OSDN Git Service

Merge tag 'libata-5.7-2020-04-09' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 10 Apr 2020 17:26:28 +0000 (10:26 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 10 Apr 2020 17:26:28 +0000 (10:26 -0700)
Pull libata fixes from Jens Axboe:
 "A few followup changes/fixes for libata:

   - PMP removal fix (Kai-Heng)

   - Add remapped NVMe device attribute to sysfs (Kai-Heng)

   - Remove redundant assignment (Colin)

   - Add yet another Comet Lake ID (Jian-Hong)"

* tag 'libata-5.7-2020-04-09' of git://git.kernel.dk/linux-block:
  ahci: Add Intel Comet Lake PCH RAID PCI ID
  ata: ahci: Add sysfs attribute to show remapped NVMe device count
  ata: ahci-imx: remove redundant assignment to ret
  libata: Return correct status in sata_pmp_eh_recover_pm() when ATA_DFLAG_DETACH is set

791 files changed:
Documentation/ABI/testing/sysfs-fs-f2fs
Documentation/admin-guide/kernel-parameters.txt
Documentation/admin-guide/mm/transhuge.rst
Documentation/admin-guide/mm/userfaultfd.rst
Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
Documentation/devicetree/bindings/display/panel/panel-dpi.yaml
Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml
Documentation/devicetree/bindings/display/ti/ti,j721e-dss.yaml
Documentation/devicetree/bindings/display/ti/ti,k2g-dss.yaml
Documentation/devicetree/bindings/input/iqs62x-keys.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt [deleted file]
Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/goodix.yaml
Documentation/devicetree/bindings/mfd/iqs62x.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/mfd/rn5t618.txt
Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt [deleted file]
Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/mfd/rohm,bd71847-pmic.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/mfd/st,stm32-lptimer.yaml
Documentation/devicetree/bindings/pwm/iqs620a-pwm.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.txt [deleted file]
Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/regulator/rohm,bd71847-regulator.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/thermal/imx8mm-thermal.txt [new file with mode: 0644]
Documentation/devicetree/bindings/thermal/qcom-tsens.yaml
Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.txt
Documentation/devicetree/bindings/thermal/sprd-thermal.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/thermal/thermal.txt
Documentation/devicetree/bindings/watchdog/ti,rti-wdt.yaml [new file with mode: 0644]
Documentation/driver-api/thermal/cpu-idle-cooling.rst
Documentation/filesystems/9p.rst
Documentation/filesystems/ceph.rst
Documentation/filesystems/f2fs.rst
Documentation/filesystems/overlayfs.rst
Documentation/firmware-guide/acpi/namespace.rst
Documentation/vm/free_page_reporting.rst [new file with mode: 0644]
Documentation/vm/zswap.rst
MAINTAINERS
arch/alpha/include/asm/mmzone.h
arch/alpha/kernel/syscalls/syscallhdr.sh
arch/arm/mach-pxa/cm-x300.c
arch/arm/mach-pxa/colibri-pxa270-income.c
arch/arm/mach-pxa/corgi.c
arch/arm/mach-pxa/ezx.c
arch/arm/mach-pxa/hx4700.c
arch/arm/mach-pxa/lpd270.c
arch/arm/mach-pxa/magician.c
arch/arm/mach-pxa/mainstone.c
arch/arm/mach-pxa/mioa701.c
arch/arm/mach-pxa/palm27x.c
arch/arm/mach-pxa/palmtc.c
arch/arm/mach-pxa/palmte2.c
arch/arm/mach-pxa/pcm990-baseboard.c
arch/arm/mach-pxa/spitz.c
arch/arm/mach-pxa/tavorevb.c
arch/arm/mach-pxa/viper.c
arch/arm/mach-pxa/z2.c
arch/arm/mach-pxa/zylonite.c
arch/arm/mach-s3c24xx/mach-h1940.c
arch/arm/mach-s3c24xx/mach-rx1950.c
arch/arm/mach-s3c64xx/dev-backlight.c
arch/arm/mach-s3c64xx/mach-crag6410.c
arch/arm/mach-s3c64xx/mach-hmt.c
arch/arm/mach-s3c64xx/mach-smartq.c
arch/arm/mach-s3c64xx/mach-smdk6410.c
arch/arm64/Kconfig
arch/arm64/Kconfig.debug
arch/arm64/Makefile
arch/arm64/include/asm/memory.h
arch/arm64/kernel/armv8_deprecated.c
arch/arm64/kvm/Kconfig
arch/csky/mm/fault.c
arch/ia64/kernel/syscalls/syscallhdr.sh
arch/ia64/kernel/vmlinux.lds.S
arch/m68k/68000/timers.c
arch/m68k/coldfire/pit.c
arch/m68k/coldfire/sltimers.c
arch/m68k/coldfire/timers.c
arch/m68k/mm/fault.c
arch/microblaze/kernel/syscalls/syscallhdr.sh
arch/mips/kernel/syscalls/syscallhdr.sh
arch/mips/kvm/Kconfig
arch/mips/mm/fault.c
arch/nds32/kernel/vmlinux.lds.S
arch/parisc/kernel/syscalls/syscallhdr.sh
arch/powerpc/Kconfig
arch/powerpc/configs/ps3_defconfig
arch/powerpc/include/asm/thread_info.h
arch/powerpc/include/asm/unistd.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/entry_64.S
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/irq.c
arch/powerpc/kernel/ppc_save_regs.S
arch/powerpc/kernel/ptrace/Makefile
arch/powerpc/kernel/signal.c
arch/powerpc/kernel/signal_32.c
arch/powerpc/kernel/syscall_64.c
arch/powerpc/kernel/syscalls/syscallhdr.sh
arch/powerpc/kernel/time.c
arch/powerpc/kernel/vdso.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/e500_mmu_host.c
arch/powerpc/mm/fault.c
arch/powerpc/mm/ioremap.c
arch/powerpc/perf/Makefile
arch/powerpc/perf/callchain.c
arch/powerpc/perf/callchain.h [new file with mode: 0644]
arch/powerpc/perf/callchain_32.c [new file with mode: 0644]
arch/powerpc/perf/callchain_64.c [new file with mode: 0644]
arch/powerpc/perf/imc-pmu.c
arch/powerpc/platforms/powernv/memtrace.c
arch/powerpc/platforms/powernv/opal-imc.c
arch/powerpc/platforms/ps3/os-area.c
arch/powerpc/platforms/pseries/iommu.c
arch/powerpc/platforms/pseries/papr_scm.c
arch/powerpc/platforms/pseries/ras.c
arch/riscv/Kconfig
arch/riscv/Kconfig.socs
arch/riscv/Makefile
arch/riscv/boot/Makefile
arch/riscv/boot/dts/Makefile
arch/riscv/boot/dts/kendryte/Makefile [new file with mode: 0644]
arch/riscv/boot/dts/kendryte/k210.dts [new file with mode: 0644]
arch/riscv/boot/dts/kendryte/k210.dtsi [new file with mode: 0644]
arch/riscv/configs/defconfig
arch/riscv/configs/nommu_k210_defconfig [new file with mode: 0644]
arch/riscv/configs/rv32_defconfig
arch/riscv/include/asm/bug.h
arch/riscv/include/asm/cacheflush.h
arch/riscv/include/asm/cpu_ops.h [new file with mode: 0644]
arch/riscv/include/asm/current.h
arch/riscv/include/asm/fixmap.h
arch/riscv/include/asm/kasan.h
arch/riscv/include/asm/patch.h [new file with mode: 0644]
arch/riscv/include/asm/pgtable.h
arch/riscv/include/asm/ptdump.h [new file with mode: 0644]
arch/riscv/include/asm/sbi.h
arch/riscv/include/asm/set_memory.h [new file with mode: 0644]
arch/riscv/include/asm/smp.h
arch/riscv/include/asm/soc.h [new file with mode: 0644]
arch/riscv/kernel/Makefile
arch/riscv/kernel/cpu-hotplug.c [new file with mode: 0644]
arch/riscv/kernel/cpu_ops.c [new file with mode: 0644]
arch/riscv/kernel/cpu_ops_sbi.c [new file with mode: 0644]
arch/riscv/kernel/cpu_ops_spinwait.c [new file with mode: 0644]
arch/riscv/kernel/entry.S
arch/riscv/kernel/ftrace.c
arch/riscv/kernel/head.S
arch/riscv/kernel/patch.c [new file with mode: 0644]
arch/riscv/kernel/process.c
arch/riscv/kernel/sbi.c
arch/riscv/kernel/setup.c
arch/riscv/kernel/smpboot.c
arch/riscv/kernel/soc.c [new file with mode: 0644]
arch/riscv/kernel/stacktrace.c
arch/riscv/kernel/traps.c
arch/riscv/kernel/traps_misaligned.c [new file with mode: 0644]
arch/riscv/kernel/vmlinux.lds.S
arch/riscv/lib/uaccess.S
arch/riscv/mm/Makefile
arch/riscv/mm/hugetlbpage.c
arch/riscv/mm/init.c
arch/riscv/mm/pageattr.c [new file with mode: 0644]
arch/riscv/mm/ptdump.c [new file with mode: 0644]
arch/s390/include/asm/qdio.h
arch/s390/kvm/Kconfig
arch/s390/kvm/vsie.c
arch/s390/mm/fault.c
arch/s390/mm/gmap.c
arch/sh/kernel/syscalls/syscallhdr.sh
arch/sh/mm/fault.c
arch/sparc/kernel/syscalls/syscallhdr.sh
arch/sparc/vdso/vdso32/vclock_gettime.c
arch/unicore32/kernel/puv3-nb0916.c
arch/x86/Kconfig
arch/x86/configs/i386_defconfig
arch/x86/configs/x86_64_defconfig
arch/x86/entry/vdso/vdso32/vclock_gettime.c
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_64.h
arch/x86/include/asm/pgtable_types.h
arch/x86/kernel/acpi/boot.c
arch/x86/kvm/Kconfig
arch/x86/kvm/Makefile
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/svm/avic.c [new file with mode: 0644]
arch/x86/kvm/svm/nested.c [new file with mode: 0644]
arch/x86/kvm/svm/pmu.c [moved from arch/x86/kvm/pmu_amd.c with 100% similarity]
arch/x86/kvm/svm/sev.c [new file with mode: 0644]
arch/x86/kvm/svm/svm.c [moved from arch/x86/kvm/svm.c with 54% similarity]
arch/x86/kvm/svm/svm.h [new file with mode: 0644]
arch/x86/kvm/svm/vmenter.S [new file with mode: 0644]
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/vmenter.S
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
arch/x86/mm/fault.c
arch/x86/mm/numa.c
arch/xtensa/kernel/syscalls/syscallhdr.sh
block/blk-cgroup.c
block/blk-mq.c
block/partitions/core.c
drivers/Kconfig
drivers/Makefile
drivers/acpi/arm64/iort.c
drivers/acpi/ec.c
drivers/acpi/nfit/core.c
drivers/acpi/nfit/nfit.h
drivers/acpi/numa/srat.c
drivers/base/memory.c
drivers/block/loop.c
drivers/block/rbd.c
drivers/cpuidle/cpuidle-haltpoll.c
drivers/crypto/hisilicon/Kconfig
drivers/crypto/marvell/octeontx/otx_cptvf_algs.c
drivers/dax/bus.c
drivers/dax/super.c
drivers/dma-buf/Kconfig
drivers/firmware/efi/libstub/arm64-stub.c
drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
drivers/gpu/drm/amd/display/dc/core/dc.c
drivers/gpu/drm/amd/display/dc/core/dc_resource.c
drivers/gpu/drm/amd/display/dc/dc.h
drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c
drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.c
drivers/gpu/drm/amd/display/dc/dcn10/dcn10_optc.h
drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c
drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c
drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c
drivers/gpu/drm/amd/display/dc/dml/dc_features.h
drivers/gpu/drm/amd/display/dc/dml/display_mode_structs.h
drivers/gpu/drm/amd/display/modules/freesync/freesync.c
drivers/gpu/drm/amd/display/modules/hdcp/hdcp.c
drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h
drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c
drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c
drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
drivers/gpu/drm/amd/display/modules/inc/mod_hdcp.h
drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
drivers/gpu/drm/amd/powerplay/navi10_ppt.c
drivers/gpu/drm/amd/powerplay/smu_v11_0.c
drivers/gpu/drm/amd/powerplay/vega20_ppt.c
drivers/gpu/drm/drm_mm.c
drivers/gpu/drm/i915/gvt/cmd_parser.c
drivers/gpu/drm/i915/gvt/display.c
drivers/gpu/drm/i915/gvt/handlers.c
drivers/gpu/drm/i915/gvt/scheduler.c
drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c
drivers/gpu/drm/nouveau/dispnv04/dac.c
drivers/gpu/drm/nouveau/dispnv04/hw.c
drivers/gpu/drm/nouveau/dispnv50/base507c.c
drivers/gpu/drm/nouveau/dispnv50/core507d.c
drivers/gpu/drm/nouveau/dispnv50/corec37d.c
drivers/gpu/drm/nouveau/dispnv50/curs507a.c
drivers/gpu/drm/nouveau/dispnv50/cursc37a.c
drivers/gpu/drm/nouveau/dispnv50/disp.c
drivers/gpu/drm/nouveau/dispnv50/ovly827e.c
drivers/gpu/drm/nouveau/dispnv50/wndw.h
drivers/gpu/drm/nouveau/include/nvif/device.h
drivers/gpu/drm/nouveau/include/nvif/timer.h [new file with mode: 0644]
drivers/gpu/drm/nouveau/include/nvif/user.h
drivers/gpu/drm/nouveau/nouveau_bo.c
drivers/gpu/drm/nouveau/nouveau_debugfs.c
drivers/gpu/drm/nouveau/nouveau_drm.c
drivers/gpu/drm/nouveau/nouveau_drv.h
drivers/gpu/drm/nouveau/nouveau_svm.c
drivers/gpu/drm/nouveau/nvif/Kbuild
drivers/gpu/drm/nouveau/nvif/device.c
drivers/gpu/drm/nouveau/nvif/timer.c [new file with mode: 0644]
drivers/gpu/drm/nouveau/nvif/userc361.c
drivers/gpu/drm/nouveau/nvkm/engine/gr/gf100.c
drivers/gpu/drm/panel/panel-simple.c
drivers/gpu/drm/vboxvideo/vbox_drv.c
drivers/gpu/drm/vc4/vc4_hdmi.c
drivers/gpu/drm/xen/xen_drm_front.c
drivers/hv/hv_balloon.c
drivers/iio/Kconfig
drivers/iio/Makefile
drivers/iio/accel/cros_ec_accel_legacy.c
drivers/iio/adc/Kconfig
drivers/iio/adc/Makefile
drivers/iio/adc/rn5t618-adc.c [new file with mode: 0644]
drivers/iio/common/cros_ec_sensors/cros_ec_lid_angle.c
drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c
drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
drivers/iio/industrialio-core.c
drivers/iio/light/Kconfig
drivers/iio/light/Makefile
drivers/iio/light/cros_ec_light_prox.c
drivers/iio/light/iqs621-als.c [new file with mode: 0644]
drivers/iio/position/Kconfig [new file with mode: 0644]
drivers/iio/position/Makefile [new file with mode: 0644]
drivers/iio/position/iqs624-pos.c [new file with mode: 0644]
drivers/iio/pressure/cros_ec_baro.c
drivers/iio/temperature/Kconfig
drivers/iio/temperature/Makefile
drivers/iio/temperature/iqs620at-temp.c [new file with mode: 0644]
drivers/input/keyboard/Kconfig
drivers/input/keyboard/Makefile
drivers/input/keyboard/iqs62x-keys.c [new file with mode: 0644]
drivers/input/serio/i8042-x86ia64io.h
drivers/input/touchscreen/elants_i2c.c
drivers/input/touchscreen/goodix.c
drivers/input/touchscreen/of_touchscreen.c
drivers/iommu/Kconfig
drivers/iommu/amd_iommu_types.h
drivers/iommu/arm-smmu-v3.c
drivers/iommu/arm-smmu.c
drivers/iommu/intel-iommu.c
drivers/iommu/intel-svm.c
drivers/iommu/iommu.c
drivers/iommu/ipmmu-vmsa.c
drivers/iommu/mtk_iommu.c
drivers/iommu/mtk_iommu_v1.c
drivers/iommu/omap-iommu.c
drivers/iommu/omap-iopgtable.h
drivers/iommu/qcom_iommu.c
drivers/iommu/tegra-gart.c
drivers/iommu/virtio-iommu.c
drivers/leds/Kconfig
drivers/leds/Makefile
drivers/leds/led-class.c
drivers/leds/leds-bd2802.c
drivers/leds/leds-ip30.c [new file with mode: 0644]
drivers/leds/leds-is31fl32xx.c
drivers/leds/leds-lm3532.c
drivers/leds/leds-lm3697.c
drivers/leds/leds-ns2.c
drivers/leds/leds-pwm.c
drivers/md/dm-linear.c
drivers/md/dm-log-writes.c
drivers/md/dm-stripe.c
drivers/md/dm.c
drivers/mfd/Kconfig
drivers/mfd/Makefile
drivers/mfd/aat2870-core.c
drivers/mfd/cros_ec_dev.c
drivers/mfd/da9062-core.c
drivers/mfd/dln2.c
drivers/mfd/intel-lpss-pci.c
drivers/mfd/iqs62x.c [new file with mode: 0644]
drivers/mfd/omap-usb-host.c
drivers/mfd/omap-usb-tll.c
drivers/mfd/qcom-pm8xxx.c
drivers/mfd/rk808.c
drivers/mfd/rn5t618.c
drivers/mfd/sprd-sc27xx-spi.c
drivers/misc/lkdtm/bugs.c
drivers/misc/lkdtm/core.c
drivers/misc/lkdtm/lkdtm.h
drivers/misc/mic/Kconfig
drivers/net/caif/Kconfig
drivers/nvdimm/bus.c
drivers/nvdimm/dimm.c
drivers/nvdimm/dimm_devs.c
drivers/nvdimm/e820.c
drivers/nvdimm/label.h
drivers/nvdimm/namespace_devs.c
drivers/nvdimm/nd.h
drivers/nvdimm/of_pmem.c
drivers/nvdimm/pfn.h
drivers/nvdimm/pfn_devs.c
drivers/nvdimm/pmem.c
drivers/nvdimm/region_devs.c
drivers/nvme/host/core.c
drivers/nvme/host/fc.c
drivers/nvme/host/multipath.c
drivers/nvme/host/rdma.c
drivers/nvme/host/tcp.c
drivers/nvme/target/configfs.c
drivers/nvme/target/fc.c
drivers/nvme/target/fcloop.c
drivers/nvme/target/rdma.c
drivers/pci/ats.c
drivers/platform/chrome/Kconfig
drivers/platform/chrome/Makefile
drivers/platform/chrome/chromeos_laptop.c
drivers/platform/chrome/cros_ec.c
drivers/platform/chrome/cros_ec_chardev.c
drivers/platform/chrome/cros_ec_lightbar.c
drivers/platform/chrome/cros_ec_proto.c
drivers/platform/chrome/cros_ec_rpmsg.c
drivers/platform/chrome/cros_ec_sensorhub.c
drivers/platform/chrome/cros_ec_sensorhub_ring.c [new file with mode: 0644]
drivers/platform/chrome/cros_ec_spi.c
drivers/platform/chrome/cros_ec_sysfs.c
drivers/platform/chrome/cros_ec_typec.c [new file with mode: 0644]
drivers/platform/chrome/cros_ec_vbc.c
drivers/platform/chrome/cros_usbpd_notify.c [new file with mode: 0644]
drivers/platform/chrome/wilco_ec/event.c
drivers/platform/chrome/wilco_ec/properties.c
drivers/platform/chrome/wilco_ec/sysfs.c
drivers/power/supply/Kconfig
drivers/power/supply/cros_usbpd-charger.c
drivers/ps3/sys-manager-core.c
drivers/rtc/Kconfig
drivers/rtc/Makefile
drivers/rtc/rtc-rc5t619.c [new file with mode: 0644]
drivers/s390/block/dcssblk.c
drivers/s390/cio/device.c
drivers/s390/cio/qdio.h
drivers/s390/cio/qdio_debug.c
drivers/s390/cio/qdio_debug.h
drivers/s390/cio/qdio_main.c
drivers/s390/cio/qdio_setup.c
drivers/s390/cio/qdio_thinint.c
drivers/s390/cio/vfio_ccw_drv.c
drivers/s390/net/qeth_core.h
drivers/s390/net/qeth_core_main.c
drivers/s390/scsi/zfcp_qdio.c
drivers/scsi/lpfc/lpfc_nvme.c
drivers/scsi/qla2xxx/qla_nvme.c
drivers/soc/Kconfig
drivers/soc/Makefile
drivers/soc/kendryte/Kconfig [new file with mode: 0644]
drivers/soc/kendryte/Makefile [new file with mode: 0644]
drivers/soc/kendryte/k210-sysctl.c [new file with mode: 0644]
drivers/thermal/Kconfig
drivers/thermal/Makefile
drivers/thermal/cpufreq_cooling.c
drivers/thermal/imx8mm_thermal.c [new file with mode: 0644]
drivers/thermal/imx_sc_thermal.c [new file with mode: 0644]
drivers/thermal/imx_thermal.c
drivers/thermal/intel/int340x_thermal/int3400_thermal.c
drivers/thermal/intel/int340x_thermal/processor_thermal_device.c
drivers/thermal/of-thermal.c
drivers/thermal/qcom/tsens-8960.c
drivers/thermal/qcom/tsens-common.c
drivers/thermal/qcom/tsens-v0_1.c
drivers/thermal/qcom/tsens-v1.c
drivers/thermal/qcom/tsens-v2.c
drivers/thermal/qcom/tsens.c
drivers/thermal/qcom/tsens.h
drivers/thermal/qoriq_thermal.c
drivers/thermal/rcar_gen3_thermal.c
drivers/thermal/rcar_thermal.c
drivers/thermal/samsung/exynos_tmu.c
drivers/thermal/sprd_thermal.c [new file with mode: 0644]
drivers/thermal/st/stm_thermal.c
drivers/thermal/ti-soc-thermal/ti-bandgap.c
drivers/thermal/ti-soc-thermal/ti-bandgap.h
drivers/vdpa/Kconfig [new file with mode: 0644]
drivers/vdpa/Makefile [new file with mode: 0644]
drivers/vdpa/ifcvf/Makefile [new file with mode: 0644]
drivers/vdpa/ifcvf/ifcvf_base.c [new file with mode: 0644]
drivers/vdpa/ifcvf/ifcvf_base.h [new file with mode: 0644]
drivers/vdpa/ifcvf/ifcvf_main.c [new file with mode: 0644]
drivers/vdpa/vdpa.c [new file with mode: 0644]
drivers/vdpa/vdpa_sim/Makefile [new file with mode: 0644]
drivers/vdpa/vdpa_sim/vdpa_sim.c [new file with mode: 0644]
drivers/vhost/Kconfig
drivers/vhost/Kconfig.vringh [deleted file]
drivers/vhost/Makefile
drivers/vhost/iotlb.c [new file with mode: 0644]
drivers/vhost/net.c
drivers/vhost/scsi.c
drivers/vhost/vdpa.c [new file with mode: 0644]
drivers/vhost/vhost.c
drivers/vhost/vhost.h
drivers/vhost/vringh.c
drivers/vhost/vsock.c
drivers/video/backlight/corgi_lcd.c
drivers/video/backlight/pwm_bl.c
drivers/video/fbdev/core/fbcon.c
drivers/virtio/Kconfig
drivers/virtio/Makefile
drivers/virtio/virtio_balloon.c
drivers/virtio/virtio_vdpa.c [new file with mode: 0644]
drivers/watchdog/Kconfig
drivers/watchdog/Makefile
drivers/watchdog/imx2_wdt.c
drivers/watchdog/imx7ulp_wdt.c
drivers/watchdog/imx_sc_wdt.c
drivers/watchdog/npcm_wdt.c
drivers/watchdog/orion_wdt.c
drivers/watchdog/pm8916_wdt.c
drivers/watchdog/qcom-wdt.c
drivers/watchdog/rti_wdt.c [new file with mode: 0644]
drivers/watchdog/watchdog_core.c
drivers/watchdog/watchdog_dev.c
drivers/watchdog/wm831x_wdt.c
drivers/watchdog/ziirave_wdt.c
fs/binfmt_elf.c
fs/ceph/addr.c
fs/ceph/cache.c
fs/ceph/caps.c
fs/ceph/debugfs.c
fs/ceph/dir.c
fs/ceph/export.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/ioctl.c
fs/ceph/locks.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/super.c
fs/ceph/super.h
fs/dax.c
fs/eventpoll.c
fs/f2fs/Kconfig
fs/f2fs/checkpoint.c
fs/f2fs/compress.c
fs/f2fs/data.c
fs/f2fs/debug.c
fs/f2fs/dir.c
fs/f2fs/f2fs.h
fs/f2fs/file.c
fs/f2fs/gc.c
fs/f2fs/inode.c
fs/f2fs/namei.c
fs/f2fs/node.c
fs/f2fs/recovery.c
fs/f2fs/segment.c
fs/f2fs/segment.h
fs/f2fs/shrinker.c
fs/f2fs/super.c
fs/f2fs/sysfs.c
fs/f2fs/xattr.c
fs/f2fs/xattr.h
fs/io-wq.c
fs/io-wq.h
fs/io_uring.c
fs/iomap/buffered-io.c
fs/nfs/blocklayout/blocklayout.c
fs/nfs/callback.h
fs/nfs/callback_proc.c
fs/nfs/delegation.c
fs/nfs/dir.c
fs/nfs/direct.c
fs/nfs/filelayout/filelayout.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/flexfilelayout/flexfilelayout.h
fs/nfs/fs_context.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/namespace.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4file.c
fs/nfs/nfs4namespace.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/nfs4trace.h
fs/nfs/nfsroot.c
fs/nfs/nfstrace.h
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/pnfs_nfs.c
fs/nfs/read.c
fs/nfs/super.c
fs/nfs/unlink.c
fs/nfs/write.c
fs/overlayfs/copy_up.c
fs/overlayfs/dir.c
fs/overlayfs/export.c
fs/overlayfs/inode.c
fs/overlayfs/namei.c
fs/overlayfs/overlayfs.h
fs/overlayfs/ovl_entry.h
fs/overlayfs/readdir.c
fs/overlayfs/super.c
fs/overlayfs/util.c
fs/proc/array.c
fs/proc/cpuinfo.c
fs/proc/generic.c
fs/proc/inode.c
fs/proc/internal.h
fs/proc/kmsg.c
fs/proc/stat.c
fs/proc/task_mmu.c
fs/read_write.c
fs/reiserfs/do_balan.c
fs/reiserfs/ioctl.c
fs/reiserfs/namei.c
fs/seq_file.c
fs/userfaultfd.c
fs/xfs/libxfs/xfs_sb.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_dquot.c
fs/xfs/xfs_dquot_item.c
fs/xfs/xfs_export.c
fs/xfs/xfs_file.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_cil.c
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_mount.h
fs/xfs/xfs_qm.c
fs/xfs/xfs_super.c
fs/xfs/xfs_symlink.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans_ail.c
fs/xfs/xfs_trans_priv.h
include/asm-generic/pgtable.h
include/asm-generic/pgtable_uffd.h [new file with mode: 0644]
include/asm-generic/tlb.h
include/dt-bindings/clock/k210-clk.h [new file with mode: 0644]
include/dt-bindings/leds/common.h
include/linux/acpi.h
include/linux/bitops.h
include/linux/bits.h
include/linux/blk-cgroup.h
include/linux/ceph/ceph_fs.h
include/linux/ceph/debugfs.h
include/linux/ceph/libceph.h
include/linux/ceph/osd_client.h
include/linux/compiler.h
include/linux/compiler_types.h
include/linux/dax.h
include/linux/devfreq_cooling.h
include/linux/device-mapper.h
include/linux/device.h
include/linux/f2fs_fs.h
include/linux/gfp.h
include/linux/huge_mm.h
include/linux/iio/common/cros_ec_sensors_core.h
include/linux/iio/iio.h
include/linux/io.h
include/linux/iommu.h
include/linux/leds.h
include/linux/leds_pwm.h [deleted file]
include/linux/libnvdimm.h
include/linux/memory.h
include/linux/memory_hotplug.h
include/linux/memremap.h
include/linux/mfd/iqs62x.h [new file with mode: 0644]
include/linux/mfd/rk808.h
include/linux/mfd/rn5t618.h
include/linux/mfd/sc27xx-pmic.h [new file with mode: 0644]
include/linux/mfd/wm831x/pdata.h
include/linux/mm.h
include/linux/mm_inline.h
include/linux/mm_types.h
include/linux/mmzone.h
include/linux/nfs_fs.h
include/linux/nfs_page.h
include/linux/nfs_xdr.h
include/linux/numa.h
include/linux/nvme-fc-driver.h
include/linux/page-flags.h
include/linux/page_reporting.h [new file with mode: 0644]
include/linux/pagemap.h
include/linux/percpu_counter.h
include/linux/platform_data/cros_ec_proto.h
include/linux/platform_data/cros_ec_sensorhub.h
include/linux/platform_data/cros_usbpd_notify.h [new file with mode: 0644]
include/linux/platform_data/leds-kirkwood-ns2.h [deleted file]
include/linux/platform_data/wilco-ec.h
include/linux/proc_fs.h
include/linux/pwm_backlight.h
include/linux/seq_file.h
include/linux/shmem_fs.h
include/linux/spi/corgi_lcd.h
include/linux/stackdepot.h
include/linux/sunrpc/sched.h
include/linux/sunrpc/xdr.h
include/linux/swapops.h
include/linux/thermal.h
include/linux/userfaultfd_k.h
include/linux/vdpa.h [new file with mode: 0644]
include/linux/vhost_iotlb.h [new file with mode: 0644]
include/linux/vm_event_item.h
include/linux/vringh.h
include/trace/events/f2fs.h
include/trace/events/huge_memory.h
include/trace/events/mmflags.h
include/trace/events/rpcrdma.h
include/trace/events/vmscan.h
include/uapi/linux/input-event-codes.h
include/uapi/linux/userfaultfd.h
include/uapi/linux/vhost.h
include/uapi/linux/vhost_types.h
include/uapi/linux/virtio_balloon.h
include/uapi/linux/virtio_iommu.h
include/uapi/linux/virtio_net.h
init/Kconfig
ipc/mqueue.c
ipc/shm.c
ipc/util.c
kernel/configs/tiny.config
kernel/events/core.c
kernel/extable.c
kernel/fork.c
kernel/gcov/fs.c
kernel/gcov/gcc_3_4.c
kernel/gcov/gcc_4_7.c
kernel/kallsyms.c
kernel/kmod.c
kernel/module.c
kernel/power/user.c
kernel/sched/fair.c
lib/Kconfig
lib/Kconfig.debug
lib/Kconfig.ubsan
lib/Makefile
lib/bch.c
lib/dynamic_debug.c
lib/rbtree.c
lib/scatterlist.c
lib/stackdepot.c
lib/test_bitmap.c
lib/test_kmod.c
lib/test_lockup.c [new file with mode: 0644]
lib/test_stackinit.c
lib/ts_bm.c
lib/ts_fsm.c
lib/ts_kmp.c
lib/ubsan.c
mm/Kconfig
mm/Makefile
mm/backing-dev.c
mm/compaction.c
mm/dmapool.c
mm/filemap.c
mm/gup.c
mm/huge_memory.c
mm/hugetlb.c
mm/hugetlb_cgroup.c
mm/internal.h
mm/kasan/common.c
mm/kasan/report.c
mm/khugepaged.c
mm/ksm.c
mm/list_lru.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/memremap.c
mm/migrate.c
mm/mm_init.c
mm/mmap.c
mm/mprotect.c
mm/page_alloc.c
mm/page_ext.c
mm/page_isolation.c
mm/page_reporting.c [new file with mode: 0644]
mm/page_reporting.h [new file with mode: 0644]
mm/rmap.c
mm/shmem.c
mm/shuffle.c
mm/shuffle.h
mm/slab_common.c
mm/slub.c
mm/sparse.c
mm/swap.c
mm/swapfile.c
mm/userfaultfd.c
mm/vmalloc.c
mm/vmscan.c
mm/vmstat.c
mm/zsmalloc.c
mm/zswap.c
net/ceph/debugfs.c
net/ceph/mon_client.c
net/ceph/osd_client.c
net/sunrpc/auth_gss/auth_gss.c
net/sunrpc/clnt.c
net/sunrpc/sched.c
net/sunrpc/xdr.c
net/sunrpc/xprtrdma/backchannel.c
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c
samples/hw_breakpoint/data_breakpoint.c
scripts/Makefile.ubsan
scripts/checkpatch.pl
tools/lib/rbtree.c
tools/testing/nvdimm/Kbuild
tools/testing/nvdimm/test/Kbuild
tools/testing/nvdimm/test/nfit.c
tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
tools/testing/selftests/powerpc/eeh/eeh-basic.sh
tools/testing/selftests/powerpc/tm/Makefile
tools/testing/selftests/vm/userfaultfd.c
tools/thermal/tmon/tmon.c
tools/virtio/Makefile

index 1a6cd53..bd8a0d1 100644 (file)
@@ -318,3 +318,8 @@ Date:               September 2019
 Contact:       "Hridya Valsaraju" <hridya@google.com>
 Description:   Average number of valid blocks.
                Available when CONFIG_F2FS_STAT_FS=y.
+
+What:          /sys/fs/f2fs/<disk>/mounted_time_sec
+Date:          February 2020
+Contact:       "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:   Show the mounted time in secs of this partition.
index 2d31d81..86aae1f 100644 (file)
                        For details see: Documentation/admin-guide/hw-vuln/mds.rst
 
        mem=nn[KMG]     [KNL,BOOT] Force usage of a specific amount of memory
-                       Amount of memory to be used when the kernel is not able
-                       to see the whole system memory or for test.
+                       Amount of memory to be used in cases as follows:
+
+                       1 for test;
+                       2 when the kernel is not able to see the whole system memory;
+                       3 memory that lies after 'mem=' boundary is excluded from
+                        the hypervisor, then assigned to KVM guests.
+
                        [X86] Work as limiting max address. Use together
                        with memmap= to avoid physical address space collisions.
                        Without memmap= PCI devices could be placed at addresses
                        belonging to unused RAM.
 
+                       Note that this only takes effects during boot time since
+                       in above case 3, memory may need be hot added after boot
+                       if system memory of hypervisor is not sufficient.
+
        mem=nopentium   [BUGS=X86-32] Disable usage of 4MB pages for kernel
                        memory.
 
index bd57145..2f31de8 100644 (file)
@@ -310,6 +310,11 @@ thp_fault_fallback
        is incremented if a page fault fails to allocate
        a huge page and instead falls back to using small pages.
 
+thp_fault_fallback_charge
+       is incremented if a page fault fails to charge a huge page and
+       instead falls back to using small pages even though the
+       allocation was successful.
+
 thp_collapse_alloc_failed
        is incremented if khugepaged found a range
        of pages that should be collapsed into one huge page but failed
@@ -319,6 +324,15 @@ thp_file_alloc
        is incremented every time a file huge page is successfully
        allocated.
 
+thp_file_fallback
+       is incremented if a file huge page is attempted to be allocated
+       but fails and instead falls back to using small pages.
+
+thp_file_fallback_charge
+       is incremented if a file huge page cannot be charged and instead
+       falls back to using small pages even though the allocation was
+       successful.
+
 thp_file_mapped
        is incremented every time a file huge page is mapped into
        user address space.
index 5048cf6..c30176e 100644 (file)
@@ -108,6 +108,57 @@ UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
 half copied page since it'll keep userfaulting until the copy has
 finished.
 
+Notes:
+
+- If you requested UFFDIO_REGISTER_MODE_MISSING when registering then
+  you must provide some kind of page in your thread after reading from
+  the uffd.  You must provide either UFFDIO_COPY or UFFDIO_ZEROPAGE.
+  The normal behavior of the OS automatically providing a zero page on
+  an annonymous mmaping is not in place.
+
+- None of the page-delivering ioctls default to the range that you
+  registered with.  You must fill in all fields for the appropriate
+  ioctl struct including the range.
+
+- You get the address of the access that triggered the missing page
+  event out of a struct uffd_msg that you read in the thread from the
+  uffd.  You can supply as many pages as you want with UFFDIO_COPY or
+  UFFDIO_ZEROPAGE.  Keep in mind that unless you used DONTWAKE then
+  the first of any of those IOCTLs wakes up the faulting thread.
+
+- Be sure to test for all errors including (pollfd[0].revents &
+  POLLERR).  This can happen, e.g. when ranges supplied were
+  incorrect.
+
+Write Protect Notifications
+---------------------------
+
+This is equivalent to (but faster than) using mprotect and a SIGSEGV
+signal handler.
+
+Firstly you need to register a range with UFFDIO_REGISTER_MODE_WP.
+Instead of using mprotect(2) you use ioctl(uffd, UFFDIO_WRITEPROTECT,
+struct *uffdio_writeprotect) while mode = UFFDIO_WRITEPROTECT_MODE_WP
+in the struct passed in.  The range does not default to and does not
+have to be identical to the range you registered with.  You can write
+protect as many ranges as you like (inside the registered range).
+Then, in the thread reading from uffd the struct will have
+msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP set. Now you send
+ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect) again
+while pagefault.mode does not have UFFDIO_WRITEPROTECT_MODE_WP set.
+This wakes up the thread which will continue to run with writes. This
+allows you to do the bookkeeping about the write in the uffd reading
+thread before the ioctl.
+
+If you registered with both UFFDIO_REGISTER_MODE_MISSING and
+UFFDIO_REGISTER_MODE_WP then you need to think about the sequence in
+which you supply a page and undo write protect.  Note that there is a
+difference between writes into a WP area and into a !WP area.  The
+former will have UFFD_PAGEFAULT_FLAG_WP set, the latter
+UFFD_PAGEFAULT_FLAG_WRITE.  The latter did not fail on protection but
+you still need to supply a page when UFFDIO_REGISTER_MODE_MISSING was
+used.
+
 QEMU/KVM
 ========
 
index 8cac6fa..623fedf 100644 (file)
@@ -166,6 +166,17 @@ Required properties:
               followed by "fsl,imx-sc-key";
 - linux,keycodes: See Documentation/devicetree/bindings/input/input.yaml
 
+Thermal bindings based on SCU Message Protocol
+------------------------------------------------------------
+
+Required properties:
+- compatible:                  Should be :
+                                 "fsl,imx8qxp-sc-thermal"
+                               followed by "fsl,imx-sc-thermal";
+
+- #thermal-sensor-cells:       See Documentation/devicetree/bindings/thermal/thermal.txt
+                               for a description.
+
 Example (imx8qxp):
 -------------
 aliases {
@@ -238,6 +249,11 @@ firmware {
                        compatible = "fsl,imx8qxp-sc-wdt", "fsl,imx-sc-wdt";
                        timeout-sec = <60>;
                };
+
+               tsens: thermal-sensor {
+                       compatible = "fsl,imx8qxp-sc-thermal", "fsl,imx-sc-thermal";
+                       #thermal-sensor-cells = <1>;
+               };
        };
 };
 
index f638703..0cd74c8 100644 (file)
@@ -21,15 +21,6 @@ properties:
       - {}
       - const: panel-dpi
 
-  data-mapping:
-    enum:
-      - rgb24
-      - rgb565
-      - bgr666
-    description: |
-      Describes the media format, how the display panel is connected
-      to the display interface.
-
   backlight: true
   enable-gpios: true
   height-mm: true
@@ -52,7 +43,6 @@ examples:
         compatible = "osddisplays,osd057T0559-34ts", "panel-dpi";
         label = "osddisplay";
         power-supply = <&vcc_supply>;
-        data-mapping = "rgb565";
         backlight = <&backlight>;
 
         port {
index cac61a9..eb04c23 100644 (file)
@@ -65,7 +65,7 @@ properties:
   ports:
     type: object
     description:
-      Ports as described in Documentation/devictree/bindings/graph.txt
+      Ports as described in Documentation/devicetree/bindings/graph.txt
     properties:
       "#address-cells":
         const: 1
@@ -121,7 +121,7 @@ examples:
     #include <dt-bindings/interrupt-controller/irq.h>
     #include <dt-bindings/soc/ti,sci_pm_domain.h>
 
-    dss: dss@04a00000 {
+    dss: dss@4a00000 {
             compatible = "ti,am65x-dss";
             reg =   <0x0 0x04a00000 0x0 0x1000>, /* common */
                     <0x0 0x04a02000 0x0 0x1000>, /* vidl1 */
index ade9b2f..eb4b1a2 100644 (file)
@@ -98,7 +98,7 @@ properties:
   ports:
     type: object
     description:
-      Ports as described in Documentation/devictree/bindings/graph.txt
+      Ports as described in Documentation/devicetree/bindings/graph.txt
     properties:
       "#address-cells":
         const: 1
@@ -154,7 +154,7 @@ examples:
     #include <dt-bindings/interrupt-controller/irq.h>
     #include <dt-bindings/soc/ti,sci_pm_domain.h>
 
-    dss: dss@04a00000 {
+    dss: dss@4a00000 {
             compatible = "ti,j721e-dss";
             reg =   <0x00 0x04a00000 0x00 0x10000>, /* common_m */
                     <0x00 0x04a10000 0x00 0x10000>, /* common_s0*/
index 385bd06..8f87b82 100644 (file)
@@ -56,7 +56,7 @@ properties:
   port:
     type: object
     description:
-      Port as described in Documentation/devictree/bindings/graph.txt.
+      Port as described in Documentation/devicetree/bindings/graph.txt.
       The DSS DPI output port node
 
   max-memory-bandwidth:
@@ -81,7 +81,7 @@ examples:
     #include <dt-bindings/interrupt-controller/arm-gic.h>
     #include <dt-bindings/interrupt-controller/irq.h>
 
-    dss: dss@02540000 {
+    dss: dss@2540000 {
             compatible = "ti,k2g-dss";
             reg =   <0x02540000 0x400>,
                     <0x02550000 0x1000>,
diff --git a/Documentation/devicetree/bindings/input/iqs62x-keys.yaml b/Documentation/devicetree/bindings/input/iqs62x-keys.yaml
new file mode 100644 (file)
index 0000000..5625c22
--- /dev/null
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/iqs62x-keys.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Azoteq IQS620A/621/622/624/625 Keys and Switches
+
+maintainers:
+  - Jeff LaBundy <jeff@labundy.com>
+
+description: |
+  The Azoteq IQS620A, IQS621, IQS622, IQS624 and IQS625 multi-function sensors
+  feature a variety of self-capacitive, mutual-inductive and Hall-effect sens-
+  ing capabilities that can facilitate a variety of contactless key and switch
+  applications.
+
+  These functions are collectively represented by a "keys" child node from the
+  parent MFD driver. See Documentation/devicetree/bindings/mfd/iqs62x.yaml for
+  further details and examples. Sensor hardware configuration (self-capacitive
+  vs. mutual-inductive, etc.) is selected based on the device's firmware.
+
+properties:
+  compatible:
+    enum:
+      - azoteq,iqs620a-keys
+      - azoteq,iqs621-keys
+      - azoteq,iqs622-keys
+      - azoteq,iqs624-keys
+      - azoteq,iqs625-keys
+
+  linux,keycodes:
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32-array
+      - minItems: 1
+        maxItems: 16
+    description: |
+      Specifies the numeric keycodes associated with each available touch or
+      proximity event according to the following table. An 'x' indicates the
+      event is supported for a given device. Specify 0 for unused events.
+
+      -------------------------------------------------------------------------
+      | #  | Event              | IQS620A | IQS621 | IQS622 | IQS624 | IQS625 |
+      -------------------------------------------------------------------------
+      | 0  | CH0 Touch          |    x    |    x   |    x   |    x   |    x   |
+      |    | Antenna 1 Touch*   |    x    |        |        |        |        |
+      -------------------------------------------------------------------------
+      | 1  | CH0 Proximity      |    x    |    x   |    x   |    x   |    x   |
+      |    | Antenna 1 Prox.*   |    x    |        |        |        |        |
+      -------------------------------------------------------------------------
+      | 2  | CH1 Touch          |    x    |    x   |    x   |    x   |    x   |
+      |    | Ant. 1 Deep Touch* |    x    |        |        |        |        |
+      -------------------------------------------------------------------------
+      | 3  | CH1 Proximity      |    x    |    x   |    x   |    x   |    x   |
+      -------------------------------------------------------------------------
+      | 4  | CH2 Touch          |    x    |        |        |        |        |
+      -------------------------------------------------------------------------
+      | 5  | CH2 Proximity      |    x    |        |        |        |        |
+      |    | Antenna 2 Prox.*   |    x    |        |        |        |        |
+      -------------------------------------------------------------------------
+      | 6  | Metal (+) Touch**  |    x    |    x   |        |        |        |
+      |    | Ant. 2 Deep Touch* |    x    |        |        |        |        |
+      -------------------------------------------------------------------------
+      | 7  | Metal (+) Prox.**  |    x    |    x   |        |        |        |
+      |    | Antenna 2 Touch*   |    x    |        |        |        |        |
+      -------------------------------------------------------------------------
+      | 8  | Metal (-) Touch**  |    x    |    x   |        |        |        |
+      -------------------------------------------------------------------------
+      | 9  | Metal (-) Prox.**  |    x    |    x   |        |        |        |
+      -------------------------------------------------------------------------
+      | 10 | SAR Active***      |    x    |        |    x   |        |        |
+      -------------------------------------------------------------------------
+      | 11 | SAR Quick Rel.***  |    x    |        |    x   |        |        |
+      -------------------------------------------------------------------------
+      | 12 | SAR Movement***    |    x    |        |    x   |        |        |
+      -------------------------------------------------------------------------
+      | 13 | SAR Filter Halt*** |    x    |        |    x   |        |        |
+      -------------------------------------------------------------------------
+      | 14 | Wheel Up           |         |        |        |    x   |        |
+      -------------------------------------------------------------------------
+      | 15 | Wheel Down         |         |        |        |    x   |        |
+      -------------------------------------------------------------------------
+      *   Two-channel SAR. Replaces CH0-2 plus metal touch and proximity events
+          if enabled via firmware.
+      **  "+" and "-" refer to the polarity of a channel's delta (LTA - counts),
+          where "LTA" is defined as the channel's long-term average.
+      *** One-channel SAR. Replaces CH0-2 touch and proximity events if enabled
+          via firmware.
+
+patternProperties:
+  "^hall-switch-(north|south)$":
+    type: object
+    description:
+      Represents north/south-field Hall-effect sensor touch or proximity
+      events. Note that north/south-field orientation is reversed on the
+      IQS620AXzCSR device due to its flip-chip package.
+
+    properties:
+      linux,code:
+        $ref: /schemas/types.yaml#/definitions/uint32
+        description: Numeric switch code associated with the event.
+
+      azoteq,use-prox:
+        $ref: /schemas/types.yaml#/definitions/flag
+        description:
+          If present, specifies that Hall-effect sensor reporting should
+          use the device's wide-range proximity threshold instead of its
+          close-range touch threshold (default).
+
+    required:
+      - linux,code
+
+    additionalProperties: false
+
+if:
+  properties:
+    compatible:
+      contains:
+        enum:
+          - azoteq,iqs624-keys
+          - azoteq,iqs625-keys
+then:
+  patternProperties:
+    "^hall-switch-(north|south)$": false
+
+required:
+  - compatible
+  - linux,keycodes
+
+additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.txt
deleted file mode 100644 (file)
index 0e57315..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-FocalTech EDT-FT5x06 Polytouch driver
-=====================================
-
-There are 5 variants of the chip for various touch panel sizes
-FT5206GE1  2.8" .. 3.8"
-FT5306DE4  4.3" .. 7"
-FT5406EE8  7"   .. 8.9"
-FT5506EEG  7"   .. 8.9"
-FT5726NEI  5.7” .. 11.6"
-
-The software interface is identical for all those chips, so that
-currently there is no need for the driver to distinguish between the
-different chips. Nevertheless distinct compatible strings are used so
-that a distinction can be added if necessary without changing the DT
-bindings.
-
-
-Required properties:
- - compatible:  "edt,edt-ft5206"
-           or:  "edt,edt-ft5306"
-           or:  "edt,edt-ft5406"
-           or:  "edt,edt-ft5506"
-           or:  "evervision,ev-ft5726"
-           or:  "focaltech,ft6236"
-
- - reg:         I2C slave address of the chip (0x38)
- - interrupts:       interrupt specification for the touchdetect
-                     interrupt
-
-Optional properties:
- - reset-gpios: GPIO specification for the RESET input
- - wake-gpios:  GPIO specification for the WAKE input
- - vcc-supply:  Regulator that supplies the touchscreen
-
- - pinctrl-names: should be "default"
- - pinctrl-0:   a phandle pointing to the pin settings for the
-                control gpios
-
- - wakeup-source: If present the device will act as wakeup-source
-
- - threshold:   allows setting the "click"-threshold in the range
-                from 0 to 80.
-
- - gain:        allows setting the sensitivity in the range from 0 to
-                31. Note that lower values indicate higher
-                sensitivity.
-
- - offset:      allows setting the edge compensation in the range from
-                0 to 31.
-
- - offset-x:    Same as offset, but applies only to the horizontal position.
-                Range from 0 to 80, only supported by evervision,ev-ft5726
-                devices.
-
- - offset-y:    Same as offset, but applies only to the vertical position.
-                Range from 0 to 80, only supported by evervision,ev-ft5726
-                devices.
-
- - touchscreen-size-x     : See touchscreen.txt
- - touchscreen-size-y     : See touchscreen.txt
- - touchscreen-fuzz-x      : See touchscreen.txt
- - touchscreen-fuzz-y      : See touchscreen.txt
- - touchscreen-inverted-x  : See touchscreen.txt
- - touchscreen-inverted-y  : See touchscreen.txt
- - touchscreen-swapped-x-y : See touchscreen.txt
-
-Example:
-       polytouch: edt-ft5x06@38 {
-               compatible = "edt,edt-ft5406", "edt,edt-ft5x06";
-               reg = <0x38>;
-               pinctrl-names = "default";
-               pinctrl-0 = <&edt_ft5x06_pins>;
-               interrupt-parent = <&gpio2>;
-               interrupts = <5 IRQ_TYPE_EDGE_FALLING>;
-               reset-gpios = <&gpio2 6 GPIO_ACTIVE_LOW>;
-               wake-gpios = <&gpio4 9 GPIO_ACTIVE_HIGH>;
-       };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml
new file mode 100644 (file)
index 0000000..8d58709
--- /dev/null
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/input/touchscreen/edt-ft5x06.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: FocalTech EDT-FT5x06 Polytouch Bindings
+
+description: |
+             There are 5 variants of the chip for various touch panel sizes
+              FT5206GE1  2.8" .. 3.8"
+              FT5306DE4  4.3" .. 7"
+              FT5406EE8  7"   .. 8.9"
+              FT5506EEG  7"   .. 8.9"
+              FT5726NEI  5.7” .. 11.6"
+
+maintainers:
+  - Dmitry Torokhov <dmitry.torokhov@gmail.com>
+
+allOf:
+  - $ref: touchscreen.yaml#
+  - if:
+     properties:
+       compatible:
+         contains:
+           enum:
+             - evervision,ev-ft5726
+
+    then:
+      properties:
+        offset-x: true
+        offset-y: true
+
+properties:
+  compatible:
+    enum:
+      - edt,edt-ft5206
+      - edt,edt-ft5306
+      - edt,edt-ft5406
+      - edt,edt-ft5506
+      - evervision,ev-ft5726
+      - focaltech,ft6236
+
+  reg:
+    const: 0x38
+
+  interrupts:
+    maxItems: 1
+
+  reset-gpios:
+    maxItems: 1
+
+  wake-gpios:
+    maxItems: 1
+
+  wakeup-source: true
+
+  vcc-supply:
+    maxItems: 1
+
+  gain:
+    description: Allows setting the sensitivity in the range from 0 to 31.
+                 Note that lower values indicate higher sensitivity.
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32
+      - minimum: 0
+      - maximum: 31
+
+  offset:
+    description: Allows setting the edge compensation in the range from 0 to 31.
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32
+      - minimum: 0
+      - maximum: 31
+
+  offset-x:
+    description: Same as offset, but applies only to the horizontal position.
+                 Range from 0 to 80, only supported by evervision,ev-ft5726 devices.
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32
+      - minimum: 0
+      - maximum: 80
+
+  offset-y:
+    description: Same as offset, but applies only to the vertical position.
+                 Range from 0 to 80, only supported by evervision,ev-ft5726 devices.
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32
+      - minimum: 0
+      - maximum: 80
+
+  touchscreen-size-x: true
+  touchscreen-size-y: true
+  touchscreen-fuzz-x: true
+  touchscreen-fuzz-y: true
+  touchscreen-inverted-x: true
+  touchscreen-inverted-y: true
+  touchscreen-swapped-x-y: true
+  interrupt-controller: true
+
+additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    i2c@00000000 {
+      #address-cells = <1>;
+      #size-cells = <0>;
+      edt-ft5x06@38 {
+        compatible = "edt,edt-ft5406";
+        reg = <0x38>;
+        interrupt-parent = <&gpio2>;
+        interrupts = <5 IRQ_TYPE_EDGE_FALLING>;
+        reset-gpios = <&gpio2 6 GPIO_ACTIVE_LOW>;
+        wake-gpios = <&gpio4 9 GPIO_ACTIVE_HIGH>;
+      };
+    };
+
+...
index c99ed39..c8ea943 100644 (file)
@@ -21,6 +21,8 @@ properties:
       - goodix,gt911
       - goodix,gt9110
       - goodix,gt912
+      - goodix,gt9147
+      - goodix,gt917s
       - goodix,gt927
       - goodix,gt9271
       - goodix,gt928
diff --git a/Documentation/devicetree/bindings/mfd/iqs62x.yaml b/Documentation/devicetree/bindings/mfd/iqs62x.yaml
new file mode 100644 (file)
index 0000000..541b06d
--- /dev/null
@@ -0,0 +1,179 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mfd/iqs62x.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Azoteq IQS620A/621/622/624/625 Multi-Function Sensors
+
+maintainers:
+  - Jeff LaBundy <jeff@labundy.com>
+
+description: |
+  The Azoteq IQS620A, IQS621, IQS622, IQS624 and IQS625 multi-function sensors
+  integrate multiple sensing technologies in a single package.
+
+  Link to datasheets: https://www.azoteq.com/
+
+properties:
+  compatible:
+    enum:
+      - azoteq,iqs620a
+      - azoteq,iqs621
+      - azoteq,iqs622
+      - azoteq,iqs624
+      - azoteq,iqs625
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  firmware-name:
+    $ref: /schemas/types.yaml#/definitions/string
+    description:
+      Specifies the name of the calibration and configuration file selected by
+      the driver. If this property is omitted, the name is chosen based on the
+      device name with ".bin" as the extension (e.g. iqs620a.bin for IQS620A).
+
+  keys:
+    $ref: ../input/iqs62x-keys.yaml
+
+  pwm:
+    $ref: ../pwm/iqs620a-pwm.yaml
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+additionalProperties: false
+
+examples:
+  - |
+    /*
+     * Dual capacitive buttons with proximity-activated function, unipolar lid
+     * switch and panel-mounted LED.
+     */
+    #include <dt-bindings/input/input.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    i2c {
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            iqs620a@44 {
+                    compatible = "azoteq,iqs620a";
+                    reg = <0x44>;
+                    interrupt-parent = <&gpio>;
+                    interrupts = <17 IRQ_TYPE_LEVEL_LOW>;
+
+                    keys {
+                            compatible = "azoteq,iqs620a-keys";
+
+                            linux,keycodes = <KEY_SELECT>,
+                                             <KEY_MENU>,
+                                             <KEY_OK>,
+                                             <KEY_MENU>;
+
+                            hall-switch-south {
+                                    linux,code = <SW_LID>;
+                                    azoteq,use-prox;
+                            };
+                    };
+
+                    iqs620a_pwm: pwm {
+                            compatible = "azoteq,iqs620a-pwm";
+                            #pwm-cells = <2>;
+                    };
+            };
+    };
+
+    pwmleds {
+            compatible = "pwm-leds";
+
+            panel {
+                    pwms = <&iqs620a_pwm 0 1000000>;
+                    max-brightness = <255>;
+            };
+    };
+
+  - |
+    /* Single inductive button with bipolar dock/tablet-mode switch. */
+    #include <dt-bindings/input/input.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    i2c {
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            iqs620a@44 {
+                    compatible = "azoteq,iqs620a";
+                    reg = <0x44>;
+                    interrupt-parent = <&gpio>;
+                    interrupts = <17 IRQ_TYPE_LEVEL_LOW>;
+
+                    firmware-name = "iqs620a_coil.bin";
+
+                    keys {
+                            compatible = "azoteq,iqs620a-keys";
+
+                            linux,keycodes = <0>,
+                                             <0>,
+                                             <0>,
+                                             <0>,
+                                             <0>,
+                                             <0>,
+                                             <KEY_MUTE>;
+
+                            hall-switch-north {
+                                    linux,code = <SW_DOCK>;
+                            };
+
+                            hall-switch-south {
+                                    linux,code = <SW_TABLET_MODE>;
+                            };
+                    };
+            };
+    };
+
+  - |
+    /* Dual capacitive buttons with volume knob. */
+    #include <dt-bindings/input/input.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    i2c {
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            iqs624@44 {
+                    compatible = "azoteq,iqs624";
+                    reg = <0x44>;
+                    interrupt-parent = <&gpio>;
+                    interrupts = <17 IRQ_TYPE_LEVEL_LOW>;
+
+                    keys {
+                            compatible = "azoteq,iqs624-keys";
+
+                            linux,keycodes = <BTN_0>,
+                                             <0>,
+                                             <BTN_1>,
+                                             <0>,
+                                             <0>,
+                                             <0>,
+                                             <0>,
+                                             <0>,
+                                             <0>,
+                                             <0>,
+                                             <0>,
+                                             <0>,
+                                             <0>,
+                                             <0>,
+                                             <KEY_VOLUMEUP>,
+                                             <KEY_VOLUMEDOWN>;
+                    };
+            };
+    };
+
+...
index b74e5e9..16778ea 100644 (file)
@@ -15,6 +15,8 @@ Required properties:
  - reg: the I2C slave address of the device
 
 Optional properties:
+ - interrupts: interrupt mapping for IRQ
+   See Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
  - system-power-controller:
    See Documentation/devicetree/bindings/power/power-controller.txt
 
@@ -32,6 +34,8 @@ Example:
        pmic@32 {
                compatible = "ricoh,rn5t618";
                reg = <0x32>;
+               interrupt-parent = <&gpio5>;
+               interrupts = <11 IRQ_TYPE_EDGE_FALLING>;
                system-power-controller;
 
                regulators {
diff --git a/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt b/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt
deleted file mode 100644 (file)
index f22d74c..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-* ROHM BD71837 and BD71847 Power Management Integrated Circuit bindings
-
-BD71837MWV and BD71847MWV are programmable Power Management ICs for powering
-single-core, dual-core, and quad-core SoCs such as NXP-i.MX 8M. They are
-optimized for low BOM cost and compact solution footprint. BD71837MWV
-integrates 8 Buck regulators and 7 LDOs. BD71847MWV contains 6 Buck regulators
-and 6 LDOs.
-
-Datasheet for BD71837 is available at:
-https://www.rohm.com/datasheet/BD71837MWV/bd71837mwv-e
-Datasheet for BD71847 is available at:
-https://www.rohm.com/datasheet/BD71847AMWV/bd71847amwv-e
-
-Required properties:
- - compatible          : Should be "rohm,bd71837" for bd71837
-                                   "rohm,bd71847" for bd71847.
- - reg                 : I2C slave address.
- - interrupt-parent    : Phandle to the parent interrupt controller.
- - interrupts          : The interrupt line the device is connected to.
- - clocks              : The parent clock connected to PMIC. If this is missing
-                         32768 KHz clock is assumed.
- - #clock-cells                : Should be 0.
- - regulators:         : List of child nodes that specify the regulators.
-                         Please see ../regulator/rohm,bd71837-regulator.txt
-
-Optional properties:
-- clock-output-names   : Should contain name for output clock.
-- rohm,reset-snvs-powered : Transfer BD718x7 to SNVS state at reset.
-
-The BD718x7 supports two different HW states as reset target states. States
-are called as SNVS and READY. At READY state all the PMIC power outputs go
-down and OTP is reload. At the SNVS state all other logic and external
-devices apart from the SNVS power domain are shut off. Please refer to NXP
-i.MX8 documentation for further information regarding SNVS state. When a
-reset is done via SNVS state the PMIC OTP data is not reload. This causes
-power outputs that have been under SW control to stay down when reset has
-switched power state to SNVS. If reset is done via READY state the power
-outputs will be returned to HW control by OTP loading. Thus the reset
-target state is set to READY by default. If SNVS state is used the boot
-crucial regulators must have the regulator-always-on and regulator-boot-on
-properties set in regulator node.
-
-- rohm,short-press-ms  : Short press duration in milliseconds
-- rohm,long-press-ms   : Long press duration in milliseconds
-
-Configure the "short press" and "long press" timers for the power button.
-Values are rounded to what hardware supports (500ms multiple for short and
-1000ms multiple for long). If these properties are not present the existing
-configuration (from bootloader or OTP) is not touched.
-
-Example:
-
-       /* external oscillator node */
-       osc: oscillator {
-               compatible = "fixed-clock";
-               #clock-cells = <1>;
-               clock-frequency  = <32768>;
-               clock-output-names = "osc";
-       };
-
-       pmic: pmic@4b {
-               compatible = "rohm,bd71837";
-               reg = <0x4b>;
-               interrupt-parent = <&gpio1>;
-               interrupts = <29 GPIO_ACTIVE_LOW>;
-               interrupt-names = "irq";
-               #clock-cells = <0>;
-               clocks = <&osc 0>;
-               clock-output-names = "bd71837-32k-out";
-               rohm,reset-snvs-powered;
-
-               regulators {
-                       buck1: BUCK1 {
-                               regulator-name = "buck1";
-                               regulator-min-microvolt = <700000>;
-                               regulator-max-microvolt = <1300000>;
-                               regulator-boot-on;
-                               regulator-always-on;
-                               regulator-ramp-delay = <1250>;
-                       };
-                       // [...]
-               };
-       };
-
-       /* Clock consumer node */
-       rtc@0 {
-               compatible = "company,my-rtc";
-               clock-names = "my-clock";
-               clocks = <&pmic>;
-       };
diff --git a/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.yaml b/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.yaml
new file mode 100644 (file)
index 0000000..aa922c5
--- /dev/null
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mfd/rohm,bd71837-pmic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ROHM BD71837 Power Management Integrated Circuit bindings
+
+maintainers:
+  - Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
+
+description: |
+  BD71837MWV is programmable Power Management ICs for powering single-core,
+  dual-core, and quad-core SoCs such as NXP-i.MX 8M. It is optimized for low
+  BOM cost and compact solution footprint. BD71837MWV  integrates 8 Buck
+  regulators and 7 LDOs.
+  Datasheet for BD71837 is available at
+  https://www.rohm.com/products/power-management/power-management-ic-for-system/industrial-consumer-applications/nxp-imx/bd71837amwv-product
+
+properties:
+  compatible:
+    const: rohm,bd71837
+
+  reg:
+    description:
+      I2C slave address.
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  "#clock-cells":
+    const: 0
+
+# The BD718x7 supports two different HW states as reset target states. States
+# are called as SNVS and READY. At READY state all the PMIC power outputs go
+# down and OTP is reload. At the SNVS state all other logic and external
+# devices apart from the SNVS power domain are shut off. Please refer to NXP
+# i.MX8 documentation for further information regarding SNVS state. When a
+# reset is done via SNVS state the PMIC OTP data is not reload. This causes
+# power outputs that have been under SW control to stay down when reset has
+# switched power state to SNVS. If reset is done via READY state the power
+# outputs will be returned to HW control by OTP loading. Thus the reset
+# target state is set to READY by default. If SNVS state is used the boot
+# crucial regulators must have the regulator-always-on and regulator-boot-on
+# properties set in regulator node.
+
+  rohm,reset-snvs-powered:
+    description: |
+      Transfer PMIC to SNVS state at reset
+    type: boolean
+
+# Configure the "short press" and "long press" timers for the power button.
+# Values are rounded to what hardware supports
+# Short-press:
+#   Shortest being 10ms, next 500ms and then multiple of 500ms up to 7,5s
+# Long-press:
+#   Shortest being 10ms, next 1000ms and then multiple of 1000ms up to 15s
+# If these properties are not present the existing configuration (from
+# bootloader or OTP) is not touched.
+
+  rohm,short-press-ms:
+    description:
+      Short press duration in milliseconds
+    enum:
+      - 10
+      - 500
+      - 1000
+      - 1500
+      - 2000
+      - 2500
+      - 3000
+      - 3500
+      - 4000
+      - 4500
+      - 5000
+      - 5500
+      - 6000
+      - 6500
+      - 7000
+
+  rohm,long-press-ms:
+    description:
+      Long press duration in milliseconds
+    enum:
+      - 10
+      - 1000
+      - 2000
+      - 3000
+      - 4000
+      - 5000
+      - 6000
+      - 7000
+      - 8000
+      - 9000
+      - 10000
+      - 11000
+      - 12000
+      - 13000
+      - 14000
+
+  regulators:
+    $ref: ../regulator/rohm,bd71837-regulator.yaml
+    description:
+      List of child nodes that specify the regulators.
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - "#clock-cells"
+  - regulators
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+    #include <dt-bindings/leds/common.h>
+
+    i2c {
+      pmic: pmic@4b {
+            compatible = "rohm,bd71837";
+            reg = <0x4b>;
+            interrupt-parent = <&gpio1>;
+            interrupts = <29 IRQ_TYPE_LEVEL_LOW>;
+            #clock-cells = <0>;
+            clocks = <&osc 0>;
+            rohm,reset-snvs-powered;
+            rohm,short-press-ms = <10>;
+            rohm,long-press-ms = <2000>;
+
+            regulators {
+                buck1: BUCK1 {
+                    regulator-name = "buck1";
+                    regulator-min-microvolt = <700000>;
+                    regulator-max-microvolt = <1300000>;
+                    regulator-boot-on;
+                    regulator-always-on;
+                    regulator-ramp-delay = <1250>;
+                    rohm,dvs-run-voltage = <900000>;
+                    rohm,dvs-idle-voltage = <850000>;
+                    rohm,dvs-suspend-voltage = <800000>;
+                };
+                buck2: BUCK2 {
+                    regulator-name = "buck2";
+                    regulator-min-microvolt = <700000>;
+                    regulator-max-microvolt = <1300000>;
+                    regulator-boot-on;
+                    regulator-always-on;
+                    regulator-ramp-delay = <1250>;
+                    rohm,dvs-run-voltage = <1000000>;
+                    rohm,dvs-idle-voltage = <900000>;
+                };
+                buck3: BUCK3 {
+                    regulator-name = "buck3";
+                    regulator-min-microvolt = <700000>;
+                    regulator-max-microvolt = <1300000>;
+                    regulator-boot-on;
+                    rohm,dvs-run-voltage = <1000000>;
+                };
+                buck4: BUCK4 {
+                    regulator-name = "buck4";
+                    regulator-min-microvolt = <700000>;
+                    regulator-max-microvolt = <1300000>;
+                    regulator-boot-on;
+                    rohm,dvs-run-voltage = <1000000>;
+                };
+                buck5: BUCK5 {
+                    regulator-name = "buck5";
+                    regulator-min-microvolt = <700000>;
+                    regulator-max-microvolt = <1350000>;
+                    regulator-boot-on;
+                };
+                buck6: BUCK6 {
+                    regulator-name = "buck6";
+                    regulator-min-microvolt = <3000000>;
+                    regulator-max-microvolt = <3300000>;
+                    regulator-boot-on;
+                };
+                buck7: BUCK7 {
+                    regulator-name = "buck7";
+                    regulator-min-microvolt = <1605000>;
+                    regulator-max-microvolt = <1995000>;
+                    regulator-boot-on;
+                };
+                buck8: BUCK8 {
+                    regulator-name = "buck8";
+                    regulator-min-microvolt = <800000>;
+                    regulator-max-microvolt = <1400000>;
+                };
+
+                ldo1: LDO1 {
+                    regulator-name = "ldo1";
+                    regulator-min-microvolt = <3000000>;
+                    regulator-max-microvolt = <3300000>;
+                    regulator-boot-on;
+                };
+                ldo2: LDO2 {
+                    regulator-name = "ldo2";
+                    regulator-min-microvolt = <900000>;
+                    regulator-max-microvolt = <900000>;
+                    regulator-boot-on;
+                };
+                ldo3: LDO3 {
+                    regulator-name = "ldo3";
+                    regulator-min-microvolt = <1800000>;
+                    regulator-max-microvolt = <3300000>;
+                };
+                ldo4: LDO4 {
+                    regulator-name = "ldo4";
+                    regulator-min-microvolt = <900000>;
+                    regulator-max-microvolt = <1800000>;
+                };
+                ldo5: LDO5 {
+                    regulator-name = "ldo5";
+                    regulator-min-microvolt = <1800000>;
+                    regulator-max-microvolt = <3300000>;
+                };
+                ldo6: LDO6 {
+                    regulator-name = "ldo6";
+                    regulator-min-microvolt = <900000>;
+                    regulator-max-microvolt = <1800000>;
+                };
+                ldo7_reg: LDO7 {
+                    regulator-name = "ldo7";
+                    regulator-min-microvolt = <1800000>;
+                    regulator-max-microvolt = <3300000>;
+                };
+            };
+        };
+    };
diff --git a/Documentation/devicetree/bindings/mfd/rohm,bd71847-pmic.yaml b/Documentation/devicetree/bindings/mfd/rohm,bd71847-pmic.yaml
new file mode 100644 (file)
index 0000000..402e40d
--- /dev/null
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mfd/rohm,bd71847-pmic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ROHM BD71847 and BD71850 Power Management Integrated Circuit bindings
+
+maintainers:
+  - Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
+
+description: |
+  BD71847AMWV and BD71850MWV are programmable Power Management ICs for powering
+  single-core,  dual-core, and quad-core SoCs such as NXP-i.MX 8M. It is
+  optimized for low BOM cost and compact solution footprint. BD71847MWV and
+  BD71850MWV integrate 6 Buck regulators and 6 LDOs.
+  Datasheets are available at
+  https://www.rohm.com/products/power-management/power-management-ic-for-system/industrial-consumer-applications/nxp-imx/bd71847amwv-product
+  https://www.rohm.com/products/power-management/power-management-ic-for-system/industrial-consumer-applications/nxp-imx/bd71850mwv-product
+
+properties:
+  compatible:
+    enum:
+      - rohm,bd71847
+      - rohm,bd71850
+
+  reg:
+    description:
+      I2C slave address.
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  "#clock-cells":
+    const: 0
+
+# The BD71847 abd BD71850 support two different HW states as reset target
+# states. States are called as SNVS and READY. At READY state all the PMIC
+# power outputs go down and OTP is reload. At the SNVS state all other logic
+# and external devices apart from the SNVS power domain are shut off. Please
+# refer to NXP i.MX8 documentation for further information regarding SNVS
+# state. When a reset is done via SNVS state the PMIC OTP data is not reload.
+# This causes power outputs that have been under SW control to stay down when
+# reset has switched power state to SNVS. If reset is done via READY state the
+# power outputs will be returned to HW control by OTP loading. Thus the reset
+# target state is set to READY by default. If SNVS state is used the boot
+# crucial regulators must have the regulator-always-on and regulator-boot-on
+# properties set in regulator node.
+
+  rohm,reset-snvs-powered:
+    description:
+      Transfer PMIC to SNVS state at reset.
+    type: boolean
+
+# Configure the "short press" and "long press" timers for the power button.
+# Values are rounded to what hardware supports
+# Short-press:
+#   Shortest being 10ms, next 500ms and then multiple of 500ms up to 7,5s
+# Long-press:
+#   Shortest being 10ms, next 1000ms and then multiple of 1000ms up to 15s
+# If these properties are not present the existing # configuration (from
+# bootloader or OTP) is not touched.
+
+  rohm,short-press-ms:
+    description:
+      Short press duration in milliseconds
+    enum:
+      - 10
+      - 500
+      - 1000
+      - 1500
+      - 2000
+      - 2500
+      - 3000
+      - 3500
+      - 4000
+      - 4500
+      - 5000
+      - 5500
+      - 6000
+      - 6500
+      - 7000
+      - 7500
+
+  rohm,long-press-ms:
+    description:
+      Long press duration in milliseconds
+    enum:
+      - 10
+      - 1000
+      - 2000
+      - 3000
+      - 4000
+      - 5000
+      - 6000
+      - 7000
+      - 8000
+      - 9000
+      - 10000
+      - 11000
+      - 12000
+      - 13000
+      - 14000
+      - 15000
+
+  regulators:
+    $ref: ../regulator/rohm,bd71847-regulator.yaml
+    description:
+      List of child nodes that specify the regulators.
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - "#clock-cells"
+  - regulators
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/irq.h>
+    #include <dt-bindings/leds/common.h>
+
+    i2c {
+      pmic: pmic@4b {
+            compatible = "rohm,bd71847";
+            reg = <0x4b>;
+            interrupt-parent = <&gpio1>;
+            interrupts = <29 IRQ_TYPE_LEVEL_LOW>;
+            #clock-cells = <0>;
+            clocks = <&osc 0>;
+            rohm,reset-snvs-powered;
+            rohm,short-press-ms = <10>;
+            rohm,long-press-ms = <2000>;
+
+            regulators {
+                buck1: BUCK1 {
+                    regulator-name = "buck1";
+                    regulator-min-microvolt = <700000>;
+                    regulator-max-microvolt = <1300000>;
+                    regulator-boot-on;
+                    regulator-always-on;
+                    regulator-ramp-delay = <1250>;
+                    rohm,dvs-run-voltage = <900000>;
+                    rohm,dvs-idle-voltage = <850000>;
+                    rohm,dvs-suspend-voltage = <800000>;
+                };
+                buck2: BUCK2 {
+                    regulator-name = "buck2";
+                    regulator-min-microvolt = <700000>;
+                    regulator-max-microvolt = <1300000>;
+                    regulator-boot-on;
+                    regulator-always-on;
+                    regulator-ramp-delay = <1250>;
+                    rohm,dvs-run-voltage = <1000000>;
+                    rohm,dvs-idle-voltage = <900000>;
+                };
+                buck3: BUCK3 {
+                    regulator-name = "buck3";
+                    regulator-min-microvolt = <550000>;
+                    regulator-max-microvolt = <1350000>;
+                    regulator-boot-on;
+                };
+                buck4: BUCK4 {
+                    regulator-name = "buck4";
+                    regulator-min-microvolt = <2600000>;
+                    regulator-max-microvolt = <3300000>;
+                    regulator-boot-on;
+                };
+                buck5: BUCK5 {
+                    regulator-name = "buck5";
+                    regulator-min-microvolt = <1605000>;
+                    regulator-max-microvolt = <1995000>;
+                    regulator-boot-on;
+                };
+                buck8: BUCK6 {
+                    regulator-name = "buck6";
+                    regulator-min-microvolt = <800000>;
+                    regulator-max-microvolt = <1400000>;
+                };
+
+                ldo1: LDO1 {
+                    regulator-name = "ldo1";
+                    regulator-min-microvolt = <1600000>;
+                    regulator-max-microvolt = <3300000>;
+                    regulator-boot-on;
+                };
+                ldo2: LDO2 {
+                    regulator-name = "ldo2";
+                    regulator-min-microvolt = <800000>;
+                    regulator-max-microvolt = <900000>;
+                    regulator-boot-on;
+                };
+                ldo3: LDO3 {
+                    regulator-name = "ldo3";
+                    regulator-min-microvolt = <1800000>;
+                    regulator-max-microvolt = <3300000>;
+                };
+                ldo4: LDO4 {
+                    regulator-name = "ldo4";
+                    regulator-min-microvolt = <900000>;
+                    regulator-max-microvolt = <1800000>;
+                };
+                ldo5: LDO5 {
+                    regulator-name = "ldo5";
+                    regulator-min-microvolt = <800000>;
+                    regulator-max-microvolt = <3300000>;
+                };
+                ldo6: LDO6 {
+                    regulator-name = "ldo6";
+                    regulator-min-microvolt = <900000>;
+                    regulator-max-microvolt = <1800000>;
+                };
+            };
+        };
+    };
index 1a4cc5f..ddf190c 100644 (file)
@@ -39,6 +39,8 @@ properties:
   "#size-cells":
     const: 0
 
+  wakeup-source: true
+
   pwm:
     type: object
 
@@ -81,6 +83,16 @@ patternProperties:
     required:
       - compatible
 
+  timer:
+    type: object
+
+    properties:
+      compatible:
+        const: st,stm32-lptimer-timer
+
+    required:
+      - compatible
+
 required:
   - "#address-cells"
   - "#size-cells"
@@ -115,6 +127,10 @@ examples:
       counter {
         compatible = "st,stm32-lptimer-counter";
       };
+
+      timer {
+        compatible = "st,stm32-lptimer-timer";
+      };
     };
 
 ...
diff --git a/Documentation/devicetree/bindings/pwm/iqs620a-pwm.yaml b/Documentation/devicetree/bindings/pwm/iqs620a-pwm.yaml
new file mode 100644 (file)
index 0000000..1d7c27b
--- /dev/null
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pwm/iqs620a-pwm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Azoteq IQS620A PWM Generator
+
+maintainers:
+  - Jeff LaBundy <jeff@labundy.com>
+
+description: |
+  The Azoteq IQS620A multi-function sensor generates a fixed-frequency PWM
+  output represented by a "pwm" child node from the parent MFD driver. See
+  Documentation/devicetree/bindings/mfd/iqs62x.yaml for further details as
+  well as an example.
+
+properties:
+  compatible:
+    enum:
+      - azoteq,iqs620a-pwm
+
+  "#pwm-cells":
+    const: 2
+
+required:
+  - compatible
+  - "#pwm-cells"
+
+additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.txt b/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.txt
deleted file mode 100644 (file)
index cbce62c..0000000
+++ /dev/null
@@ -1,162 +0,0 @@
-ROHM BD71837 and BD71847 Power Management Integrated Circuit regulator bindings
-
-Required properties:
- - regulator-name: should be "buck1", ..., "buck8" and "ldo1", ..., "ldo7" for
-                   BD71837. For BD71847 names should be "buck1", ..., "buck6"
-                  and "ldo1", ..., "ldo6"
-
-List of regulators provided by this controller. BD71837 regulators node
-should be sub node of the BD71837 MFD node. See BD71837 MFD bindings at
-Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt
-Regulator nodes should be named to BUCK_<number> and LDO_<number>. The
-definition for each of these nodes is defined using the standard
-binding for regulators at
-Documentation/devicetree/bindings/regulator/regulator.txt.
-Note that if BD71837 starts at RUN state you probably want to use
-regulator-boot-on at least for BUCK6 and BUCK7 so that those are not
-disabled by driver at startup. LDO5 and LDO6 are supplied by those and
-if they are disabled at startup the voltage monitoring for LDO5/LDO6 will
-cause PMIC to reset.
-
-The valid names for BD71837 regulator nodes are:
-BUCK1, BUCK2, BUCK3, BUCK4, BUCK5, BUCK6, BUCK7, BUCK8
-LDO1, LDO2, LDO3, LDO4, LDO5, LDO6, LDO7
-
-The valid names for BD71847 regulator nodes are:
-BUCK1, BUCK2, BUCK3, BUCK4, BUCK5, BUCK6
-LDO1, LDO2, LDO3, LDO4, LDO5, LDO6
-
-Optional properties:
-- rohm,dvs-run-voltage         : PMIC default "RUN" state voltage in uV.
-                                 See below table for bucks which support this.
-- rohm,dvs-idle-voltage                : PMIC default "IDLE" state voltage in uV.
-                                 See below table for bucks which support this.
-- rohm,dvs-suspend-voltage     : PMIC default "SUSPEND" state voltage in uV.
-                                 See below table for bucks which support this.
-- Any optional property defined in bindings/regulator/regulator.txt
-
-Supported default DVS states:
-
-BD71837:
-buck   | dvs-run-voltage       | dvs-idle-voltage      | dvs-suspend-voltage
------------------------------------------------------------------------------
-1      | supported             | supported             | supported
-----------------------------------------------------------------------------
-2      | supported             | supported             | not supported
-----------------------------------------------------------------------------
-3      | supported             | not supported         | not supported
-----------------------------------------------------------------------------
-4      | supported             | not supported         | not supported
-----------------------------------------------------------------------------
-rest   | not supported         | not supported         | not supported
-
-BD71847:
-buck   | dvs-run-voltage       | dvs-idle-voltage      | dvs-suspend-voltage
------------------------------------------------------------------------------
-1      | supported             | supported             | supported
-----------------------------------------------------------------------------
-2      | supported             | supported             | not supported
-----------------------------------------------------------------------------
-rest   | not supported         | not supported         | not supported
-
-Example:
-regulators {
-       buck1: BUCK1 {
-               regulator-name = "buck1";
-               regulator-min-microvolt = <700000>;
-               regulator-max-microvolt = <1300000>;
-               regulator-boot-on;
-               regulator-always-on;
-               regulator-ramp-delay = <1250>;
-               rohm,dvs-run-voltage = <900000>;
-               rohm,dvs-idle-voltage = <850000>;
-               rohm,dvs-suspend-voltage = <800000>;
-       };
-       buck2: BUCK2 {
-               regulator-name = "buck2";
-               regulator-min-microvolt = <700000>;
-               regulator-max-microvolt = <1300000>;
-               regulator-boot-on;
-               regulator-always-on;
-               regulator-ramp-delay = <1250>;
-               rohm,dvs-run-voltage = <1000000>;
-               rohm,dvs-idle-voltage = <900000>;
-       };
-       buck3: BUCK3 {
-               regulator-name = "buck3";
-               regulator-min-microvolt = <700000>;
-               regulator-max-microvolt = <1300000>;
-               regulator-boot-on;
-               rohm,dvs-run-voltage = <1000000>;
-       };
-       buck4: BUCK4 {
-               regulator-name = "buck4";
-               regulator-min-microvolt = <700000>;
-               regulator-max-microvolt = <1300000>;
-               regulator-boot-on;
-               rohm,dvs-run-voltage = <1000000>;
-       };
-       buck5: BUCK5 {
-               regulator-name = "buck5";
-               regulator-min-microvolt = <700000>;
-               regulator-max-microvolt = <1350000>;
-               regulator-boot-on;
-       };
-       buck6: BUCK6 {
-               regulator-name = "buck6";
-               regulator-min-microvolt = <3000000>;
-               regulator-max-microvolt = <3300000>;
-               regulator-boot-on;
-       };
-       buck7: BUCK7 {
-               regulator-name = "buck7";
-               regulator-min-microvolt = <1605000>;
-               regulator-max-microvolt = <1995000>;
-               regulator-boot-on;
-       };
-       buck8: BUCK8 {
-               regulator-name = "buck8";
-               regulator-min-microvolt = <800000>;
-               regulator-max-microvolt = <1400000>;
-       };
-
-       ldo1: LDO1 {
-               regulator-name = "ldo1";
-               regulator-min-microvolt = <3000000>;
-               regulator-max-microvolt = <3300000>;
-               regulator-boot-on;
-       };
-       ldo2: LDO2 {
-               regulator-name = "ldo2";
-               regulator-min-microvolt = <900000>;
-               regulator-max-microvolt = <900000>;
-               regulator-boot-on;
-       };
-       ldo3: LDO3 {
-               regulator-name = "ldo3";
-               regulator-min-microvolt = <1800000>;
-               regulator-max-microvolt = <3300000>;
-       };
-       ldo4: LDO4 {
-               regulator-name = "ldo4";
-               regulator-min-microvolt = <900000>;
-               regulator-max-microvolt = <1800000>;
-       };
-       ldo5: LDO5 {
-               regulator-name = "ldo5";
-               regulator-min-microvolt = <1800000>;
-               regulator-max-microvolt = <3300000>;
-       };
-       ldo6: LDO6 {
-               regulator-name = "ldo6";
-               regulator-min-microvolt = <900000>;
-               regulator-max-microvolt = <1800000>;
-       };
-       ldo7_reg: LDO7 {
-               regulator-name = "ldo7";
-               regulator-min-microvolt = <1800000>;
-               regulator-max-microvolt = <3300000>;
-       };
-};
-
-
diff --git a/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.yaml b/Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.yaml
new file mode 100644 (file)
index 0000000..a323b16
--- /dev/null
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/regulator/rohm,bd71837-regulator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ROHM BD71837 Power Management Integrated Circuit regulators
+
+maintainers:
+  - Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
+
+description: |
+  List of regulators provided by this controller. BD71837 regulators node
+  should be sub node of the BD71837 MFD node. See BD71837 MFD bindings at
+  Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.yaml
+  Regulator nodes should be named to BUCK_<number> and LDO_<number>. The
+  definition for each of these nodes is defined using the standard
+  binding for regulators at
+  Documentation/devicetree/bindings/regulator/regulator.txt.
+  Note that if BD71837 starts at RUN state you probably want to use
+  regulator-boot-on at least for BUCK6 and BUCK7 so that those are not
+  disabled by driver at startup. LDO5 and LDO6 are supplied by those and
+  if they are disabled at startup the voltage monitoring for LDO5/LDO6 will
+  cause PMIC to reset.
+
+#The valid names for BD71837 regulator nodes are:
+#BUCK1, BUCK2, BUCK3, BUCK4, BUCK5, BUCK6, BUCK7, BUCK8
+#LDO1, LDO2, LDO3, LDO4, LDO5, LDO6, LDO7
+
+patternProperties:
+  "^LDO[1-7]$":
+    type: object
+    allOf:
+      - $ref: regulator.yaml#
+    description:
+      Properties for single LDO regulator.
+
+    properties:
+      regulator-name:
+        pattern: "^ldo[1-7]$"
+        description:
+          should be "ldo1", ..., "ldo7"
+
+  "^BUCK[1-8]$":
+    type: object
+    allOf:
+      - $ref: regulator.yaml#
+    description:
+      Properties for single BUCK regulator.
+
+    properties:
+      regulator-name:
+        pattern: "^buck[1-8]$"
+        description:
+          should be "buck1", ..., "buck8"
+
+      rohm,dvs-run-voltage:
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - minimum: 0
+            maximum: 1300000
+        description:
+          PMIC default "RUN" state voltage in uV. See below table for
+          bucks which support this. 0 means disabled.
+
+      rohm,dvs-idle-voltage:
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - minimum: 0
+            maximum: 1300000
+        description:
+          PMIC default "IDLE" state voltage in uV. See below table for
+          bucks which support this. 0 means disabled.
+
+      rohm,dvs-suspend-voltage:
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - minimum: 0
+            maximum: 1300000
+        description:
+          PMIC default "SUSPEND" state voltage in uV. See below table for
+          bucks which support this. 0 means disabled.
+
+        # Supported default DVS states:
+        #
+        # BD71837:
+        # buck | dvs-run-voltage | dvs-idle-voltage | dvs-suspend-voltage
+        # ----------------------------------------------------------------
+        # 1    | supported       | supported        | supported
+        # ----------------------------------------------------------------
+        # 2    | supported       | supported        | not supported
+        # ----------------------------------------------------------------
+        # 3    | supported       | not supported    | not supported
+        # ----------------------------------------------------------------
+        # 4    | supported       | not supported    | not supported
+        # ----------------------------------------------------------------
+        # rest | not supported   | not supported    | not supported
+
+
+    required:
+      - regulator-name
+  additionalProperties: false
+additionalProperties: false
diff --git a/Documentation/devicetree/bindings/regulator/rohm,bd71847-regulator.yaml b/Documentation/devicetree/bindings/regulator/rohm,bd71847-regulator.yaml
new file mode 100644 (file)
index 0000000..526fd00
--- /dev/null
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/regulator/rohm,bd71847-regulator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ROHM BD71847 and BD71850 Power Management Integrated Circuit regulators
+
+maintainers:
+  - Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
+
+description: |
+  List of regulators provided by this controller. BD71847 regulators node
+  should be sub node of the BD71847 MFD node. See BD71847 MFD bindings at
+  Documentation/devicetree/bindings/mfd/rohm,bd71847-pmic.yaml
+  Regulator nodes should be named to BUCK_<number> and LDO_<number>. The
+  definition for each of these nodes is defined using the standard
+  binding for regulators at
+  Documentation/devicetree/bindings/regulator/regulator.txt.
+  Note that if BD71847 starts at RUN state you probably want to use
+  regulator-boot-on at least for BUCK5. LDO6 is supplied by it and it must
+  not be disabled by driver at startup. If BUCK5 is disabled at startup the
+  voltage monitoring for LDO5/LDO6 can cause PMIC to reset.
+
+#The valid names for BD71847 regulator nodes are:
+#BUCK1, BUCK2, BUCK3, BUCK4, BUCK5, BUCK6
+#LDO1, LDO2, LDO3, LDO4, LDO5, LDO6
+
+patternProperties:
+  "^LDO[1-6]$":
+    type: object
+    allOf:
+      - $ref: regulator.yaml#
+    description:
+      Properties for single LDO regulator.
+
+    properties:
+      regulator-name:
+        pattern: "^ldo[1-6]$"
+        description:
+          should be "ldo1", ..., "ldo6"
+
+  "^BUCK[1-6]$":
+    type: object
+    allOf:
+      - $ref: regulator.yaml#
+    description:
+      Properties for single BUCK regulator.
+
+    properties:
+      regulator-name:
+        pattern: "^buck[1-6]$"
+        description:
+          should be "buck1", ..., "buck6"
+
+      rohm,dvs-run-voltage:
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - minimum: 0
+            maximum: 1300000
+        description:
+          PMIC default "RUN" state voltage in uV. See below table for
+          bucks which support this. 0 means disabled.
+
+      rohm,dvs-idle-voltage:
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - minimum: 0
+            maximum: 1300000
+        description:
+          PMIC default "IDLE" state voltage in uV. See below table for
+          bucks which support this. 0 means disabled.
+
+      rohm,dvs-suspend-voltage:
+        allOf:
+          - $ref: "/schemas/types.yaml#/definitions/uint32"
+          - minimum: 0
+            maximum: 1300000
+        description:
+          PMIC default "SUSPEND" state voltage in uV. See below table for
+          bucks which support this. 0 means disabled.
+
+        # Supported default DVS states:
+        #
+        # BD71847:
+        # buck | dvs-run-voltage | dvs-idle-voltage | dvs-suspend-voltage
+        # ----------------------------------------------------------------
+        # 1    | supported       | supported        | supported
+        # ----------------------------------------------------------------
+        # 2    | supported       | supported        | not supported
+        # ----------------------------------------------------------------
+        # rest | not supported   | not supported    | not supported
+
+    required:
+      - regulator-name
+  additionalProperties: false
+additionalProperties: false
diff --git a/Documentation/devicetree/bindings/thermal/imx8mm-thermal.txt b/Documentation/devicetree/bindings/thermal/imx8mm-thermal.txt
new file mode 100644 (file)
index 0000000..3629d3c
--- /dev/null
@@ -0,0 +1,15 @@
+* Thermal Monitoring Unit (TMU) on Freescale i.MX8MM SoC
+
+Required properties:
+- compatible : Must be "fsl,imx8mm-tmu" or "fsl,imx8mp-tmu".
+- reg : Address range of TMU registers.
+- clocks : TMU's clock source.
+- #thermal-sensor-cells : Should be 0 or 1. See ./thermal.txt for a description.
+
+Example:
+tmu: tmu@30260000 {
+       compatible = "fsl,imx8mm-tmu";
+       reg = <0x30260000 0x10000>;
+       clocks = <&clk IMX8MM_CLK_TMU_ROOT>;
+       #thermal-sensor-cells = <0>;
+};
index a57b76a..2ddd39d 100644 (file)
@@ -38,11 +38,11 @@ properties:
           - enum:
               - qcom,msm8996-tsens
               - qcom,msm8998-tsens
+              - qcom,sc7180-tsens
               - qcom,sdm845-tsens
           - const: qcom,tsens-v2
 
   reg:
-    maxItems: 2
     items:
       - description: TM registers
       - description: SROT registers
index 12c740b..2993fa7 100644 (file)
@@ -11,6 +11,7 @@ Required properties:
                            - "renesas,r8a774b1-thermal" (RZ/G2N)
                            - "renesas,r8a7795-thermal" (R-Car H3)
                            - "renesas,r8a7796-thermal" (R-Car M3-W)
+                           - "renesas,r8a77961-thermal" (R-Car M3-W+)
                            - "renesas,r8a77965-thermal" (R-Car M3-N)
                            - "renesas,r8a77980-thermal" (R-Car V3H)
 - reg                  : Address ranges of the thermal registers. Each sensor
diff --git a/Documentation/devicetree/bindings/thermal/sprd-thermal.yaml b/Documentation/devicetree/bindings/thermal/sprd-thermal.yaml
new file mode 100644 (file)
index 0000000..058c4cc
--- /dev/null
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/thermal/sprd-thermal.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Spreadtrum thermal sensor controller bindings
+
+maintainers:
+  - Orson Zhai <orsonzhai@gmail.com>
+  - Baolin Wang <baolin.wang7@gmail.com>
+  - Chunyan Zhang <zhang.lyra@gmail.com>
+
+properties:
+  compatible:
+    const: sprd,ums512-thermal
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  clock-names:
+    items:
+      - const: enable
+
+  nvmem-cells:
+    maxItems: 2
+    description:
+      Reference to nvmem nodes for the calibration data.
+
+  nvmem-cell-names:
+    items:
+      - const: thm_sign_cal
+      - const: thm_ratio_cal
+
+  "#thermal-sensor-cells":
+    const: 1
+
+  "#address-cells":
+    const: 1
+
+  "#size-cells":
+    const: 0
+
+patternProperties:
+  "^([a-z]*-)?sensor(-section)?@[0-9]+$":
+    type: object
+    description:
+      Represent one thermal sensor.
+
+    properties:
+      reg:
+        description: Specify the sensor id.
+        maxItems: 1
+
+      nvmem-cells:
+        maxItems: 1
+        description:
+          Reference to an nvmem node for the calibration data.
+
+      nvmem-cell-names:
+        const: sen_delta_cal
+
+    required:
+      - reg
+      - nvmem-cells
+      - nvmem-cell-names
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+  - nvmem-cells
+  - nvmem-cell-names
+  - "#thermal-sensor-cells"
+  - "#address-cells"
+  - "#size-cells"
+
+examples:
+  - |
+        ap_thm0: thermal@32200000 {
+                compatible = "sprd,ums512-thermal";
+                reg = <0 0x32200000 0 0x10000>;
+                clock-names = "enable";
+                clocks = <&aonapb_gate 32>;
+                #thermal-sensor-cells = <1>;
+                nvmem-cells = <&thm0_sign>, <&thm0_ratio>;
+                nvmem-cell-names = "thm_sign_cal", "thm_ratio_cal";
+                #address-cells = <1>;
+                #size-cells = <0>;
+
+                prometheus-sensor@0 {
+                        reg = <0>;
+                        nvmem-cells = <&thm0_sen0>;
+                        nvmem-cell-names = "sen_delta_cal";
+                };
+
+                ank-sensor@1 {
+                        reg = <1>;
+                        nvmem-cells = <&thm0_sen1>;
+                        nvmem-cell-names = "sen_delta_cal";
+                };
+        };
+...
index ca14ba9..f78bec1 100644 (file)
@@ -142,11 +142,11 @@ Required properties:
 - trips:               A sub-node which is a container of only trip point nodes
   Type: sub-node       required to describe the thermal zone.
 
+Optional property:
 - cooling-maps:                A sub-node which is a container of only cooling device
   Type: sub-node       map nodes, used to describe the relation between trips
                        and cooling devices.
 
-Optional property:
 - coefficients:                An array of integers (one signed cell) containing
   Type: array          coefficients to compose a linear relation between
   Elem size: one cell  the sensors listed in the thermal-sensors property.
diff --git a/Documentation/devicetree/bindings/watchdog/ti,rti-wdt.yaml b/Documentation/devicetree/bindings/watchdog/ti,rti-wdt.yaml
new file mode 100644 (file)
index 0000000..e83026f
--- /dev/null
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/watchdog/ti,rti-wdt.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Texas Instruments K3 SoC Watchdog Timer
+
+maintainers:
+  - Tero Kristo <t-kristo@ti.com>
+
+description:
+  The TI K3 SoC watchdog timer is implemented via the RTI (Real Time
+  Interrupt) IP module. This timer adds a support for windowed watchdog
+  mode, which will signal an error if it is pinged outside the watchdog
+  time window, meaning either too early or too late. The error signal
+  generated can be routed to either interrupt a safety controller or
+  to directly reset the SoC.
+
+allOf:
+  - $ref: "watchdog.yaml#"
+
+properties:
+  compatible:
+    enum:
+      - ti,j7-rti-wdt
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  power-domains:
+    maxItems: 1
+
+  assigned-clocks:
+    maxItems: 1
+
+  assigned-clocks-parents:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - power-domains
+
+examples:
+  - |
+    /*
+     * RTI WDT in main domain on J721e SoC. Assigned clocks are used to
+     * select the source clock for the watchdog, forcing it to tick with
+     * a 32kHz clock in this case.
+     */
+    #include <dt-bindings/soc/ti,sci_pm_domain.h>
+
+    watchdog0: rti@2200000 {
+        compatible = "ti,rti-wdt";
+        reg = <0x0 0x2200000 0x0 0x100>;
+        clocks = <&k3_clks 252 1>;
+        power-domains = <&k3_pds 252 TI_SCI_PD_EXCLUSIVE>;
+        assigned-clocks = <&k3_clks 252 1>;
+        assigned-clock-parents = <&k3_clks 252 5>;
+    };
index 9f0016e..a1c3ede 100644 (file)
@@ -105,8 +105,8 @@ and this variation will modulate the cooling effect.
        idle  <-------------->
                 running
 
-      <----------------------------->
-              duty cycle 33%
+      <--------------------->
+          duty cycle 33%
 
 
      ^
index f054d1c..671fef3 100644 (file)
@@ -158,6 +158,16 @@ Options
                /sys/fs/9p/caches. (applies only to cache=fscache)
   ============= ===============================================================
 
+Behavior
+========
+
+This section aims at describing 9p 'quirks' that can be different
+from a local filesystem behaviors.
+
+ - Setting O_NONBLOCK on a file will make client reads return as early
+   as the server returns some data instead of trying to fill the read
+   buffer with the requested amount of bytes or end of file is reached.
+
 Resources
 =========
 
index b46a721..0aa7075 100644 (file)
@@ -107,17 +107,17 @@ Mount Options
        address its connection to the monitor originates from.
 
   wsize=X
-       Specify the maximum write size in bytes.  Default: 16 MB.
+       Specify the maximum write size in bytes.  Default: 64 MB.
 
   rsize=X
-       Specify the maximum read size in bytes.  Default: 16 MB.
+       Specify the maximum read size in bytes.  Default: 64 MB.
 
   rasize=X
        Specify the maximum readahead size in bytes.  Default: 8 MB.
 
   mount_timeout=X
        Specify the timeout value for mount (in seconds), in the case
-       of a non-responsive Ceph file system.  The default is 30
+       of a non-responsive Ceph file system.  The default is 60
        seconds.
 
   caps_max=X
index d681203..87d794b 100644 (file)
@@ -243,8 +243,8 @@ checkpoint=%s[:%u[%]]  Set to "disable" to turn off checkpointing. Set to "enabl
                        hide up to all remaining free space. The actual space that
                        would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
                        This space is reclaimed once checkpoint=enable.
-compress_algorithm=%s  Control compress algorithm, currently f2fs supports "lzo"
-                       and "lz4" algorithm.
+compress_algorithm=%s  Control compress algorithm, currently f2fs supports "lzo",
+                       "lz4" and "zstd" algorithm.
 compress_log_size=%u   Support configuring compress cluster size, the size will
                        be 4KB * (1 << %u), 16KB is minimum size, also it's
                        default size.
index e443be7..c9d2bf9 100644 (file)
@@ -40,13 +40,46 @@ On 64bit systems, even if all overlay layers are not on the same
 underlying filesystem, the same compliant behavior could be achieved
 with the "xino" feature.  The "xino" feature composes a unique object
 identifier from the real object st_ino and an underlying fsid index.
+
 If all underlying filesystems support NFS file handles and export file
 handles with 32bit inode number encoding (e.g. ext4), overlay filesystem
 will use the high inode number bits for fsid.  Even when the underlying
 filesystem uses 64bit inode numbers, users can still enable the "xino"
 feature with the "-o xino=on" overlay mount option.  That is useful for the
 case of underlying filesystems like xfs and tmpfs, which use 64bit inode
-numbers, but are very unlikely to use the high inode number bit.
+numbers, but are very unlikely to use the high inode number bits.  In case
+the underlying inode number does overflow into the high xino bits, overlay
+filesystem will fall back to the non xino behavior for that inode.
+
+The following table summarizes what can be expected in different overlay
+configurations.
+
+Inode properties
+````````````````
+
++--------------+------------+------------+-----------------+----------------+
+|Configuration | Persistent | Uniform    | st_ino == d_ino | d_ino == i_ino |
+|              | st_ino     | st_dev     |                 | [*]            |
++==============+=====+======+=====+======+========+========+========+=======+
+|              | dir | !dir | dir | !dir |  dir   +  !dir  |  dir   | !dir  |
++--------------+-----+------+-----+------+--------+--------+--------+-------+
+| All layers   |  Y  |  Y   |  Y  |  Y   |  Y     |   Y    |  Y     |  Y    |
+| on same fs   |     |      |     |      |        |        |        |       |
++--------------+-----+------+-----+------+--------+--------+--------+-------+
+| Layers not   |  N  |  Y   |  Y  |  N   |  N     |   Y    |  N     |  Y    |
+| on same fs,  |     |      |     |      |        |        |        |       |
+| xino=off     |     |      |     |      |        |        |        |       |
++--------------+-----+------+-----+------+--------+--------+--------+-------+
+| xino=on/auto |  Y  |  Y   |  Y  |  Y   |  Y     |   Y    |  Y     |  Y    |
+|              |     |      |     |      |        |        |        |       |
++--------------+-----+------+-----+------+--------+--------+--------+-------+
+| xino=on/auto,|  N  |  Y   |  Y  |  N   |  N     |   Y    |  N     |  Y    |
+| ino overflow |     |      |     |      |        |        |        |       |
++--------------+-----+------+-----+------+--------+--------+--------+-------+
+
+[*] nfsd v3 readdirplus verifies d_ino == i_ino. i_ino is exposed via several
+/proc files, such as /proc/locks and /proc/self/fdinfo/<fd> of an inotify
+file descriptor.
 
 
 Upper and Lower
@@ -248,6 +281,50 @@ overlay filesystem (though an operation on the name of the file such as
 rename or unlink will of course be noticed and handled).
 
 
+Permission model
+----------------
+
+Permission checking in the overlay filesystem follows these principles:
+
+ 1) permission check SHOULD return the same result before and after copy up
+
+ 2) task creating the overlay mount MUST NOT gain additional privileges
+
+ 3) non-mounting task MAY gain additional privileges through the overlay,
+ compared to direct access on underlying lower or upper filesystems
+
+This is achieved by performing two permission checks on each access
+
+ a) check if current task is allowed access based on local DAC (owner,
+    group, mode and posix acl), as well as MAC checks
+
+ b) check if mounting task would be allowed real operation on lower or
+    upper layer based on underlying filesystem permissions, again including
+    MAC checks
+
+Check (a) ensures consistency (1) since owner, group, mode and posix acls
+are copied up.  On the other hand it can result in server enforced
+permissions (used by NFS, for example) being ignored (3).
+
+Check (b) ensures that no task gains permissions to underlying layers that
+the mounting task does not have (2).  This also means that it is possible
+to create setups where the consistency rule (1) does not hold; normally,
+however, the mounting task will have sufficient privileges to perform all
+operations.
+
+Another way to demonstrate this model is drawing parallels between
+
+  mount -t overlay overlay -olowerdir=/lower,upperdir=/upper,... /merged
+
+and
+
+  cp -a /lower /upper
+  mount --bind /upper /merged
+
+The resulting access permissions should be the same.  The difference is in
+the time of copy (on-demand vs. up-front).
+
+
 Multiple lower layers
 ---------------------
 
@@ -383,7 +460,8 @@ guarantee that the values of st_ino and st_dev returned by stat(2) and the
 value of d_ino returned by readdir(3) will act like on a normal filesystem.
 E.g. the value of st_dev may be different for two objects in the same
 overlay filesystem and the value of st_ino for directory objects may not be
-persistent and could change even while the overlay filesystem is mounted.
+persistent and could change even while the overlay filesystem is mounted, as
+summarized in the `Inode properties`_ table above.
 
 
 Changes to underlying filesystems
index 3eb763d..6193582 100644 (file)
@@ -56,13 +56,13 @@ are illustrated in the following diagram::
                   +- - - -+                  |  +-------------------| |
                   | Entry | - - - - - - - -+ |  | Definition Blocks | |
                   +- - - -+                | |  +-------------------+ |
-                                          | |  +- - - - - - - - - -+ |
-                                          +-|->|       SSDT        | |
+                                           | |  +- - - - - - - - - -+ |
+                                           +-|->|       SSDT        | |
                                              |  +-------------------+ |
                                              |  | Definition Blocks | |
                                              |  +- - - - - - - - - -+ |
                                              +------------------------+
-                                                         |
+                                                          |
                                              OSPM Loading |
                                                          \|/
                                                    +----------------+
diff --git a/Documentation/vm/free_page_reporting.rst b/Documentation/vm/free_page_reporting.rst
new file mode 100644 (file)
index 0000000..8c05e62
--- /dev/null
@@ -0,0 +1,40 @@
+.. _free_page_reporting:
+
+=====================
+Free Page Reporting
+=====================
+
+Free page reporting is an API by which a device can register to receive
+lists of pages that are currently unused by the system. This is useful in
+the case of virtualization where a guest is then able to use this data to
+notify the hypervisor that it is no longer using certain pages in memory.
+
+For the driver, typically a balloon driver, to use of this functionality
+it will allocate and initialize a page_reporting_dev_info structure. The
+field within the structure it will populate is the "report" function
+pointer used to process the scatterlist. It must also guarantee that it can
+handle at least PAGE_REPORTING_CAPACITY worth of scatterlist entries per
+call to the function. A call to page_reporting_register will register the
+page reporting interface with the reporting framework assuming no other
+page reporting devices are already registered.
+
+Once registered the page reporting API will begin reporting batches of
+pages to the driver. The API will start reporting pages 2 seconds after
+the interface is registered and will continue to do so 2 seconds after any
+page of a sufficiently high order is freed.
+
+Pages reported will be stored in the scatterlist passed to the reporting
+function with the final entry having the end bit set in entry nent - 1.
+While pages are being processed by the report function they will not be
+accessible to the allocator. Once the report function has been completed
+the pages will be returned to the free area from which they were obtained.
+
+Prior to removing a driver that is making use of free page reporting it
+is necessary to call page_reporting_unregister to have the
+page_reporting_dev_info structure that is currently in use by free page
+reporting removed. Doing this will prevent further reports from being
+issued via the interface. If another driver or the same driver is
+registered it is possible for it to resume where the previous driver had
+left off in terms of reporting free pages.
+
+Alexander Duyck, Dec 04, 2019
index 61f6185..f8c6a79 100644 (file)
@@ -35,9 +35,11 @@ Zswap evicts pages from compressed cache on an LRU basis to the backing swap
 device when the compressed pool reaches its size limit.  This requirement had
 been identified in prior community discussions.
 
-Zswap is disabled by default but can be enabled at boot time by setting
-the ``enabled`` attribute to 1 at boot time. ie: ``zswap.enabled=1``.  Zswap
-can also be enabled and disabled at runtime using the sysfs interface.
+Whether Zswap is enabled at the boot time depends on whether
+the ``CONFIG_ZSWAP_DEFAULT_ON`` Kconfig option is enabled or not.
+This setting can then be overridden by providing the kernel command line
+``zswap.enabled=`` option, for example ``zswap.enabled=0``.
+Zswap can also be enabled and disabled at runtime using the sysfs interface.
 An example command to enable zswap at runtime, assuming sysfs is mounted
 at ``/sys``, is::
 
@@ -64,9 +66,10 @@ allocation in zpool is not directly accessible by address.  Rather, a handle is
 returned by the allocation routine and that handle must be mapped before being
 accessed.  The compressed memory pool grows on demand and shrinks as compressed
 pages are freed.  The pool is not preallocated.  By default, a zpool
-of type zbud is created, but it can be selected at boot time by
-setting the ``zpool`` attribute, e.g. ``zswap.zpool=zbud``. It can
-also be changed at runtime using the sysfs ``zpool`` attribute, e.g.::
+of type selected in ``CONFIG_ZSWAP_ZPOOL_DEFAULT`` Kconfig option is created,
+but it can be overridden at boot time by setting the ``zpool`` attribute,
+e.g. ``zswap.zpool=zbud``. It can also be changed at runtime using the sysfs
+``zpool`` attribute, e.g.::
 
        echo zbud > /sys/module/zswap/parameters/zpool
 
@@ -97,8 +100,9 @@ controlled policy:
 * max_pool_percent - The maximum percentage of memory that the compressed
   pool can occupy.
 
-The default compressor is lzo, but it can be selected at boot time by
-setting the ``compressor`` attribute, e.g. ``zswap.compressor=lzo``.
+The default compressor is selected in ``CONFIG_ZSWAP_COMPRESSOR_DEFAULT``
+Kconfig option, but it can be overridden at boot time by setting the
+``compressor`` attribute, e.g. ``zswap.compressor=lzo``.
 It can also be changed at runtime using the sysfs "compressor"
 attribute, e.g.::
 
index 534a8dc..d5b1878 100644 (file)
@@ -77,21 +77,13 @@ Tips for patch submitters
 
 8.     Happy hacking.
 
-Descriptions of section entries
--------------------------------
+Descriptions of section entries and preferred order
+---------------------------------------------------
 
        M: *Mail* patches to: FullName <address@domain>
        R: Designated *Reviewer*: FullName <address@domain>
           These reviewers should be CCed on patches.
        L: *Mailing list* that is relevant to this area
-       W: *Web-page* with status/info
-       B: URI for where to file *bugs*. A web-page with detailed bug
-          filing info, a direct bug tracker link, or a mailto: URI.
-       C: URI for *chat* protocol, server and channel where developers
-          usually hang out, for example irc://server/channel.
-       Q: *Patchwork* web based patch tracking system site
-       T: *SCM* tree type and location.
-          Type is one of: git, hg, quilt, stgit, topgit
        S: *Status*, one of the following:
           Supported:   Someone is actually paid to look after this.
           Maintained:  Someone actually looks after it.
@@ -102,30 +94,39 @@ Descriptions of section entries
           Obsolete:    Old code. Something tagged obsolete generally means
                        it has been replaced by a better system and you
                        should be using that.
+       W: *Web-page* with status/info
+       Q: *Patchwork* web based patch tracking system site
+       B: URI for where to file *bugs*. A web-page with detailed bug
+          filing info, a direct bug tracker link, or a mailto: URI.
+       C: URI for *chat* protocol, server and channel where developers
+          usually hang out, for example irc://server/channel.
        P: Subsystem Profile document for more details submitting
           patches to the given subsystem. This is either an in-tree file,
           or a URI. See Documentation/maintainer/maintainer-entry-profile.rst
           for details.
+       T: *SCM* tree type and location.
+          Type is one of: git, hg, quilt, stgit, topgit
        F: *Files* and directories wildcard patterns.
           A trailing slash includes all files and subdirectory files.
           F:   drivers/net/    all files in and below drivers/net
           F:   drivers/net/*   all files in drivers/net, but not below
           F:   */net/*         all files in "any top level directory"/net
           One pattern per line.  Multiple F: lines acceptable.
+       X: *Excluded* files and directories that are NOT maintained, same
+          rules as F:. Files exclusions are tested before file matches.
+          Can be useful for excluding a specific subdirectory, for instance:
+          F:   net/
+          X:   net/ipv6/
+          matches all files in and below net excluding net/ipv6/
        N: Files and directories *Regex* patterns.
-          N:   [^a-z]tegra     all files whose path contains the word tegra
+          N:   [^a-z]tegra     all files whose path contains tegra
+                               (not including files like integrator)
           One pattern per line.  Multiple N: lines acceptable.
           scripts/get_maintainer.pl has different behavior for files that
           match F: pattern and matches of N: patterns.  By default,
           get_maintainer will not look at git log history when an F: pattern
           match occurs.  When an N: match occurs, git log history is used
           to also notify the people that have git commit signatures.
-       X: *Excluded* files and directories that are NOT maintained, same
-          rules as F:. Files exclusions are tested before file matches.
-          Can be useful for excluding a specific subdirectory, for instance:
-          F:   net/
-          X:   net/ipv6/
-          matches all files in and below net excluding net/ipv6/
        K: *Content regex* (perl extended) pattern match in a patch or file.
           For instance:
           K: of_get_profile
@@ -1443,6 +1444,7 @@ M:        Will Deacon <will@kernel.org>
 R:     Robin Murphy <robin.murphy@arm.com>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Maintained
+F:     Documentation/devicetree/bindings/iommu/arm,smmu*
 F:     drivers/iommu/arm-smmu*
 F:     drivers/iommu/io-pgtable-arm.c
 F:     drivers/iommu/io-pgtable-arm-v7s.c
@@ -9668,6 +9670,7 @@ F:        drivers/acpi/nfit/*
 F:     include/linux/nd.h
 F:     include/linux/libnvdimm.h
 F:     include/uapi/linux/ndctl.h
+F:     tools/testing/nvdimm/
 
 LICENSES and SPDX stuff
 M:     Thomas Gleixner <tglx@linutronix.de>
@@ -17869,10 +17872,12 @@ L:    virtualization@lists.linux-foundation.org
 S:     Maintained
 F:     Documentation/devicetree/bindings/virtio/
 F:     drivers/virtio/
+F:     drivers/vdpa/
 F:     tools/virtio/
 F:     drivers/net/virtio_net.c
 F:     drivers/block/virtio_blk.c
 F:     include/linux/virtio*.h
+F:     include/linux/vdpa.h
 F:     include/uapi/linux/virtio_*.h
 F:     drivers/crypto/virtio/
 F:     mm/balloon_compaction.c
@@ -17940,6 +17945,7 @@ T:      git git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git
 S:     Maintained
 F:     drivers/vhost/
 F:     include/uapi/linux/vhost.h
+F:     include/linux/vhost_iotlb.h
 
 VIRTIO INPUT DRIVER
 M:     Gerd Hoffmann <kraxel@redhat.com>
index 7ee144f..9b521c8 100644 (file)
@@ -8,8 +8,6 @@
 
 #include <asm/smp.h>
 
-struct bootmem_data_t; /* stupid forward decl. */
-
 /*
  * Following are macros that are specific to this numa platform.
  */
index e5b99bd..1780e86 100644 (file)
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
        printf "#define __NR_syscalls\t%s\n" "${nxt}"
        printf "#endif\n"
        printf "\n"
-       printf "#endif /* %s */" "${fileguard}"
+       printf "#endif /* %s */\n" "${fileguard}"
 ) > "$out"
index 425855f..2e35354 100644 (file)
@@ -312,7 +312,6 @@ static struct pwm_lookup cm_x300_pwm_lookup[] = {
 static struct platform_pwm_backlight_data cm_x300_backlight_data = {
        .max_brightness = 100,
        .dft_brightness = 100,
-       .enable_gpio    = -1,
 };
 
 static struct platform_device cm_x300_backlight_device = {
index dbad2f1..e5879e8 100644 (file)
@@ -202,7 +202,6 @@ static struct pwm_lookup income_pwm_lookup[] = {
 static struct platform_pwm_backlight_data income_backlight_data = {
        .max_brightness = 0x3ff,
        .dft_brightness = 0x1ff,
-       .enable_gpio    = -1,
 };
 
 static struct platform_device income_backlight = {
index f2d7328..593c7f7 100644 (file)
@@ -563,13 +563,20 @@ static void corgi_bl_kick_battery(void)
        }
 }
 
+static struct gpiod_lookup_table corgi_lcdcon_gpio_table = {
+       .dev_id = "spi1.1",
+       .table = {
+               GPIO_LOOKUP("gpio-pxa", CORGI_GPIO_BACKLIGHT_CONT,
+                           "BL_CONT", GPIO_ACTIVE_HIGH),
+               { },
+       },
+};
+
 static struct corgi_lcd_platform_data corgi_lcdcon_info = {
        .init_mode              = CORGI_LCD_MODE_VGA,
        .max_intensity          = 0x2f,
        .default_intensity      = 0x1f,
        .limit_mask             = 0x0b,
-       .gpio_backlight_cont    = CORGI_GPIO_BACKLIGHT_CONT,
-       .gpio_backlight_on      = -1,
        .kick_battery           = corgi_bl_kick_battery,
 };
 
@@ -609,6 +616,7 @@ static struct spi_board_info corgi_spi_devices[] = {
 static void __init corgi_init_spi(void)
 {
        pxa2xx_set_spi_info(1, &corgi_spi_info);
+       gpiod_add_lookup_table(&corgi_lcdcon_gpio_table);
        spi_register_board_info(ARRAY_AND_SIZE(corgi_spi_devices));
 }
 #else
index ec10851..eb85950 100644 (file)
@@ -55,7 +55,6 @@ static struct pwm_lookup ezx_pwm_lookup[] __maybe_unused = {
 static struct platform_pwm_backlight_data ezx_backlight_data = {
        .max_brightness = 1023,
        .dft_brightness = 1023,
-       .enable_gpio    = -1,
 };
 
 static struct platform_device ezx_backlight_device = {
index 238a751..1d4c5db 100644 (file)
@@ -556,7 +556,6 @@ static struct platform_device hx4700_lcd = {
 static struct platform_pwm_backlight_data backlight_data = {
        .max_brightness = 200,
        .dft_brightness = 100,
-       .enable_gpio    = -1,
 };
 
 static struct platform_device backlight = {
index 20e00e9..6fc40bc 100644 (file)
@@ -277,7 +277,6 @@ static struct pwm_lookup lpd270_pwm_lookup[] = {
 static struct platform_pwm_backlight_data lpd270_backlight_data = {
        .max_brightness = 1,
        .dft_brightness = 1,
-       .enable_gpio    = -1,
 };
 
 static struct platform_device lpd270_backlight_device = {
index 5d0591f..cd9fa46 100644 (file)
@@ -401,7 +401,6 @@ static void magician_backlight_exit(struct device *dev)
 static struct platform_pwm_backlight_data backlight_data = {
        .max_brightness = 272,
        .dft_brightness = 100,
-       .enable_gpio    = -1,
        .init           = magician_backlight_init,
        .notify         = magician_backlight_notify,
        .exit           = magician_backlight_exit,
index 1b78829..d1010ec 100644 (file)
@@ -256,7 +256,6 @@ static struct pwm_lookup mainstone_pwm_lookup[] = {
 static struct platform_pwm_backlight_data mainstone_backlight_data = {
        .max_brightness = 1023,
        .dft_brightness = 1023,
-       .enable_gpio    = -1,
 };
 
 static struct platform_device mainstone_backlight_device = {
index 0b8bae9..d3af803 100644 (file)
@@ -176,7 +176,6 @@ static struct pwm_lookup mioa701_pwm_lookup[] = {
 static struct platform_pwm_backlight_data mioa701_backlight_data = {
        .max_brightness = 100,
        .dft_brightness = 50,
-       .enable_gpio    = -1,
 };
 
 /*
index b600b63..0d246a1 100644 (file)
@@ -318,7 +318,6 @@ static void palm27x_backlight_exit(struct device *dev)
 static struct platform_pwm_backlight_data palm27x_backlight_data = {
        .max_brightness = 0xfe,
        .dft_brightness = 0x7e,
-       .enable_gpio    = -1,
        .init           = palm27x_backlight_init,
        .notify         = palm27x_backlight_notify,
        .exit           = palm27x_backlight_exit,
index fda9dea..455cb8c 100644 (file)
@@ -174,6 +174,15 @@ static inline void palmtc_keys_init(void) {}
  * Backlight
  ******************************************************************************/
 #if defined(CONFIG_BACKLIGHT_PWM) || defined(CONFIG_BACKLIGHT_PWM_MODULE)
+
+static struct gpiod_lookup_table palmtc_pwm_bl_gpio_table = {
+       .dev_id = "pwm-backlight.0",
+       .table = {
+               GPIO_LOOKUP("gpio-pxa", GPIO_NR_PALMTC_BL_POWER,
+                           "enable", GPIO_ACTIVE_HIGH),
+       },
+};
+
 static struct pwm_lookup palmtc_pwm_lookup[] = {
        PWM_LOOKUP("pxa25x-pwm.1", 0, "pwm-backlight.0", NULL, PALMTC_PERIOD_NS,
                   PWM_POLARITY_NORMAL),
@@ -182,7 +191,6 @@ static struct pwm_lookup palmtc_pwm_lookup[] = {
 static struct platform_pwm_backlight_data palmtc_backlight_data = {
        .max_brightness = PALMTC_MAX_INTENSITY,
        .dft_brightness = PALMTC_MAX_INTENSITY,
-       .enable_gpio    = GPIO_NR_PALMTC_BL_POWER,
 };
 
 static struct platform_device palmtc_backlight = {
@@ -195,6 +203,7 @@ static struct platform_device palmtc_backlight = {
 
 static void __init palmtc_pwm_init(void)
 {
+       gpiod_add_lookup_table(&palmtc_pwm_bl_gpio_table);
        pwm_add_table(palmtc_pwm_lookup, ARRAY_SIZE(palmtc_pwm_lookup));
        platform_device_register(&palmtc_backlight);
 }
index 7171014..e3bcf58 100644 (file)
@@ -175,7 +175,6 @@ static void palmte2_backlight_exit(struct device *dev)
 static struct platform_pwm_backlight_data palmte2_backlight_data = {
        .max_brightness = PALMTE2_MAX_INTENSITY,
        .dft_brightness = PALMTE2_MAX_INTENSITY,
-       .enable_gpio    = -1,
        .init           = palmte2_backlight_init,
        .notify         = palmte2_backlight_notify,
        .exit           = palmte2_backlight_exit,
index cb1c567..bf613f8 100644 (file)
@@ -154,7 +154,6 @@ static struct pwm_lookup pcm990_pwm_lookup[] = {
 static struct platform_pwm_backlight_data pcm990_backlight_data = {
        .max_brightness = 1023,
        .dft_brightness = 1023,
-       .enable_gpio    = -1,
 };
 
 static struct platform_device pcm990_backlight_device = {
index a4fdc39..371008e 100644 (file)
@@ -525,13 +525,33 @@ static void spitz_bl_kick_battery(void)
        }
 }
 
+static struct gpiod_lookup_table spitz_lcdcon_gpio_table = {
+       .dev_id = "spi2.1",
+       .table = {
+               GPIO_LOOKUP("gpio-pxa", SPITZ_GPIO_BACKLIGHT_CONT,
+                           "BL_CONT", GPIO_ACTIVE_LOW),
+               GPIO_LOOKUP("gpio-pxa", SPITZ_GPIO_BACKLIGHT_ON,
+                           "BL_ON", GPIO_ACTIVE_HIGH),
+               { },
+       },
+};
+
+static struct gpiod_lookup_table akita_lcdcon_gpio_table = {
+       .dev_id = "spi2.1",
+       .table = {
+               GPIO_LOOKUP("gpio-pxa", AKITA_GPIO_BACKLIGHT_CONT,
+                           "BL_CONT", GPIO_ACTIVE_LOW),
+               GPIO_LOOKUP("gpio-pxa", AKITA_GPIO_BACKLIGHT_ON,
+                           "BL_ON", GPIO_ACTIVE_HIGH),
+               { },
+       },
+};
+
 static struct corgi_lcd_platform_data spitz_lcdcon_info = {
        .init_mode              = CORGI_LCD_MODE_VGA,
        .max_intensity          = 0x2f,
        .default_intensity      = 0x1f,
        .limit_mask             = 0x0b,
-       .gpio_backlight_cont    = SPITZ_GPIO_BACKLIGHT_CONT,
-       .gpio_backlight_on      = SPITZ_GPIO_BACKLIGHT_ON,
        .kick_battery           = spitz_bl_kick_battery,
 };
 
@@ -574,12 +594,10 @@ static struct pxa2xx_spi_controller spitz_spi_info = {
 
 static void __init spitz_spi_init(void)
 {
-       struct corgi_lcd_platform_data *lcd_data = &spitz_lcdcon_info;
-
-       if (machine_is_akita()) {
-               lcd_data->gpio_backlight_cont = AKITA_GPIO_BACKLIGHT_CONT;
-               lcd_data->gpio_backlight_on = AKITA_GPIO_BACKLIGHT_ON;
-       }
+       if (machine_is_akita())
+               gpiod_add_lookup_table(&akita_lcdcon_gpio_table);
+       else
+               gpiod_add_lookup_table(&spitz_lcdcon_gpio_table);
 
        pxa2xx_set_spi_info(2, &spitz_spi_info);
        spi_register_board_info(ARRAY_AND_SIZE(spitz_spi_devices));
index 93466fa..a15eb3b 100644 (file)
@@ -178,13 +178,11 @@ static struct platform_pwm_backlight_data tavorevb_backlight_data[] = {
                /* primary backlight */
                .max_brightness = 100,
                .dft_brightness = 100,
-               .enable_gpio    = -1,
        },
        [1] = {
                /* secondary backlight */
                .max_brightness = 100,
                .dft_brightness = 100,
-               .enable_gpio    = -1,
        },
 };
 
index c06031d..3aa34e9 100644 (file)
@@ -404,7 +404,6 @@ static void viper_backlight_exit(struct device *dev)
 static struct platform_pwm_backlight_data viper_backlight_data = {
        .max_brightness = 100,
        .dft_brightness = 100,
-       .enable_gpio    = -1,
        .init           = viper_backlight_init,
        .notify         = viper_backlight_notify,
        .exit           = viper_backlight_exit,
index 900cefc..21fd76b 100644 (file)
@@ -210,13 +210,11 @@ static struct platform_pwm_backlight_data z2_backlight_data[] = {
                /* Keypad Backlight */
                .max_brightness = 1023,
                .dft_brightness = 0,
-               .enable_gpio    = -1,
        },
        [1] = {
                /* LCD Backlight */
                .max_brightness = 1023,
                .dft_brightness = 512,
-               .enable_gpio    = -1,
        },
 };
 
index bf2ab5b..79f0025 100644 (file)
@@ -117,7 +117,6 @@ static struct pwm_lookup zylonite_pwm_lookup[] = {
 static struct platform_pwm_backlight_data zylonite_backlight_data = {
        .max_brightness = 100,
        .dft_brightness = 100,
-       .enable_gpio    = -1,
 };
 
 static struct platform_device zylonite_backlight_device = {
index 74d6b68..e1c372e 100644 (file)
@@ -516,7 +516,6 @@ static void h1940_backlight_exit(struct device *dev)
 static struct platform_pwm_backlight_data backlight_data = {
        .max_brightness = 100,
        .dft_brightness = 50,
-       .enable_gpio    = -1,
        .init           = h1940_backlight_init,
        .notify         = h1940_backlight_notify,
        .exit           = h1940_backlight_exit,
index 03d8f27..fde98b1 100644 (file)
@@ -534,7 +534,6 @@ static int rx1950_backlight_notify(struct device *dev, int brightness)
 static struct platform_pwm_backlight_data rx1950_backlight_data = {
        .max_brightness = 24,
        .dft_brightness = 4,
-       .enable_gpio = -1,
        .init = rx1950_backlight_init,
        .notify = rx1950_backlight_notify,
        .exit = rx1950_backlight_exit,
index 799cfdf..09e6da3 100644 (file)
@@ -65,7 +65,6 @@ static struct samsung_bl_drvdata samsung_dfl_bl_data __initdata = {
        .plat_data = {
                .max_brightness = 255,
                .dft_brightness = 255,
-               .enable_gpio    = -1,
                .init           = samsung_bl_init,
                .exit           = samsung_bl_exit,
        },
@@ -111,8 +110,6 @@ void __init samsung_bl_set(struct samsung_bl_gpio_info *gpio_info,
                samsung_bl_data->dft_brightness = bl_data->dft_brightness;
        if (bl_data->lth_brightness)
                samsung_bl_data->lth_brightness = bl_data->lth_brightness;
-       if (bl_data->enable_gpio >= 0)
-               samsung_bl_data->enable_gpio = bl_data->enable_gpio;
        if (bl_data->init)
                samsung_bl_data->init = bl_data->init;
        if (bl_data->notify)
index 8ec6a4f..da96542 100644 (file)
@@ -114,7 +114,6 @@ static struct pwm_lookup crag6410_pwm_lookup[] = {
 static struct platform_pwm_backlight_data crag6410_backlight_data = {
        .max_brightness = 1000,
        .dft_brightness = 600,
-       .enable_gpio    = -1,
 };
 
 static struct platform_device crag6410_backlight_device = {
index bfe9881..e708021 100644 (file)
@@ -115,7 +115,6 @@ static void hmt_bl_exit(struct device *dev)
 static struct platform_pwm_backlight_data hmt_backlight_data = {
        .max_brightness = 100 * 256,
        .dft_brightness = 40 * 256,
-       .enable_gpio    = -1,
        .init           = hmt_bl_init,
        .notify         = hmt_bl_notify,
        .exit           = hmt_bl_exit,
index 829d5db..5025db6 100644 (file)
@@ -150,7 +150,6 @@ static int smartq_bl_init(struct device *dev)
 static struct platform_pwm_backlight_data smartq_backlight_data = {
        .max_brightness = 1000,
        .dft_brightness = 600,
-       .enable_gpio    = -1,
        .init           = smartq_bl_init,
 };
 
index 908e5aa..56f406c 100644 (file)
@@ -623,7 +623,7 @@ static struct pwm_lookup smdk6410_pwm_lookup[] = {
 };
 
 static struct platform_pwm_backlight_data smdk6410_bl_data = {
-       .enable_gpio = -1,
+       /* Intentionally blank */
 };
 
 static struct dwc2_hsotg_plat smdk6410_hsotg_pdata;
index 6e41c4b..40fb05d 100644 (file)
@@ -1502,7 +1502,10 @@ config ARM64_PTR_AUTH
        default y
        depends on !KVM || ARM64_VHE
        depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC
-       depends on CC_IS_GCC || (CC_IS_CLANG && AS_HAS_CFI_NEGATE_RA_STATE)
+       # GCC 9.1 and later inserts a .note.gnu.property section note for PAC
+       # which is only understood by binutils starting with version 2.33.1.
+       depends on !CC_IS_GCC || GCC_VERSION < 90100 || LD_VERSION >= 233010000
+       depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE
        depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
        help
          Pointer authentication (part of the ARMv8.3 Extensions) provides
index 1c906d9..a1efa24 100644 (file)
@@ -52,19 +52,6 @@ config DEBUG_WX
 
          If in doubt, say "Y".
 
-config DEBUG_ALIGN_RODATA
-       depends on STRICT_KERNEL_RWX
-       bool "Align linker sections up to SECTION_SIZE"
-       help
-         If this option is enabled, sections that may potentially be marked as
-         read only or non-executable will be aligned up to the section size of
-         the kernel. This prevents sections from being split into pages and
-         avoids a potential TLB penalty. The downside is an increase in
-         alignment and potentially wasted space. Turn on this option if
-         performance is more important than memory pressure.
-
-         If in doubt, say N.
-
 config DEBUG_EFI
        depends on EFI && DEBUG_INFO
        bool "UEFI debugging"
index f15f92b..85e4149 100644 (file)
@@ -65,6 +65,10 @@ stack_protector_prepare: prepare0
                                        include/generated/asm-offsets.h))
 endif
 
+# Ensure that if the compiler supports branch protection we default it
+# off, this will be overridden if we are using branch protection.
+branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
+
 ifeq ($(CONFIG_ARM64_PTR_AUTH),y)
 branch-prot-flags-$(CONFIG_CC_HAS_SIGN_RETURN_ADDRESS) := -msign-return-address=all
 branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET) := -mbranch-protection=pac-ret+leaf
@@ -73,9 +77,10 @@ branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET) := -mbranch-protection=pa
 # we pass it only to the assembler. This option is utilized only in case of non
 # integrated assemblers.
 branch-prot-flags-$(CONFIG_AS_HAS_PAC) += -Wa,-march=armv8.3-a
-KBUILD_CFLAGS += $(branch-prot-flags-y)
 endif
 
+KBUILD_CFLAGS += $(branch-prot-flags-y)
+
 ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
 KBUILD_CPPFLAGS        += -mbig-endian
 CHECKFLAGS     += -D__AARCH64EB__
index 2be67b2..a1871bb 100644 (file)
 
 /*
  * Alignment of kernel segments (e.g. .text, .data).
- */
-#if defined(CONFIG_DEBUG_ALIGN_RODATA)
-/*
- *  4 KB granule:   1 level 2 entry
- * 16 KB granule: 128 level 3 entries, with contiguous bit
- * 64 KB granule:  32 level 3 entries, with contiguous bit
- */
-#define SEGMENT_ALIGN          SZ_2M
-#else
-/*
+ *
  *  4 KB granule:  16 level 3 entries, with contiguous bit
  * 16 KB granule:   4 level 3 entries, without contiguous bit
  * 64 KB granule:   1 level 3 entry
  */
 #define SEGMENT_ALIGN          SZ_64K
-#endif
 
 /*
  * Memory types available.
index 4cc581a..c19aa81 100644 (file)
@@ -601,7 +601,7 @@ static struct undef_hook setend_hooks[] = {
        },
        {
                /* Thumb mode */
-               .instr_mask     = 0x0000fff7,
+               .instr_mask     = 0xfffffff7,
                .instr_val      = 0x0000b650,
                .pstate_mask    = (PSR_AA32_T_BIT | PSR_AA32_MODE_MASK),
                .pstate_val     = (PSR_AA32_T_BIT | PSR_AA32_MODE_USR),
index a475c68..449386d 100644 (file)
@@ -64,6 +64,4 @@ config KVM_ARM_PMU
 config KVM_INDIRECT_VECTORS
        def_bool KVM && (HARDEN_BRANCH_PREDICTOR || HARDEN_EL2_VECTORS)
 
-source "drivers/vhost/Kconfig"
-
 endif # VIRTUALIZATION
index d3c61b8..4e6dc68 100644 (file)
@@ -141,7 +141,7 @@ good_area:
                if (!(vma->vm_flags & VM_WRITE))
                        goto bad_area;
        } else {
-               if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
+               if (unlikely(!vma_is_accessible(vma)))
                        goto bad_area;
        }
 
index 0c2d2c7..f407b6e 100644 (file)
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
        printf "#define __NR_syscalls\t%s\n" "${nxt}"
        printf "#endif\n"
        printf "\n"
-       printf "#endif /* %s */" "${fileguard}"
+       printf "#endif /* %s */\n" "${fileguard}"
 ) > "$out"
index 1ec6b70..6b5652e 100644 (file)
@@ -54,6 +54,8 @@ SECTIONS {
                CPUIDLE_TEXT
                LOCK_TEXT
                KPROBES_TEXT
+               IRQENTRY_TEXT
+               SOFTIRQENTRY_TEXT
                *(.gnu.linkonce.t*)
        }
 
index 71ddb4c..1c8e8a8 100644 (file)
@@ -68,14 +68,6 @@ static irqreturn_t hw_tick(int irq, void *dummy)
 
 /***************************************************************************/
 
-static struct irqaction m68328_timer_irq = {
-       .name    = "timer",
-       .flags   = IRQF_TIMER,
-       .handler = hw_tick,
-};
-
-/***************************************************************************/
-
 static u64 m68328_read_clk(struct clocksource *cs)
 {
        unsigned long flags;
@@ -102,11 +94,17 @@ static struct clocksource m68328_clk = {
 
 void hw_timer_init(irq_handler_t handler)
 {
+       int ret;
+
        /* disable timer 1 */
        TCTL = 0;
 
        /* set ISR */
-       setup_irq(TMR_IRQ_NUM, &m68328_timer_irq);
+       ret = request_irq(TMR_IRQ_NUM, hw_tick, IRQF_TIMER, "timer", NULL);
+       if (ret) {
+               pr_err("Failed to request irq %d (timer): %pe\n", TMR_IRQ_NUM,
+                      ERR_PTR(ret));
+       }
 
        /* Restart mode, Enable int, Set clock source */
        TCTL = TCTL_OM | TCTL_IRQEN | CLOCK_SOURCE;
index eb6f16b..fd1d9c9 100644 (file)
@@ -111,14 +111,6 @@ static irqreturn_t pit_tick(int irq, void *dummy)
 
 /***************************************************************************/
 
-static struct irqaction pit_irq = {
-       .name    = "timer",
-       .flags   = IRQF_TIMER,
-       .handler = pit_tick,
-};
-
-/***************************************************************************/
-
 static u64 pit_read_clk(struct clocksource *cs)
 {
        unsigned long flags;
@@ -146,6 +138,8 @@ static struct clocksource pit_clk = {
 
 void hw_timer_init(irq_handler_t handler)
 {
+       int ret;
+
        cf_pit_clockevent.cpumask = cpumask_of(smp_processor_id());
        cf_pit_clockevent.mult = div_sc(FREQ, NSEC_PER_SEC, 32);
        cf_pit_clockevent.max_delta_ns =
@@ -156,7 +150,11 @@ void hw_timer_init(irq_handler_t handler)
        cf_pit_clockevent.min_delta_ticks = 0x3f;
        clockevents_register_device(&cf_pit_clockevent);
 
-       setup_irq(MCF_IRQ_PIT1, &pit_irq);
+       ret = request_irq(MCF_IRQ_PIT1, pit_tick, IRQF_TIMER, "timer", NULL);
+       if (ret) {
+               pr_err("Failed to request irq %d (timer): %pe\n", MCF_IRQ_PIT1,
+                      ERR_PTR(ret));
+       }
 
        clocksource_register_hz(&pit_clk, FREQ);
 }
index 1b11e7b..5ab81c9 100644 (file)
@@ -50,18 +50,19 @@ irqreturn_t mcfslt_profile_tick(int irq, void *dummy)
        return IRQ_HANDLED;
 }
 
-static struct irqaction mcfslt_profile_irq = {
-       .name    = "profile timer",
-       .flags   = IRQF_TIMER,
-       .handler = mcfslt_profile_tick,
-};
-
 void mcfslt_profile_init(void)
 {
+       int ret;
+
        printk(KERN_INFO "PROFILE: lodging TIMER 1 @ %dHz as profile timer\n",
               PROFILEHZ);
 
-       setup_irq(MCF_IRQ_PROFILER, &mcfslt_profile_irq);
+       ret = request_irq(MCF_IRQ_PROFILER, mcfslt_profile_tick, IRQF_TIMER,
+                         "profile timer", NULL);
+       if (ret) {
+               pr_err("Failed to request irq %d (profile timer): %pe\n",
+                      MCF_IRQ_PROFILER, ERR_PTR(ret));
+       }
 
        /* Set up TIMER 2 as high speed profile clock */
        __raw_writel(MCF_BUSCLK / PROFILEHZ - 1, PA(MCFSLT_STCNT));
@@ -92,12 +93,6 @@ static irqreturn_t mcfslt_tick(int irq, void *dummy)
        return timer_interrupt(irq, dummy);
 }
 
-static struct irqaction mcfslt_timer_irq = {
-       .name    = "timer",
-       .flags   = IRQF_TIMER,
-       .handler = mcfslt_tick,
-};
-
 static u64 mcfslt_read_clk(struct clocksource *cs)
 {
        unsigned long flags;
@@ -126,6 +121,8 @@ static struct clocksource mcfslt_clk = {
 
 void hw_timer_init(irq_handler_t handler)
 {
+       int r;
+
        mcfslt_cycles_per_jiffy = MCF_BUSCLK / HZ;
        /*
         *      The coldfire slice timer (SLT) runs from STCNT to 0 included,
@@ -140,7 +137,11 @@ void hw_timer_init(irq_handler_t handler)
        mcfslt_cnt = mcfslt_cycles_per_jiffy;
 
        timer_interrupt = handler;
-       setup_irq(MCF_IRQ_TIMER, &mcfslt_timer_irq);
+       r = request_irq(MCF_IRQ_TIMER, mcfslt_tick, IRQF_TIMER, "timer", NULL);
+       if (r) {
+               pr_err("Failed to request irq %d (timer): %pe\n", MCF_IRQ_TIMER,
+                      ERR_PTR(r));
+       }
 
        clocksource_register_hz(&mcfslt_clk, MCF_BUSCLK);
 
index 227aa5d..b8301fd 100644 (file)
@@ -82,14 +82,6 @@ static irqreturn_t mcftmr_tick(int irq, void *dummy)
 
 /***************************************************************************/
 
-static struct irqaction mcftmr_timer_irq = {
-       .name    = "timer",
-       .flags   = IRQF_TIMER,
-       .handler = mcftmr_tick,
-};
-
-/***************************************************************************/
-
 static u64 mcftmr_read_clk(struct clocksource *cs)
 {
        unsigned long flags;
@@ -118,6 +110,8 @@ static struct clocksource mcftmr_clk = {
 
 void hw_timer_init(irq_handler_t handler)
 {
+       int r;
+
        __raw_writew(MCFTIMER_TMR_DISABLE, TA(MCFTIMER_TMR));
        mcftmr_cycles_per_jiffy = FREQ / HZ;
        /*
@@ -134,7 +128,11 @@ void hw_timer_init(irq_handler_t handler)
 
        timer_interrupt = handler;
        init_timer_irq();
-       setup_irq(MCF_IRQ_TIMER, &mcftmr_timer_irq);
+       r = request_irq(MCF_IRQ_TIMER, mcftmr_tick, IRQF_TIMER, "timer", NULL);
+       if (r) {
+               pr_err("Failed to request irq %d (timer): %pe\n", MCF_IRQ_TIMER,
+                      ERR_PTR(r));
+       }
 
 #ifdef CONFIG_HIGHPROFILE
        coldfire_profile_init();
@@ -170,14 +168,10 @@ irqreturn_t coldfire_profile_tick(int irq, void *dummy)
 
 /***************************************************************************/
 
-static struct irqaction coldfire_profile_irq = {
-       .name    = "profile timer",
-       .flags   = IRQF_TIMER,
-       .handler = coldfire_profile_tick,
-};
-
 void coldfire_profile_init(void)
 {
+       int ret;
+
        printk(KERN_INFO "PROFILE: lodging TIMER2 @ %dHz as profile timer\n",
               PROFILEHZ);
 
@@ -188,7 +182,12 @@ void coldfire_profile_init(void)
        __raw_writew(MCFTIMER_TMR_ENORI | MCFTIMER_TMR_CLK16 |
                MCFTIMER_TMR_RESTART | MCFTIMER_TMR_ENABLE, PA(MCFTIMER_TMR));
 
-       setup_irq(MCF_IRQ_PROFILER, &coldfire_profile_irq);
+       ret = request_irq(MCF_IRQ_PROFILER, coldfire_profile_tick, IRQF_TIMER,
+                         "profile timer", NULL);
+       if (ret) {
+               pr_err("Failed to request irq %d (profile timer): %pe\n",
+                      MCF_IRQ_PROFILER, ERR_PTR(ret));
+       }
 }
 
 /***************************************************************************/
index f7afb98..3bfb5c8 100644 (file)
@@ -125,7 +125,7 @@ good_area:
                case 1:         /* read, present */
                        goto acc_err;
                case 0:         /* read, not present */
-                       if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                       if (unlikely(!vma_is_accessible(vma)))
                                goto acc_err;
        }
 
index 2e9062a..a914854 100644 (file)
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
        printf "#define __NR_syscalls\t%s\n" "${nxt}"
        printf "#endif\n"
        printf "\n"
-       printf "#endif /* %s */" "${fileguard}"
+       printf "#endif /* %s */\n" "${fileguard}"
 ) > "$out"
index d2bcfa8..2e241e7 100644 (file)
@@ -32,6 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
        printf "#define __NR_syscalls\t%s\n" "${nxt}"
        printf "#endif\n"
        printf "\n"
-       printf "#endif /* %s */" "${fileguard}"
-       printf "\n"
+       printf "#endif /* %s */\n" "${fileguard}"
 ) > "$out"
index eac25ae..b91d145 100644 (file)
@@ -72,6 +72,4 @@ config KVM_MIPS_DEBUG_COP0_COUNTERS
 
          If unsure, say N.
 
-source "drivers/vhost/Kconfig"
-
 endif # VIRTUALIZATION
index 4a0eafe..f8d62cd 100644 (file)
@@ -142,7 +142,7 @@ good_area:
                                goto bad_area;
                        }
                } else {
-                       if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
+                       if (unlikely(!vma_is_accessible(vma)))
                                goto bad_area;
                }
        }
index f679d33..7a6c1ce 100644 (file)
@@ -47,6 +47,7 @@ SECTIONS
                LOCK_TEXT
                KPROBES_TEXT
                IRQENTRY_TEXT
+               SOFTIRQENTRY_TEXT
                *(.fixup)
        }
 
index 50242b7..730db28 100644 (file)
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
        printf "#define __NR_syscalls\t%s\n" "${nxt}"
        printf "#endif\n"
        printf "\n"
-       printf "#endif /* %s */" "${fileguard}"
+       printf "#endif /* %s */\n" "${fileguard}"
 ) > "$out"
index 5fc4536..924c541 100644 (file)
@@ -122,6 +122,7 @@ config PPC
        select ARCH_HAS_GCOV_PROFILE_ALL
        select ARCH_HAS_KCOV
        select ARCH_HAS_HUGEPD                  if HUGETLB_PAGE
+       select ARCH_HAS_MEMREMAP_COMPAT_ALIGN
        select ARCH_HAS_MMIOWB                  if PPC64
        select ARCH_HAS_PHYS_TO_DMA
        select ARCH_HAS_PMEM_API
@@ -265,8 +266,9 @@ config PANIC_TIMEOUT
        default 180
 
 config COMPAT
-       bool
-       default y if PPC64
+       bool "Enable support for 32bit binaries"
+       depends on PPC64
+       default y if !CPU_LITTLE_ENDIAN
        select COMPAT_BINFMT_ELF
        select ARCH_WANT_OLD_COMPAT_IPC
        select COMPAT_OLD_SIGACTION
index 4db5171..81b55c8 100644 (file)
@@ -60,6 +60,8 @@ CONFIG_CFG80211=m
 CONFIG_CFG80211_WEXT=y
 CONFIG_MAC80211=m
 # CONFIG_MAC80211_RC_MINSTREL is not set
+CONFIG_UEVENT_HELPER=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=65535
index a227074..ca6c970 100644 (file)
@@ -162,10 +162,10 @@ static inline bool test_thread_local_flags(unsigned int flags)
        return (ti->local_flags & flags) != 0;
 }
 
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_COMPAT
 #define is_32bit_task()        (test_thread_flag(TIF_32BIT))
 #else
-#define is_32bit_task()        (1)
+#define is_32bit_task()        (IS_ENABLED(CONFIG_PPC32))
 #endif
 
 #if defined(CONFIG_PPC64)
index b0720c7..700fcda 100644 (file)
@@ -31,6 +31,7 @@
 #define __ARCH_WANT_SYS_SOCKETCALL
 #define __ARCH_WANT_SYS_FADVISE64
 #define __ARCH_WANT_SYS_GETPGRP
+#define __ARCH_WANT_SYS_LLSEEK
 #define __ARCH_WANT_SYS_NICE
 #define __ARCH_WANT_SYS_OLD_GETRLIMIT
 #define __ARCH_WANT_SYS_OLD_UNAME
index 570660e..1c43858 100644 (file)
@@ -40,16 +40,17 @@ CFLAGS_btext.o += -DDISABLE_BRANCH_PROFILING
 endif
 
 obj-y                          := cputable.o syscalls.o \
-                                  irq.o align.o signal_32.o pmc.o vdso.o \
+                                  irq.o align.o signal_$(BITS).o pmc.o vdso.o \
                                   process.o systbl.o idle.o \
                                   signal.o sysfs.o cacheinfo.o time.o \
                                   prom.o traps.o setup-common.o \
                                   udbg.o misc.o io.o misc_$(BITS).o \
                                   of_platform.o prom_parse.o
 obj-y                          += ptrace/
-obj-$(CONFIG_PPC64)            += setup_64.o sys_ppc32.o signal_64.o \
+obj-$(CONFIG_PPC64)            += setup_64.o \
                                   paca.o nvram_64.o firmware.o note.o \
                                   syscall_64.o
+obj-$(CONFIG_COMPAT)           += sys_ppc32.o signal_32.o
 obj-$(CONFIG_VDSO32)           += vdso32/
 obj-$(CONFIG_PPC_WATCHDOG)     += watchdog.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)       += hw_breakpoint.o
index 63f0a44..9a1e5d6 100644 (file)
 SYS_CALL_TABLE:
        .tc sys_call_table[TC],sys_call_table
 
+#ifdef CONFIG_COMPAT
 COMPAT_SYS_CALL_TABLE:
        .tc compat_sys_call_table[TC],compat_sys_call_table
+#endif
 
 /* This value is used to mark exception frames on the stack. */
 exception_marker:
index 18bbce1..728ccb0 100644 (file)
@@ -3121,22 +3121,3 @@ handle_dabr_fault:
        li      r5,SIGSEGV
        bl      bad_page_fault
        b       interrupt_return
-
-/*
- * When doorbell is triggered from system reset wakeup, the message is
- * not cleared, so it would fire again when EE is enabled.
- *
- * When coming from local_irq_enable, there may be the same problem if
- * we were hard disabled.
- *
- * Execute msgclr to clear pending exceptions before handling it.
- */
-h_doorbell_common_msgclr:
-       LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36))
-       PPC_MSGCLR(3)
-       b       h_doorbell_common_virt
-
-doorbell_super_common_msgclr:
-       LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36))
-       PPC_MSGCLRP(3)
-       b       doorbell_super_common_virt
index a25ed47..1f11698 100644 (file)
@@ -527,6 +527,19 @@ void irq_set_pending_from_srr1(unsigned long srr1)
                return;
        }
 
+       if (reason == PACA_IRQ_DBELL) {
+               /*
+                * When doorbell triggers a system reset wakeup, the message
+                * is not cleared, so if the doorbell interrupt is replayed
+                * and the IPI handled, the doorbell interrupt would still
+                * fire when EE is enabled.
+                *
+                * To avoid taking the superfluous doorbell interrupt,
+                * execute a msgclr here before the interrupt is replayed.
+                */
+               ppc_msgclr(PPC_DBELL_MSGTYPE);
+       }
+
        /*
         * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
         * so this can be called unconditionally with the SRR1 wake
index f3bd0bb..2d4d21b 100644 (file)
@@ -55,14 +55,17 @@ _GLOBAL(ppc_save_regs)
        PPC_STL r29,29*SZL(r3)
        PPC_STL r30,30*SZL(r3)
        PPC_STL r31,31*SZL(r3)
+       lbz     r0,PACAIRQSOFTMASK(r13)
+       PPC_STL r0,SOFTE-STACK_FRAME_OVERHEAD(r3)
 #endif
        /* go up one stack frame for SP */
        PPC_LL  r4,0(r1)
        PPC_STL r4,1*SZL(r3)
        /* get caller's LR */
        PPC_LL  r0,LRSAVE(r4)
-       PPC_STL r0,_NIP-STACK_FRAME_OVERHEAD(r3)
        PPC_STL r0,_LINK-STACK_FRAME_OVERHEAD(r3)
+       mflr    r0
+       PPC_STL r0,_NIP-STACK_FRAME_OVERHEAD(r3)
        mfmsr   r0
        PPC_STL r0,_MSR-STACK_FRAME_OVERHEAD(r3)
        mfctr   r0
@@ -73,4 +76,5 @@ _GLOBAL(ppc_save_regs)
        PPC_STL r0,_CCR-STACK_FRAME_OVERHEAD(r3)
        li      r0,0
        PPC_STL r0,_TRAP-STACK_FRAME_OVERHEAD(r3)
+       PPC_STL r0,ORIG_GPR3-STACK_FRAME_OVERHEAD(r3)
        blr
index e9d97c2..c2f2402 100644 (file)
@@ -6,7 +6,7 @@
 CFLAGS_ptrace-view.o           += -DUTS_MACHINE='"$(UTS_MACHINE)"'
 
 obj-y                          += ptrace.o ptrace-view.o
-obj-$(CONFIG_PPC64)            += ptrace32.o
+obj-$(CONFIG_COMPAT)           += ptrace32.o
 obj-$(CONFIG_VSX)              += ptrace-vsx.o
 ifneq ($(CONFIG_VSX),y)
 obj-y                          += ptrace-novsx.o
index d215f95..a264989 100644 (file)
 #include <linux/syscalls.h>
 #include <asm/hw_breakpoint.h>
 #include <linux/uaccess.h>
+#include <asm/switch_to.h>
 #include <asm/unistd.h>
 #include <asm/debug.h>
 #include <asm/tm.h>
 
 #include "signal.h"
 
+#ifdef CONFIG_VSX
+unsigned long copy_fpr_to_user(void __user *to,
+                              struct task_struct *task)
+{
+       u64 buf[ELF_NFPREG];
+       int i;
+
+       /* save FPR copy to local buffer then write to the thread_struct */
+       for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+               buf[i] = task->thread.TS_FPR(i);
+       buf[i] = task->thread.fp_state.fpscr;
+       return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
+}
+
+unsigned long copy_fpr_from_user(struct task_struct *task,
+                                void __user *from)
+{
+       u64 buf[ELF_NFPREG];
+       int i;
+
+       if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
+               return 1;
+       for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+               task->thread.TS_FPR(i) = buf[i];
+       task->thread.fp_state.fpscr = buf[i];
+
+       return 0;
+}
+
+unsigned long copy_vsx_to_user(void __user *to,
+                              struct task_struct *task)
+{
+       u64 buf[ELF_NVSRHALFREG];
+       int i;
+
+       /* save FPR copy to local buffer then write to the thread_struct */
+       for (i = 0; i < ELF_NVSRHALFREG; i++)
+               buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
+       return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
+}
+
+unsigned long copy_vsx_from_user(struct task_struct *task,
+                                void __user *from)
+{
+       u64 buf[ELF_NVSRHALFREG];
+       int i;
+
+       if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
+               return 1;
+       for (i = 0; i < ELF_NVSRHALFREG ; i++)
+               task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+       return 0;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+unsigned long copy_ckfpr_to_user(void __user *to,
+                                 struct task_struct *task)
+{
+       u64 buf[ELF_NFPREG];
+       int i;
+
+       /* save FPR copy to local buffer then write to the thread_struct */
+       for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+               buf[i] = task->thread.TS_CKFPR(i);
+       buf[i] = task->thread.ckfp_state.fpscr;
+       return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
+}
+
+unsigned long copy_ckfpr_from_user(struct task_struct *task,
+                                         void __user *from)
+{
+       u64 buf[ELF_NFPREG];
+       int i;
+
+       if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
+               return 1;
+       for (i = 0; i < (ELF_NFPREG - 1) ; i++)
+               task->thread.TS_CKFPR(i) = buf[i];
+       task->thread.ckfp_state.fpscr = buf[i];
+
+       return 0;
+}
+
+unsigned long copy_ckvsx_to_user(void __user *to,
+                                 struct task_struct *task)
+{
+       u64 buf[ELF_NVSRHALFREG];
+       int i;
+
+       /* save FPR copy to local buffer then write to the thread_struct */
+       for (i = 0; i < ELF_NVSRHALFREG; i++)
+               buf[i] = task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET];
+       return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
+}
+
+unsigned long copy_ckvsx_from_user(struct task_struct *task,
+                                         void __user *from)
+{
+       u64 buf[ELF_NVSRHALFREG];
+       int i;
+
+       if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
+               return 1;
+       for (i = 0; i < ELF_NVSRHALFREG ; i++)
+               task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+       return 0;
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#else
+inline unsigned long copy_fpr_to_user(void __user *to,
+                                     struct task_struct *task)
+{
+       return __copy_to_user(to, task->thread.fp_state.fpr,
+                             ELF_NFPREG * sizeof(double));
+}
+
+inline unsigned long copy_fpr_from_user(struct task_struct *task,
+                                       void __user *from)
+{
+       return __copy_from_user(task->thread.fp_state.fpr, from,
+                             ELF_NFPREG * sizeof(double));
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+inline unsigned long copy_ckfpr_to_user(void __user *to,
+                                        struct task_struct *task)
+{
+       return __copy_to_user(to, task->thread.ckfp_state.fpr,
+                             ELF_NFPREG * sizeof(double));
+}
+
+inline unsigned long copy_ckfpr_from_user(struct task_struct *task,
+                                                void __user *from)
+{
+       return __copy_from_user(task->thread.ckfp_state.fpr, from,
+                               ELF_NFPREG * sizeof(double));
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#endif
+
 /* Log an error when sending an unhandled signal to a process. Controlled
  * through debug.exception-trace sysctl.
  */
@@ -106,7 +247,6 @@ static void do_signal(struct task_struct *tsk)
        sigset_t *oldset = sigmask_to_save();
        struct ksignal ksig = { .sig = 0 };
        int ret;
-       int is32 = is_32bit_task();
 
        BUG_ON(tsk != current);
 
@@ -136,7 +276,7 @@ static void do_signal(struct task_struct *tsk)
 
        rseq_signal_deliver(&ksig, tsk->thread.regs);
 
-       if (is32) {
+       if (is_32bit_task()) {
                if (ksig.ka.sa.sa_flags & SA_SIGINFO)
                        ret = handle_rt_signal32(&ksig, oldset, tsk);
                else
index 1b090a7..4f96d29 100644 (file)
@@ -235,146 +235,6 @@ struct rt_sigframe {
        int                     abigap[56];
 };
 
-#ifdef CONFIG_VSX
-unsigned long copy_fpr_to_user(void __user *to,
-                              struct task_struct *task)
-{
-       u64 buf[ELF_NFPREG];
-       int i;
-
-       /* save FPR copy to local buffer then write to the thread_struct */
-       for (i = 0; i < (ELF_NFPREG - 1) ; i++)
-               buf[i] = task->thread.TS_FPR(i);
-       buf[i] = task->thread.fp_state.fpscr;
-       return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
-}
-
-unsigned long copy_fpr_from_user(struct task_struct *task,
-                                void __user *from)
-{
-       u64 buf[ELF_NFPREG];
-       int i;
-
-       if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
-               return 1;
-       for (i = 0; i < (ELF_NFPREG - 1) ; i++)
-               task->thread.TS_FPR(i) = buf[i];
-       task->thread.fp_state.fpscr = buf[i];
-
-       return 0;
-}
-
-unsigned long copy_vsx_to_user(void __user *to,
-                              struct task_struct *task)
-{
-       u64 buf[ELF_NVSRHALFREG];
-       int i;
-
-       /* save FPR copy to local buffer then write to the thread_struct */
-       for (i = 0; i < ELF_NVSRHALFREG; i++)
-               buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
-       return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
-}
-
-unsigned long copy_vsx_from_user(struct task_struct *task,
-                                void __user *from)
-{
-       u64 buf[ELF_NVSRHALFREG];
-       int i;
-
-       if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
-               return 1;
-       for (i = 0; i < ELF_NVSRHALFREG ; i++)
-               task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
-       return 0;
-}
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-unsigned long copy_ckfpr_to_user(void __user *to,
-                                 struct task_struct *task)
-{
-       u64 buf[ELF_NFPREG];
-       int i;
-
-       /* save FPR copy to local buffer then write to the thread_struct */
-       for (i = 0; i < (ELF_NFPREG - 1) ; i++)
-               buf[i] = task->thread.TS_CKFPR(i);
-       buf[i] = task->thread.ckfp_state.fpscr;
-       return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
-}
-
-unsigned long copy_ckfpr_from_user(struct task_struct *task,
-                                         void __user *from)
-{
-       u64 buf[ELF_NFPREG];
-       int i;
-
-       if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
-               return 1;
-       for (i = 0; i < (ELF_NFPREG - 1) ; i++)
-               task->thread.TS_CKFPR(i) = buf[i];
-       task->thread.ckfp_state.fpscr = buf[i];
-
-       return 0;
-}
-
-unsigned long copy_ckvsx_to_user(void __user *to,
-                                 struct task_struct *task)
-{
-       u64 buf[ELF_NVSRHALFREG];
-       int i;
-
-       /* save FPR copy to local buffer then write to the thread_struct */
-       for (i = 0; i < ELF_NVSRHALFREG; i++)
-               buf[i] = task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET];
-       return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
-}
-
-unsigned long copy_ckvsx_from_user(struct task_struct *task,
-                                         void __user *from)
-{
-       u64 buf[ELF_NVSRHALFREG];
-       int i;
-
-       if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
-               return 1;
-       for (i = 0; i < ELF_NVSRHALFREG ; i++)
-               task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
-       return 0;
-}
-#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
-#else
-inline unsigned long copy_fpr_to_user(void __user *to,
-                                     struct task_struct *task)
-{
-       return __copy_to_user(to, task->thread.fp_state.fpr,
-                             ELF_NFPREG * sizeof(double));
-}
-
-inline unsigned long copy_fpr_from_user(struct task_struct *task,
-                                       void __user *from)
-{
-       return __copy_from_user(task->thread.fp_state.fpr, from,
-                             ELF_NFPREG * sizeof(double));
-}
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-inline unsigned long copy_ckfpr_to_user(void __user *to,
-                                        struct task_struct *task)
-{
-       return __copy_to_user(to, task->thread.ckfp_state.fpr,
-                             ELF_NFPREG * sizeof(double));
-}
-
-inline unsigned long copy_ckfpr_from_user(struct task_struct *task,
-                                                void __user *from)
-{
-       return __copy_from_user(task->thread.ckfp_state.fpr, from,
-                               ELF_NFPREG * sizeof(double));
-}
-#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
-#endif
-
 /*
  * Save the current user registers on the user stack.
  * We only save the altivec/spe registers if the process has used
index cf06eb4..c74295a 100644 (file)
@@ -22,7 +22,6 @@ notrace long system_call_exception(long r3, long r4, long r5,
                                   long r6, long r7, long r8,
                                   unsigned long r0, struct pt_regs *regs)
 {
-       unsigned long ti_flags;
        syscall_fn f;
 
        if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
@@ -60,8 +59,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 
        local_irq_enable();
 
-       ti_flags = current_thread_info()->flags;
-       if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) {
+       if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) {
                /*
                 * We use the return value of do_syscall_trace_enter() as the
                 * syscall number. If the syscall was rejected for any reason
@@ -86,7 +84,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
        /* May be faster to do array_index_nospec? */
        barrier_nospec();
 
-       if (unlikely(ti_flags & _TIF_32BIT)) {
+       if (unlikely(is_32bit_task())) {
                f = (void *)compat_sys_call_table[r0];
 
                r3 &= 0x00000000ffffffffULL;
index c0a9a32..02d6751 100644 (file)
@@ -32,6 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
        printf "#define __NR_syscalls\t%s\n" "${nxt}"
        printf "#endif\n"
        printf "\n"
-       printf "#endif /* %s */" "${fileguard}"
-       printf "\n"
+       printf "#endif /* %s */\n" "${fileguard}"
 ) > "$out"
index bda9cb4..6fcae43 100644 (file)
@@ -50,7 +50,7 @@
 #include <linux/irq.h>
 #include <linux/delay.h>
 #include <linux/irq_work.h>
-#include <linux/clk-provider.h>
+#include <linux/of_clk.h>
 #include <linux/suspend.h>
 #include <linux/sched/cputime.h>
 #include <linux/processor.h>
@@ -522,35 +522,6 @@ static inline void clear_irq_work_pending(void)
                "i" (offsetof(struct paca_struct, irq_work_pending)));
 }
 
-void arch_irq_work_raise(void)
-{
-       preempt_disable();
-       set_irq_work_pending_flag();
-       /*
-        * Non-nmi code running with interrupts disabled will replay
-        * irq_happened before it re-enables interrupts, so setthe
-        * decrementer there instead of causing a hardware exception
-        * which would immediately hit the masked interrupt handler
-        * and have the net effect of setting the decrementer in
-        * irq_happened.
-        *
-        * NMI interrupts can not check this when they return, so the
-        * decrementer hardware exception is raised, which will fire
-        * when interrupts are next enabled.
-        *
-        * BookE does not support this yet, it must audit all NMI
-        * interrupt handlers to ensure they call nmi_enter() so this
-        * check would be correct.
-        */
-       if (IS_ENABLED(CONFIG_BOOKE) || !irqs_disabled() || in_nmi()) {
-               set_dec(1);
-       } else {
-               hard_irq_disable();
-               local_paca->irq_happened |= PACA_IRQ_DEC;
-       }
-       preempt_enable();
-}
-
 #else /* 32-bit */
 
 DEFINE_PER_CPU(u8, irq_work_pending);
@@ -559,16 +530,27 @@ DEFINE_PER_CPU(u8, irq_work_pending);
 #define test_irq_work_pending()                __this_cpu_read(irq_work_pending)
 #define clear_irq_work_pending()       __this_cpu_write(irq_work_pending, 0)
 
+#endif /* 32 vs 64 bit */
+
 void arch_irq_work_raise(void)
 {
+       /*
+        * 64-bit code that uses irq soft-mask can just cause an immediate
+        * interrupt here that gets soft masked, if this is called under
+        * local_irq_disable(). It might be possible to prevent that happening
+        * by noticing interrupts are disabled and setting decrementer pending
+        * to be replayed when irqs are enabled. The problem there is that
+        * tracing can call irq_work_raise, including in code that does low
+        * level manipulations of irq soft-mask state (e.g., trace_hardirqs_on)
+        * which could get tangled up if we're messing with the same state
+        * here.
+        */
        preempt_disable();
        set_irq_work_pending_flag();
        set_dec(1);
        preempt_enable();
 }
 
-#endif /* 32 vs 64 bit */
-
 #else  /* CONFIG_IRQ_WORK */
 
 #define test_irq_work_pending()        0
@@ -1149,9 +1131,7 @@ void __init time_init(void)
        init_decrementer_clockevent();
        tick_setup_hrtimer_broadcast();
 
-#ifdef CONFIG_COMMON_CLK
        of_clk_init(NULL);
-#endif
 }
 
 /*
index d3b77c1..f38f26e 100644 (file)
@@ -651,7 +651,8 @@ static void __init vdso_setup_syscall_map(void)
                if (sys_call_table[i] != sys_ni_syscall)
                        vdso_data->syscall_map_64[i >> 5] |=
                                0x80000000UL >> (i & 0x1f);
-               if (compat_sys_call_table[i] != sys_ni_syscall)
+               if (IS_ENABLED(CONFIG_COMPAT) &&
+                   compat_sys_call_table[i] != sys_ni_syscall)
                        vdso_data->syscall_map_32[i >> 5] |=
                                0x80000000UL >> (i & 0x1f);
 #else /* CONFIG_PPC64 */
index 711fca9..12885ed 100644 (file)
@@ -204,6 +204,4 @@ config KVM_XIVE
        default y
        depends on KVM_XICS && PPC_XIVE_NATIVE && KVM_BOOK3S_HV_POSSIBLE
 
-source "drivers/vhost/Kconfig"
-
 endif # VIRTUALIZATION
index 425d138..df9989c 100644 (file)
@@ -422,7 +422,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
                                break;
                        }
                } else if (vma && hva >= vma->vm_start &&
-                          (vma->vm_flags & VM_HUGETLB)) {
+                          is_vm_hugetlb_page(vma)) {
                        unsigned long psize = vma_kernel_pagesize(vma);
 
                        tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
index d15f0f0..84af6c8 100644 (file)
@@ -314,7 +314,7 @@ static bool access_error(bool is_write, bool is_exec,
                return false;
        }
 
-       if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+       if (unlikely(!vma_is_accessible(vma)))
                return true;
        /*
         * We should ideally do the vma pkey access check here. But in the
index fc66964..b1a0aeb 100644 (file)
@@ -2,6 +2,7 @@
 
 #include <linux/io.h>
 #include <linux/slab.h>
+#include <linux/mmzone.h>
 #include <linux/vmalloc.h>
 #include <asm/io-workarounds.h>
 
@@ -97,3 +98,23 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
 
        return NULL;
 }
+
+#ifdef CONFIG_ZONE_DEVICE
+/*
+ * Override the generic version in mm/memremap.c.
+ *
+ * With hash translation, the direct-map range is mapped with just one
+ * page size selected by htab_init_page_sizes(). Consult
+ * mmu_psize_defs[] to determine the minimum page size alignment.
+*/
+unsigned long memremap_compat_align(void)
+{
+       unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift;
+
+       if (radix_enabled())
+               return SUBSECTION_SIZE;
+       return max(SUBSECTION_SIZE, 1UL << shift);
+
+}
+EXPORT_SYMBOL_GPL(memremap_compat_align);
+#endif
index c155dcb..53d614e 100644 (file)
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 
-obj-$(CONFIG_PERF_EVENTS)      += callchain.o perf_regs.o
+obj-$(CONFIG_PERF_EVENTS)      += callchain.o callchain_$(BITS).o perf_regs.o
+ifdef CONFIG_COMPAT
+obj-$(CONFIG_PERF_EVENTS)      += callchain_32.o
+endif
 
 obj-$(CONFIG_PPC_PERF_CTRS)    += core-book3s.o bhrb.o
 obj64-$(CONFIG_PPC_PERF_CTRS)  += ppc970-pmu.o power5-pmu.o \
index cbc2519..dd50510 100644 (file)
 #include <asm/sigcontext.h>
 #include <asm/ucontext.h>
 #include <asm/vdso.h>
-#ifdef CONFIG_PPC64
-#include "../kernel/ppc32.h"
-#endif
 #include <asm/pte-walk.h>
 
+#include "callchain.h"
 
 /*
  * Is sp valid as the address of the next kernel stack frame after prev_sp?
@@ -102,358 +100,6 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
        }
 }
 
-#ifdef CONFIG_PPC64
-/*
- * On 64-bit we don't want to invoke hash_page on user addresses from
- * interrupt context, so if the access faults, we read the page tables
- * to find which page (if any) is mapped and access it directly.
- */
-static int read_user_stack_slow(void __user *ptr, void *buf, int nb)
-{
-       int ret = -EFAULT;
-       pgd_t *pgdir;
-       pte_t *ptep, pte;
-       unsigned shift;
-       unsigned long addr = (unsigned long) ptr;
-       unsigned long offset;
-       unsigned long pfn, flags;
-       void *kaddr;
-
-       pgdir = current->mm->pgd;
-       if (!pgdir)
-               return -EFAULT;
-
-       local_irq_save(flags);
-       ptep = find_current_mm_pte(pgdir, addr, NULL, &shift);
-       if (!ptep)
-               goto err_out;
-       if (!shift)
-               shift = PAGE_SHIFT;
-
-       /* align address to page boundary */
-       offset = addr & ((1UL << shift) - 1);
-
-       pte = READ_ONCE(*ptep);
-       if (!pte_present(pte) || !pte_user(pte))
-               goto err_out;
-       pfn = pte_pfn(pte);
-       if (!page_is_ram(pfn))
-               goto err_out;
-
-       /* no highmem to worry about here */
-       kaddr = pfn_to_kaddr(pfn);
-       memcpy(buf, kaddr + offset, nb);
-       ret = 0;
-err_out:
-       local_irq_restore(flags);
-       return ret;
-}
-
-static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
-{
-       if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) ||
-           ((unsigned long)ptr & 7))
-               return -EFAULT;
-
-       if (!probe_user_read(ret, ptr, sizeof(*ret)))
-               return 0;
-
-       return read_user_stack_slow(ptr, ret, 8);
-}
-
-static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
-{
-       if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
-           ((unsigned long)ptr & 3))
-               return -EFAULT;
-
-       if (!probe_user_read(ret, ptr, sizeof(*ret)))
-               return 0;
-
-       return read_user_stack_slow(ptr, ret, 4);
-}
-
-static inline int valid_user_sp(unsigned long sp, int is_64)
-{
-       if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32)
-               return 0;
-       return 1;
-}
-
-/*
- * 64-bit user processes use the same stack frame for RT and non-RT signals.
- */
-struct signal_frame_64 {
-       char            dummy[__SIGNAL_FRAMESIZE];
-       struct ucontext uc;
-       unsigned long   unused[2];
-       unsigned int    tramp[6];
-       struct siginfo  *pinfo;
-       void            *puc;
-       struct siginfo  info;
-       char            abigap[288];
-};
-
-static int is_sigreturn_64_address(unsigned long nip, unsigned long fp)
-{
-       if (nip == fp + offsetof(struct signal_frame_64, tramp))
-               return 1;
-       if (vdso64_rt_sigtramp && current->mm->context.vdso_base &&
-           nip == current->mm->context.vdso_base + vdso64_rt_sigtramp)
-               return 1;
-       return 0;
-}
-
-/*
- * Do some sanity checking on the signal frame pointed to by sp.
- * We check the pinfo and puc pointers in the frame.
- */
-static int sane_signal_64_frame(unsigned long sp)
-{
-       struct signal_frame_64 __user *sf;
-       unsigned long pinfo, puc;
-
-       sf = (struct signal_frame_64 __user *) sp;
-       if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) ||
-           read_user_stack_64((unsigned long __user *) &sf->puc, &puc))
-               return 0;
-       return pinfo == (unsigned long) &sf->info &&
-               puc == (unsigned long) &sf->uc;
-}
-
-static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
-                                  struct pt_regs *regs)
-{
-       unsigned long sp, next_sp;
-       unsigned long next_ip;
-       unsigned long lr;
-       long level = 0;
-       struct signal_frame_64 __user *sigframe;
-       unsigned long __user *fp, *uregs;
-
-       next_ip = perf_instruction_pointer(regs);
-       lr = regs->link;
-       sp = regs->gpr[1];
-       perf_callchain_store(entry, next_ip);
-
-       while (entry->nr < entry->max_stack) {
-               fp = (unsigned long __user *) sp;
-               if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
-                       return;
-               if (level > 0 && read_user_stack_64(&fp[2], &next_ip))
-                       return;
-
-               /*
-                * Note: the next_sp - sp >= signal frame size check
-                * is true when next_sp < sp, which can happen when
-                * transitioning from an alternate signal stack to the
-                * normal stack.
-                */
-               if (next_sp - sp >= sizeof(struct signal_frame_64) &&
-                   (is_sigreturn_64_address(next_ip, sp) ||
-                    (level <= 1 && is_sigreturn_64_address(lr, sp))) &&
-                   sane_signal_64_frame(sp)) {
-                       /*
-                        * This looks like an signal frame
-                        */
-                       sigframe = (struct signal_frame_64 __user *) sp;
-                       uregs = sigframe->uc.uc_mcontext.gp_regs;
-                       if (read_user_stack_64(&uregs[PT_NIP], &next_ip) ||
-                           read_user_stack_64(&uregs[PT_LNK], &lr) ||
-                           read_user_stack_64(&uregs[PT_R1], &sp))
-                               return;
-                       level = 0;
-                       perf_callchain_store_context(entry, PERF_CONTEXT_USER);
-                       perf_callchain_store(entry, next_ip);
-                       continue;
-               }
-
-               if (level == 0)
-                       next_ip = lr;
-               perf_callchain_store(entry, next_ip);
-               ++level;
-               sp = next_sp;
-       }
-}
-
-#else  /* CONFIG_PPC64 */
-/*
- * On 32-bit we just access the address and let hash_page create a
- * HPTE if necessary, so there is no need to fall back to reading
- * the page tables.  Since this is called at interrupt level,
- * do_page_fault() won't treat a DSI as a page fault.
- */
-static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
-{
-       if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
-           ((unsigned long)ptr & 3))
-               return -EFAULT;
-
-       return probe_user_read(ret, ptr, sizeof(*ret));
-}
-
-static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
-                                         struct pt_regs *regs)
-{
-}
-
-static inline int valid_user_sp(unsigned long sp, int is_64)
-{
-       if (!sp || (sp & 7) || sp > TASK_SIZE - 32)
-               return 0;
-       return 1;
-}
-
-#define __SIGNAL_FRAMESIZE32   __SIGNAL_FRAMESIZE
-#define sigcontext32           sigcontext
-#define mcontext32             mcontext
-#define ucontext32             ucontext
-#define compat_siginfo_t       struct siginfo
-
-#endif /* CONFIG_PPC64 */
-
-/*
- * Layout for non-RT signal frames
- */
-struct signal_frame_32 {
-       char                    dummy[__SIGNAL_FRAMESIZE32];
-       struct sigcontext32     sctx;
-       struct mcontext32       mctx;
-       int                     abigap[56];
-};
-
-/*
- * Layout for RT signal frames
- */
-struct rt_signal_frame_32 {
-       char                    dummy[__SIGNAL_FRAMESIZE32 + 16];
-       compat_siginfo_t        info;
-       struct ucontext32       uc;
-       int                     abigap[56];
-};
-
-static int is_sigreturn_32_address(unsigned int nip, unsigned int fp)
-{
-       if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad))
-               return 1;
-       if (vdso32_sigtramp && current->mm->context.vdso_base &&
-           nip == current->mm->context.vdso_base + vdso32_sigtramp)
-               return 1;
-       return 0;
-}
-
-static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp)
-{
-       if (nip == fp + offsetof(struct rt_signal_frame_32,
-                                uc.uc_mcontext.mc_pad))
-               return 1;
-       if (vdso32_rt_sigtramp && current->mm->context.vdso_base &&
-           nip == current->mm->context.vdso_base + vdso32_rt_sigtramp)
-               return 1;
-       return 0;
-}
-
-static int sane_signal_32_frame(unsigned int sp)
-{
-       struct signal_frame_32 __user *sf;
-       unsigned int regs;
-
-       sf = (struct signal_frame_32 __user *) (unsigned long) sp;
-       if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, &regs))
-               return 0;
-       return regs == (unsigned long) &sf->mctx;
-}
-
-static int sane_rt_signal_32_frame(unsigned int sp)
-{
-       struct rt_signal_frame_32 __user *sf;
-       unsigned int regs;
-
-       sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
-       if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, &regs))
-               return 0;
-       return regs == (unsigned long) &sf->uc.uc_mcontext;
-}
-
-static unsigned int __user *signal_frame_32_regs(unsigned int sp,
-                               unsigned int next_sp, unsigned int next_ip)
-{
-       struct mcontext32 __user *mctx = NULL;
-       struct signal_frame_32 __user *sf;
-       struct rt_signal_frame_32 __user *rt_sf;
-
-       /*
-        * Note: the next_sp - sp >= signal frame size check
-        * is true when next_sp < sp, for example, when
-        * transitioning from an alternate signal stack to the
-        * normal stack.
-        */
-       if (next_sp - sp >= sizeof(struct signal_frame_32) &&
-           is_sigreturn_32_address(next_ip, sp) &&
-           sane_signal_32_frame(sp)) {
-               sf = (struct signal_frame_32 __user *) (unsigned long) sp;
-               mctx = &sf->mctx;
-       }
-
-       if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) &&
-           is_rt_sigreturn_32_address(next_ip, sp) &&
-           sane_rt_signal_32_frame(sp)) {
-               rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
-               mctx = &rt_sf->uc.uc_mcontext;
-       }
-
-       if (!mctx)
-               return NULL;
-       return mctx->mc_gregs;
-}
-
-static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
-                                  struct pt_regs *regs)
-{
-       unsigned int sp, next_sp;
-       unsigned int next_ip;
-       unsigned int lr;
-       long level = 0;
-       unsigned int __user *fp, *uregs;
-
-       next_ip = perf_instruction_pointer(regs);
-       lr = regs->link;
-       sp = regs->gpr[1];
-       perf_callchain_store(entry, next_ip);
-
-       while (entry->nr < entry->max_stack) {
-               fp = (unsigned int __user *) (unsigned long) sp;
-               if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
-                       return;
-               if (level > 0 && read_user_stack_32(&fp[1], &next_ip))
-                       return;
-
-               uregs = signal_frame_32_regs(sp, next_sp, next_ip);
-               if (!uregs && level <= 1)
-                       uregs = signal_frame_32_regs(sp, next_sp, lr);
-               if (uregs) {
-                       /*
-                        * This looks like an signal frame, so restart
-                        * the stack trace with the values in it.
-                        */
-                       if (read_user_stack_32(&uregs[PT_NIP], &next_ip) ||
-                           read_user_stack_32(&uregs[PT_LNK], &lr) ||
-                           read_user_stack_32(&uregs[PT_R1], &sp))
-                               return;
-                       level = 0;
-                       perf_callchain_store_context(entry, PERF_CONTEXT_USER);
-                       perf_callchain_store(entry, next_ip);
-                       continue;
-               }
-
-               if (level == 0)
-                       next_ip = lr;
-               perf_callchain_store(entry, next_ip);
-               ++level;
-               sp = next_sp;
-       }
-}
-
 void
 perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
diff --git a/arch/powerpc/perf/callchain.h b/arch/powerpc/perf/callchain.h
new file mode 100644 (file)
index 0000000..7a2cb9e
--- /dev/null
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _POWERPC_PERF_CALLCHAIN_H
+#define _POWERPC_PERF_CALLCHAIN_H
+
+int read_user_stack_slow(void __user *ptr, void *buf, int nb);
+void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
+                           struct pt_regs *regs);
+void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
+                           struct pt_regs *regs);
+
+static inline bool invalid_user_sp(unsigned long sp)
+{
+       unsigned long mask = is_32bit_task() ? 3 : 7;
+       unsigned long top = STACK_TOP - (is_32bit_task() ? 16 : 32);
+
+       return (!sp || (sp & mask) || (sp > top));
+}
+
+#endif /* _POWERPC_PERF_CALLCHAIN_H */
diff --git a/arch/powerpc/perf/callchain_32.c b/arch/powerpc/perf/callchain_32.c
new file mode 100644 (file)
index 0000000..8aa9510
--- /dev/null
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Performance counter callchain support - powerpc architecture code
+ *
+ * Copyright © 2009 Paul Mackerras, IBM Corporation.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <asm/ptrace.h>
+#include <asm/pgtable.h>
+#include <asm/sigcontext.h>
+#include <asm/ucontext.h>
+#include <asm/vdso.h>
+#include <asm/pte-walk.h>
+
+#include "callchain.h"
+
+#ifdef CONFIG_PPC64
+#include "../kernel/ppc32.h"
+#else  /* CONFIG_PPC64 */
+
+#define __SIGNAL_FRAMESIZE32   __SIGNAL_FRAMESIZE
+#define sigcontext32           sigcontext
+#define mcontext32             mcontext
+#define ucontext32             ucontext
+#define compat_siginfo_t       struct siginfo
+
+#endif /* CONFIG_PPC64 */
+
+/*
+ * On 32-bit we just access the address and let hash_page create a
+ * HPTE if necessary, so there is no need to fall back to reading
+ * the page tables.  Since this is called at interrupt level,
+ * do_page_fault() won't treat a DSI as a page fault.
+ */
+static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
+{
+       int rc;
+
+       if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
+           ((unsigned long)ptr & 3))
+               return -EFAULT;
+
+       rc = probe_user_read(ret, ptr, sizeof(*ret));
+
+       if (IS_ENABLED(CONFIG_PPC64) && rc)
+               return read_user_stack_slow(ptr, ret, 4);
+
+       return rc;
+}
+
+/*
+ * Layout for non-RT signal frames
+ */
+struct signal_frame_32 {
+       char                    dummy[__SIGNAL_FRAMESIZE32];
+       struct sigcontext32     sctx;
+       struct mcontext32       mctx;
+       int                     abigap[56];
+};
+
+/*
+ * Layout for RT signal frames
+ */
+struct rt_signal_frame_32 {
+       char                    dummy[__SIGNAL_FRAMESIZE32 + 16];
+       compat_siginfo_t        info;
+       struct ucontext32       uc;
+       int                     abigap[56];
+};
+
+static int is_sigreturn_32_address(unsigned int nip, unsigned int fp)
+{
+       if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad))
+               return 1;
+       if (vdso32_sigtramp && current->mm->context.vdso_base &&
+           nip == current->mm->context.vdso_base + vdso32_sigtramp)
+               return 1;
+       return 0;
+}
+
+static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp)
+{
+       if (nip == fp + offsetof(struct rt_signal_frame_32,
+                                uc.uc_mcontext.mc_pad))
+               return 1;
+       if (vdso32_rt_sigtramp && current->mm->context.vdso_base &&
+           nip == current->mm->context.vdso_base + vdso32_rt_sigtramp)
+               return 1;
+       return 0;
+}
+
+static int sane_signal_32_frame(unsigned int sp)
+{
+       struct signal_frame_32 __user *sf;
+       unsigned int regs;
+
+       sf = (struct signal_frame_32 __user *) (unsigned long) sp;
+       if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, &regs))
+               return 0;
+       return regs == (unsigned long) &sf->mctx;
+}
+
+static int sane_rt_signal_32_frame(unsigned int sp)
+{
+       struct rt_signal_frame_32 __user *sf;
+       unsigned int regs;
+
+       sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
+       if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, &regs))
+               return 0;
+       return regs == (unsigned long) &sf->uc.uc_mcontext;
+}
+
+static unsigned int __user *signal_frame_32_regs(unsigned int sp,
+                               unsigned int next_sp, unsigned int next_ip)
+{
+       struct mcontext32 __user *mctx = NULL;
+       struct signal_frame_32 __user *sf;
+       struct rt_signal_frame_32 __user *rt_sf;
+
+       /*
+        * Note: the next_sp - sp >= signal frame size check
+        * is true when next_sp < sp, for example, when
+        * transitioning from an alternate signal stack to the
+        * normal stack.
+        */
+       if (next_sp - sp >= sizeof(struct signal_frame_32) &&
+           is_sigreturn_32_address(next_ip, sp) &&
+           sane_signal_32_frame(sp)) {
+               sf = (struct signal_frame_32 __user *) (unsigned long) sp;
+               mctx = &sf->mctx;
+       }
+
+       if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) &&
+           is_rt_sigreturn_32_address(next_ip, sp) &&
+           sane_rt_signal_32_frame(sp)) {
+               rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
+               mctx = &rt_sf->uc.uc_mcontext;
+       }
+
+       if (!mctx)
+               return NULL;
+       return mctx->mc_gregs;
+}
+
+void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
+                           struct pt_regs *regs)
+{
+       unsigned int sp, next_sp;
+       unsigned int next_ip;
+       unsigned int lr;
+       long level = 0;
+       unsigned int __user *fp, *uregs;
+
+       next_ip = perf_instruction_pointer(regs);
+       lr = regs->link;
+       sp = regs->gpr[1];
+       perf_callchain_store(entry, next_ip);
+
+       while (entry->nr < entry->max_stack) {
+               fp = (unsigned int __user *) (unsigned long) sp;
+               if (invalid_user_sp(sp) || read_user_stack_32(fp, &next_sp))
+                       return;
+               if (level > 0 && read_user_stack_32(&fp[1], &next_ip))
+                       return;
+
+               uregs = signal_frame_32_regs(sp, next_sp, next_ip);
+               if (!uregs && level <= 1)
+                       uregs = signal_frame_32_regs(sp, next_sp, lr);
+               if (uregs) {
+                       /*
+                        * This looks like an signal frame, so restart
+                        * the stack trace with the values in it.
+                        */
+                       if (read_user_stack_32(&uregs[PT_NIP], &next_ip) ||
+                           read_user_stack_32(&uregs[PT_LNK], &lr) ||
+                           read_user_stack_32(&uregs[PT_R1], &sp))
+                               return;
+                       level = 0;
+                       perf_callchain_store_context(entry, PERF_CONTEXT_USER);
+                       perf_callchain_store(entry, next_ip);
+                       continue;
+               }
+
+               if (level == 0)
+                       next_ip = lr;
+               perf_callchain_store(entry, next_ip);
+               ++level;
+               sp = next_sp;
+       }
+}
diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c
new file mode 100644 (file)
index 0000000..df1ffd8
--- /dev/null
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Performance counter callchain support - powerpc architecture code
+ *
+ * Copyright © 2009 Paul Mackerras, IBM Corporation.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <asm/ptrace.h>
+#include <asm/pgtable.h>
+#include <asm/sigcontext.h>
+#include <asm/ucontext.h>
+#include <asm/vdso.h>
+#include <asm/pte-walk.h>
+
+#include "callchain.h"
+
+/*
+ * On 64-bit we don't want to invoke hash_page on user addresses from
+ * interrupt context, so if the access faults, we read the page tables
+ * to find which page (if any) is mapped and access it directly.
+ */
+int read_user_stack_slow(void __user *ptr, void *buf, int nb)
+{
+       int ret = -EFAULT;
+       pgd_t *pgdir;
+       pte_t *ptep, pte;
+       unsigned int shift;
+       unsigned long addr = (unsigned long) ptr;
+       unsigned long offset;
+       unsigned long pfn, flags;
+       void *kaddr;
+
+       pgdir = current->mm->pgd;
+       if (!pgdir)
+               return -EFAULT;
+
+       local_irq_save(flags);
+       ptep = find_current_mm_pte(pgdir, addr, NULL, &shift);
+       if (!ptep)
+               goto err_out;
+       if (!shift)
+               shift = PAGE_SHIFT;
+
+       /* align address to page boundary */
+       offset = addr & ((1UL << shift) - 1);
+
+       pte = READ_ONCE(*ptep);
+       if (!pte_present(pte) || !pte_user(pte))
+               goto err_out;
+       pfn = pte_pfn(pte);
+       if (!page_is_ram(pfn))
+               goto err_out;
+
+       /* no highmem to worry about here */
+       kaddr = pfn_to_kaddr(pfn);
+       memcpy(buf, kaddr + offset, nb);
+       ret = 0;
+err_out:
+       local_irq_restore(flags);
+       return ret;
+}
+
+static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
+{
+       if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) ||
+           ((unsigned long)ptr & 7))
+               return -EFAULT;
+
+       if (!probe_user_read(ret, ptr, sizeof(*ret)))
+               return 0;
+
+       return read_user_stack_slow(ptr, ret, 8);
+}
+
+/*
+ * 64-bit user processes use the same stack frame for RT and non-RT signals.
+ */
+struct signal_frame_64 {
+       char            dummy[__SIGNAL_FRAMESIZE];
+       struct ucontext uc;
+       unsigned long   unused[2];
+       unsigned int    tramp[6];
+       struct siginfo  *pinfo;
+       void            *puc;
+       struct siginfo  info;
+       char            abigap[288];
+};
+
+static int is_sigreturn_64_address(unsigned long nip, unsigned long fp)
+{
+       if (nip == fp + offsetof(struct signal_frame_64, tramp))
+               return 1;
+       if (vdso64_rt_sigtramp && current->mm->context.vdso_base &&
+           nip == current->mm->context.vdso_base + vdso64_rt_sigtramp)
+               return 1;
+       return 0;
+}
+
+/*
+ * Do some sanity checking on the signal frame pointed to by sp.
+ * We check the pinfo and puc pointers in the frame.
+ */
+static int sane_signal_64_frame(unsigned long sp)
+{
+       struct signal_frame_64 __user *sf;
+       unsigned long pinfo, puc;
+
+       sf = (struct signal_frame_64 __user *) sp;
+       if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) ||
+           read_user_stack_64((unsigned long __user *) &sf->puc, &puc))
+               return 0;
+       return pinfo == (unsigned long) &sf->info &&
+               puc == (unsigned long) &sf->uc;
+}
+
+void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
+                           struct pt_regs *regs)
+{
+       unsigned long sp, next_sp;
+       unsigned long next_ip;
+       unsigned long lr;
+       long level = 0;
+       struct signal_frame_64 __user *sigframe;
+       unsigned long __user *fp, *uregs;
+
+       next_ip = perf_instruction_pointer(regs);
+       lr = regs->link;
+       sp = regs->gpr[1];
+       perf_callchain_store(entry, next_ip);
+
+       while (entry->nr < entry->max_stack) {
+               fp = (unsigned long __user *) sp;
+               if (invalid_user_sp(sp) || read_user_stack_64(fp, &next_sp))
+                       return;
+               if (level > 0 && read_user_stack_64(&fp[2], &next_ip))
+                       return;
+
+               /*
+                * Note: the next_sp - sp >= signal frame size check
+                * is true when next_sp < sp, which can happen when
+                * transitioning from an alternate signal stack to the
+                * normal stack.
+                */
+               if (next_sp - sp >= sizeof(struct signal_frame_64) &&
+                   (is_sigreturn_64_address(next_ip, sp) ||
+                    (level <= 1 && is_sigreturn_64_address(lr, sp))) &&
+                   sane_signal_64_frame(sp)) {
+                       /*
+                        * This looks like an signal frame
+                        */
+                       sigframe = (struct signal_frame_64 __user *) sp;
+                       uregs = sigframe->uc.uc_mcontext.gp_regs;
+                       if (read_user_stack_64(&uregs[PT_NIP], &next_ip) ||
+                           read_user_stack_64(&uregs[PT_LNK], &lr) ||
+                           read_user_stack_64(&uregs[PT_R1], &sp))
+                               return;
+                       level = 0;
+                       perf_callchain_store_context(entry, PERF_CONTEXT_USER);
+                       perf_callchain_store(entry, next_ip);
+                       continue;
+               }
+
+               if (level == 0)
+                       next_ip = lr;
+               perf_callchain_store(entry, next_ip);
+               ++level;
+               sp = next_sp;
+       }
+}
index cb50a9e..eb82dda 100644 (file)
@@ -44,6 +44,16 @@ static DEFINE_PER_CPU(u64 *, trace_imc_mem);
 static struct imc_pmu_ref *trace_imc_refc;
 static int trace_imc_mem_size;
 
+/*
+ * Global data structure used to avoid races between thread,
+ * core and trace-imc
+ */
+static struct imc_pmu_ref imc_global_refc = {
+       .lock = __MUTEX_INITIALIZER(imc_global_refc.lock),
+       .id = 0,
+       .refc = 0,
+};
+
 static struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
        return container_of(event->pmu, struct imc_pmu, pmu);
@@ -698,6 +708,16 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
                        return -EINVAL;
 
                ref->refc = 0;
+               /*
+                * Reduce the global reference count, if this is the
+                * last cpu in this core and core-imc event running
+                * in this cpu.
+                */
+               mutex_lock(&imc_global_refc.lock);
+               if (imc_global_refc.id == IMC_DOMAIN_CORE)
+                       imc_global_refc.refc--;
+
+               mutex_unlock(&imc_global_refc.lock);
        }
        return 0;
 }
@@ -710,6 +730,23 @@ static int core_imc_pmu_cpumask_init(void)
                                 ppc_core_imc_cpu_offline);
 }
 
+static void reset_global_refc(struct perf_event *event)
+{
+               mutex_lock(&imc_global_refc.lock);
+               imc_global_refc.refc--;
+
+               /*
+                * If no other thread is running any
+                * event for this domain(thread/core/trace),
+                * set the global id to zero.
+                */
+               if (imc_global_refc.refc <= 0) {
+                       imc_global_refc.refc = 0;
+                       imc_global_refc.id = 0;
+               }
+               mutex_unlock(&imc_global_refc.lock);
+}
+
 static void core_imc_counters_release(struct perf_event *event)
 {
        int rc, core_id;
@@ -759,6 +796,8 @@ static void core_imc_counters_release(struct perf_event *event)
                ref->refc = 0;
        }
        mutex_unlock(&ref->lock);
+
+       reset_global_refc(event);
 }
 
 static int core_imc_event_init(struct perf_event *event)
@@ -819,6 +858,29 @@ static int core_imc_event_init(struct perf_event *event)
        ++ref->refc;
        mutex_unlock(&ref->lock);
 
+       /*
+        * Since the system can run either in accumulation or trace-mode
+        * of IMC at a time, core-imc events are allowed only if no other
+        * trace/thread imc events are enabled/monitored.
+        *
+        * Take the global lock, and check the refc.id
+        * to know whether any other trace/thread imc
+        * events are running.
+        */
+       mutex_lock(&imc_global_refc.lock);
+       if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) {
+               /*
+                * No other trace/thread imc events are running in
+                * the system, so set the refc.id to core-imc.
+                */
+               imc_global_refc.id = IMC_DOMAIN_CORE;
+               imc_global_refc.refc++;
+       } else {
+               mutex_unlock(&imc_global_refc.lock);
+               return -EBUSY;
+       }
+       mutex_unlock(&imc_global_refc.lock);
+
        event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK);
        event->destroy = core_imc_counters_release;
        return 0;
@@ -877,7 +939,23 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu)
 
 static int ppc_thread_imc_cpu_offline(unsigned int cpu)
 {
-       mtspr(SPRN_LDBAR, 0);
+       /*
+        * Set the bit 0 of LDBAR to zero.
+        *
+        * If bit 0 of LDBAR is unset, it will stop posting
+        * the counter data to memory.
+        * For thread-imc, bit 0 of LDBAR will be set to 1 in the
+        * event_add function. So reset this bit here, to stop the updates
+        * to memory in the cpu_offline path.
+        */
+       mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
+
+       /* Reduce the refc if thread-imc event running on this cpu */
+       mutex_lock(&imc_global_refc.lock);
+       if (imc_global_refc.id == IMC_DOMAIN_THREAD)
+               imc_global_refc.refc--;
+       mutex_unlock(&imc_global_refc.lock);
+
        return 0;
 }
 
@@ -916,7 +994,22 @@ static int thread_imc_event_init(struct perf_event *event)
        if (!target)
                return -EINVAL;
 
+       mutex_lock(&imc_global_refc.lock);
+       /*
+        * Check if any other trace/core imc events are running in the
+        * system, if not set the global id to thread-imc.
+        */
+       if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_THREAD) {
+               imc_global_refc.id = IMC_DOMAIN_THREAD;
+               imc_global_refc.refc++;
+       } else {
+               mutex_unlock(&imc_global_refc.lock);
+               return -EBUSY;
+       }
+       mutex_unlock(&imc_global_refc.lock);
+
        event->pmu->task_ctx_nr = perf_sw_context;
+       event->destroy = reset_global_refc;
        return 0;
 }
 
@@ -1063,10 +1156,12 @@ static void thread_imc_event_del(struct perf_event *event, int flags)
        int core_id;
        struct imc_pmu_ref *ref;
 
-       mtspr(SPRN_LDBAR, 0);
-
        core_id = smp_processor_id() / threads_per_core;
        ref = &core_imc_refc[core_id];
+       if (!ref) {
+               pr_debug("imc: Failed to get event reference count\n");
+               return;
+       }
 
        mutex_lock(&ref->lock);
        ref->refc--;
@@ -1082,6 +1177,10 @@ static void thread_imc_event_del(struct perf_event *event, int flags)
                ref->refc = 0;
        }
        mutex_unlock(&ref->lock);
+
+       /* Set bit 0 of LDBAR to zero, to stop posting updates to memory */
+       mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
+
        /*
         * Take a snapshot and calculate the delta and update
         * the event counter values.
@@ -1133,7 +1232,18 @@ static int ppc_trace_imc_cpu_online(unsigned int cpu)
 
 static int ppc_trace_imc_cpu_offline(unsigned int cpu)
 {
-       mtspr(SPRN_LDBAR, 0);
+       /*
+        * No need to set bit 0 of LDBAR to zero, as
+        * it is set to zero for imc trace-mode
+        *
+        * Reduce the refc if any trace-imc event running
+        * on this cpu.
+        */
+       mutex_lock(&imc_global_refc.lock);
+       if (imc_global_refc.id == IMC_DOMAIN_TRACE)
+               imc_global_refc.refc--;
+       mutex_unlock(&imc_global_refc.lock);
+
        return 0;
 }
 
@@ -1226,15 +1336,14 @@ static int trace_imc_event_add(struct perf_event *event, int flags)
        local_mem = get_trace_imc_event_base_addr();
        ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE;
 
-       if (core_imc_refc)
-               ref = &core_imc_refc[core_id];
+       /* trace-imc reference count */
+       if (trace_imc_refc)
+               ref = &trace_imc_refc[core_id];
        if (!ref) {
-               /* If core-imc is not enabled, use trace-imc reference count */
-               if (trace_imc_refc)
-                       ref = &trace_imc_refc[core_id];
-               if (!ref)
-                       return -EINVAL;
+               pr_debug("imc: Failed to get the event reference count\n");
+               return -EINVAL;
        }
+
        mtspr(SPRN_LDBAR, ldbar_value);
        mutex_lock(&ref->lock);
        if (ref->refc == 0) {
@@ -1242,13 +1351,11 @@ static int trace_imc_event_add(struct perf_event *event, int flags)
                                get_hard_smp_processor_id(smp_processor_id()))) {
                        mutex_unlock(&ref->lock);
                        pr_err("trace-imc: Unable to start the counters for core %d\n", core_id);
-                       mtspr(SPRN_LDBAR, 0);
                        return -EINVAL;
                }
        }
        ++ref->refc;
        mutex_unlock(&ref->lock);
-
        return 0;
 }
 
@@ -1274,16 +1381,13 @@ static void trace_imc_event_del(struct perf_event *event, int flags)
        int core_id = smp_processor_id() / threads_per_core;
        struct imc_pmu_ref *ref = NULL;
 
-       if (core_imc_refc)
-               ref = &core_imc_refc[core_id];
+       if (trace_imc_refc)
+               ref = &trace_imc_refc[core_id];
        if (!ref) {
-               /* If core-imc is not enabled, use trace-imc reference count */
-               if (trace_imc_refc)
-                       ref = &trace_imc_refc[core_id];
-               if (!ref)
-                       return;
+               pr_debug("imc: Failed to get event reference count\n");
+               return;
        }
-       mtspr(SPRN_LDBAR, 0);
+
        mutex_lock(&ref->lock);
        ref->refc--;
        if (ref->refc == 0) {
@@ -1297,6 +1401,7 @@ static void trace_imc_event_del(struct perf_event *event, int flags)
                ref->refc = 0;
        }
        mutex_unlock(&ref->lock);
+
        trace_imc_event_stop(event, flags);
 }
 
@@ -1314,10 +1419,30 @@ static int trace_imc_event_init(struct perf_event *event)
        if (event->attr.sample_period == 0)
                return -ENOENT;
 
+       /*
+        * Take the global lock, and make sure
+        * no other thread is running any core/thread imc
+        * events
+        */
+       mutex_lock(&imc_global_refc.lock);
+       if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_TRACE) {
+               /*
+                * No core/thread imc events are running in the
+                * system, so set the refc.id to trace-imc.
+                */
+               imc_global_refc.id = IMC_DOMAIN_TRACE;
+               imc_global_refc.refc++;
+       } else {
+               mutex_unlock(&imc_global_refc.lock);
+               return -EBUSY;
+       }
+       mutex_unlock(&imc_global_refc.lock);
+
        event->hw.idx = -1;
        target = event->hw.target;
 
        event->pmu->task_ctx_nr = perf_hw_context;
+       event->destroy = reset_global_refc;
        return 0;
 }
 
@@ -1429,10 +1554,10 @@ static void cleanup_all_core_imc_memory(void)
 static void thread_imc_ldbar_disable(void *dummy)
 {
        /*
-        * By Zeroing LDBAR, we disable thread-imc
-        * updates.
+        * By setting 0th bit of LDBAR to zero, we disable thread-imc
+        * updates to memory.
         */
-       mtspr(SPRN_LDBAR, 0);
+       mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63))));
 }
 
 void thread_imc_disable(void)
index d6d64f8..13b369d 100644 (file)
@@ -231,16 +231,10 @@ static int memtrace_online(void)
                        continue;
                }
 
-               /*
-                * If kernel isn't compiled with the auto online option
-                * we need to online the memory ourselves.
-                */
-               if (!memhp_auto_online) {
-                       lock_device_hotplug();
-                       walk_memory_blocks(ent->start, ent->size, NULL,
-                                          online_mem_block);
-                       unlock_device_hotplug();
-               }
+               lock_device_hotplug();
+               walk_memory_blocks(ent->start, ent->size, NULL,
+                                  online_mem_block);
+               unlock_device_hotplug();
 
                /*
                 * Memory was added successfully so clean up references to it
index 968b9a4..7824cc3 100644 (file)
@@ -268,14 +268,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev)
                        domain = IMC_DOMAIN_THREAD;
                        break;
                case IMC_TYPE_TRACE:
-                       /*
-                        * FIXME. Using trace_imc events to monitor application
-                        * or KVM thread performance can cause a checkstop
-                        * (system crash).
-                        * Disable it for now.
-                        */
-                       pr_info_once("IMC: disabling trace_imc PMU\n");
-                       domain = -1;
+                       domain = IMC_DOMAIN_TRACE;
                        break;
                default:
                        pr_warn("IMC Unknown Device type \n");
index cbddd63..e853037 100644 (file)
@@ -613,10 +613,8 @@ static int update_flash_db(void)
        /* Read in header and db from flash. */
 
        header = kmalloc(buf_len, GFP_KERNEL);
-       if (!header) {
-               pr_debug("%s: kmalloc failed\n", __func__);
+       if (!header)
                return -ENOMEM;
-       }
 
        count = os_area_flash_read(header, buf_len, 0);
        if (count < 0) {
index 2e0a8ea..6d47b4a 100644 (file)
@@ -945,6 +945,15 @@ static phys_addr_t ddw_memory_hotplug_max(void)
        phys_addr_t max_addr = memory_hotplug_max();
        struct device_node *memory;
 
+       /*
+        * The "ibm,pmemory" can appear anywhere in the address space.
+        * Assuming it is still backed by page structs, set the upper limit
+        * for the huge DMA window as MAX_PHYSMEM_BITS.
+        */
+       if (of_find_node_by_type(NULL, "ibm,pmemory"))
+               return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
+                       (phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS);
+
        for_each_node_by_type(memory, "memory") {
                unsigned long start, size;
                int n_mem_addr_cells, n_mem_size_cells, len;
index e460610..f355924 100644 (file)
@@ -286,25 +286,6 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
        return 0;
 }
 
-static inline int papr_scm_node(int node)
-{
-       int min_dist = INT_MAX, dist;
-       int nid, min_node;
-
-       if ((node == NUMA_NO_NODE) || node_online(node))
-               return node;
-
-       min_node = first_online_node;
-       for_each_online_node(nid) {
-               dist = node_distance(node, nid);
-               if (dist < min_dist) {
-                       min_dist = dist;
-                       min_node = nid;
-               }
-       }
-       return min_node;
-}
-
 static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 {
        struct device *dev = &p->pdev->dev;
@@ -329,7 +310,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
        }
 
        dimm_flags = 0;
-       set_bit(NDD_ALIASING, &dimm_flags);
+       set_bit(NDD_LABELING, &dimm_flags);
 
        p->nvdimm = nvdimm_create(p->bus, p, NULL, dimm_flags,
                                  PAPR_SCM_DIMM_CMD_MASK, 0, NULL);
@@ -350,7 +331,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 
        memset(&ndr_desc, 0, sizeof(ndr_desc));
        target_nid = dev_to_node(&p->pdev->dev);
-       online_nid = papr_scm_node(target_nid);
+       online_nid = numa_map_to_online_node(target_nid);
        ndr_desc.numa_node = online_nid;
        ndr_desc.target_node = target_nid;
        ndr_desc.res = &p->res;
@@ -362,8 +343,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 
        if (p->is_volatile)
                p->region = nvdimm_volatile_region_create(p->bus, &ndr_desc);
-       else
+       else {
+               set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);
                p->region = nvdimm_pmem_region_create(p->bus, &ndr_desc);
+       }
        if (!p->region) {
                dev_err(dev, "Error registering region %pR from %pOF\n",
                                ndr_desc.res, p->dn);
index aa6208c..1d1da63 100644 (file)
@@ -686,6 +686,17 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 #endif
 
 out:
+       /*
+        * Enable translation as we will be accessing per-cpu variables
+        * in save_mce_event() which may fall outside RMO region, also
+        * leave it enabled because subsequently we will be queuing work
+        * to workqueues where again per-cpu variables accessed, besides
+        * fwnmi_release_errinfo() crashes when called in realmode on
+        * pseries.
+        * Note: All the realmode handling like flushing SLB entries for
+        *       SLB multihit is done by now.
+        */
+       mtmsr(mfmsr() | MSR_IR | MSR_DR);
        save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
                        &mce_err, regs->nip, eaddr, paddr);
 
index 8672e77..a197258 100644 (file)
@@ -20,7 +20,6 @@ config RISCV
        select CLONE_BACKWARDS
        select COMMON_CLK
        select GENERIC_CLOCKEVENTS
-       select GENERIC_CPU_DEVICES
        select GENERIC_IRQ_SHOW
        select GENERIC_PCI_IOMAP
        select GENERIC_SCHED_CLOCK
@@ -29,6 +28,7 @@ config RISCV
        select GENERIC_SMP_IDLE_THREAD
        select GENERIC_ATOMIC64 if !64BIT
        select GENERIC_IOREMAP
+       select GENERIC_PTDUMP if MMU
        select HAVE_ARCH_AUDITSYSCALL
        select HAVE_ARCH_SECCOMP_FILTER
        select HAVE_ASM_MODVERSIONS
@@ -58,6 +58,9 @@ config RISCV
        select HAVE_EBPF_JIT
        select EDAC_SUPPORT
        select ARCH_HAS_GIGANTIC_PAGE
+       select ARCH_HAS_SET_DIRECT_MAP
+       select ARCH_HAS_SET_MEMORY
+       select ARCH_HAS_STRICT_KERNEL_RWX
        select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
        select SPARSEMEM_STATIC if 32BIT
        select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
@@ -129,6 +132,9 @@ config ARCH_SELECT_MEMORY_MODEL
 config ARCH_WANT_GENERAL_HUGETLB
        def_bool y
 
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+       def_bool y
+
 config SYS_SUPPORTS_HUGETLBFS
        def_bool y
 
@@ -247,6 +253,17 @@ config NR_CPUS
        depends on SMP
        default "8"
 
+config HOTPLUG_CPU
+       bool "Support for hot-pluggable CPUs"
+       depends on SMP
+       select GENERIC_IRQ_MIGRATION
+       help
+
+         Say Y here to experiment with turning CPUs off and on.  CPUs
+         can be controlled through /sys/devices/system/cpu.
+
+         Say N if you want to disable CPU hotplug.
+
 choice
        prompt "CPU Tuning"
        default TUNE_GENERIC
@@ -307,6 +324,13 @@ config SECCOMP
          and the task is only allowed to execute a few safe syscalls
          defined by each seccomp mode.
 
+config RISCV_SBI_V01
+       bool "SBI v0.1 support"
+       default y
+       depends on RISCV_SBI
+       help
+         This config allows kernel to use SBI v0.1 APIs. This will be
+         deprecated in future once legacy M-mode software are no longer in use.
 endmenu
 
 menu "Boot options"
index a131174..216286d 100644 (file)
@@ -20,4 +20,14 @@ config SOC_VIRT
        help
          This enables support for QEMU Virt Machine.
 
+config SOC_KENDRYTE
+       bool "Kendryte K210 SoC"
+       depends on !MMU
+       select BUILTIN_DTB
+       select SERIAL_SIFIVE if TTY
+       select SERIAL_SIFIVE_CONSOLE if TTY
+       select SIFIVE_PLIC
+       help
+         This enables support for Kendryte K210 SoC platform hardware.
+
 endmenu
index 259cb53..fb6e37d 100644 (file)
@@ -85,12 +85,12 @@ PHONY += vdso_install
 vdso_install:
        $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@
 
-ifeq ($(CONFIG_RISCV_M_MODE),y)
-KBUILD_IMAGE := $(boot)/loader
+ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_KENDRYTE),yy)
+KBUILD_IMAGE := $(boot)/loader.bin
 else
 KBUILD_IMAGE := $(boot)/Image.gz
 endif
-BOOT_TARGETS := Image Image.gz loader
+BOOT_TARGETS := Image Image.gz loader loader.bin
 
 all:   $(notdir $(KBUILD_IMAGE))
 
index 36db814..3530c59 100644 (file)
@@ -41,6 +41,9 @@ $(obj)/Image.lzma: $(obj)/Image FORCE
 $(obj)/Image.lzo: $(obj)/Image FORCE
        $(call if_changed,lzo)
 
+$(obj)/loader.bin: $(obj)/loader FORCE
+       $(call if_changed,objcopy)
+
 install:
        $(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \
        $(obj)/Image System.map "$(INSTALL_PATH)"
index dcc3ada..557f0b5 100644 (file)
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 subdir-y += sifive
+subdir-y += kendryte
diff --git a/arch/riscv/boot/dts/kendryte/Makefile b/arch/riscv/boot/dts/kendryte/Makefile
new file mode 100644 (file)
index 0000000..815444e
--- /dev/null
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+dtb-$(CONFIG_SOC_KENDRYTE) += k210.dtb
diff --git a/arch/riscv/boot/dts/kendryte/k210.dts b/arch/riscv/boot/dts/kendryte/k210.dts
new file mode 100644 (file)
index 0000000..0d1f28f
--- /dev/null
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+
+/dts-v1/;
+
+#include "k210.dtsi"
+
+/ {
+       model = "Kendryte K210 generic";
+       compatible = "kendryte,k210";
+
+       chosen {
+               bootargs = "earlycon console=ttySIF0";
+               stdout-path = "serial0";
+       };
+};
+
+&uarths0 {
+       status = "okay";
+};
+
diff --git a/arch/riscv/boot/dts/kendryte/k210.dtsi b/arch/riscv/boot/dts/kendryte/k210.dtsi
new file mode 100644 (file)
index 0000000..c1df56c
--- /dev/null
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Sean Anderson <seanga2@gmail.com>
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+#include <dt-bindings/clock/k210-clk.h>
+
+/ {
+       /*
+        * Although the K210 is a 64-bit CPU, the address bus is only 32-bits
+        * wide, and the upper half of all addresses is ignored.
+        */
+       #address-cells = <1>;
+       #size-cells = <1>;
+       compatible = "kendryte,k210";
+
+       aliases {
+               serial0 = &uarths0;
+       };
+
+       /*
+        * The K210 has an sv39 MMU following the priviledge specification v1.9.
+        * Since this is a non-ratified draft specification, the kernel does not
+        * support it and the K210 support enabled only for the !MMU case.
+        * Be consistent with this by setting the CPUs MMU type to "none".
+        */
+       cpus {
+               #address-cells = <1>;
+               #size-cells = <0>;
+               timebase-frequency = <7800000>;
+               cpu0: cpu@0 {
+                       device_type = "cpu";
+                       reg = <0>;
+                       compatible = "kendryte,k210", "sifive,rocket0", "riscv";
+                       riscv,isa = "rv64imafdc";
+                       mmu-type = "none";
+                       i-cache-size = <0x8000>;
+                       i-cache-block-size = <64>;
+                       d-cache-size = <0x8000>;
+                       d-cache-block-size = <64>;
+                       clocks = <&sysctl K210_CLK_CPU>;
+                       clock-frequency = <390000000>;
+                       cpu0_intc: interrupt-controller {
+                               #interrupt-cells = <1>;
+                               interrupt-controller;
+                               compatible = "riscv,cpu-intc";
+                       };
+               };
+               cpu1: cpu@1 {
+                       device_type = "cpu";
+                       reg = <1>;
+                       compatible = "kendryte,k210", "sifive,rocket0", "riscv";
+                       riscv,isa = "rv64imafdc";
+                       mmu-type = "none";
+                       i-cache-size = <0x8000>;
+                       i-cache-block-size = <64>;
+                       d-cache-size = <0x8000>;
+                       d-cache-block-size = <64>;
+                       clocks = <&sysctl K210_CLK_CPU>;
+                       clock-frequency = <390000000>;
+                       cpu1_intc: interrupt-controller {
+                               #interrupt-cells = <1>;
+                               interrupt-controller;
+                               compatible = "riscv,cpu-intc";
+                       };
+               };
+       };
+
+       sram: memory@80000000 {
+               device_type = "memory";
+               reg = <0x80000000 0x400000>,
+                     <0x80400000 0x200000>,
+                     <0x80600000 0x200000>;
+               reg-names = "sram0", "sram1", "aisram";
+       };
+
+       clocks {
+               in0: oscillator {
+                       compatible = "fixed-clock";
+                       #clock-cells = <0>;
+                       clock-frequency = <26000000>;
+               };
+       };
+
+       soc {
+               #address-cells = <1>;
+               #size-cells = <1>;
+               compatible = "kendryte,k210-soc", "simple-bus";
+               ranges;
+               interrupt-parent = <&plic0>;
+
+               sysctl: sysctl@50440000 {
+                       compatible = "kendryte,k210-sysctl", "simple-mfd";
+                       reg = <0x50440000 0x1000>;
+                       #clock-cells = <1>;
+               };
+
+               clint0: interrupt-controller@2000000 {
+                       compatible = "riscv,clint0";
+                       reg = <0x2000000 0xC000>;
+                       interrupts-extended = <&cpu0_intc 3>,  <&cpu1_intc 3>;
+                       clocks = <&sysctl K210_CLK_ACLK>;
+               };
+
+               plic0: interrupt-controller@c000000 {
+                       #interrupt-cells = <1>;
+                       interrupt-controller;
+                       compatible = "kendryte,k210-plic0", "riscv,plic0";
+                       reg = <0xC000000 0x4000000>;
+                       interrupts-extended = <&cpu0_intc 11>, <&cpu0_intc 0xffffffff>,
+                                             <&cpu1_intc 11>, <&cpu1_intc 0xffffffff>;
+                       riscv,ndev = <65>;
+                       riscv,max-priority = <7>;
+               };
+
+               uarths0: serial@38000000 {
+                       compatible = "kendryte,k210-uarths", "sifive,uart0";
+                       reg = <0x38000000 0x1000>;
+                       interrupts = <33>;
+                       clocks = <&sysctl K210_CLK_CPU>;
+               };
+       };
+};
index 2557c53..4da4886 100644 (file)
@@ -128,3 +128,4 @@ CONFIG_DEBUG_BLOCK_EXT_DEVT=y
 # CONFIG_FTRACE is not set
 # CONFIG_RUNTIME_TESTING_MENU is not set
 CONFIG_MEMTEST=y
+# CONFIG_SYSFS_SYSCALL is not set
diff --git a/arch/riscv/configs/nommu_k210_defconfig b/arch/riscv/configs/nommu_k210_defconfig
new file mode 100644 (file)
index 0000000..632aa2f
--- /dev/null
@@ -0,0 +1,68 @@
+# CONFIG_CPU_ISOLATION is not set
+CONFIG_LOG_BUF_SHIFT=15
+CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=12
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_INITRAMFS_FORCE=y
+# CONFIG_RD_BZIP2 is not set
+# CONFIG_RD_LZMA is not set
+# CONFIG_RD_XZ is not set
+# CONFIG_RD_LZO is not set
+# CONFIG_RD_LZ4 is not set
+# CONFIG_BOOT_CONFIG is not set
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_SYSFS_SYSCALL is not set
+# CONFIG_FHANDLE is not set
+# CONFIG_BASE_FULL is not set
+# CONFIG_EPOLL is not set
+# CONFIG_SIGNALFD is not set
+# CONFIG_TIMERFD is not set
+# CONFIG_EVENTFD is not set
+# CONFIG_AIO is not set
+# CONFIG_IO_URING is not set
+# CONFIG_ADVISE_SYSCALLS is not set
+# CONFIG_MEMBARRIER is not set
+# CONFIG_KALLSYMS is not set
+CONFIG_EMBEDDED=y
+# CONFIG_VM_EVENT_COUNTERS is not set
+# CONFIG_COMPAT_BRK is not set
+CONFIG_SLOB=y
+# CONFIG_SLAB_MERGE_DEFAULT is not set
+# CONFIG_MMU is not set
+CONFIG_SOC_KENDRYTE=y
+CONFIG_MAXPHYSMEM_2GB=y
+CONFIG_SMP=y
+CONFIG_NR_CPUS=2
+CONFIG_CMDLINE="earlycon console=ttySIF0"
+CONFIG_CMDLINE_FORCE=y
+CONFIG_USE_BUILTIN_DTB=y
+CONFIG_BUILTIN_DTB_SOURCE="kendryte/k210"
+# CONFIG_BLOCK is not set
+CONFIG_BINFMT_FLAT=y
+# CONFIG_COREDUMP is not set
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+# CONFIG_FW_LOADER is not set
+# CONFIG_ALLOW_DEV_COREDUMP is not set
+# CONFIG_INPUT_KEYBOARD is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_SERIO is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_LDISC_AUTOLOAD is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_HW_RANDOM is not set
+# CONFIG_HWMON is not set
+# CONFIG_VGA_CONSOLE is not set
+# CONFIG_HID is not set
+# CONFIG_USB_SUPPORT is not set
+# CONFIG_VIRTIO_MENU is not set
+# CONFIG_DNOTIFY is not set
+# CONFIG_INOTIFY_USER is not set
+# CONFIG_MISC_FILESYSTEMS is not set
+CONFIG_LSM="[]"
+CONFIG_PRINTK_TIME=y
+# CONFIG_DEBUG_MISC is not set
+# CONFIG_SCHED_DEBUG is not set
+# CONFIG_RCU_TRACE is not set
+# CONFIG_FTRACE is not set
+# CONFIG_RUNTIME_TESTING_MENU is not set
index 0292879..05bbf52 100644 (file)
@@ -124,3 +124,4 @@ CONFIG_DEBUG_BLOCK_EXT_DEVT=y
 # CONFIG_FTRACE is not set
 # CONFIG_RUNTIME_TESTING_MENU is not set
 CONFIG_MEMTEST=y
+# CONFIG_SYSFS_SYSCALL is not set
index 75604fe..d6f1ec0 100644 (file)
 #define __BUG_INSN_32  _UL(0x00100073) /* ebreak */
 #define __BUG_INSN_16  _UL(0x9002) /* c.ebreak */
 
+#define GET_INSN_LENGTH(insn)                                          \
+({                                                                     \
+       unsigned long __len;                                            \
+       __len = ((insn & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) ?     \
+               4UL : 2UL;                                              \
+       __len;                                                          \
+})
+
 typedef u32 bug_insn_t;
 
 #ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
index 555b20b..c8677c7 100644 (file)
@@ -85,7 +85,7 @@ static inline void flush_dcache_page(struct page *page)
  * so instead we just flush the whole thing.
  */
 #define flush_icache_range(start, end) flush_icache_all()
-#define flush_icache_user_range(vma, pg, addr, len) flush_icache_all()
+#define flush_icache_user_range(vma, pg, addr, len) flush_icache_mm(vma->vm_mm, 0)
 
 #ifndef CONFIG_SMP
 
diff --git a/arch/riscv/include/asm/cpu_ops.h b/arch/riscv/include/asm/cpu_ops.h
new file mode 100644 (file)
index 0000000..a8ec3c5
--- /dev/null
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ * Based on arch/arm64/include/asm/cpu_ops.h
+ */
+#ifndef __ASM_CPU_OPS_H
+#define __ASM_CPU_OPS_H
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/threads.h>
+
+/**
+ * struct cpu_operations - Callback operations for hotplugging CPUs.
+ *
+ * @name:              Name of the boot protocol.
+ * @cpu_prepare:       Early one-time preparation step for a cpu. If there
+ *                     is a mechanism for doing so, tests whether it is
+ *                     possible to boot the given HART.
+ * @cpu_start:         Boots a cpu into the kernel.
+ * @cpu_disable:       Prepares a cpu to die. May fail for some
+ *                     mechanism-specific reason, which will cause the hot
+ *                     unplug to be aborted. Called from the cpu to be killed.
+ * @cpu_stop:          Makes a cpu leave the kernel. Must not fail. Called from
+ *                     the cpu being stopped.
+ * @cpu_is_stopped:    Ensures a cpu has left the kernel. Called from another
+ *                     cpu.
+ */
+struct cpu_operations {
+       const char      *name;
+       int             (*cpu_prepare)(unsigned int cpu);
+       int             (*cpu_start)(unsigned int cpu,
+                                    struct task_struct *tidle);
+#ifdef CONFIG_HOTPLUG_CPU
+       int             (*cpu_disable)(unsigned int cpu);
+       void            (*cpu_stop)(void);
+       int             (*cpu_is_stopped)(unsigned int cpu);
+#endif
+};
+
+extern const struct cpu_operations *cpu_ops[NR_CPUS];
+void __init cpu_set_ops(int cpu);
+void cpu_update_secondary_bootdata(unsigned int cpuid,
+                                  struct task_struct *tidle);
+
+#endif /* ifndef __ASM_CPU_OPS_H */
index dd973ef..1de233d 100644 (file)
@@ -17,6 +17,8 @@
 
 struct task_struct;
 
+register struct task_struct *riscv_current_is_tp __asm__("tp");
+
 /*
  * This only works because "struct thread_info" is at offset 0 from "struct
  * task_struct".  This constraint seems to be necessary on other architectures
@@ -26,8 +28,7 @@ struct task_struct;
  */
 static __always_inline struct task_struct *get_current(void)
 {
-       register struct task_struct *tp __asm__("tp");
-       return tp;
+       return riscv_current_is_tp;
 }
 
 #define current get_current()
index 42d2c42..2368d49 100644 (file)
@@ -27,6 +27,8 @@ enum fixed_addresses {
        FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
        FIX_PTE,
        FIX_PMD,
+       FIX_TEXT_POKE1,
+       FIX_TEXT_POKE0,
        FIX_EARLYCON_MEM_BASE,
        __end_of_fixed_addresses
 };
index eee6e65..b47045c 100644 (file)
@@ -13,7 +13,7 @@
 #define KASAN_SHADOW_SCALE_SHIFT       3
 
 #define KASAN_SHADOW_SIZE      (UL(1) << (38 - KASAN_SHADOW_SCALE_SHIFT))
-#define KASAN_SHADOW_START     0xffffffc000000000 /* 2^64 - 2^38 */
+#define KASAN_SHADOW_START     KERN_VIRT_START /* 2^64 - 2^38 */
 #define KASAN_SHADOW_END       (KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
 
 #define KASAN_SHADOW_OFFSET    (KASAN_SHADOW_END - (1ULL << \
diff --git a/arch/riscv/include/asm/patch.h b/arch/riscv/include/asm/patch.h
new file mode 100644 (file)
index 0000000..b5918a6
--- /dev/null
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 SiFive
+ */
+
+#ifndef _ASM_RISCV_PATCH_H
+#define _ASM_RISCV_PATCH_H
+
+int riscv_patch_text_nosync(void *addr, const void *insns, size_t len);
+int riscv_patch_text(void *addr, u32 insn);
+
+#endif /* _ASM_RISCV_PATCH_H */
index 393f201..9c188ad 100644 (file)
@@ -449,6 +449,16 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 #define __swp_entry_to_pte(x)  ((pte_t) { (x).val })
 
 /*
+ * In the RV64 Linux scheme, we give the user half of the virtual-address space
+ * and give the kernel the other (upper) half.
+ */
+#ifdef CONFIG_64BIT
+#define KERN_VIRT_START        (-(BIT(CONFIG_VA_BITS)) + TASK_SIZE)
+#else
+#define KERN_VIRT_START        FIXADDR_START
+#endif
+
+/*
  * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32.
  * Note that PGDIR_SIZE must evenly divide TASK_SIZE.
  */
diff --git a/arch/riscv/include/asm/ptdump.h b/arch/riscv/include/asm/ptdump.h
new file mode 100644 (file)
index 0000000..e29af71
--- /dev/null
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 SiFive
+ */
+
+#ifndef _ASM_RISCV_PTDUMP_H
+#define _ASM_RISCV_PTDUMP_H
+
+void ptdump_check_wx(void);
+
+#endif /* _ASM_RISCV_PTDUMP_H */
index 2570c1e..653edb2 100644 (file)
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Copyright (C) 2015 Regents of the University of California
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
  */
 
 #ifndef _ASM_RISCV_SBI_H
 #include <linux/types.h>
 
 #ifdef CONFIG_RISCV_SBI
-#define SBI_SET_TIMER 0
-#define SBI_CONSOLE_PUTCHAR 1
-#define SBI_CONSOLE_GETCHAR 2
-#define SBI_CLEAR_IPI 3
-#define SBI_SEND_IPI 4
-#define SBI_REMOTE_FENCE_I 5
-#define SBI_REMOTE_SFENCE_VMA 6
-#define SBI_REMOTE_SFENCE_VMA_ASID 7
-#define SBI_SHUTDOWN 8
-
-#define SBI_CALL(which, arg0, arg1, arg2, arg3) ({             \
-       register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0);   \
-       register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1);   \
-       register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2);   \
-       register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3);   \
-       register uintptr_t a7 asm ("a7") = (uintptr_t)(which);  \
-       asm volatile ("ecall"                                   \
-                     : "+r" (a0)                               \
-                     : "r" (a1), "r" (a2), "r" (a3), "r" (a7)  \
-                     : "memory");                              \
-       a0;                                                     \
-})
-
-/* Lazy implementations until SBI is finalized */
-#define SBI_CALL_0(which) SBI_CALL(which, 0, 0, 0, 0)
-#define SBI_CALL_1(which, arg0) SBI_CALL(which, arg0, 0, 0, 0)
-#define SBI_CALL_2(which, arg0, arg1) SBI_CALL(which, arg0, arg1, 0, 0)
-#define SBI_CALL_3(which, arg0, arg1, arg2) \
-               SBI_CALL(which, arg0, arg1, arg2, 0)
-#define SBI_CALL_4(which, arg0, arg1, arg2, arg3) \
-               SBI_CALL(which, arg0, arg1, arg2, arg3)
-
-static inline void sbi_console_putchar(int ch)
-{
-       SBI_CALL_1(SBI_CONSOLE_PUTCHAR, ch);
-}
+enum sbi_ext_id {
+#ifdef CONFIG_RISCV_SBI_V01
+       SBI_EXT_0_1_SET_TIMER = 0x0,
+       SBI_EXT_0_1_CONSOLE_PUTCHAR = 0x1,
+       SBI_EXT_0_1_CONSOLE_GETCHAR = 0x2,
+       SBI_EXT_0_1_CLEAR_IPI = 0x3,
+       SBI_EXT_0_1_SEND_IPI = 0x4,
+       SBI_EXT_0_1_REMOTE_FENCE_I = 0x5,
+       SBI_EXT_0_1_REMOTE_SFENCE_VMA = 0x6,
+       SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID = 0x7,
+       SBI_EXT_0_1_SHUTDOWN = 0x8,
+#endif
+       SBI_EXT_BASE = 0x10,
+       SBI_EXT_TIME = 0x54494D45,
+       SBI_EXT_IPI = 0x735049,
+       SBI_EXT_RFENCE = 0x52464E43,
+       SBI_EXT_HSM = 0x48534D,
+};
 
-static inline int sbi_console_getchar(void)
-{
-       return SBI_CALL_0(SBI_CONSOLE_GETCHAR);
-}
+enum sbi_ext_base_fid {
+       SBI_EXT_BASE_GET_SPEC_VERSION = 0,
+       SBI_EXT_BASE_GET_IMP_ID,
+       SBI_EXT_BASE_GET_IMP_VERSION,
+       SBI_EXT_BASE_PROBE_EXT,
+       SBI_EXT_BASE_GET_MVENDORID,
+       SBI_EXT_BASE_GET_MARCHID,
+       SBI_EXT_BASE_GET_MIMPID,
+};
 
-static inline void sbi_set_timer(uint64_t stime_value)
-{
-#if __riscv_xlen == 32
-       SBI_CALL_2(SBI_SET_TIMER, stime_value, stime_value >> 32);
-#else
-       SBI_CALL_1(SBI_SET_TIMER, stime_value);
-#endif
-}
+enum sbi_ext_time_fid {
+       SBI_EXT_TIME_SET_TIMER = 0,
+};
 
-static inline void sbi_shutdown(void)
-{
-       SBI_CALL_0(SBI_SHUTDOWN);
-}
+enum sbi_ext_ipi_fid {
+       SBI_EXT_IPI_SEND_IPI = 0,
+};
 
-static inline void sbi_clear_ipi(void)
-{
-       SBI_CALL_0(SBI_CLEAR_IPI);
-}
+enum sbi_ext_rfence_fid {
+       SBI_EXT_RFENCE_REMOTE_FENCE_I = 0,
+       SBI_EXT_RFENCE_REMOTE_SFENCE_VMA,
+       SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID,
+       SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA,
+       SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID,
+       SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA,
+       SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID,
+};
 
-static inline void sbi_send_ipi(const unsigned long *hart_mask)
-{
-       SBI_CALL_1(SBI_SEND_IPI, hart_mask);
-}
+enum sbi_ext_hsm_fid {
+       SBI_EXT_HSM_HART_START = 0,
+       SBI_EXT_HSM_HART_STOP,
+       SBI_EXT_HSM_HART_STATUS,
+};
+
+enum sbi_hsm_hart_status {
+       SBI_HSM_HART_STATUS_STARTED = 0,
+       SBI_HSM_HART_STATUS_STOPPED,
+       SBI_HSM_HART_STATUS_START_PENDING,
+       SBI_HSM_HART_STATUS_STOP_PENDING,
+};
+
+#define SBI_SPEC_VERSION_DEFAULT       0x1
+#define SBI_SPEC_VERSION_MAJOR_SHIFT   24
+#define SBI_SPEC_VERSION_MAJOR_MASK    0x7f
+#define SBI_SPEC_VERSION_MINOR_MASK    0xffffff
+
+/* SBI return error codes */
+#define SBI_SUCCESS            0
+#define SBI_ERR_FAILURE                -1
+#define SBI_ERR_NOT_SUPPORTED  -2
+#define SBI_ERR_INVALID_PARAM  -3
+#define SBI_ERR_DENIED         -4
+#define SBI_ERR_INVALID_ADDRESS        -5
 
-static inline void sbi_remote_fence_i(const unsigned long *hart_mask)
+extern unsigned long sbi_spec_version;
+struct sbiret {
+       long error;
+       long value;
+};
+
+int sbi_init(void);
+struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
+                       unsigned long arg1, unsigned long arg2,
+                       unsigned long arg3, unsigned long arg4,
+                       unsigned long arg5);
+
+void sbi_console_putchar(int ch);
+int sbi_console_getchar(void);
+void sbi_set_timer(uint64_t stime_value);
+void sbi_shutdown(void);
+void sbi_clear_ipi(void);
+void sbi_send_ipi(const unsigned long *hart_mask);
+void sbi_remote_fence_i(const unsigned long *hart_mask);
+void sbi_remote_sfence_vma(const unsigned long *hart_mask,
+                          unsigned long start,
+                          unsigned long size);
+
+void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask,
+                               unsigned long start,
+                               unsigned long size,
+                               unsigned long asid);
+int sbi_remote_hfence_gvma(const unsigned long *hart_mask,
+                          unsigned long start,
+                          unsigned long size);
+int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask,
+                               unsigned long start,
+                               unsigned long size,
+                               unsigned long vmid);
+int sbi_remote_hfence_vvma(const unsigned long *hart_mask,
+                          unsigned long start,
+                          unsigned long size);
+int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask,
+                               unsigned long start,
+                               unsigned long size,
+                               unsigned long asid);
+int sbi_probe_extension(int ext);
+
+/* Check if current SBI specification version is 0.1 or not */
+static inline int sbi_spec_is_0_1(void)
 {
-       SBI_CALL_1(SBI_REMOTE_FENCE_I, hart_mask);
+       return (sbi_spec_version == SBI_SPEC_VERSION_DEFAULT) ? 1 : 0;
 }
 
-static inline void sbi_remote_sfence_vma(const unsigned long *hart_mask,
-                                        unsigned long start,
-                                        unsigned long size)
+/* Get the major version of SBI */
+static inline unsigned long sbi_major_version(void)
 {
-       SBI_CALL_3(SBI_REMOTE_SFENCE_VMA, hart_mask, start, size);
+       return (sbi_spec_version >> SBI_SPEC_VERSION_MAJOR_SHIFT) &
+               SBI_SPEC_VERSION_MAJOR_MASK;
 }
 
-static inline void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask,
-                                             unsigned long start,
-                                             unsigned long size,
-                                             unsigned long asid)
+/* Get the minor version of SBI */
+static inline unsigned long sbi_minor_version(void)
 {
-       SBI_CALL_4(SBI_REMOTE_SFENCE_VMA_ASID, hart_mask, start, size, asid);
+       return sbi_spec_version & SBI_SPEC_VERSION_MINOR_MASK;
 }
+
+int sbi_err_map_linux_errno(int err);
 #else /* CONFIG_RISCV_SBI */
 /* stubs for code that is only reachable under IS_ENABLED(CONFIG_RISCV_SBI): */
 void sbi_set_timer(uint64_t stime_value);
 void sbi_clear_ipi(void);
 void sbi_send_ipi(const unsigned long *hart_mask);
 void sbi_remote_fence_i(const unsigned long *hart_mask);
+void sbi_init(void);
 #endif /* CONFIG_RISCV_SBI */
 #endif /* _ASM_RISCV_SBI_H */
diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
new file mode 100644 (file)
index 0000000..c38df47
--- /dev/null
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 SiFive
+ */
+
+#ifndef _ASM_RISCV_SET_MEMORY_H
+#define _ASM_RISCV_SET_MEMORY_H
+
+#ifndef __ASSEMBLY__
+/*
+ * Functions to change memory attributes.
+ */
+#ifdef CONFIG_MMU
+int set_memory_ro(unsigned long addr, int numpages);
+int set_memory_rw(unsigned long addr, int numpages);
+int set_memory_x(unsigned long addr, int numpages);
+int set_memory_nx(unsigned long addr, int numpages);
+#else
+static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
+#endif
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void set_kernel_text_ro(void);
+void set_kernel_text_rw(void);
+#else
+static inline void set_kernel_text_ro(void) { }
+static inline void set_kernel_text_rw(void) { }
+#endif
+
+int set_direct_map_invalid_noflush(struct page *page);
+int set_direct_map_default_noflush(struct page *page);
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_ARCH_HAS_STRICT_KERNEL_RWX
+#ifdef CONFIG_64BIT
+#define SECTION_ALIGN (1 << 21)
+#else
+#define SECTION_ALIGN (1 << 22)
+#endif
+#else /* !CONFIG_ARCH_HAS_STRICT_KERNEL_RWX */
+#define SECTION_ALIGN L1_CACHE_BYTES
+#endif /* CONFIG_ARCH_HAS_STRICT_KERNEL_RWX */
+
+#endif /* _ASM_RISCV_SET_MEMORY_H */
index a83451d..f4c7cfd 100644 (file)
@@ -43,6 +43,13 @@ void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out);
  */
 #define raw_smp_processor_id() (current_thread_info()->cpu)
 
+#if defined CONFIG_HOTPLUG_CPU
+int __cpu_disable(void);
+void __cpu_die(unsigned int cpu);
+void cpu_stop(void);
+#else
+#endif /* CONFIG_HOTPLUG_CPU */
+
 #else
 
 static inline void show_ipi_stats(struct seq_file *p, int prec)
@@ -61,5 +68,22 @@ static inline unsigned long cpuid_to_hartid_map(int cpu)
        return boot_cpu_hartid;
 }
 
+static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in,
+                                             struct cpumask *out)
+{
+       cpumask_clear(out);
+       cpumask_set_cpu(boot_cpu_hartid, out);
+}
+
 #endif /* CONFIG_SMP */
+
+#if defined(CONFIG_HOTPLUG_CPU) && (CONFIG_SMP)
+bool cpu_has_hotplug(unsigned int cpu);
+#else
+static inline bool cpu_has_hotplug(unsigned int cpu)
+{
+       return false;
+}
+#endif
+
 #endif /* _ASM_RISCV_SMP_H */
diff --git a/arch/riscv/include/asm/soc.h b/arch/riscv/include/asm/soc.h
new file mode 100644 (file)
index 0000000..7cec196
--- /dev/null
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#ifndef _ASM_RISCV_SOC_H
+#define _ASM_RISCV_SOC_H
+
+#include <linux/of.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+
+#define SOC_EARLY_INIT_DECLARE(name, compat, fn)                       \
+       static const struct of_device_id __soc_early_init__##name       \
+               __used __section(__soc_early_init_table)                \
+                = { .compatible = compat, .data = fn  }
+
+void soc_early_init(void);
+
+extern unsigned long __soc_early_init_table_start;
+extern unsigned long __soc_early_init_table_end;
+
+#endif
index f40205c..86c8308 100644 (file)
@@ -4,12 +4,14 @@
 #
 
 ifdef CONFIG_FTRACE
-CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_patch.o  = -pg
 endif
 
 extra-y += head.o
 extra-y += vmlinux.lds
 
+obj-y  += soc.o
 obj-y  += cpu.o
 obj-y  += cpufeature.o
 obj-y  += entry.o
@@ -26,12 +28,15 @@ obj-y       += traps.o
 obj-y  += riscv_ksyms.o
 obj-y  += stacktrace.o
 obj-y  += cacheinfo.o
+obj-y  += patch.o
 obj-$(CONFIG_MMU) += vdso.o vdso/
 
-obj-$(CONFIG_RISCV_M_MODE)     += clint.o
+obj-$(CONFIG_RISCV_M_MODE)     += clint.o traps_misaligned.o
 obj-$(CONFIG_FPU)              += fpu.o
 obj-$(CONFIG_SMP)              += smpboot.o
 obj-$(CONFIG_SMP)              += smp.o
+obj-$(CONFIG_SMP)              += cpu_ops.o
+obj-$(CONFIG_SMP)              += cpu_ops_spinwait.o
 obj-$(CONFIG_MODULES)          += module.o
 obj-$(CONFIG_MODULE_SECTIONS)  += module-sections.o
 
@@ -42,5 +47,9 @@ obj-$(CONFIG_PERF_EVENTS)     += perf_event.o
 obj-$(CONFIG_PERF_EVENTS)      += perf_callchain.o
 obj-$(CONFIG_HAVE_PERF_REGS)   += perf_regs.o
 obj-$(CONFIG_RISCV_SBI)                += sbi.o
+ifeq ($(CONFIG_RISCV_SBI), y)
+obj-$(CONFIG_SMP) += cpu_ops_sbi.o
+endif
+obj-$(CONFIG_HOTPLUG_CPU)      += cpu-hotplug.o
 
 clean:
diff --git a/arch/riscv/kernel/cpu-hotplug.c b/arch/riscv/kernel/cpu-hotplug.c
new file mode 100644 (file)
index 0000000..df84e0c
--- /dev/null
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/irq.h>
+#include <linux/cpu.h>
+#include <linux/sched/hotplug.h>
+#include <asm/irq.h>
+#include <asm/cpu_ops.h>
+#include <asm/sbi.h>
+
+void cpu_stop(void);
+void arch_cpu_idle_dead(void)
+{
+       cpu_stop();
+}
+
+bool cpu_has_hotplug(unsigned int cpu)
+{
+       if (cpu_ops[cpu]->cpu_stop)
+               return true;
+
+       return false;
+}
+
+/*
+ * __cpu_disable runs on the processor to be shutdown.
+ */
+int __cpu_disable(void)
+{
+       int ret = 0;
+       unsigned int cpu = smp_processor_id();
+
+       if (!cpu_ops[cpu] || !cpu_ops[cpu]->cpu_stop)
+               return -EOPNOTSUPP;
+
+       if (cpu_ops[cpu]->cpu_disable)
+               ret = cpu_ops[cpu]->cpu_disable(cpu);
+
+       if (ret)
+               return ret;
+
+       remove_cpu_topology(cpu);
+       set_cpu_online(cpu, false);
+       irq_migrate_all_off_this_cpu();
+
+       return ret;
+}
+
+/*
+ * Called on the thread which is asking for a CPU to be shutdown.
+ */
+void __cpu_die(unsigned int cpu)
+{
+       int ret = 0;
+
+       if (!cpu_wait_death(cpu, 5)) {
+               pr_err("CPU %u: didn't die\n", cpu);
+               return;
+       }
+       pr_notice("CPU%u: off\n", cpu);
+
+       /* Verify from the firmware if the cpu is really stopped*/
+       if (cpu_ops[cpu]->cpu_is_stopped)
+               ret = cpu_ops[cpu]->cpu_is_stopped(cpu);
+       if (ret)
+               pr_warn("CPU%d may not have stopped: %d\n", cpu, ret);
+}
+
+/*
+ * Called from the idle thread for the CPU which has been shutdown.
+ */
+void cpu_stop(void)
+{
+       idle_task_exit();
+
+       (void)cpu_report_death();
+
+       cpu_ops[smp_processor_id()]->cpu_stop();
+       /* It should never reach here */
+       BUG();
+}
diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c
new file mode 100644 (file)
index 0000000..c4c33bf
--- /dev/null
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/of.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <asm/cpu_ops.h>
+#include <asm/sbi.h>
+#include <asm/smp.h>
+
+const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init;
+
+void *__cpu_up_stack_pointer[NR_CPUS];
+void *__cpu_up_task_pointer[NR_CPUS];
+
+extern const struct cpu_operations cpu_ops_sbi;
+extern const struct cpu_operations cpu_ops_spinwait;
+
+void cpu_update_secondary_bootdata(unsigned int cpuid,
+                                  struct task_struct *tidle)
+{
+       int hartid = cpuid_to_hartid_map(cpuid);
+
+       /* Make sure tidle is updated */
+       smp_mb();
+       WRITE_ONCE(__cpu_up_stack_pointer[hartid],
+                  task_stack_page(tidle) + THREAD_SIZE);
+       WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle);
+}
+
+void __init cpu_set_ops(int cpuid)
+{
+#if IS_ENABLED(CONFIG_RISCV_SBI)
+       if (sbi_probe_extension(SBI_EXT_HSM) > 0) {
+               if (!cpuid)
+                       pr_info("SBI v0.2 HSM extension detected\n");
+               cpu_ops[cpuid] = &cpu_ops_sbi;
+       } else
+#endif
+               cpu_ops[cpuid] = &cpu_ops_spinwait;
+}
diff --git a/arch/riscv/kernel/cpu_ops_sbi.c b/arch/riscv/kernel/cpu_ops_sbi.c
new file mode 100644 (file)
index 0000000..685fae7
--- /dev/null
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * HSM extension and cpu_ops implementation.
+ *
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <asm/cpu_ops.h>
+#include <asm/sbi.h>
+#include <asm/smp.h>
+
+extern char secondary_start_sbi[];
+const struct cpu_operations cpu_ops_sbi;
+
+static int sbi_hsm_hart_start(unsigned long hartid, unsigned long saddr,
+                             unsigned long priv)
+{
+       struct sbiret ret;
+
+       ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_START,
+                       hartid, saddr, priv, 0, 0, 0);
+       if (ret.error)
+               return sbi_err_map_linux_errno(ret.error);
+       else
+               return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int sbi_hsm_hart_stop(void)
+{
+       struct sbiret ret;
+
+       ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_STOP, 0, 0, 0, 0, 0, 0);
+
+       if (ret.error)
+               return sbi_err_map_linux_errno(ret.error);
+       else
+               return 0;
+}
+
+static int sbi_hsm_hart_get_status(unsigned long hartid)
+{
+       struct sbiret ret;
+
+       ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_STATUS,
+                       hartid, 0, 0, 0, 0, 0);
+       if (ret.error)
+               return sbi_err_map_linux_errno(ret.error);
+       else
+               return ret.value;
+}
+#endif
+
+static int sbi_cpu_start(unsigned int cpuid, struct task_struct *tidle)
+{
+       int rc;
+       unsigned long boot_addr = __pa_symbol(secondary_start_sbi);
+       int hartid = cpuid_to_hartid_map(cpuid);
+
+       cpu_update_secondary_bootdata(cpuid, tidle);
+       rc = sbi_hsm_hart_start(hartid, boot_addr, 0);
+
+       return rc;
+}
+
+static int sbi_cpu_prepare(unsigned int cpuid)
+{
+       if (!cpu_ops_sbi.cpu_start) {
+               pr_err("cpu start method not defined for CPU [%d]\n", cpuid);
+               return -ENODEV;
+       }
+       return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int sbi_cpu_disable(unsigned int cpuid)
+{
+       if (!cpu_ops_sbi.cpu_stop)
+               return -EOPNOTSUPP;
+       return 0;
+}
+
+static void sbi_cpu_stop(void)
+{
+       int ret;
+
+       ret = sbi_hsm_hart_stop();
+       pr_crit("Unable to stop the cpu %u (%d)\n", smp_processor_id(), ret);
+}
+
+static int sbi_cpu_is_stopped(unsigned int cpuid)
+{
+       int rc;
+       int hartid = cpuid_to_hartid_map(cpuid);
+
+       rc = sbi_hsm_hart_get_status(hartid);
+
+       if (rc == SBI_HSM_HART_STATUS_STOPPED)
+               return 0;
+       return rc;
+}
+#endif
+
+const struct cpu_operations cpu_ops_sbi = {
+       .name           = "sbi",
+       .cpu_prepare    = sbi_cpu_prepare,
+       .cpu_start      = sbi_cpu_start,
+#ifdef CONFIG_HOTPLUG_CPU
+       .cpu_disable    = sbi_cpu_disable,
+       .cpu_stop       = sbi_cpu_stop,
+       .cpu_is_stopped = sbi_cpu_is_stopped,
+#endif
+};
diff --git a/arch/riscv/kernel/cpu_ops_spinwait.c b/arch/riscv/kernel/cpu_ops_spinwait.c
new file mode 100644 (file)
index 0000000..b2c957b
--- /dev/null
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/errno.h>
+#include <linux/of.h>
+#include <linux/string.h>
+#include <asm/cpu_ops.h>
+#include <asm/sbi.h>
+#include <asm/smp.h>
+
+const struct cpu_operations cpu_ops_spinwait;
+
+static int spinwait_cpu_prepare(unsigned int cpuid)
+{
+       if (!cpu_ops_spinwait.cpu_start) {
+               pr_err("cpu start method not defined for CPU [%d]\n", cpuid);
+               return -ENODEV;
+       }
+       return 0;
+}
+
+static int spinwait_cpu_start(unsigned int cpuid, struct task_struct *tidle)
+{
+       /*
+        * In this protocol, all cpus boot on their own accord.  _start
+        * selects the first cpu to boot the kernel and causes the remainder
+        * of the cpus to spin in a loop waiting for their stack pointer to be
+        * setup by that main cpu.  Writing to bootdata
+        * (i.e __cpu_up_stack_pointer) signals to the spinning cpus that they
+        * can continue the boot process.
+        */
+       cpu_update_secondary_bootdata(cpuid, tidle);
+
+       return 0;
+}
+
+const struct cpu_operations cpu_ops_spinwait = {
+       .name           = "spinwait",
+       .cpu_prepare    = spinwait_cpu_prepare,
+       .cpu_start      = spinwait_cpu_start,
+};
index 208702d..56d071b 100644 (file)
 #include <asm/thread_info.h>
 #include <asm/asm-offsets.h>
 
-       .text
-       .altmacro
-
-/*
- * Prepares to enter a system call or exception by saving all registers to the
- * stack.
- */
-       .macro SAVE_ALL
-       LOCAL _restore_kernel_tpsp
-       LOCAL _save_context
+#if !IS_ENABLED(CONFIG_PREEMPTION)
+.set resume_kernel, restore_all
+#endif
 
+ENTRY(handle_exception)
        /*
         * If coming from userspace, preserve the user thread pointer and load
         * the kernel thread pointer.  If we came from the kernel, the scratch
@@ -90,77 +84,6 @@ _save_context:
        REG_S s3, PT_BADADDR(sp)
        REG_S s4, PT_CAUSE(sp)
        REG_S s5, PT_TP(sp)
-       .endm
-
-/*
- * Prepares to return from a system call or exception by restoring all
- * registers from the stack.
- */
-       .macro RESTORE_ALL
-       REG_L a0, PT_STATUS(sp)
-       /*
-        * The current load reservation is effectively part of the processor's
-        * state, in the sense that load reservations cannot be shared between
-        * different hart contexts.  We can't actually save and restore a load
-        * reservation, so instead here we clear any existing reservation --
-        * it's always legal for implementations to clear load reservations at
-        * any point (as long as the forward progress guarantee is kept, but
-        * we'll ignore that here).
-        *
-        * Dangling load reservations can be the result of taking a trap in the
-        * middle of an LR/SC sequence, but can also be the result of a taken
-        * forward branch around an SC -- which is how we implement CAS.  As a
-        * result we need to clear reservations between the last CAS and the
-        * jump back to the new context.  While it is unlikely the store
-        * completes, implementations are allowed to expand reservations to be
-        * arbitrarily large.
-        */
-       REG_L  a2, PT_EPC(sp)
-       REG_SC x0, a2, PT_EPC(sp)
-
-       csrw CSR_STATUS, a0
-       csrw CSR_EPC, a2
-
-       REG_L x1,  PT_RA(sp)
-       REG_L x3,  PT_GP(sp)
-       REG_L x4,  PT_TP(sp)
-       REG_L x5,  PT_T0(sp)
-       REG_L x6,  PT_T1(sp)
-       REG_L x7,  PT_T2(sp)
-       REG_L x8,  PT_S0(sp)
-       REG_L x9,  PT_S1(sp)
-       REG_L x10, PT_A0(sp)
-       REG_L x11, PT_A1(sp)
-       REG_L x12, PT_A2(sp)
-       REG_L x13, PT_A3(sp)
-       REG_L x14, PT_A4(sp)
-       REG_L x15, PT_A5(sp)
-       REG_L x16, PT_A6(sp)
-       REG_L x17, PT_A7(sp)
-       REG_L x18, PT_S2(sp)
-       REG_L x19, PT_S3(sp)
-       REG_L x20, PT_S4(sp)
-       REG_L x21, PT_S5(sp)
-       REG_L x22, PT_S6(sp)
-       REG_L x23, PT_S7(sp)
-       REG_L x24, PT_S8(sp)
-       REG_L x25, PT_S9(sp)
-       REG_L x26, PT_S10(sp)
-       REG_L x27, PT_S11(sp)
-       REG_L x28, PT_T3(sp)
-       REG_L x29, PT_T4(sp)
-       REG_L x30, PT_T5(sp)
-       REG_L x31, PT_T6(sp)
-
-       REG_L x2,  PT_SP(sp)
-       .endm
-
-#if !IS_ENABLED(CONFIG_PREEMPTION)
-.set resume_kernel, restore_all
-#endif
-
-ENTRY(handle_exception)
-       SAVE_ALL
 
        /*
         * Set the scratch register to 0, so that if a recursive exception
@@ -291,7 +214,63 @@ resume_userspace:
        csrw CSR_SCRATCH, tp
 
 restore_all:
-       RESTORE_ALL
+       REG_L a0, PT_STATUS(sp)
+       /*
+        * The current load reservation is effectively part of the processor's
+        * state, in the sense that load reservations cannot be shared between
+        * different hart contexts.  We can't actually save and restore a load
+        * reservation, so instead here we clear any existing reservation --
+        * it's always legal for implementations to clear load reservations at
+        * any point (as long as the forward progress guarantee is kept, but
+        * we'll ignore that here).
+        *
+        * Dangling load reservations can be the result of taking a trap in the
+        * middle of an LR/SC sequence, but can also be the result of a taken
+        * forward branch around an SC -- which is how we implement CAS.  As a
+        * result we need to clear reservations between the last CAS and the
+        * jump back to the new context.  While it is unlikely the store
+        * completes, implementations are allowed to expand reservations to be
+        * arbitrarily large.
+        */
+       REG_L  a2, PT_EPC(sp)
+       REG_SC x0, a2, PT_EPC(sp)
+
+       csrw CSR_STATUS, a0
+       csrw CSR_EPC, a2
+
+       REG_L x1,  PT_RA(sp)
+       REG_L x3,  PT_GP(sp)
+       REG_L x4,  PT_TP(sp)
+       REG_L x5,  PT_T0(sp)
+       REG_L x6,  PT_T1(sp)
+       REG_L x7,  PT_T2(sp)
+       REG_L x8,  PT_S0(sp)
+       REG_L x9,  PT_S1(sp)
+       REG_L x10, PT_A0(sp)
+       REG_L x11, PT_A1(sp)
+       REG_L x12, PT_A2(sp)
+       REG_L x13, PT_A3(sp)
+       REG_L x14, PT_A4(sp)
+       REG_L x15, PT_A5(sp)
+       REG_L x16, PT_A6(sp)
+       REG_L x17, PT_A7(sp)
+       REG_L x18, PT_S2(sp)
+       REG_L x19, PT_S3(sp)
+       REG_L x20, PT_S4(sp)
+       REG_L x21, PT_S5(sp)
+       REG_L x22, PT_S6(sp)
+       REG_L x23, PT_S7(sp)
+       REG_L x24, PT_S8(sp)
+       REG_L x25, PT_S9(sp)
+       REG_L x26, PT_S10(sp)
+       REG_L x27, PT_S11(sp)
+       REG_L x28, PT_T3(sp)
+       REG_L x29, PT_T4(sp)
+       REG_L x30, PT_T5(sp)
+       REG_L x31, PT_T6(sp)
+
+       REG_L x2,  PT_SP(sp)
+
 #ifdef CONFIG_RISCV_M_MODE
        mret
 #else
index c40fdcd..ce69b34 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/ftrace.h>
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
+#include <asm/patch.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 static int ftrace_check_current_call(unsigned long hook_pos,
@@ -46,20 +47,14 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target,
 {
        unsigned int call[2];
        unsigned int nops[2] = {NOP4, NOP4};
-       int ret = 0;
 
        make_call(hook_pos, target, call);
 
-       /* replace the auipc-jalr pair at once */
-       ret = probe_kernel_write((void *)hook_pos, enable ? call : nops,
-                                MCOUNT_INSN_SIZE);
-       /* return must be -EPERM on write error */
-       if (ret)
+       /* Replace the auipc-jalr pair at once. Return -EPERM on write error. */
+       if (riscv_patch_text_nosync
+           ((void *)hook_pos, enable ? call : nops, MCOUNT_INSN_SIZE))
                return -EPERM;
 
-       smp_mb();
-       flush_icache_range((void *)hook_pos, (void *)hook_pos + MCOUNT_INSN_SIZE);
-
        return 0;
 }
 
index 85f2073..98a4064 100644 (file)
@@ -14,7 +14,7 @@
 #include <asm/hwcap.h>
 #include <asm/image.h>
 
-__INIT
+__HEAD
 ENTRY(_start)
        /*
         * Image header expected by Linux boot-loaders. The image header data
@@ -45,8 +45,111 @@ ENTRY(_start)
        .ascii RISCV_IMAGE_MAGIC2
        .word 0
 
-.global _start_kernel
-_start_kernel:
+.align 2
+#ifdef CONFIG_MMU
+relocate:
+       /* Relocate return address */
+       li a1, PAGE_OFFSET
+       la a2, _start
+       sub a1, a1, a2
+       add ra, ra, a1
+
+       /* Point stvec to virtual address of intruction after satp write */
+       la a2, 1f
+       add a2, a2, a1
+       csrw CSR_TVEC, a2
+
+       /* Compute satp for kernel page tables, but don't load it yet */
+       srl a2, a0, PAGE_SHIFT
+       li a1, SATP_MODE
+       or a2, a2, a1
+
+       /*
+        * Load trampoline page directory, which will cause us to trap to
+        * stvec if VA != PA, or simply fall through if VA == PA.  We need a
+        * full fence here because setup_vm() just wrote these PTEs and we need
+        * to ensure the new translations are in use.
+        */
+       la a0, trampoline_pg_dir
+       srl a0, a0, PAGE_SHIFT
+       or a0, a0, a1
+       sfence.vma
+       csrw CSR_SATP, a0
+.align 2
+1:
+       /* Set trap vector to spin forever to help debug */
+       la a0, .Lsecondary_park
+       csrw CSR_TVEC, a0
+
+       /* Reload the global pointer */
+.option push
+.option norelax
+       la gp, __global_pointer$
+.option pop
+
+       /*
+        * Switch to kernel page tables.  A full fence is necessary in order to
+        * avoid using the trampoline translations, which are only correct for
+        * the first superpage.  Fetching the fence is guarnteed to work
+        * because that first superpage is translated the same way.
+        */
+       csrw CSR_SATP, a2
+       sfence.vma
+
+       ret
+#endif /* CONFIG_MMU */
+#ifdef CONFIG_SMP
+       .global secondary_start_sbi
+secondary_start_sbi:
+       /* Mask all interrupts */
+       csrw CSR_IE, zero
+       csrw CSR_IP, zero
+
+       /* Load the global pointer */
+       .option push
+       .option norelax
+               la gp, __global_pointer$
+       .option pop
+
+       /*
+        * Disable FPU to detect illegal usage of
+        * floating point in kernel space
+        */
+       li t0, SR_FS
+       csrc CSR_STATUS, t0
+
+       /* Set trap vector to spin forever to help debug */
+       la a3, .Lsecondary_park
+       csrw CSR_TVEC, a3
+
+       slli a3, a0, LGREG
+       la a4, __cpu_up_stack_pointer
+       la a5, __cpu_up_task_pointer
+       add a4, a3, a4
+       add a5, a3, a5
+       REG_L sp, (a4)
+       REG_L tp, (a5)
+
+       .global secondary_start_common
+secondary_start_common:
+
+#ifdef CONFIG_MMU
+       /* Enable virtual memory and relocate to virtual address */
+       la a0, swapper_pg_dir
+       call relocate
+#endif
+       tail smp_callin
+#endif /* CONFIG_SMP */
+
+.Lsecondary_park:
+       /* We lack SMP support or have too many harts, so park this hart */
+       wfi
+       j .Lsecondary_park
+
+END(_start)
+
+       __INIT
+ENTRY(_start_kernel)
        /* Mask all interrupts */
        csrw CSR_IE, zero
        csrw CSR_IP, zero
@@ -131,62 +234,10 @@ clear_bss_done:
        call kasan_early_init
 #endif
        /* Start the kernel */
+       call soc_early_init
        call parse_dtb
        tail start_kernel
 
-#ifdef CONFIG_MMU
-relocate:
-       /* Relocate return address */
-       li a1, PAGE_OFFSET
-       la a2, _start
-       sub a1, a1, a2
-       add ra, ra, a1
-
-       /* Point stvec to virtual address of intruction after satp write */
-       la a2, 1f
-       add a2, a2, a1
-       csrw CSR_TVEC, a2
-
-       /* Compute satp for kernel page tables, but don't load it yet */
-       srl a2, a0, PAGE_SHIFT
-       li a1, SATP_MODE
-       or a2, a2, a1
-
-       /*
-        * Load trampoline page directory, which will cause us to trap to
-        * stvec if VA != PA, or simply fall through if VA == PA.  We need a
-        * full fence here because setup_vm() just wrote these PTEs and we need
-        * to ensure the new translations are in use.
-        */
-       la a0, trampoline_pg_dir
-       srl a0, a0, PAGE_SHIFT
-       or a0, a0, a1
-       sfence.vma
-       csrw CSR_SATP, a0
-.align 2
-1:
-       /* Set trap vector to spin forever to help debug */
-       la a0, .Lsecondary_park
-       csrw CSR_TVEC, a0
-
-       /* Reload the global pointer */
-.option push
-.option norelax
-       la gp, __global_pointer$
-.option pop
-
-       /*
-        * Switch to kernel page tables.  A full fence is necessary in order to
-        * avoid using the trampoline translations, which are only correct for
-        * the first superpage.  Fetching the fence is guarnteed to work
-        * because that first superpage is translated the same way.
-        */
-       csrw CSR_SATP, a2
-       sfence.vma
-
-       ret
-#endif /* CONFIG_MMU */
-
 .Lsecondary_start:
 #ifdef CONFIG_SMP
        /* Set trap vector to spin forever to help debug */
@@ -211,16 +262,10 @@ relocate:
        beqz tp, .Lwait_for_cpu_up
        fence
 
-#ifdef CONFIG_MMU
-       /* Enable virtual memory and relocate to virtual address */
-       la a0, swapper_pg_dir
-       call relocate
+       tail secondary_start_common
 #endif
 
-       tail smp_callin
-#endif
-
-END(_start)
+END(_start_kernel)
 
 #ifdef CONFIG_RISCV_M_MODE
 ENTRY(reset_regs)
@@ -301,13 +346,6 @@ ENTRY(reset_regs)
 END(reset_regs)
 #endif /* CONFIG_RISCV_M_MODE */
 
-.section ".text", "ax",@progbits
-.align 2
-.Lsecondary_park:
-       /* We lack SMP support or have too many harts, so park this hart */
-       wfi
-       j .Lsecondary_park
-
 __PAGE_ALIGNED_BSS
        /* Empty zero page */
        .balign PAGE_SIZE
diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c
new file mode 100644 (file)
index 0000000..8a4fc65
--- /dev/null
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 SiFive
+ */
+
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/stop_machine.h>
+#include <asm/kprobes.h>
+#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
+
+struct riscv_insn_patch {
+       void *addr;
+       u32 insn;
+       atomic_t cpu_count;
+};
+
+#ifdef CONFIG_MMU
+static DEFINE_RAW_SPINLOCK(patch_lock);
+
+static void __kprobes *patch_map(void *addr, int fixmap)
+{
+       uintptr_t uintaddr = (uintptr_t) addr;
+       struct page *page;
+
+       if (core_kernel_text(uintaddr))
+               page = phys_to_page(__pa_symbol(addr));
+       else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+               page = vmalloc_to_page(addr);
+       else
+               return addr;
+
+       BUG_ON(!page);
+
+       return (void *)set_fixmap_offset(fixmap, page_to_phys(page) +
+                                        (uintaddr & ~PAGE_MASK));
+}
+
+static void __kprobes patch_unmap(int fixmap)
+{
+       clear_fixmap(fixmap);
+}
+
+static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
+{
+       void *waddr = addr;
+       bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE;
+       unsigned long flags = 0;
+       int ret;
+
+       raw_spin_lock_irqsave(&patch_lock, flags);
+
+       if (across_pages)
+               patch_map(addr + len, FIX_TEXT_POKE1);
+
+       waddr = patch_map(addr, FIX_TEXT_POKE0);
+
+       ret = probe_kernel_write(waddr, insn, len);
+
+       patch_unmap(FIX_TEXT_POKE0);
+
+       if (across_pages)
+               patch_unmap(FIX_TEXT_POKE1);
+
+       raw_spin_unlock_irqrestore(&patch_lock, flags);
+
+       return ret;
+}
+#else
+static int __kprobes riscv_insn_write(void *addr, const void *insn, size_t len)
+{
+       return probe_kernel_write(addr, insn, len);
+}
+#endif /* CONFIG_MMU */
+
+int __kprobes riscv_patch_text_nosync(void *addr, const void *insns, size_t len)
+{
+       u32 *tp = addr;
+       int ret;
+
+       ret = riscv_insn_write(tp, insns, len);
+
+       if (!ret)
+               flush_icache_range((uintptr_t) tp, (uintptr_t) tp + len);
+
+       return ret;
+}
+
+static int __kprobes riscv_patch_text_cb(void *data)
+{
+       struct riscv_insn_patch *patch = data;
+       int ret = 0;
+
+       if (atomic_inc_return(&patch->cpu_count) == 1) {
+               ret =
+                   riscv_patch_text_nosync(patch->addr, &patch->insn,
+                                           GET_INSN_LENGTH(patch->insn));
+               atomic_inc(&patch->cpu_count);
+       } else {
+               while (atomic_read(&patch->cpu_count) <= num_online_cpus())
+                       cpu_relax();
+               smp_mb();
+       }
+
+       return ret;
+}
+
+int __kprobes riscv_patch_text(void *addr, u32 insn)
+{
+       struct riscv_insn_patch patch = {
+               .addr = addr,
+               .insn = insn,
+               .cpu_count = ATOMIC_INIT(0),
+       };
+
+       return stop_machine_cpuslocked(riscv_patch_text_cb,
+                                      &patch, cpu_online_mask);
+}
index 817cf7b..610c11e 100644 (file)
@@ -22,6 +22,8 @@
 #include <asm/switch_to.h>
 #include <asm/thread_info.h>
 
+unsigned long gp_in_global __asm__("gp");
+
 extern asmlinkage void ret_from_fork(void);
 extern asmlinkage void ret_from_kernel_thread(void);
 
@@ -107,9 +109,8 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
        /* p->thread holds context to be restored by __switch_to() */
        if (unlikely(p->flags & PF_KTHREAD)) {
                /* Kernel thread */
-               const register unsigned long gp __asm__ ("gp");
                memset(childregs, 0, sizeof(struct pt_regs));
-               childregs->gp = gp;
+               childregs->gp = gp_in_global;
                /* Supervisor/Machine, irqs on: */
                childregs->status = SR_PP | SR_PIE;
 
index f6c7c3e..7c24da5 100644 (file)
 // SPDX-License-Identifier: GPL-2.0-only
+/*
+ * SBI initialilization and all extension implementation.
+ *
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
 
 #include <linux/init.h>
 #include <linux/pm.h>
 #include <asm/sbi.h>
+#include <asm/smp.h>
+
+/* default SBI version is 0.1 */
+unsigned long sbi_spec_version = SBI_SPEC_VERSION_DEFAULT;
+EXPORT_SYMBOL(sbi_spec_version);
+
+static void (*__sbi_set_timer)(uint64_t stime);
+static int (*__sbi_send_ipi)(const unsigned long *hart_mask);
+static int (*__sbi_rfence)(int fid, const unsigned long *hart_mask,
+                          unsigned long start, unsigned long size,
+                          unsigned long arg4, unsigned long arg5);
+
+struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0,
+                       unsigned long arg1, unsigned long arg2,
+                       unsigned long arg3, unsigned long arg4,
+                       unsigned long arg5)
+{
+       struct sbiret ret;
+
+       register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0);
+       register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1);
+       register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2);
+       register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3);
+       register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4);
+       register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5);
+       register uintptr_t a6 asm ("a6") = (uintptr_t)(fid);
+       register uintptr_t a7 asm ("a7") = (uintptr_t)(ext);
+       asm volatile ("ecall"
+                     : "+r" (a0), "+r" (a1)
+                     : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7)
+                     : "memory");
+       ret.error = a0;
+       ret.value = a1;
+
+       return ret;
+}
+EXPORT_SYMBOL(sbi_ecall);
+
+int sbi_err_map_linux_errno(int err)
+{
+       switch (err) {
+       case SBI_SUCCESS:
+               return 0;
+       case SBI_ERR_DENIED:
+               return -EPERM;
+       case SBI_ERR_INVALID_PARAM:
+               return -EINVAL;
+       case SBI_ERR_INVALID_ADDRESS:
+               return -EFAULT;
+       case SBI_ERR_NOT_SUPPORTED:
+       case SBI_ERR_FAILURE:
+       default:
+               return -ENOTSUPP;
+       };
+}
+EXPORT_SYMBOL(sbi_err_map_linux_errno);
+
+#ifdef CONFIG_RISCV_SBI_V01
+/**
+ * sbi_console_putchar() - Writes given character to the console device.
+ * @ch: The data to be written to the console.
+ *
+ * Return: None
+ */
+void sbi_console_putchar(int ch)
+{
+       sbi_ecall(SBI_EXT_0_1_CONSOLE_PUTCHAR, 0, ch, 0, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(sbi_console_putchar);
+
+/**
+ * sbi_console_getchar() - Reads a byte from console device.
+ *
+ * Returns the value read from console.
+ */
+int sbi_console_getchar(void)
+{
+       struct sbiret ret;
+
+       ret = sbi_ecall(SBI_EXT_0_1_CONSOLE_GETCHAR, 0, 0, 0, 0, 0, 0, 0);
+
+       return ret.error;
+}
+EXPORT_SYMBOL(sbi_console_getchar);
+
+/**
+ * sbi_shutdown() - Remove all the harts from executing supervisor code.
+ *
+ * Return: None
+ */
+void sbi_shutdown(void)
+{
+       sbi_ecall(SBI_EXT_0_1_SHUTDOWN, 0, 0, 0, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(sbi_set_timer);
+
+/**
+ * sbi_clear_ipi() - Clear any pending IPIs for the calling hart.
+ *
+ * Return: None
+ */
+void sbi_clear_ipi(void)
+{
+       sbi_ecall(SBI_EXT_0_1_CLEAR_IPI, 0, 0, 0, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(sbi_shutdown);
+
+/**
+ * sbi_set_timer_v01() - Program the timer for next timer event.
+ * @stime_value: The value after which next timer event should fire.
+ *
+ * Return: None
+ */
+static void __sbi_set_timer_v01(uint64_t stime_value)
+{
+#if __riscv_xlen == 32
+       sbi_ecall(SBI_EXT_0_1_SET_TIMER, 0, stime_value,
+                 stime_value >> 32, 0, 0, 0, 0);
+#else
+       sbi_ecall(SBI_EXT_0_1_SET_TIMER, 0, stime_value, 0, 0, 0, 0, 0);
+#endif
+}
+
+static int __sbi_send_ipi_v01(const unsigned long *hart_mask)
+{
+       sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)hart_mask,
+                 0, 0, 0, 0, 0);
+       return 0;
+}
+
+static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask,
+                           unsigned long start, unsigned long size,
+                           unsigned long arg4, unsigned long arg5)
+{
+       int result = 0;
+
+       /* v0.2 function IDs are equivalent to v0.1 extension IDs */
+       switch (fid) {
+       case SBI_EXT_RFENCE_REMOTE_FENCE_I:
+               sbi_ecall(SBI_EXT_0_1_REMOTE_FENCE_I, 0,
+                         (unsigned long)hart_mask, 0, 0, 0, 0, 0);
+               break;
+       case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
+               sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA, 0,
+                         (unsigned long)hart_mask, start, size,
+                         0, 0, 0);
+               break;
+       case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
+               sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID, 0,
+                         (unsigned long)hart_mask, start, size,
+                         arg4, 0, 0);
+               break;
+       default:
+               pr_err("SBI call [%d]not supported in SBI v0.1\n", fid);
+               result = -EINVAL;
+       }
+
+       return result;
+}
+#else
+static void __sbi_set_timer_v01(uint64_t stime_value)
+{
+       pr_warn("Timer extension is not available in SBI v%lu.%lu\n",
+               sbi_major_version(), sbi_minor_version());
+}
+
+static int __sbi_send_ipi_v01(const unsigned long *hart_mask)
+{
+       pr_warn("IPI extension is not available in SBI v%lu.%lu\n",
+               sbi_major_version(), sbi_minor_version());
+
+       return 0;
+}
+
+static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask,
+                           unsigned long start, unsigned long size,
+                           unsigned long arg4, unsigned long arg5)
+{
+       pr_warn("remote fence extension is not available in SBI v%lu.%lu\n",
+               sbi_major_version(), sbi_minor_version());
+
+       return 0;
+}
+#endif /* CONFIG_RISCV_SBI_V01 */
+
+static void __sbi_set_timer_v02(uint64_t stime_value)
+{
+#if __riscv_xlen == 32
+       sbi_ecall(SBI_EXT_TIME, SBI_EXT_TIME_SET_TIMER, stime_value,
+                 stime_value >> 32, 0, 0, 0, 0);
+#else
+       sbi_ecall(SBI_EXT_TIME, SBI_EXT_TIME_SET_TIMER, stime_value, 0,
+                 0, 0, 0, 0);
+#endif
+}
+
+static int __sbi_send_ipi_v02(const unsigned long *hart_mask)
+{
+       unsigned long hartid, hmask_val, hbase;
+       struct cpumask tmask;
+       struct sbiret ret = {0};
+       int result;
+
+       if (!hart_mask || !(*hart_mask)) {
+               riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask);
+               hart_mask = cpumask_bits(&tmask);
+       }
+
+       hmask_val = 0;
+       hbase = 0;
+       for_each_set_bit(hartid, hart_mask, NR_CPUS) {
+               if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) {
+                       ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI,
+                                       hmask_val, hbase, 0, 0, 0, 0);
+                       if (ret.error)
+                               goto ecall_failed;
+                       hmask_val = 0;
+                       hbase = 0;
+               }
+               if (!hmask_val)
+                       hbase = hartid;
+               hmask_val |= 1UL << (hartid - hbase);
+       }
+
+       if (hmask_val) {
+               ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI,
+                               hmask_val, hbase, 0, 0, 0, 0);
+               if (ret.error)
+                       goto ecall_failed;
+       }
+
+       return 0;
+
+ecall_failed:
+       result = sbi_err_map_linux_errno(ret.error);
+       pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n",
+              __func__, hbase, hmask_val, result);
+       return result;
+}
+
+static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val,
+                                unsigned long hbase, unsigned long start,
+                                unsigned long size, unsigned long arg4,
+                                unsigned long arg5)
+{
+       struct sbiret ret = {0};
+       int ext = SBI_EXT_RFENCE;
+       int result = 0;
+
+       switch (fid) {
+       case SBI_EXT_RFENCE_REMOTE_FENCE_I:
+               ret = sbi_ecall(ext, fid, hmask_val, hbase, 0, 0, 0, 0);
+               break;
+       case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
+               ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+                               size, 0, 0);
+               break;
+       case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
+               ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+                               size, arg4, 0);
+               break;
+
+       case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA:
+               ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+                               size, 0, 0);
+               break;
+       case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID:
+               ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+                               size, arg4, 0);
+               break;
+       case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA:
+               ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+                               size, 0, 0);
+               break;
+       case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID:
+               ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+                               size, arg4, 0);
+               break;
+       default:
+               pr_err("unknown function ID [%lu] for SBI extension [%d]\n",
+                      fid, ext);
+               result = -EINVAL;
+       }
+
+       if (ret.error) {
+               result = sbi_err_map_linux_errno(ret.error);
+               pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n",
+                      __func__, hbase, hmask_val, result);
+       }
+
+       return result;
+}
+
+static int __sbi_rfence_v02(int fid, const unsigned long *hart_mask,
+                           unsigned long start, unsigned long size,
+                           unsigned long arg4, unsigned long arg5)
+{
+       unsigned long hmask_val, hartid, hbase;
+       struct cpumask tmask;
+       int result;
+
+       if (!hart_mask || !(*hart_mask)) {
+               riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask);
+               hart_mask = cpumask_bits(&tmask);
+       }
+
+       hmask_val = 0;
+       hbase = 0;
+       for_each_set_bit(hartid, hart_mask, NR_CPUS) {
+               if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) {
+                       result = __sbi_rfence_v02_call(fid, hmask_val, hbase,
+                                                      start, size, arg4, arg5);
+                       if (result)
+                               return result;
+                       hmask_val = 0;
+                       hbase = 0;
+               }
+               if (!hmask_val)
+                       hbase = hartid;
+               hmask_val |= 1UL << (hartid - hbase);
+       }
+
+       if (hmask_val) {
+               result = __sbi_rfence_v02_call(fid, hmask_val, hbase,
+                                              start, size, arg4, arg5);
+               if (result)
+                       return result;
+       }
+
+       return 0;
+}
+
+/**
+ * sbi_set_timer() - Program the timer for next timer event.
+ * @stime_value: The value after which next timer event should fire.
+ *
+ * Return: None
+ */
+void sbi_set_timer(uint64_t stime_value)
+{
+       __sbi_set_timer(stime_value);
+}
+
+/**
+ * sbi_send_ipi() - Send an IPI to any hart.
+ * @hart_mask: A cpu mask containing all the target harts.
+ *
+ * Return: None
+ */
+void sbi_send_ipi(const unsigned long *hart_mask)
+{
+       __sbi_send_ipi(hart_mask);
+}
+EXPORT_SYMBOL(sbi_send_ipi);
+
+/**
+ * sbi_remote_fence_i() - Execute FENCE.I instruction on given remote harts.
+ * @hart_mask: A cpu mask containing all the target harts.
+ *
+ * Return: None
+ */
+void sbi_remote_fence_i(const unsigned long *hart_mask)
+{
+       __sbi_rfence(SBI_EXT_RFENCE_REMOTE_FENCE_I,
+                    hart_mask, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(sbi_remote_fence_i);
+
+/**
+ * sbi_remote_sfence_vma() - Execute SFENCE.VMA instructions on given remote
+ *                          harts for the specified virtual address range.
+ * @hart_mask: A cpu mask containing all the target harts.
+ * @start: Start of the virtual address
+ * @size: Total size of the virtual address range.
+ *
+ * Return: None
+ */
+void sbi_remote_sfence_vma(const unsigned long *hart_mask,
+                          unsigned long start,
+                          unsigned long size)
+{
+       __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA,
+                    hart_mask, start, size, 0, 0);
+}
+EXPORT_SYMBOL(sbi_remote_sfence_vma);
+
+/**
+ * sbi_remote_sfence_vma_asid() - Execute SFENCE.VMA instructions on given
+ * remote harts for a virtual address range belonging to a specific ASID.
+ *
+ * @hart_mask: A cpu mask containing all the target harts.
+ * @start: Start of the virtual address
+ * @size: Total size of the virtual address range.
+ * @asid: The value of address space identifier (ASID).
+ *
+ * Return: None
+ */
+void sbi_remote_sfence_vma_asid(const unsigned long *hart_mask,
+                               unsigned long start,
+                               unsigned long size,
+                               unsigned long asid)
+{
+       __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID,
+                    hart_mask, start, size, asid, 0);
+}
+EXPORT_SYMBOL(sbi_remote_sfence_vma_asid);
+
+/**
+ * sbi_remote_hfence_gvma() - Execute HFENCE.GVMA instructions on given remote
+ *                        harts for the specified guest physical address range.
+ * @hart_mask: A cpu mask containing all the target harts.
+ * @start: Start of the guest physical address
+ * @size: Total size of the guest physical address range.
+ *
+ * Return: None
+ */
+int sbi_remote_hfence_gvma(const unsigned long *hart_mask,
+                          unsigned long start,
+                          unsigned long size)
+{
+       return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA,
+                           hart_mask, start, size, 0, 0);
+}
+EXPORT_SYMBOL_GPL(sbi_remote_hfence_gvma);
+
+/**
+ * sbi_remote_hfence_gvma_vmid() - Execute HFENCE.GVMA instructions on given
+ * remote harts for a guest physical address range belonging to a specific VMID.
+ *
+ * @hart_mask: A cpu mask containing all the target harts.
+ * @start: Start of the guest physical address
+ * @size: Total size of the guest physical address range.
+ * @vmid: The value of guest ID (VMID).
+ *
+ * Return: 0 if success, Error otherwise.
+ */
+int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask,
+                               unsigned long start,
+                               unsigned long size,
+                               unsigned long vmid)
+{
+       return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID,
+                           hart_mask, start, size, vmid, 0);
+}
+EXPORT_SYMBOL(sbi_remote_hfence_gvma_vmid);
+
+/**
+ * sbi_remote_hfence_vvma() - Execute HFENCE.VVMA instructions on given remote
+ *                          harts for the current guest virtual address range.
+ * @hart_mask: A cpu mask containing all the target harts.
+ * @start: Start of the current guest virtual address
+ * @size: Total size of the current guest virtual address range.
+ *
+ * Return: None
+ */
+int sbi_remote_hfence_vvma(const unsigned long *hart_mask,
+                          unsigned long start,
+                          unsigned long size)
+{
+       return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA,
+                           hart_mask, start, size, 0, 0);
+}
+EXPORT_SYMBOL(sbi_remote_hfence_vvma);
+
+/**
+ * sbi_remote_hfence_vvma_asid() - Execute HFENCE.VVMA instructions on given
+ * remote harts for current guest virtual address range belonging to a specific
+ * ASID.
+ *
+ * @hart_mask: A cpu mask containing all the target harts.
+ * @start: Start of the current guest virtual address
+ * @size: Total size of the current guest virtual address range.
+ * @asid: The value of address space identifier (ASID).
+ *
+ * Return: None
+ */
+int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask,
+                               unsigned long start,
+                               unsigned long size,
+                               unsigned long asid)
+{
+       return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID,
+                           hart_mask, start, size, asid, 0);
+}
+EXPORT_SYMBOL(sbi_remote_hfence_vvma_asid);
+
+/**
+ * sbi_probe_extension() - Check if an SBI extension ID is supported or not.
+ * @extid: The extension ID to be probed.
+ *
+ * Return: Extension specific nonzero value f yes, -ENOTSUPP otherwise.
+ */
+int sbi_probe_extension(int extid)
+{
+       struct sbiret ret;
+
+       ret = sbi_ecall(SBI_EXT_BASE, SBI_EXT_BASE_PROBE_EXT, extid,
+                       0, 0, 0, 0, 0);
+       if (!ret.error)
+               if (ret.value)
+                       return ret.value;
+
+       return -ENOTSUPP;
+}
+EXPORT_SYMBOL(sbi_probe_extension);
+
+static long __sbi_base_ecall(int fid)
+{
+       struct sbiret ret;
+
+       ret = sbi_ecall(SBI_EXT_BASE, fid, 0, 0, 0, 0, 0, 0);
+       if (!ret.error)
+               return ret.value;
+       else
+               return sbi_err_map_linux_errno(ret.error);
+}
+
+static inline long sbi_get_spec_version(void)
+{
+       return __sbi_base_ecall(SBI_EXT_BASE_GET_SPEC_VERSION);
+}
+
+static inline long sbi_get_firmware_id(void)
+{
+       return __sbi_base_ecall(SBI_EXT_BASE_GET_IMP_ID);
+}
+
+static inline long sbi_get_firmware_version(void)
+{
+       return __sbi_base_ecall(SBI_EXT_BASE_GET_IMP_VERSION);
+}
 
 static void sbi_power_off(void)
 {
        sbi_shutdown();
 }
 
-static int __init sbi_init(void)
+int __init sbi_init(void)
 {
+       int ret;
+
        pm_power_off = sbi_power_off;
+       ret = sbi_get_spec_version();
+       if (ret > 0)
+               sbi_spec_version = ret;
+
+       pr_info("SBI specification v%lu.%lu detected\n",
+               sbi_major_version(), sbi_minor_version());
+
+       if (!sbi_spec_is_0_1()) {
+               pr_info("SBI implementation ID=0x%lx Version=0x%lx\n",
+                       sbi_get_firmware_id(), sbi_get_firmware_version());
+               if (sbi_probe_extension(SBI_EXT_TIME) > 0) {
+                       __sbi_set_timer = __sbi_set_timer_v02;
+                       pr_info("SBI v0.2 TIME extension detected\n");
+               } else {
+                       __sbi_set_timer = __sbi_set_timer_v01;
+               }
+               if (sbi_probe_extension(SBI_EXT_IPI) > 0) {
+                       __sbi_send_ipi  = __sbi_send_ipi_v02;
+                       pr_info("SBI v0.2 IPI extension detected\n");
+               } else {
+                       __sbi_send_ipi  = __sbi_send_ipi_v01;
+               }
+               if (sbi_probe_extension(SBI_EXT_RFENCE) > 0) {
+                       __sbi_rfence    = __sbi_rfence_v02;
+                       pr_info("SBI v0.2 RFENCE extension detected\n");
+               } else {
+                       __sbi_rfence    = __sbi_rfence_v01;
+               }
+       } else {
+               __sbi_set_timer = __sbi_set_timer_v01;
+               __sbi_send_ipi  = __sbi_send_ipi_v01;
+               __sbi_rfence    = __sbi_rfence_v01;
+       }
+
        return 0;
 }
-early_initcall(sbi_init);
index 0a6d415..145128a 100644 (file)
 #include <linux/of_platform.h>
 #include <linux/sched/task.h>
 #include <linux/swiotlb.h>
+#include <linux/smp.h>
 
 #include <asm/clint.h>
+#include <asm/cpu_ops.h>
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/pgtable.h>
-#include <asm/smp.h>
+#include <asm/sbi.h>
 #include <asm/tlbflush.h>
 #include <asm/thread_info.h>
 #include <asm/kasan.h>
@@ -39,9 +41,14 @@ struct screen_info screen_info = {
 };
 #endif
 
-/* The lucky hart to first increment this variable will boot the other cores */
-atomic_t hart_lottery;
+/*
+ * The lucky hart to first increment this variable will boot the other cores.
+ * This is used before the kernel initializes the BSS so it can't be in the
+ * BSS.
+ */
+atomic_t hart_lottery __section(.sdata);
 unsigned long boot_cpu_hartid;
+static DEFINE_PER_CPU(struct cpu, cpu_devices);
 
 void __init parse_dtb(void)
 {
@@ -79,9 +86,28 @@ void __init setup_arch(char **cmdline_p)
        kasan_init();
 #endif
 
+#if IS_ENABLED(CONFIG_RISCV_SBI)
+       sbi_init();
+#endif
+
 #ifdef CONFIG_SMP
        setup_smp();
 #endif
 
        riscv_fill_hwcap();
 }
+
+static int __init topology_init(void)
+{
+       int i;
+
+       for_each_possible_cpu(i) {
+               struct cpu *cpu = &per_cpu(cpu_devices, i);
+
+               cpu->hotpluggable = cpu_has_hotplug(i);
+               register_cpu(cpu, i);
+       }
+
+       return 0;
+}
+subsys_initcall(topology_init);
index 8bc01f0..4e99227 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/sched/task_stack.h>
 #include <linux/sched/mm.h>
 #include <asm/clint.h>
+#include <asm/cpu_ops.h>
 #include <asm/irq.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
@@ -34,8 +35,6 @@
 
 #include "head.h"
 
-void *__cpu_up_stack_pointer[NR_CPUS];
-void *__cpu_up_task_pointer[NR_CPUS];
 static DECLARE_COMPLETION(cpu_running);
 
 void __init smp_prepare_boot_cpu(void)
@@ -46,6 +45,7 @@ void __init smp_prepare_boot_cpu(void)
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
        int cpuid;
+       int ret;
 
        /* This covers non-smp usecase mandated by "nosmp" option */
        if (max_cpus == 0)
@@ -54,6 +54,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
        for_each_possible_cpu(cpuid) {
                if (cpuid == smp_processor_id())
                        continue;
+               if (cpu_ops[cpuid]->cpu_prepare) {
+                       ret = cpu_ops[cpuid]->cpu_prepare(cpuid);
+                       if (ret)
+                               continue;
+               }
                set_cpu_present(cpuid, true);
        }
 }
@@ -65,6 +70,8 @@ void __init setup_smp(void)
        bool found_boot_cpu = false;
        int cpuid = 1;
 
+       cpu_set_ops(0);
+
        for_each_of_cpu_node(dn) {
                hart = riscv_of_processor_hartid(dn);
                if (hart < 0)
@@ -92,36 +99,38 @@ void __init setup_smp(void)
                        cpuid, nr_cpu_ids);
 
        for (cpuid = 1; cpuid < nr_cpu_ids; cpuid++) {
-               if (cpuid_to_hartid_map(cpuid) != INVALID_HARTID)
+               if (cpuid_to_hartid_map(cpuid) != INVALID_HARTID) {
+                       cpu_set_ops(cpuid);
                        set_cpu_possible(cpuid, true);
+               }
        }
 }
 
+int start_secondary_cpu(int cpu, struct task_struct *tidle)
+{
+       if (cpu_ops[cpu]->cpu_start)
+               return cpu_ops[cpu]->cpu_start(cpu, tidle);
+
+       return -EOPNOTSUPP;
+}
+
 int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
        int ret = 0;
-       int hartid = cpuid_to_hartid_map(cpu);
        tidle->thread_info.cpu = cpu;
 
-       /*
-        * On RISC-V systems, all harts boot on their own accord.  Our _start
-        * selects the first hart to boot the kernel and causes the remainder
-        * of the harts to spin in a loop waiting for their stack pointer to be
-        * setup by that main hart.  Writing __cpu_up_stack_pointer signals to
-        * the spinning harts that they can continue the boot process.
-        */
-       smp_mb();
-       WRITE_ONCE(__cpu_up_stack_pointer[hartid],
-                 task_stack_page(tidle) + THREAD_SIZE);
-       WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle);
-
-       lockdep_assert_held(&cpu_running);
-       wait_for_completion_timeout(&cpu_running,
+       ret = start_secondary_cpu(cpu, tidle);
+       if (!ret) {
+               lockdep_assert_held(&cpu_running);
+               wait_for_completion_timeout(&cpu_running,
                                            msecs_to_jiffies(1000));
 
-       if (!cpu_online(cpu)) {
-               pr_crit("CPU%u: failed to come online\n", cpu);
-               ret = -EIO;
+               if (!cpu_online(cpu)) {
+                       pr_crit("CPU%u: failed to come online\n", cpu);
+                       ret = -EIO;
+               }
+       } else {
+               pr_crit("CPU%u: failed to start\n", cpu);
        }
 
        return ret;
@@ -134,7 +143,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
 /*
  * C entry point for a secondary processor.
  */
-asmlinkage __visible void __init smp_callin(void)
+asmlinkage __visible void smp_callin(void)
 {
        struct mm_struct *mm = &init_mm;
 
diff --git a/arch/riscv/kernel/soc.c b/arch/riscv/kernel/soc.c
new file mode 100644 (file)
index 0000000..0b3b3dc
--- /dev/null
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+#include <linux/init.h>
+#include <linux/libfdt.h>
+#include <asm/pgtable.h>
+#include <asm/soc.h>
+
+/*
+ * This is called extremly early, before parse_dtb(), to allow initializing
+ * SoC hardware before memory or any device driver initialization.
+ */
+void __init soc_early_init(void)
+{
+       void (*early_fn)(const void *fdt);
+       const struct of_device_id *s;
+       const void *fdt = dtb_early_va;
+
+       for (s = (void *)&__soc_early_init_table_start;
+            (void *)s < (void *)&__soc_early_init_table_end; s++) {
+               if (!fdt_node_check_compatible(fdt, 0, s->compatible)) {
+                       early_fn = s->data;
+                       early_fn(fdt);
+                       return;
+               }
+       }
+}
index 0940681..02087fe 100644 (file)
@@ -19,6 +19,8 @@ struct stackframe {
        unsigned long ra;
 };
 
+register unsigned long sp_in_global __asm__("sp");
+
 void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs,
                             bool (*fn)(unsigned long, void *), void *arg)
 {
@@ -29,7 +31,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs,
                sp = user_stack_pointer(regs);
                pc = instruction_pointer(regs);
        } else if (task == NULL || task == current) {
-               const register unsigned long current_sp __asm__ ("sp");
+               const register unsigned long current_sp = sp_in_global;
                fp = (unsigned long)__builtin_frame_address(0);
                sp = current_sp;
                pc = (unsigned long)walk_stackframe;
@@ -73,8 +75,7 @@ static void notrace walk_stackframe(struct task_struct *task,
                sp = user_stack_pointer(regs);
                pc = instruction_pointer(regs);
        } else if (task == NULL || task == current) {
-               const register unsigned long current_sp __asm__ ("sp");
-               sp = current_sp;
+               sp = sp_in_global;
                pc = (unsigned long)walk_stackframe;
        } else {
                /* task blocked in __switch_to */
index 55ea614..7f58fa5 100644 (file)
@@ -97,12 +97,33 @@ DO_ERROR_INFO(do_trap_insn_fault,
        SIGSEGV, SEGV_ACCERR, "instruction access fault");
 DO_ERROR_INFO(do_trap_insn_illegal,
        SIGILL, ILL_ILLOPC, "illegal instruction");
-DO_ERROR_INFO(do_trap_load_misaligned,
-       SIGBUS, BUS_ADRALN, "load address misaligned");
 DO_ERROR_INFO(do_trap_load_fault,
        SIGSEGV, SEGV_ACCERR, "load access fault");
+#ifndef CONFIG_RISCV_M_MODE
+DO_ERROR_INFO(do_trap_load_misaligned,
+       SIGBUS, BUS_ADRALN, "Oops - load address misaligned");
 DO_ERROR_INFO(do_trap_store_misaligned,
-       SIGBUS, BUS_ADRALN, "store (or AMO) address misaligned");
+       SIGBUS, BUS_ADRALN, "Oops - store (or AMO) address misaligned");
+#else
+int handle_misaligned_load(struct pt_regs *regs);
+int handle_misaligned_store(struct pt_regs *regs);
+
+asmlinkage void do_trap_load_misaligned(struct pt_regs *regs)
+{
+       if (!handle_misaligned_load(regs))
+               return;
+       do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
+                     "Oops - load address misaligned");
+}
+
+asmlinkage void do_trap_store_misaligned(struct pt_regs *regs)
+{
+       if (!handle_misaligned_store(regs))
+               return;
+       do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc,
+                     "Oops - store (or AMO) address misaligned");
+}
+#endif
 DO_ERROR_INFO(do_trap_store_fault,
        SIGSEGV, SEGV_ACCERR, "store (or AMO) access fault");
 DO_ERROR_INFO(do_trap_ecall_u,
@@ -118,7 +139,8 @@ static inline unsigned long get_break_insn_length(unsigned long pc)
 
        if (probe_kernel_address((bug_insn_t *)pc, insn))
                return 0;
-       return (((insn & __INSN_LENGTH_MASK) == __INSN_LENGTH_32) ? 4UL : 2UL);
+
+       return GET_INSN_LENGTH(insn);
 }
 
 asmlinkage __visible void do_trap_break(struct pt_regs *regs)
@@ -147,7 +169,7 @@ int is_valid_bugaddr(unsigned long pc)
 }
 #endif /* CONFIG_GENERIC_BUG */
 
-void __init trap_init(void)
+void trap_init(void)
 {
        /*
         * Set sup0 scratch register to 0, indicating to exception vector
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
new file mode 100644 (file)
index 0000000..46c4daf
--- /dev/null
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/irq.h>
+
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/csr.h>
+
+#define INSN_MATCH_LB                  0x3
+#define INSN_MASK_LB                   0x707f
+#define INSN_MATCH_LH                  0x1003
+#define INSN_MASK_LH                   0x707f
+#define INSN_MATCH_LW                  0x2003
+#define INSN_MASK_LW                   0x707f
+#define INSN_MATCH_LD                  0x3003
+#define INSN_MASK_LD                   0x707f
+#define INSN_MATCH_LBU                 0x4003
+#define INSN_MASK_LBU                  0x707f
+#define INSN_MATCH_LHU                 0x5003
+#define INSN_MASK_LHU                  0x707f
+#define INSN_MATCH_LWU                 0x6003
+#define INSN_MASK_LWU                  0x707f
+#define INSN_MATCH_SB                  0x23
+#define INSN_MASK_SB                   0x707f
+#define INSN_MATCH_SH                  0x1023
+#define INSN_MASK_SH                   0x707f
+#define INSN_MATCH_SW                  0x2023
+#define INSN_MASK_SW                   0x707f
+#define INSN_MATCH_SD                  0x3023
+#define INSN_MASK_SD                   0x707f
+
+#define INSN_MATCH_FLW                 0x2007
+#define INSN_MASK_FLW                  0x707f
+#define INSN_MATCH_FLD                 0x3007
+#define INSN_MASK_FLD                  0x707f
+#define INSN_MATCH_FLQ                 0x4007
+#define INSN_MASK_FLQ                  0x707f
+#define INSN_MATCH_FSW                 0x2027
+#define INSN_MASK_FSW                  0x707f
+#define INSN_MATCH_FSD                 0x3027
+#define INSN_MASK_FSD                  0x707f
+#define INSN_MATCH_FSQ                 0x4027
+#define INSN_MASK_FSQ                  0x707f
+
+#define INSN_MATCH_C_LD                        0x6000
+#define INSN_MASK_C_LD                 0xe003
+#define INSN_MATCH_C_SD                        0xe000
+#define INSN_MASK_C_SD                 0xe003
+#define INSN_MATCH_C_LW                        0x4000
+#define INSN_MASK_C_LW                 0xe003
+#define INSN_MATCH_C_SW                        0xc000
+#define INSN_MASK_C_SW                 0xe003
+#define INSN_MATCH_C_LDSP              0x6002
+#define INSN_MASK_C_LDSP               0xe003
+#define INSN_MATCH_C_SDSP              0xe002
+#define INSN_MASK_C_SDSP               0xe003
+#define INSN_MATCH_C_LWSP              0x4002
+#define INSN_MASK_C_LWSP               0xe003
+#define INSN_MATCH_C_SWSP              0xc002
+#define INSN_MASK_C_SWSP               0xe003
+
+#define INSN_MATCH_C_FLD               0x2000
+#define INSN_MASK_C_FLD                        0xe003
+#define INSN_MATCH_C_FLW               0x6000
+#define INSN_MASK_C_FLW                        0xe003
+#define INSN_MATCH_C_FSD               0xa000
+#define INSN_MASK_C_FSD                        0xe003
+#define INSN_MATCH_C_FSW               0xe000
+#define INSN_MASK_C_FSW                        0xe003
+#define INSN_MATCH_C_FLDSP             0x2002
+#define INSN_MASK_C_FLDSP              0xe003
+#define INSN_MATCH_C_FSDSP             0xa002
+#define INSN_MASK_C_FSDSP              0xe003
+#define INSN_MATCH_C_FLWSP             0x6002
+#define INSN_MASK_C_FLWSP              0xe003
+#define INSN_MATCH_C_FSWSP             0xe002
+#define INSN_MASK_C_FSWSP              0xe003
+
+#define INSN_LEN(insn)                 ((((insn) & 0x3) < 0x3) ? 2 : 4)
+
+#if defined(CONFIG_64BIT)
+#define LOG_REGBYTES                   3
+#define XLEN                           64
+#else
+#define LOG_REGBYTES                   2
+#define XLEN                           32
+#endif
+#define REGBYTES                       (1 << LOG_REGBYTES)
+#define XLEN_MINUS_16                  ((XLEN) - 16)
+
+#define SH_RD                          7
+#define SH_RS1                         15
+#define SH_RS2                         20
+#define SH_RS2C                                2
+
+#define RV_X(x, s, n)                  (((x) >> (s)) & ((1 << (n)) - 1))
+#define RVC_LW_IMM(x)                  ((RV_X(x, 6, 1) << 2) | \
+                                        (RV_X(x, 10, 3) << 3) | \
+                                        (RV_X(x, 5, 1) << 6))
+#define RVC_LD_IMM(x)                  ((RV_X(x, 10, 3) << 3) | \
+                                        (RV_X(x, 5, 2) << 6))
+#define RVC_LWSP_IMM(x)                        ((RV_X(x, 4, 3) << 2) | \
+                                        (RV_X(x, 12, 1) << 5) | \
+                                        (RV_X(x, 2, 2) << 6))
+#define RVC_LDSP_IMM(x)                        ((RV_X(x, 5, 2) << 3) | \
+                                        (RV_X(x, 12, 1) << 5) | \
+                                        (RV_X(x, 2, 3) << 6))
+#define RVC_SWSP_IMM(x)                        ((RV_X(x, 9, 4) << 2) | \
+                                        (RV_X(x, 7, 2) << 6))
+#define RVC_SDSP_IMM(x)                        ((RV_X(x, 10, 3) << 3) | \
+                                        (RV_X(x, 7, 3) << 6))
+#define RVC_RS1S(insn)                 (8 + RV_X(insn, SH_RD, 3))
+#define RVC_RS2S(insn)                 (8 + RV_X(insn, SH_RS2C, 3))
+#define RVC_RS2(insn)                  RV_X(insn, SH_RS2C, 5)
+
+#define SHIFT_RIGHT(x, y)              \
+       ((y) < 0 ? ((x) << -(y)) : ((x) >> (y)))
+
+#define REG_MASK                       \
+       ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES))
+
+#define REG_OFFSET(insn, pos)          \
+       (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK)
+
+#define REG_PTR(insn, pos, regs)       \
+       (ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))
+
+#define GET_RM(insn)                   (((insn) >> 12) & 7)
+
+#define GET_RS1(insn, regs)            (*REG_PTR(insn, SH_RS1, regs))
+#define GET_RS2(insn, regs)            (*REG_PTR(insn, SH_RS2, regs))
+#define GET_RS1S(insn, regs)           (*REG_PTR(RVC_RS1S(insn), 0, regs))
+#define GET_RS2S(insn, regs)           (*REG_PTR(RVC_RS2S(insn), 0, regs))
+#define GET_RS2C(insn, regs)           (*REG_PTR(insn, SH_RS2C, regs))
+#define GET_SP(regs)                   (*REG_PTR(2, 0, regs))
+#define SET_RD(insn, regs, val)                (*REG_PTR(insn, SH_RD, regs) = (val))
+#define IMM_I(insn)                    ((s32)(insn) >> 20)
+#define IMM_S(insn)                    (((s32)(insn) >> 25 << 5) | \
+                                        (s32)(((insn) >> 7) & 0x1f))
+#define MASK_FUNCT3                    0x7000
+
+#define GET_PRECISION(insn) (((insn) >> 25) & 3)
+#define GET_RM(insn) (((insn) >> 12) & 7)
+#define PRECISION_S 0
+#define PRECISION_D 1
+
+#define STR(x) XSTR(x)
+#define XSTR(x) #x
+
+#define DECLARE_UNPRIVILEGED_LOAD_FUNCTION(type, insn)                 \
+static inline type load_##type(const type *addr)                       \
+{                                                                      \
+       type val;                                                       \
+       asm (#insn " %0, %1"                                            \
+       : "=&r" (val) : "m" (*addr));                                   \
+       return val;                                                     \
+}
+
+#define DECLARE_UNPRIVILEGED_STORE_FUNCTION(type, insn)                        \
+static inline void store_##type(type *addr, type val)                  \
+{                                                                      \
+       asm volatile (#insn " %0, %1\n"                                 \
+       : : "r" (val), "m" (*addr));                                    \
+}
+
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u8, lbu)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u16, lhu)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(s8, lb)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(s16, lh)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(s32, lw)
+DECLARE_UNPRIVILEGED_STORE_FUNCTION(u8, sb)
+DECLARE_UNPRIVILEGED_STORE_FUNCTION(u16, sh)
+DECLARE_UNPRIVILEGED_STORE_FUNCTION(u32, sw)
+#if defined(CONFIG_64BIT)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u32, lwu)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u64, ld)
+DECLARE_UNPRIVILEGED_STORE_FUNCTION(u64, sd)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(ulong, ld)
+#else
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(u32, lw)
+DECLARE_UNPRIVILEGED_LOAD_FUNCTION(ulong, lw)
+
+static inline u64 load_u64(const u64 *addr)
+{
+       return load_u32((u32 *)addr)
+               + ((u64)load_u32((u32 *)addr + 1) << 32);
+}
+
+static inline void store_u64(u64 *addr, u64 val)
+{
+       store_u32((u32 *)addr, val);
+       store_u32((u32 *)addr + 1, val >> 32);
+}
+#endif
+
+static inline ulong get_insn(ulong mepc)
+{
+       register ulong __mepc asm ("a2") = mepc;
+       ulong val, rvc_mask = 3, tmp;
+
+       asm ("and %[tmp], %[addr], 2\n"
+               "bnez %[tmp], 1f\n"
+#if defined(CONFIG_64BIT)
+               STR(LWU) " %[insn], (%[addr])\n"
+#else
+               STR(LW) " %[insn], (%[addr])\n"
+#endif
+               "and %[tmp], %[insn], %[rvc_mask]\n"
+               "beq %[tmp], %[rvc_mask], 2f\n"
+               "sll %[insn], %[insn], %[xlen_minus_16]\n"
+               "srl %[insn], %[insn], %[xlen_minus_16]\n"
+               "j 2f\n"
+               "1:\n"
+               "lhu %[insn], (%[addr])\n"
+               "and %[tmp], %[insn], %[rvc_mask]\n"
+               "bne %[tmp], %[rvc_mask], 2f\n"
+               "lhu %[tmp], 2(%[addr])\n"
+               "sll %[tmp], %[tmp], 16\n"
+               "add %[insn], %[insn], %[tmp]\n"
+               "2:"
+       : [insn] "=&r" (val), [tmp] "=&r" (tmp)
+       : [addr] "r" (__mepc), [rvc_mask] "r" (rvc_mask),
+         [xlen_minus_16] "i" (XLEN_MINUS_16));
+
+       return val;
+}
+
+union reg_data {
+       u8 data_bytes[8];
+       ulong data_ulong;
+       u64 data_u64;
+};
+
+int handle_misaligned_load(struct pt_regs *regs)
+{
+       union reg_data val;
+       unsigned long epc = regs->epc;
+       unsigned long insn = get_insn(epc);
+       unsigned long addr = csr_read(mtval);
+       int i, fp = 0, shift = 0, len = 0;
+
+       regs->epc = 0;
+
+       if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
+               len = 4;
+               shift = 8 * (sizeof(unsigned long) - len);
+#if defined(CONFIG_64BIT)
+       } else if ((insn & INSN_MASK_LD) == INSN_MATCH_LD) {
+               len = 8;
+               shift = 8 * (sizeof(unsigned long) - len);
+       } else if ((insn & INSN_MASK_LWU) == INSN_MATCH_LWU) {
+               len = 4;
+#endif
+       } else if ((insn & INSN_MASK_FLD) == INSN_MATCH_FLD) {
+               fp = 1;
+               len = 8;
+       } else if ((insn & INSN_MASK_FLW) == INSN_MATCH_FLW) {
+               fp = 1;
+               len = 4;
+       } else if ((insn & INSN_MASK_LH) == INSN_MATCH_LH) {
+               len = 2;
+               shift = 8 * (sizeof(unsigned long) - len);
+       } else if ((insn & INSN_MASK_LHU) == INSN_MATCH_LHU) {
+               len = 2;
+#if defined(CONFIG_64BIT)
+       } else if ((insn & INSN_MASK_C_LD) == INSN_MATCH_C_LD) {
+               len = 8;
+               shift = 8 * (sizeof(unsigned long) - len);
+               insn = RVC_RS2S(insn) << SH_RD;
+       } else if ((insn & INSN_MASK_C_LDSP) == INSN_MATCH_C_LDSP &&
+                  ((insn >> SH_RD) & 0x1f)) {
+               len = 8;
+               shift = 8 * (sizeof(unsigned long) - len);
+#endif
+       } else if ((insn & INSN_MASK_C_LW) == INSN_MATCH_C_LW) {
+               len = 4;
+               shift = 8 * (sizeof(unsigned long) - len);
+               insn = RVC_RS2S(insn) << SH_RD;
+       } else if ((insn & INSN_MASK_C_LWSP) == INSN_MATCH_C_LWSP &&
+                  ((insn >> SH_RD) & 0x1f)) {
+               len = 4;
+               shift = 8 * (sizeof(unsigned long) - len);
+       } else if ((insn & INSN_MASK_C_FLD) == INSN_MATCH_C_FLD) {
+               fp = 1;
+               len = 8;
+               insn = RVC_RS2S(insn) << SH_RD;
+       } else if ((insn & INSN_MASK_C_FLDSP) == INSN_MATCH_C_FLDSP) {
+               fp = 1;
+               len = 8;
+#if defined(CONFIG_32BIT)
+       } else if ((insn & INSN_MASK_C_FLW) == INSN_MATCH_C_FLW) {
+               fp = 1;
+               len = 4;
+               insn = RVC_RS2S(insn) << SH_RD;
+       } else if ((insn & INSN_MASK_C_FLWSP) == INSN_MATCH_C_FLWSP) {
+               fp = 1;
+               len = 4;
+#endif
+       } else {
+               regs->epc = epc;
+               return -1;
+       }
+
+       val.data_u64 = 0;
+       for (i = 0; i < len; i++)
+               val.data_bytes[i] = load_u8((void *)(addr + i));
+
+       if (fp)
+               return -1;
+       SET_RD(insn, regs, val.data_ulong << shift >> shift);
+
+       regs->epc = epc + INSN_LEN(insn);
+
+       return 0;
+}
+
+int handle_misaligned_store(struct pt_regs *regs)
+{
+       union reg_data val;
+       unsigned long epc = regs->epc;
+       unsigned long insn = get_insn(epc);
+       unsigned long addr = csr_read(mtval);
+       int i, len = 0;
+
+       regs->epc = 0;
+
+       val.data_ulong = GET_RS2(insn, regs);
+
+       if ((insn & INSN_MASK_SW) == INSN_MATCH_SW) {
+               len = 4;
+#if defined(CONFIG_64BIT)
+       } else if ((insn & INSN_MASK_SD) == INSN_MATCH_SD) {
+               len = 8;
+#endif
+       } else if ((insn & INSN_MASK_SH) == INSN_MATCH_SH) {
+               len = 2;
+#if defined(CONFIG_64BIT)
+       } else if ((insn & INSN_MASK_C_SD) == INSN_MATCH_C_SD) {
+               len = 8;
+               val.data_ulong = GET_RS2S(insn, regs);
+       } else if ((insn & INSN_MASK_C_SDSP) == INSN_MATCH_C_SDSP &&
+                  ((insn >> SH_RD) & 0x1f)) {
+               len = 8;
+               val.data_ulong = GET_RS2C(insn, regs);
+#endif
+       } else if ((insn & INSN_MASK_C_SW) == INSN_MATCH_C_SW) {
+               len = 4;
+               val.data_ulong = GET_RS2S(insn, regs);
+       } else if ((insn & INSN_MASK_C_SWSP) == INSN_MATCH_C_SWSP &&
+                  ((insn >> SH_RD) & 0x1f)) {
+               len = 4;
+               val.data_ulong = GET_RS2C(insn, regs);
+       } else {
+               regs->epc = epc;
+               return -1;
+       }
+
+       for (i = 0; i < len; i++)
+               store_u8((void *)(addr + i), val.data_bytes[i]);
+
+       regs->epc = epc + INSN_LEN(insn);
+
+       return 0;
+}
index 1e0193d..0339b6b 100644 (file)
@@ -9,7 +9,9 @@
 #include <asm/page.h>
 #include <asm/cache.h>
 #include <asm/thread_info.h>
+#include <asm/set_memory.h>
 
+#include <linux/sizes.h>
 OUTPUT_ARCH(riscv)
 ENTRY(_start)
 
@@ -20,10 +22,18 @@ SECTIONS
        /* Beginning of code and text segment */
        . = LOAD_OFFSET;
        _start = .;
-       __init_begin = .;
        HEAD_TEXT_SECTION
+       . = ALIGN(PAGE_SIZE);
+
+       __init_begin = .;
        INIT_TEXT_SECTION(PAGE_SIZE)
        INIT_DATA_SECTION(16)
+       . = ALIGN(8);
+       __soc_early_init_table : {
+               __soc_early_init_table_start = .;
+               KEEP(*(__soc_early_init_table))
+               __soc_early_init_table_end = .;
+       }
        /* we have to discard exit text and such at runtime, not link time */
        .exit.text :
        {
@@ -36,6 +46,7 @@ SECTIONS
        PERCPU_SECTION(L1_CACHE_BYTES)
        __init_end = .;
 
+       . = ALIGN(SECTION_ALIGN);
        .text : {
                _text = .;
                _stext = .;
@@ -53,24 +64,26 @@ SECTIONS
 
        /* Start of data section */
        _sdata = .;
-       RO_DATA(L1_CACHE_BYTES)
+       RO_DATA(SECTION_ALIGN)
        .srodata : {
                *(.srodata*)
        }
 
+       EXCEPTION_TABLE(0x10)
+
+       . = ALIGN(SECTION_ALIGN);
+       _data = .;
+
        RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
        .sdata : {
                __global_pointer$ = . + 0x800;
                *(.sdata*)
                /* End of data section */
                _edata = .;
-               *(.sbss*)
        }
 
        BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0)
 
-       EXCEPTION_TABLE(0x10)
-
        .rel.dyn : {
                *(.rel.dyn*)
        }
index f29d2ba..fceaeb1 100644 (file)
@@ -3,14 +3,12 @@
 #include <asm/asm.h>
 #include <asm/csr.h>
 
-       .altmacro
        .macro fixup op reg addr lbl
-       LOCAL _epc
-_epc:
+100:
        \op \reg, \addr
        .section __ex_table,"a"
        .balign RISCV_SZPTR
-       RISCV_PTR _epc, \lbl
+       RISCV_PTR 100b, \lbl
        .previous
        .endm
 
index 50b7af5..363ef01 100644 (file)
@@ -7,7 +7,7 @@ endif
 
 obj-y += init.o
 obj-y += extable.o
-obj-$(CONFIG_MMU) += fault.o
+obj-$(CONFIG_MMU) += fault.o pageattr.o
 obj-y += cacheflush.o
 obj-y += context.o
 
@@ -15,6 +15,7 @@ ifeq ($(CONFIG_MMU),y)
 obj-$(CONFIG_SMP) += tlbflush.o
 endif
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
 obj-$(CONFIG_KASAN)   += kasan_init.o
 
 ifdef CONFIG_KASAN
index 0d4747e..a6189ed 100644 (file)
@@ -4,14 +4,12 @@
 
 int pud_huge(pud_t pud)
 {
-       return pud_present(pud) &&
-               (pud_val(pud) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC));
+       return pud_leaf(pud);
 }
 
 int pmd_huge(pmd_t pmd)
 {
-       return pmd_present(pmd) &&
-               (pmd_val(pmd) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC));
+       return pmd_leaf(pmd);
 }
 
 static __init int setup_hugepagesz(char *opt)
index fab8559..b55be44 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/sizes.h>
 #include <linux/of_fdt.h>
 #include <linux/libfdt.h>
+#include <linux/set_memory.h>
 
 #include <asm/fixmap.h>
 #include <asm/tlbflush.h>
@@ -477,6 +478,17 @@ static void __init setup_vm_final(void)
        csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
        local_flush_tlb_all();
 }
+
+void free_initmem(void)
+{
+       unsigned long init_begin = (unsigned long)__init_begin;
+       unsigned long init_end = (unsigned long)__init_end;
+
+       /* Make the region as non-execuatble. */
+       set_memory_nx(init_begin, (init_end - init_begin) >> PAGE_SHIFT);
+       free_initmem_default(POISON_FREE_INITMEM);
+}
+
 #else
 asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 {
@@ -488,6 +500,38 @@ static inline void setup_vm_final(void)
 }
 #endif /* CONFIG_MMU */
 
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void set_kernel_text_rw(void)
+{
+       unsigned long text_start = (unsigned long)_text;
+       unsigned long text_end = (unsigned long)_etext;
+
+       set_memory_rw(text_start, (text_end - text_start) >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+       unsigned long text_start = (unsigned long)_text;
+       unsigned long text_end = (unsigned long)_etext;
+
+       set_memory_ro(text_start, (text_end - text_start) >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+       unsigned long text_start = (unsigned long)_text;
+       unsigned long text_end = (unsigned long)_etext;
+       unsigned long rodata_start = (unsigned long)__start_rodata;
+       unsigned long data_start = (unsigned long)_data;
+       unsigned long max_low = (unsigned long)(__va(PFN_PHYS(max_low_pfn)));
+
+       set_memory_ro(text_start, (text_end - text_start) >> PAGE_SHIFT);
+       set_memory_ro(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
+       set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT);
+       set_memory_nx(data_start, (max_low - data_start) >> PAGE_SHIFT);
+}
+#endif
+
 void __init paging_init(void)
 {
        setup_vm_final();
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
new file mode 100644 (file)
index 0000000..728759e
--- /dev/null
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 SiFive
+ */
+
+#include <linux/pagewalk.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/bitops.h>
+
+struct pageattr_masks {
+       pgprot_t set_mask;
+       pgprot_t clear_mask;
+};
+
+static unsigned long set_pageattr_masks(unsigned long val, struct mm_walk *walk)
+{
+       struct pageattr_masks *masks = walk->private;
+       unsigned long new_val = val;
+
+       new_val &= ~(pgprot_val(masks->clear_mask));
+       new_val |= (pgprot_val(masks->set_mask));
+
+       return new_val;
+}
+
+static int pageattr_pgd_entry(pgd_t *pgd, unsigned long addr,
+                             unsigned long next, struct mm_walk *walk)
+{
+       pgd_t val = READ_ONCE(*pgd);
+
+       if (pgd_leaf(val)) {
+               val = __pgd(set_pageattr_masks(pgd_val(val), walk));
+               set_pgd(pgd, val);
+       }
+
+       return 0;
+}
+
+static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr,
+                             unsigned long next, struct mm_walk *walk)
+{
+       p4d_t val = READ_ONCE(*p4d);
+
+       if (p4d_leaf(val)) {
+               val = __p4d(set_pageattr_masks(p4d_val(val), walk));
+               set_p4d(p4d, val);
+       }
+
+       return 0;
+}
+
+static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
+                             unsigned long next, struct mm_walk *walk)
+{
+       pud_t val = READ_ONCE(*pud);
+
+       if (pud_leaf(val)) {
+               val = __pud(set_pageattr_masks(pud_val(val), walk));
+               set_pud(pud, val);
+       }
+
+       return 0;
+}
+
+static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
+                             unsigned long next, struct mm_walk *walk)
+{
+       pmd_t val = READ_ONCE(*pmd);
+
+       if (pmd_leaf(val)) {
+               val = __pmd(set_pageattr_masks(pmd_val(val), walk));
+               set_pmd(pmd, val);
+       }
+
+       return 0;
+}
+
+static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
+                             unsigned long next, struct mm_walk *walk)
+{
+       pte_t val = READ_ONCE(*pte);
+
+       val = __pte(set_pageattr_masks(pte_val(val), walk));
+       set_pte(pte, val);
+
+       return 0;
+}
+
+static int pageattr_pte_hole(unsigned long addr, unsigned long next,
+                            int depth, struct mm_walk *walk)
+{
+       /* Nothing to do here */
+       return 0;
+}
+
+const static struct mm_walk_ops pageattr_ops = {
+       .pgd_entry = pageattr_pgd_entry,
+       .p4d_entry = pageattr_p4d_entry,
+       .pud_entry = pageattr_pud_entry,
+       .pmd_entry = pageattr_pmd_entry,
+       .pte_entry = pageattr_pte_entry,
+       .pte_hole = pageattr_pte_hole,
+};
+
+static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
+                       pgprot_t clear_mask)
+{
+       int ret;
+       unsigned long start = addr;
+       unsigned long end = start + PAGE_SIZE * numpages;
+       struct pageattr_masks masks = {
+               .set_mask = set_mask,
+               .clear_mask = clear_mask
+       };
+
+       if (!numpages)
+               return 0;
+
+       down_read(&init_mm.mmap_sem);
+       ret =  walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
+                                    &masks);
+       up_read(&init_mm.mmap_sem);
+
+       flush_tlb_kernel_range(start, end);
+
+       return ret;
+}
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+       return __set_memory(addr, numpages, __pgprot(_PAGE_READ),
+                           __pgprot(_PAGE_WRITE));
+}
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+       return __set_memory(addr, numpages, __pgprot(_PAGE_READ | _PAGE_WRITE),
+                           __pgprot(0));
+}
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+       return __set_memory(addr, numpages, __pgprot(_PAGE_EXEC), __pgprot(0));
+}
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+       return __set_memory(addr, numpages, __pgprot(0), __pgprot(_PAGE_EXEC));
+}
+
+int set_direct_map_invalid_noflush(struct page *page)
+{
+       unsigned long start = (unsigned long)page_address(page);
+       unsigned long end = start + PAGE_SIZE;
+       struct pageattr_masks masks = {
+               .set_mask = __pgprot(0),
+               .clear_mask = __pgprot(_PAGE_PRESENT)
+       };
+
+       return walk_page_range(&init_mm, start, end, &pageattr_ops, &masks);
+}
+
+int set_direct_map_default_noflush(struct page *page)
+{
+       unsigned long start = (unsigned long)page_address(page);
+       unsigned long end = start + PAGE_SIZE;
+       struct pageattr_masks masks = {
+               .set_mask = PAGE_KERNEL,
+               .clear_mask = __pgprot(0)
+       };
+
+       return walk_page_range(&init_mm, start, end, &pageattr_ops, &masks);
+}
+
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+       if (!debug_pagealloc_enabled())
+               return;
+
+       if (enable)
+               __set_memory((unsigned long)page_address(page), numpages,
+                            __pgprot(_PAGE_PRESENT), __pgprot(0));
+       else
+               __set_memory((unsigned long)page_address(page), numpages,
+                            __pgprot(0), __pgprot(_PAGE_PRESENT));
+}
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
new file mode 100644 (file)
index 0000000..7eab76a
--- /dev/null
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 SiFive
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/ptdump.h>
+
+#include <asm/ptdump.h>
+#include <asm/pgtable.h>
+#include <asm/kasan.h>
+
+#define pt_dump_seq_printf(m, fmt, args...)    \
+({                                             \
+       if (m)                                  \
+               seq_printf(m, fmt, ##args);     \
+})
+
+#define pt_dump_seq_puts(m, fmt)       \
+({                                     \
+       if (m)                          \
+               seq_printf(m, fmt);     \
+})
+
+/*
+ * The page dumper groups page table entries of the same type into a single
+ * description. It uses pg_state to track the range information while
+ * iterating over the pte entries. When the continuity is broken it then
+ * dumps out a description of the range.
+ */
+struct pg_state {
+       struct ptdump_state ptdump;
+       struct seq_file *seq;
+       const struct addr_marker *marker;
+       unsigned long start_address;
+       unsigned long start_pa;
+       unsigned long last_pa;
+       int level;
+       u64 current_prot;
+       bool check_wx;
+       unsigned long wx_pages;
+};
+
+/* Address marker */
+struct addr_marker {
+       unsigned long start_address;
+       const char *name;
+};
+
+static struct addr_marker address_markers[] = {
+#ifdef CONFIG_KASAN
+       {KASAN_SHADOW_START,    "Kasan shadow start"},
+       {KASAN_SHADOW_END,      "Kasan shadow end"},
+#endif
+       {FIXADDR_START,         "Fixmap start"},
+       {FIXADDR_TOP,           "Fixmap end"},
+       {PCI_IO_START,          "PCI I/O start"},
+       {PCI_IO_END,            "PCI I/O end"},
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+       {VMEMMAP_START,         "vmemmap start"},
+       {VMEMMAP_END,           "vmemmap end"},
+#endif
+       {VMALLOC_START,         "vmalloc() area"},
+       {VMALLOC_END,           "vmalloc() end"},
+       {PAGE_OFFSET,           "Linear mapping"},
+       {-1, NULL},
+};
+
+/* Page Table Entry */
+struct prot_bits {
+       u64 mask;
+       u64 val;
+       const char *set;
+       const char *clear;
+};
+
+static const struct prot_bits pte_bits[] = {
+       {
+               .mask = _PAGE_SOFT,
+               .val = _PAGE_SOFT,
+               .set = "RSW",
+               .clear = "   ",
+       }, {
+               .mask = _PAGE_DIRTY,
+               .val = _PAGE_DIRTY,
+               .set = "D",
+               .clear = ".",
+       }, {
+               .mask = _PAGE_ACCESSED,
+               .val = _PAGE_ACCESSED,
+               .set = "A",
+               .clear = ".",
+       }, {
+               .mask = _PAGE_GLOBAL,
+               .val = _PAGE_GLOBAL,
+               .set = "G",
+               .clear = ".",
+       }, {
+               .mask = _PAGE_USER,
+               .val = _PAGE_USER,
+               .set = "U",
+               .clear = ".",
+       }, {
+               .mask = _PAGE_EXEC,
+               .val = _PAGE_EXEC,
+               .set = "X",
+               .clear = ".",
+       }, {
+               .mask = _PAGE_WRITE,
+               .val = _PAGE_WRITE,
+               .set = "W",
+               .clear = ".",
+       }, {
+               .mask = _PAGE_READ,
+               .val = _PAGE_READ,
+               .set = "R",
+               .clear = ".",
+       }, {
+               .mask = _PAGE_PRESENT,
+               .val = _PAGE_PRESENT,
+               .set = "V",
+               .clear = ".",
+       }
+};
+
+/* Page Level */
+struct pg_level {
+       const char *name;
+       u64 mask;
+};
+
+static struct pg_level pg_level[] = {
+       { /* pgd */
+               .name = "PGD",
+       }, { /* p4d */
+               .name = (CONFIG_PGTABLE_LEVELS > 4) ? "P4D" : "PGD",
+       }, { /* pud */
+               .name = (CONFIG_PGTABLE_LEVELS > 3) ? "PUD" : "PGD",
+       }, { /* pmd */
+               .name = (CONFIG_PGTABLE_LEVELS > 2) ? "PMD" : "PGD",
+       }, { /* pte */
+               .name = "PTE",
+       },
+};
+
+static void dump_prot(struct pg_state *st)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(pte_bits); i++) {
+               const char *s;
+
+               if ((st->current_prot & pte_bits[i].mask) == pte_bits[i].val)
+                       s = pte_bits[i].set;
+               else
+                       s = pte_bits[i].clear;
+
+               if (s)
+                       pt_dump_seq_printf(st->seq, " %s", s);
+       }
+}
+
+#ifdef CONFIG_64BIT
+#define ADDR_FORMAT    "0x%016lx"
+#else
+#define ADDR_FORMAT    "0x%08lx"
+#endif
+static void dump_addr(struct pg_state *st, unsigned long addr)
+{
+       static const char units[] = "KMGTPE";
+       const char *unit = units;
+       unsigned long delta;
+
+       pt_dump_seq_printf(st->seq, ADDR_FORMAT "-" ADDR_FORMAT "   ",
+                          st->start_address, addr);
+
+       pt_dump_seq_printf(st->seq, " " ADDR_FORMAT " ", st->start_pa);
+       delta = (addr - st->start_address) >> 10;
+
+       while (!(delta & 1023) && unit[1]) {
+               delta >>= 10;
+               unit++;
+       }
+
+       pt_dump_seq_printf(st->seq, "%9lu%c %s", delta, *unit,
+                          pg_level[st->level].name);
+}
+
+static void note_prot_wx(struct pg_state *st, unsigned long addr)
+{
+       if (!st->check_wx)
+               return;
+
+       if ((st->current_prot & (_PAGE_WRITE | _PAGE_EXEC)) !=
+           (_PAGE_WRITE | _PAGE_EXEC))
+               return;
+
+       WARN_ONCE(1, "riscv/mm: Found insecure W+X mapping at address %p/%pS\n",
+                 (void *)st->start_address, (void *)st->start_address);
+
+       st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
+}
+
+static void note_page(struct ptdump_state *pt_st, unsigned long addr,
+                     int level, unsigned long val)
+{
+       struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
+       u64 pa = PFN_PHYS(pte_pfn(__pte(val)));
+       u64 prot = 0;
+
+       if (level >= 0)
+               prot = val & pg_level[level].mask;
+
+       if (st->level == -1) {
+               st->level = level;
+               st->current_prot = prot;
+               st->start_address = addr;
+               st->start_pa = pa;
+               st->last_pa = pa;
+               pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+       } else if (prot != st->current_prot ||
+                  level != st->level || addr >= st->marker[1].start_address) {
+               if (st->current_prot) {
+                       note_prot_wx(st, addr);
+                       dump_addr(st, addr);
+                       dump_prot(st);
+                       pt_dump_seq_puts(st->seq, "\n");
+               }
+
+               while (addr >= st->marker[1].start_address) {
+                       st->marker++;
+                       pt_dump_seq_printf(st->seq, "---[ %s ]---\n",
+                                          st->marker->name);
+               }
+
+               st->start_address = addr;
+               st->start_pa = pa;
+               st->last_pa = pa;
+               st->current_prot = prot;
+               st->level = level;
+       } else {
+               st->last_pa = pa;
+       }
+}
+
+static void ptdump_walk(struct seq_file *s)
+{
+       struct pg_state st = {
+               .seq = s,
+               .marker = address_markers,
+               .level = -1,
+               .ptdump = {
+                       .note_page = note_page,
+                       .range = (struct ptdump_range[]) {
+                               {KERN_VIRT_START, ULONG_MAX},
+                               {0, 0}
+                       }
+               }
+       };
+
+       ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+}
+
+void ptdump_check_wx(void)
+{
+       struct pg_state st = {
+               .seq = NULL,
+               .marker = (struct addr_marker[]) {
+                       {0, NULL},
+                       {-1, NULL},
+               },
+               .level = -1,
+               .check_wx = true,
+               .ptdump = {
+                       .note_page = note_page,
+                       .range = (struct ptdump_range[]) {
+                               {KERN_VIRT_START, ULONG_MAX},
+                               {0, 0}
+                       }
+               }
+       };
+
+       ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+
+       if (st.wx_pages)
+               pr_warn("Checked W+X mappings: failed, %lu W+X pages found\n",
+                       st.wx_pages);
+       else
+               pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+       ptdump_walk(m);
+
+       return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(ptdump);
+
+static int ptdump_init(void)
+{
+       unsigned int i, j;
+
+       for (i = 0; i < ARRAY_SIZE(pg_level); i++)
+               for (j = 0; j < ARRAY_SIZE(pte_bits); j++)
+                       pg_level[i].mask |= pte_bits[j].mask;
+
+       debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
+                           &ptdump_fops);
+
+       return 0;
+}
+
+device_initcall(ptdump_init);
index e577f85..86a3796 100644 (file)
@@ -325,7 +325,6 @@ typedef void qdio_handler_t(struct ccw_device *, unsigned int, int,
 
 /**
  * struct qdio_initialize - qdio initialization data
- * @cdev: associated ccw device
  * @q_format: queue format
  * @qdr_ac: feature flags to set
  * @adapter_name: name for the adapter
@@ -341,12 +340,11 @@ typedef void qdio_handler_t(struct ccw_device *, unsigned int, int,
  * @irq_poll: Data IRQ polling handler (NULL when not supported)
  * @scan_threshold: # of in-use buffers that triggers scan on output queue
  * @int_parm: interruption parameter
- * @input_sbal_addr_array:  address of no_input_qs * 128 pointers
- * @output_sbal_addr_array: address of no_output_qs * 128 pointers
+ * @input_sbal_addr_array:  per-queue array, each element points to 128 SBALs
+ * @output_sbal_addr_array: per-queue array, each element points to 128 SBALs
  * @output_sbal_state_array: no_output_qs * 128 state info (for CQ or NULL)
  */
 struct qdio_initialize {
-       struct ccw_device *cdev;
        unsigned char q_format;
        unsigned char qdr_ac;
        unsigned char adapter_name[8];
@@ -362,8 +360,8 @@ struct qdio_initialize {
        void (*irq_poll)(struct ccw_device *cdev, unsigned long data);
        unsigned int scan_threshold;
        unsigned long int_parm;
-       struct qdio_buffer **input_sbal_addr_array;
-       struct qdio_buffer **output_sbal_addr_array;
+       struct qdio_buffer ***input_sbal_addr_array;
+       struct qdio_buffer ***output_sbal_addr_array;
        struct qdio_outbuf_state *output_sbal_state_array;
 };
 
@@ -408,8 +406,10 @@ int qdio_alloc_buffers(struct qdio_buffer **buf, unsigned int count);
 void qdio_free_buffers(struct qdio_buffer **buf, unsigned int count);
 void qdio_reset_buffers(struct qdio_buffer **buf, unsigned int count);
 
-extern int qdio_allocate(struct qdio_initialize *);
-extern int qdio_establish(struct qdio_initialize *);
+extern int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs,
+                        unsigned int no_output_qs);
+extern int qdio_establish(struct ccw_device *cdev,
+                         struct qdio_initialize *init_data);
 extern int qdio_activate(struct ccw_device *);
 extern void qdio_release_aob(struct qaob *);
 extern int do_QDIO(struct ccw_device *, unsigned int, int, unsigned int,
index d3db3d7..def3b60 100644 (file)
@@ -55,8 +55,4 @@ config KVM_S390_UCONTROL
 
          If unsure, say N.
 
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source "drivers/vhost/Kconfig"
-
 endif # VIRTUALIZATION
index 076090f..4f6c22d 100644 (file)
@@ -1202,6 +1202,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
                scb_s->iprcc = PGM_ADDRESSING;
                scb_s->pgmilc = 4;
                scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+               rc = 1;
        }
        return rc;
 }
index d56f677..1bf091b 100644 (file)
@@ -852,9 +852,7 @@ void do_secure_storage_access(struct pt_regs *regs)
                        BUG();
                break;
        case VDSO_FAULT:
-               /* fallthrough */
        case GMAP_FAULT:
-               /* fallthrough */
        default:
                do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
                WARN_ON_ONCE(1);
index 2fbece4..1a95d88 100644 (file)
@@ -787,14 +787,18 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
 static inline unsigned long *gmap_table_walk(struct gmap *gmap,
                                             unsigned long gaddr, int level)
 {
+       const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
        unsigned long *table;
 
        if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
                return NULL;
        if (gmap_is_shadow(gmap) && gmap->removed)
                return NULL;
-       if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+
+       if (asce_type != _ASCE_TYPE_REGION1 &&
+           gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
                return NULL;
+
        table = gmap->table;
        switch (gmap->asce & _ASCE_TYPE_MASK) {
        case _ASCE_TYPE_REGION1:
@@ -1840,6 +1844,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
                goto out_free;
        } else if (*table & _REGION_ENTRY_ORIGIN) {
                rc = -EAGAIN;           /* Race with shadow */
+               goto out_free;
        }
        crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
        /* mark as invalid as long as the parent table is not protected */
index 1de0334..4c05198 100644 (file)
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
        printf "#define __NR_syscalls\t%s\n" "${nxt}"
        printf "#endif\n"
        printf "\n"
-       printf "#endif /* %s */" "${fileguard}"
+       printf "#endif /* %s */\n" "${fileguard}"
 ) > "$out"
index 13ee4d2..5f23d79 100644 (file)
@@ -355,7 +355,7 @@ static inline int access_error(int error_code, struct vm_area_struct *vma)
                return 1;
 
        /* read, not present: */
-       if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+       if (unlikely(!vma_is_accessible(vma)))
                return 1;
 
        return 0;
index 626b574..cf50a75 100644 (file)
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
        printf "#define __NR_syscalls\t%s\n" "${nxt}"
        printf "#endif\n"
        printf "\n"
-       printf "#endif /* %s */" "${fileguard}"
+       printf "#endif /* %s */\n" "${fileguard}"
 ) > "$out"
index 026abb3..d7f99e6 100644 (file)
@@ -4,10 +4,6 @@
 
 #define        BUILD_VDSO32
 
-#ifndef        CONFIG_CC_OPTIMIZE_FOR_SIZE
-#undef CONFIG_OPTIMIZE_INLINING
-#endif
-
 #ifdef CONFIG_SPARC64
 
 /*
index a3bf2ff..e251f50 100644 (file)
@@ -55,7 +55,6 @@ static struct pwm_lookup nb0916_pwm_lookup[] = {
 static struct platform_pwm_backlight_data nb0916_backlight_data = {
        .max_brightness = 100,
        .dft_brightness = 100,
-       .enable_gpio    = -1,
 };
 
 static struct gpio_keys_button nb0916_gpio_keys[] = {
index 1edf788..0163d76 100644 (file)
@@ -149,6 +149,7 @@ config X86
        select HAVE_ARCH_TRACEHOOK
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
+       select HAVE_ARCH_USERFAULTFD_WP         if USERFAULTFD
        select HAVE_ARCH_VMAP_STACK             if X86_64
        select HAVE_ARCH_WITHIN_STACK_FRAMES
        select HAVE_ASM_MODVERSIONS
@@ -1660,6 +1661,7 @@ config X86_PMEM_LEGACY
        depends on PHYS_ADDR_T_64BIT
        depends on BLK_DEV
        select X86_PMEM_LEGACY_DEVICE
+       select NUMA_KEEP_MEMINFO if NUMA
        select LIBNVDIMM
        help
          Treat memory marked using the non-standard e820 type of 12 as used
index ab8b30c..5509045 100644 (file)
@@ -285,7 +285,6 @@ CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_DEBUG_BOOT_PARAMS=y
-CONFIG_OPTIMIZE_INLINING=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_SELINUX=y
index 2d196cb..6149610 100644 (file)
@@ -282,7 +282,6 @@ CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_DEBUG_BOOT_PARAMS=y
-CONFIG_OPTIMIZE_INLINING=y
 CONFIG_UNWINDER_ORC=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
index 1e82bd4..84a4a73 100644 (file)
@@ -1,10 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #define BUILD_VDSO32
 
-#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE
-#undef CONFIG_OPTIMIZE_INLINING
-#endif
-
 #ifdef CONFIG_X86_64
 
 /*
index afda66a..28838d7 100644 (file)
@@ -25,6 +25,7 @@
 #include <asm/x86_init.h>
 #include <asm/fpu/xstate.h>
 #include <asm/fpu/api.h>
+#include <asm-generic/pgtable_uffd.h>
 
 extern pgd_t early_top_pgt[PTRS_PER_PGD];
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
@@ -313,6 +314,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
        return native_make_pte(v & ~clear);
 }
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline int pte_uffd_wp(pte_t pte)
+{
+       return pte_flags(pte) & _PAGE_UFFD_WP;
+}
+
+static inline pte_t pte_mkuffd_wp(pte_t pte)
+{
+       return pte_set_flags(pte, _PAGE_UFFD_WP);
+}
+
+static inline pte_t pte_clear_uffd_wp(pte_t pte)
+{
+       return pte_clear_flags(pte, _PAGE_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
 static inline pte_t pte_mkclean(pte_t pte)
 {
        return pte_clear_flags(pte, _PAGE_DIRTY);
@@ -392,6 +410,23 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
        return native_make_pmd(v & ~clear);
 }
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline int pmd_uffd_wp(pmd_t pmd)
+{
+       return pmd_flags(pmd) & _PAGE_UFFD_WP;
+}
+
+static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
+{
+       return pmd_set_flags(pmd, _PAGE_UFFD_WP);
+}
+
+static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
+{
+       return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
 static inline pmd_t pmd_mkold(pmd_t pmd)
 {
        return pmd_clear_flags(pmd, _PAGE_ACCESSED);
@@ -1374,6 +1409,38 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
 #endif
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
+{
+       return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
+}
+
+static inline int pte_swp_uffd_wp(pte_t pte)
+{
+       return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
+}
+
+static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
+{
+       return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
+}
+
+static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd)
+{
+       return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP);
+}
+
+static inline int pmd_swp_uffd_wp(pmd_t pmd)
+{
+       return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP;
+}
+
+static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
+{
+       return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
 #define PKRU_AD_BIT 0x1
 #define PKRU_WD_BIT 0x2
 #define PKRU_BITS_PER_PKEY 2
index 0b6c404..df13734 100644 (file)
@@ -189,7 +189,7 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
  *
  * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
  * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
- * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|X|SD|0| <- swp entry
+ * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|F|SD|0| <- swp entry
  *
  * G (8) is aliased and used as a PROT_NONE indicator for
  * !present ptes.  We need to start storing swap entries above
@@ -197,9 +197,15 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
  * erratum where they can be incorrectly set by hardware on
  * non-present PTEs.
  *
+ * SD Bits 1-4 are not used in non-present format and available for
+ * special use described below:
+ *
  * SD (1) in swp entry is used to store soft dirty bit, which helps us
  * remember soft dirty over page migration
  *
+ * F (2) in swp entry is used to record when a pagetable is
+ * writeprotected by userfaultfd WP support.
+ *
  * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
  * but also L and G.
  *
index 65c2ecd..b6606fe 100644 (file)
@@ -32,6 +32,7 @@
 
 #define _PAGE_BIT_SPECIAL      _PAGE_BIT_SOFTW1
 #define _PAGE_BIT_CPA_TEST     _PAGE_BIT_SOFTW1
+#define _PAGE_BIT_UFFD_WP      _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
 #define _PAGE_BIT_SOFT_DIRTY   _PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_DEVMAP       _PAGE_BIT_SOFTW4
 
 #define _PAGE_SWP_SOFT_DIRTY   (_AT(pteval_t, 0))
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+#define _PAGE_UFFD_WP          (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
+#define _PAGE_SWP_UFFD_WP      _PAGE_USER
+#else
+#define _PAGE_UFFD_WP          (_AT(pteval_t, 0))
+#define _PAGE_SWP_UFFD_WP      (_AT(pteval_t, 0))
+#endif
+
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX       (_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #define _PAGE_DEVMAP   (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
  */
 #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |         \
                         _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
-                        _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC)
+                        _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC |  \
+                        _PAGE_UFFD_WP)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
 /*
index 1ae5439..683ed9e 100644 (file)
@@ -45,7 +45,7 @@ EXPORT_SYMBOL(acpi_disabled);
 #define PREFIX                 "ACPI: "
 
 int acpi_noirq;                                /* skip ACPI IRQ initialization */
-int acpi_nobgrt;                       /* skip ACPI BGRT */
+static int acpi_nobgrt;                        /* skip ACPI BGRT */
 int acpi_pci_disabled;         /* skip ACPI PCI scan and IRQ initialization */
 EXPORT_SYMBOL(acpi_pci_disabled);
 
index 9fea075..d8154e0 100644 (file)
@@ -107,8 +107,4 @@ config KVM_MMU_AUDIT
         This option adds a R/W kVM module parameter 'mmu_audit', which allows
         auditing of KVM MMU events at runtime.
 
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source "drivers/vhost/Kconfig"
-
 endif # VIRTUALIZATION
index e553f0f..a789759 100644 (file)
@@ -14,7 +14,7 @@ kvm-y                 += x86.o emulate.o i8259.o irq.o lapic.o \
                           hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
 
 kvm-intel-y            += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
-kvm-amd-y              += svm.o pmu_amd.o
+kvm-amd-y              += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
 
 obj-$(CONFIG_KVM)      += kvm.o
 obj-$(CONFIG_KVM_INTEL)        += kvm-intel.o
index ca80daf..9af25c9 100644 (file)
@@ -59,9 +59,6 @@
 #define MAX_APIC_VECTOR                        256
 #define APIC_VECTORS_PER_REG           32
 
-#define APIC_BROADCAST                 0xFF
-#define X2APIC_BROADCAST               0xFFFFFFFFul
-
 static bool lapic_timer_advance_dynamic __read_mostly;
 #define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100     /* clock cycles */
 #define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000   /* clock cycles */
index 40ed6ed..a0ffb43 100644 (file)
@@ -17,6 +17,9 @@
 #define APIC_BUS_CYCLE_NS       1
 #define APIC_BUS_FREQUENCY      (1000000000ULL / APIC_BUS_CYCLE_NS)
 
+#define APIC_BROADCAST                 0xFF
+#define X2APIC_BROADCAST               0xFFFFFFFFul
+
 enum lapic_mode {
        LAPIC_MODE_DISABLED = 0,
        LAPIC_MODE_INVALID = X2APIC_ENABLE,
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
new file mode 100644 (file)
index 0000000..e80daa9
--- /dev/null
@@ -0,0 +1,1027 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ */
+
+#define pr_fmt(fmt) "SVM: " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/hashtable.h>
+#include <linux/amd-iommu.h>
+#include <linux/kvm_host.h>
+
+#include <asm/irq_remapping.h>
+
+#include "trace.h"
+#include "lapic.h"
+#include "x86.h"
+#include "irq.h"
+#include "svm.h"
+
+/* enable / disable AVIC */
+int avic;
+#ifdef CONFIG_X86_LOCAL_APIC
+module_param(avic, int, S_IRUGO);
+#endif
+
+#define SVM_AVIC_DOORBELL      0xc001011b
+
+#define AVIC_HPA_MASK  ~((0xFFFULL << 52) | 0xFFF)
+
+/*
+ * 0xff is broadcast, so the max index allowed for physical APIC ID
+ * table is 0xfe.  APIC IDs above 0xff are reserved.
+ */
+#define AVIC_MAX_PHYSICAL_ID_COUNT     255
+
+#define AVIC_UNACCEL_ACCESS_WRITE_MASK         1
+#define AVIC_UNACCEL_ACCESS_OFFSET_MASK                0xFF0
+#define AVIC_UNACCEL_ACCESS_VECTOR_MASK                0xFFFFFFFF
+
+/* AVIC GATAG is encoded using VM and VCPU IDs */
+#define AVIC_VCPU_ID_BITS              8
+#define AVIC_VCPU_ID_MASK              ((1 << AVIC_VCPU_ID_BITS) - 1)
+
+#define AVIC_VM_ID_BITS                        24
+#define AVIC_VM_ID_NR                  (1 << AVIC_VM_ID_BITS)
+#define AVIC_VM_ID_MASK                        ((1 << AVIC_VM_ID_BITS) - 1)
+
+#define AVIC_GATAG(x, y)               (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
+                                               (y & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG_TO_VMID(x)          ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUID(x)                (x & AVIC_VCPU_ID_MASK)
+
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_svm,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS  8
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static u32 next_vm_id = 0;
+static bool next_vm_id_wrapped = 0;
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
+
+/*
+ * This is a wrapper of struct amd_iommu_ir_data.
+ */
+struct amd_svm_iommu_ir {
+       struct list_head node;  /* Used by SVM for per-vcpu ir_list */
+       void *data;             /* Storing pointer to struct amd_ir_data */
+};
+
+enum avic_ipi_failure_cause {
+       AVIC_IPI_FAILURE_INVALID_INT_TYPE,
+       AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
+       AVIC_IPI_FAILURE_INVALID_TARGET,
+       AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
+};
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+int avic_ga_log_notifier(u32 ga_tag)
+{
+       unsigned long flags;
+       struct kvm_svm *kvm_svm;
+       struct kvm_vcpu *vcpu = NULL;
+       u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+       u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+
+       pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+       trace_kvm_avic_ga_log(vm_id, vcpu_id);
+
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
+               if (kvm_svm->avic_vm_id != vm_id)
+                       continue;
+               vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
+               break;
+       }
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+       /* Note:
+        * At this point, the IOMMU should have already set the pending
+        * bit in the vAPIC backing page. So, we just need to schedule
+        * in the vcpu.
+        */
+       if (vcpu)
+               kvm_vcpu_wake_up(vcpu);
+
+       return 0;
+}
+
+void avic_vm_destroy(struct kvm *kvm)
+{
+       unsigned long flags;
+       struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+       if (!avic)
+               return;
+
+       if (kvm_svm->avic_logical_id_table_page)
+               __free_page(kvm_svm->avic_logical_id_table_page);
+       if (kvm_svm->avic_physical_id_table_page)
+               __free_page(kvm_svm->avic_physical_id_table_page);
+
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+       hash_del(&kvm_svm->hnode);
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+}
+
+int avic_vm_init(struct kvm *kvm)
+{
+       unsigned long flags;
+       int err = -ENOMEM;
+       struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+       struct kvm_svm *k2;
+       struct page *p_page;
+       struct page *l_page;
+       u32 vm_id;
+
+       if (!avic)
+               return 0;
+
+       /* Allocating physical APIC ID table (4KB) */
+       p_page = alloc_page(GFP_KERNEL_ACCOUNT);
+       if (!p_page)
+               goto free_avic;
+
+       kvm_svm->avic_physical_id_table_page = p_page;
+       clear_page(page_address(p_page));
+
+       /* Allocating logical APIC ID table (4KB) */
+       l_page = alloc_page(GFP_KERNEL_ACCOUNT);
+       if (!l_page)
+               goto free_avic;
+
+       kvm_svm->avic_logical_id_table_page = l_page;
+       clear_page(page_address(l_page));
+
+       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ again:
+       vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
+       if (vm_id == 0) { /* id is 1-based, zero is not okay */
+               next_vm_id_wrapped = 1;
+               goto again;
+       }
+       /* Is it still in use? Only possible if wrapped at least once */
+       if (next_vm_id_wrapped) {
+               hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
+                       if (k2->avic_vm_id == vm_id)
+                               goto again;
+               }
+       }
+       kvm_svm->avic_vm_id = vm_id;
+       hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
+       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+       return 0;
+
+free_avic:
+       avic_vm_destroy(kvm);
+       return err;
+}
+
+void avic_init_vmcb(struct vcpu_svm *svm)
+{
+       struct vmcb *vmcb = svm->vmcb;
+       struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
+       phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
+       phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
+       phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
+
+       vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
+       vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
+       vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
+       vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
+       if (kvm_apicv_activated(svm->vcpu.kvm))
+               vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+       else
+               vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+}
+
+static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
+                                      unsigned int index)
+{
+       u64 *avic_physical_id_table;
+       struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+
+       if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
+               return NULL;
+
+       avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
+
+       return &avic_physical_id_table[index];
+}
+
+/**
+ * Note:
+ * AVIC hardware walks the nested page table to check permissions,
+ * but does not use the SPA address specified in the leaf page
+ * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
+ * field of the VMCB. Therefore, we set up the
+ * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
+ */
+static int avic_update_access_page(struct kvm *kvm, bool activate)
+{
+       int ret = 0;
+
+       mutex_lock(&kvm->slots_lock);
+       /*
+        * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger
+        * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
+        * memory region. So, we need to ensure that kvm->mm == current->mm.
+        */
+       if ((kvm->arch.apic_access_page_done == activate) ||
+           (kvm->mm != current->mm))
+               goto out;
+
+       ret = __x86_set_memory_region(kvm,
+                                     APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+                                     APIC_DEFAULT_PHYS_BASE,
+                                     activate ? PAGE_SIZE : 0);
+       if (ret)
+               goto out;
+
+       kvm->arch.apic_access_page_done = activate;
+out:
+       mutex_unlock(&kvm->slots_lock);
+       return ret;
+}
+
+static int avic_init_backing_page(struct kvm_vcpu *vcpu)
+{
+       u64 *entry, new_entry;
+       int id = vcpu->vcpu_id;
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
+               return -EINVAL;
+
+       if (!svm->vcpu.arch.apic->regs)
+               return -EINVAL;
+
+       if (kvm_apicv_activated(vcpu->kvm)) {
+               int ret;
+
+               ret = avic_update_access_page(vcpu->kvm, true);
+               if (ret)
+                       return ret;
+       }
+
+       svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+
+       /* Setting AVIC backing page address in the phy APIC ID table */
+       entry = avic_get_physical_id_entry(vcpu, id);
+       if (!entry)
+               return -EINVAL;
+
+       new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+                             AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+                             AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
+       WRITE_ONCE(*entry, new_entry);
+
+       svm->avic_physical_id_cache = entry;
+
+       return 0;
+}
+
+int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+{
+       u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
+       u32 icrl = svm->vmcb->control.exit_info_1;
+       u32 id = svm->vmcb->control.exit_info_2 >> 32;
+       u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
+       struct kvm_lapic *apic = svm->vcpu.arch.apic;
+
+       trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+
+       switch (id) {
+       case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
+               /*
+                * AVIC hardware handles the generation of
+                * IPIs when the specified Message Type is Fixed
+                * (also known as fixed delivery mode) and
+                * the Trigger Mode is edge-triggered. The hardware
+                * also supports self and broadcast delivery modes
+                * specified via the Destination Shorthand(DSH)
+                * field of the ICRL. Logical and physical APIC ID
+                * formats are supported. All other IPI types cause
+                * a #VMEXIT, which needs to emulated.
+                */
+               kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
+               kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+               break;
+       case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
+               int i;
+               struct kvm_vcpu *vcpu;
+               struct kvm *kvm = svm->vcpu.kvm;
+               struct kvm_lapic *apic = svm->vcpu.arch.apic;
+
+               /*
+                * At this point, we expect that the AVIC HW has already
+                * set the appropriate IRR bits on the valid target
+                * vcpus. So, we just need to kick the appropriate vcpu.
+                */
+               kvm_for_each_vcpu(i, vcpu, kvm) {
+                       bool m = kvm_apic_match_dest(vcpu, apic,
+                                                    icrl & APIC_SHORT_MASK,
+                                                    GET_APIC_DEST_FIELD(icrh),
+                                                    icrl & APIC_DEST_MASK);
+
+                       if (m && !avic_vcpu_is_running(vcpu))
+                               kvm_vcpu_wake_up(vcpu);
+               }
+               break;
+       }
+       case AVIC_IPI_FAILURE_INVALID_TARGET:
+               WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
+                         index, svm->vcpu.vcpu_id, icrh, icrl);
+               break;
+       case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
+               WARN_ONCE(1, "Invalid backing page\n");
+               break;
+       default:
+               pr_err("Unknown IPI interception\n");
+       }
+
+       return 1;
+}
+
+static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
+{
+       struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+       int index;
+       u32 *logical_apic_id_table;
+       int dlid = GET_APIC_LOGICAL_ID(ldr);
+
+       if (!dlid)
+               return NULL;
+
+       if (flat) { /* flat */
+               index = ffs(dlid) - 1;
+               if (index > 7)
+                       return NULL;
+       } else { /* cluster */
+               int cluster = (dlid & 0xf0) >> 4;
+               int apic = ffs(dlid & 0x0f) - 1;
+
+               if ((apic < 0) || (apic > 7) ||
+                   (cluster >= 0xf))
+                       return NULL;
+               index = (cluster << 2) + apic;
+       }
+
+       logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
+
+       return &logical_apic_id_table[index];
+}
+
+static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
+{
+       bool flat;
+       u32 *entry, new_entry;
+
+       flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
+       entry = avic_get_logical_id_entry(vcpu, ldr, flat);
+       if (!entry)
+               return -EINVAL;
+
+       new_entry = READ_ONCE(*entry);
+       new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+       new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
+       new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+       WRITE_ONCE(*entry, new_entry);
+
+       return 0;
+}
+
+static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       bool flat = svm->dfr_reg == APIC_DFR_FLAT;
+       u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
+
+       if (entry)
+               clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
+}
+
+static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+{
+       int ret = 0;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+       u32 id = kvm_xapic_id(vcpu->arch.apic);
+
+       if (ldr == svm->ldr_reg)
+               return 0;
+
+       avic_invalidate_logical_id_entry(vcpu);
+
+       if (ldr)
+               ret = avic_ldr_write(vcpu, id, ldr);
+
+       if (!ret)
+               svm->ldr_reg = ldr;
+
+       return ret;
+}
+
+static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
+{
+       u64 *old, *new;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u32 id = kvm_xapic_id(vcpu->arch.apic);
+
+       if (vcpu->vcpu_id == id)
+               return 0;
+
+       old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
+       new = avic_get_physical_id_entry(vcpu, id);
+       if (!new || !old)
+               return 1;
+
+       /* We need to move physical_id_entry to new offset */
+       *new = *old;
+       *old = 0ULL;
+       to_svm(vcpu)->avic_physical_id_cache = new;
+
+       /*
+        * Also update the guest physical APIC ID in the logical
+        * APIC ID table entry if already setup the LDR.
+        */
+       if (svm->ldr_reg)
+               avic_handle_ldr_update(vcpu);
+
+       return 0;
+}
+
+static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
+
+       if (svm->dfr_reg == dfr)
+               return;
+
+       avic_invalidate_logical_id_entry(vcpu);
+       svm->dfr_reg = dfr;
+}
+
+static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+{
+       struct kvm_lapic *apic = svm->vcpu.arch.apic;
+       u32 offset = svm->vmcb->control.exit_info_1 &
+                               AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+
+       switch (offset) {
+       case APIC_ID:
+               if (avic_handle_apic_id_update(&svm->vcpu))
+                       return 0;
+               break;
+       case APIC_LDR:
+               if (avic_handle_ldr_update(&svm->vcpu))
+                       return 0;
+               break;
+       case APIC_DFR:
+               avic_handle_dfr_update(&svm->vcpu);
+               break;
+       default:
+               break;
+       }
+
+       kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
+
+       return 1;
+}
+
+static bool is_avic_unaccelerated_access_trap(u32 offset)
+{
+       bool ret = false;
+
+       switch (offset) {
+       case APIC_ID:
+       case APIC_EOI:
+       case APIC_RRR:
+       case APIC_LDR:
+       case APIC_DFR:
+       case APIC_SPIV:
+       case APIC_ESR:
+       case APIC_ICR:
+       case APIC_LVTT:
+       case APIC_LVTTHMR:
+       case APIC_LVTPC:
+       case APIC_LVT0:
+       case APIC_LVT1:
+       case APIC_LVTERR:
+       case APIC_TMICT:
+       case APIC_TDCR:
+               ret = true;
+               break;
+       default:
+               break;
+       }
+       return ret;
+}
+
+int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+{
+       int ret = 0;
+       u32 offset = svm->vmcb->control.exit_info_1 &
+                    AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+       u32 vector = svm->vmcb->control.exit_info_2 &
+                    AVIC_UNACCEL_ACCESS_VECTOR_MASK;
+       bool write = (svm->vmcb->control.exit_info_1 >> 32) &
+                    AVIC_UNACCEL_ACCESS_WRITE_MASK;
+       bool trap = is_avic_unaccelerated_access_trap(offset);
+
+       trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+                                           trap, write, vector);
+       if (trap) {
+               /* Handling Trap */
+               WARN_ONCE(!write, "svm: Handling trap read.\n");
+               ret = avic_unaccel_trap_write(svm);
+       } else {
+               /* Handling Fault */
+               ret = kvm_emulate_instruction(&svm->vcpu, 0);
+       }
+
+       return ret;
+}
+
+int avic_init_vcpu(struct vcpu_svm *svm)
+{
+       int ret;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
+
+       if (!avic || !irqchip_in_kernel(vcpu->kvm))
+               return 0;
+
+       ret = avic_init_backing_page(&svm->vcpu);
+       if (ret)
+               return ret;
+
+       INIT_LIST_HEAD(&svm->ir_list);
+       spin_lock_init(&svm->ir_list_lock);
+       svm->dfr_reg = APIC_DFR_FLAT;
+
+       return ret;
+}
+
+void avic_post_state_restore(struct kvm_vcpu *vcpu)
+{
+       if (avic_handle_apic_id_update(vcpu) != 0)
+               return;
+       avic_handle_dfr_update(vcpu);
+       avic_handle_ldr_update(vcpu);
+}
+
+void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate)
+{
+       if (!avic || !lapic_in_kernel(vcpu))
+               return;
+
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+       kvm_request_apicv_update(vcpu->kvm, activate,
+                                APICV_INHIBIT_REASON_IRQWIN);
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+}
+
+void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+       return;
+}
+
+void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+{
+}
+
+void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+{
+}
+
+static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+{
+       int ret = 0;
+       unsigned long flags;
+       struct amd_svm_iommu_ir *ir;
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (!kvm_arch_has_assigned_device(vcpu->kvm))
+               return 0;
+
+       /*
+        * Here, we go through the per-vcpu ir_list to update all existing
+        * interrupt remapping table entry targeting this vcpu.
+        */
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+       if (list_empty(&svm->ir_list))
+               goto out;
+
+       list_for_each_entry(ir, &svm->ir_list, node) {
+               if (activate)
+                       ret = amd_iommu_activate_guest_mode(ir->data);
+               else
+                       ret = amd_iommu_deactivate_guest_mode(ir->data);
+               if (ret)
+                       break;
+       }
+out:
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+       return ret;
+}
+
+void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb = svm->vmcb;
+       bool activated = kvm_vcpu_apicv_active(vcpu);
+
+       if (!avic)
+               return;
+
+       if (activated) {
+               /**
+                * During AVIC temporary deactivation, guest could update
+                * APIC ID, DFR and LDR registers, which would not be trapped
+                * by avic_unaccelerated_access_interception(). In this case,
+                * we need to check and update the AVIC logical APIC ID table
+                * accordingly before re-activating.
+                */
+               avic_post_state_restore(vcpu);
+               vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+       } else {
+               vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+       }
+       mark_dirty(vmcb, VMCB_AVIC);
+
+       svm_set_pi_irte_mode(vcpu, activated);
+}
+
+void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+       return;
+}
+
+int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+{
+       if (!vcpu->arch.apicv_active)
+               return -1;
+
+       kvm_lapic_set_irr(vec, vcpu->arch.apic);
+       smp_mb__after_atomic();
+
+       if (avic_vcpu_is_running(vcpu)) {
+               int cpuid = vcpu->cpu;
+
+               if (cpuid != get_cpu())
+                       wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid));
+               put_cpu();
+       } else
+               kvm_vcpu_wake_up(vcpu);
+
+       return 0;
+}
+
+bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+       return false;
+}
+
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+       unsigned long flags;
+       struct amd_svm_iommu_ir *cur;
+
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_for_each_entry(cur, &svm->ir_list, node) {
+               if (cur->data != pi->ir_data)
+                       continue;
+               list_del(&cur->node);
+               kfree(cur);
+               break;
+       }
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+       int ret = 0;
+       unsigned long flags;
+       struct amd_svm_iommu_ir *ir;
+
+       /**
+        * In some cases, the existing irte is updaed and re-set,
+        * so we need to check here if it's already been * added
+        * to the ir_list.
+        */
+       if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+               struct kvm *kvm = svm->vcpu.kvm;
+               u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+               struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+               struct vcpu_svm *prev_svm;
+
+               if (!prev_vcpu) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               prev_svm = to_svm(prev_vcpu);
+               svm_ir_list_del(prev_svm, pi);
+       }
+
+       /**
+        * Allocating new amd_iommu_pi_data, which will get
+        * add to the per-vcpu ir_list.
+        */
+       ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
+       if (!ir) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       ir->data = pi->ir_data;
+
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+       list_add(&ir->node, &svm->ir_list);
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+out:
+       return ret;
+}
+
+/**
+ * Note:
+ * The HW cannot support posting multicast/broadcast
+ * interrupts to a vCPU. So, we still use legacy interrupt
+ * remapping for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ */
+static int
+get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+                struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+{
+       struct kvm_lapic_irq irq;
+       struct kvm_vcpu *vcpu = NULL;
+
+       kvm_set_msi_irq(kvm, e, &irq);
+
+       if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+           !kvm_irq_is_postable(&irq)) {
+               pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+                        __func__, irq.vector);
+               return -1;
+       }
+
+       pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+                irq.vector);
+       *svm = to_svm(vcpu);
+       vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
+       vcpu_info->vector = irq.vector;
+
+       return 0;
+}
+
+/*
+ * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+                      uint32_t guest_irq, bool set)
+{
+       struct kvm_kernel_irq_routing_entry *e;
+       struct kvm_irq_routing_table *irq_rt;
+       int idx, ret = -EINVAL;
+
+       if (!kvm_arch_has_assigned_device(kvm) ||
+           !irq_remapping_cap(IRQ_POSTING_CAP))
+               return 0;
+
+       pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+                __func__, host_irq, guest_irq, set);
+
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+       WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+       hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+               struct vcpu_data vcpu_info;
+               struct vcpu_svm *svm = NULL;
+
+               if (e->type != KVM_IRQ_ROUTING_MSI)
+                       continue;
+
+               /**
+                * Here, we setup with legacy mode in the following cases:
+                * 1. When cannot target interrupt to a specific vcpu.
+                * 2. Unsetting posted interrupt.
+                * 3. APIC virtialization is disabled for the vcpu.
+                * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
+                */
+               if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+                   kvm_vcpu_apicv_active(&svm->vcpu)) {
+                       struct amd_iommu_pi_data pi;
+
+                       /* Try to enable guest_mode in IRTE */
+                       pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
+                                           AVIC_HPA_MASK);
+                       pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+                                                    svm->vcpu.vcpu_id);
+                       pi.is_guest_mode = true;
+                       pi.vcpu_data = &vcpu_info;
+                       ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+                       /**
+                        * Here, we successfully setting up vcpu affinity in
+                        * IOMMU guest mode. Now, we need to store the posted
+                        * interrupt information in a per-vcpu ir_list so that
+                        * we can reference to them directly when we update vcpu
+                        * scheduling information in IOMMU irte.
+                        */
+                       if (!ret && pi.is_guest_mode)
+                               svm_ir_list_add(svm, &pi);
+               } else {
+                       /* Use legacy mode in IRTE */
+                       struct amd_iommu_pi_data pi;
+
+                       /**
+                        * Here, pi is used to:
+                        * - Tell IOMMU to use legacy mode for this interrupt.
+                        * - Retrieve ga_tag of prior interrupt remapping data.
+                        */
+                       pi.is_guest_mode = false;
+                       ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+                       /**
+                        * Check if the posted interrupt was previously
+                        * setup with the guest_mode by checking if the ga_tag
+                        * was cached. If so, we need to clean up the per-vcpu
+                        * ir_list.
+                        */
+                       if (!ret && pi.prev_ga_tag) {
+                               int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+                               struct kvm_vcpu *vcpu;
+
+                               vcpu = kvm_get_vcpu_by_id(kvm, id);
+                               if (vcpu)
+                                       svm_ir_list_del(to_svm(vcpu), &pi);
+                       }
+               }
+
+               if (!ret && svm) {
+                       trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
+                                                e->gsi, vcpu_info.vector,
+                                                vcpu_info.pi_desc_addr, set);
+               }
+
+               if (ret < 0) {
+                       pr_err("%s: failed to update PI IRTE\n", __func__);
+                       goto out;
+               }
+       }
+
+       ret = 0;
+out:
+       srcu_read_unlock(&kvm->irq_srcu, idx);
+       return ret;
+}
+
+bool svm_check_apicv_inhibit_reasons(ulong bit)
+{
+       ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+                         BIT(APICV_INHIBIT_REASON_HYPERV) |
+                         BIT(APICV_INHIBIT_REASON_NESTED) |
+                         BIT(APICV_INHIBIT_REASON_IRQWIN) |
+                         BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
+                         BIT(APICV_INHIBIT_REASON_X2APIC);
+
+       return supported & BIT(bit);
+}
+
+void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate)
+{
+       avic_update_access_page(kvm, activate);
+}
+
+static inline int
+avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
+{
+       int ret = 0;
+       unsigned long flags;
+       struct amd_svm_iommu_ir *ir;
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (!kvm_arch_has_assigned_device(vcpu->kvm))
+               return 0;
+
+       /*
+        * Here, we go through the per-vcpu ir_list to update all existing
+        * interrupt remapping table entry targeting this vcpu.
+        */
+       spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+       if (list_empty(&svm->ir_list))
+               goto out;
+
+       list_for_each_entry(ir, &svm->ir_list, node) {
+               ret = amd_iommu_update_ga(cpu, r, ir->data);
+               if (ret)
+                       break;
+       }
+out:
+       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+       return ret;
+}
+
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       u64 entry;
+       /* ID = 0xff (broadcast), ID > 0xff (reserved) */
+       int h_physical_id = kvm_cpu_get_apicid(cpu);
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+
+       /*
+        * Since the host physical APIC id is 8 bits,
+        * we can support host APIC ID upto 255.
+        */
+       if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+               return;
+
+       entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+
+       entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+       entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+
+       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+       if (svm->avic_is_running)
+               entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+       avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+                                       svm->avic_is_running);
+}
+
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       u64 entry;
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+
+       entry = READ_ONCE(*(svm->avic_physical_id_cache));
+       if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+               avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
+       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+}
+
+/**
+ * This function is called during VCPU halt/unhalt.
+ */
+static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       svm->avic_is_running = is_run;
+       if (is_run)
+               avic_vcpu_load(vcpu, vcpu->cpu);
+       else
+               avic_vcpu_put(vcpu);
+}
+
+void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+       avic_set_running(vcpu, false);
+}
+
+void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+       if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
+               kvm_vcpu_update_apicv(vcpu);
+       avic_set_running(vcpu, true);
+}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
new file mode 100644 (file)
index 0000000..90a1ca9
--- /dev/null
@@ -0,0 +1,823 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ */
+
+#define pr_fmt(fmt) "SVM: " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+
+#include <asm/msr-index.h>
+
+#include "kvm_emulate.h"
+#include "trace.h"
+#include "mmu.h"
+#include "x86.h"
+#include "svm.h"
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+                                      struct x86_exception *fault)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
+               /*
+                * TODO: track the cause of the nested page fault, and
+                * correctly fill in the high bits of exit_info_1.
+                */
+               svm->vmcb->control.exit_code = SVM_EXIT_NPF;
+               svm->vmcb->control.exit_code_hi = 0;
+               svm->vmcb->control.exit_info_1 = (1ULL << 32);
+               svm->vmcb->control.exit_info_2 = fault->address;
+       }
+
+       svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
+       svm->vmcb->control.exit_info_1 |= fault->error_code;
+
+       /*
+        * The present bit is always zero for page structure faults on real
+        * hardware.
+        */
+       if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
+               svm->vmcb->control.exit_info_1 &= ~1;
+
+       nested_svm_vmexit(svm);
+}
+
+static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u64 cr3 = svm->nested.nested_cr3;
+       u64 pdpte;
+       int ret;
+
+       ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
+                                      offset_in_page(cr3) + index * 8, 8);
+       if (ret)
+               return 0;
+       return pdpte;
+}
+
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       return svm->nested.nested_cr3;
+}
+
+static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+       WARN_ON(mmu_is_nested(vcpu));
+
+       vcpu->arch.mmu = &vcpu->arch.guest_mmu;
+       kvm_init_shadow_mmu(vcpu);
+       vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
+       vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
+       vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
+       vcpu->arch.mmu->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu);
+       reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
+       vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.mmu = &vcpu->arch.root_mmu;
+       vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+}
+
+void recalc_intercepts(struct vcpu_svm *svm)
+{
+       struct vmcb_control_area *c, *h;
+       struct nested_state *g;
+
+       mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+       if (!is_guest_mode(&svm->vcpu))
+               return;
+
+       c = &svm->vmcb->control;
+       h = &svm->nested.hsave->control;
+       g = &svm->nested;
+
+       c->intercept_cr = h->intercept_cr;
+       c->intercept_dr = h->intercept_dr;
+       c->intercept_exceptions = h->intercept_exceptions;
+       c->intercept = h->intercept;
+
+       if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+               /* We only want the cr8 intercept bits of L1 */
+               c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
+               c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
+
+               /*
+                * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
+                * affect any interrupt we may want to inject; therefore,
+                * interrupt window vmexits are irrelevant to L0.
+                */
+               c->intercept &= ~(1ULL << INTERCEPT_VINTR);
+       }
+
+       /* We don't want to see VMMCALLs from a nested guest */
+       c->intercept &= ~(1ULL << INTERCEPT_VMMCALL);
+
+       c->intercept_cr |= g->intercept_cr;
+       c->intercept_dr |= g->intercept_dr;
+       c->intercept_exceptions |= g->intercept_exceptions;
+       c->intercept |= g->intercept;
+}
+
+static void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
+{
+       struct vmcb_control_area *dst  = &dst_vmcb->control;
+       struct vmcb_control_area *from = &from_vmcb->control;
+
+       dst->intercept_cr         = from->intercept_cr;
+       dst->intercept_dr         = from->intercept_dr;
+       dst->intercept_exceptions = from->intercept_exceptions;
+       dst->intercept            = from->intercept;
+       dst->iopm_base_pa         = from->iopm_base_pa;
+       dst->msrpm_base_pa        = from->msrpm_base_pa;
+       dst->tsc_offset           = from->tsc_offset;
+       dst->asid                 = from->asid;
+       dst->tlb_ctl              = from->tlb_ctl;
+       dst->int_ctl              = from->int_ctl;
+       dst->int_vector           = from->int_vector;
+       dst->int_state            = from->int_state;
+       dst->exit_code            = from->exit_code;
+       dst->exit_code_hi         = from->exit_code_hi;
+       dst->exit_info_1          = from->exit_info_1;
+       dst->exit_info_2          = from->exit_info_2;
+       dst->exit_int_info        = from->exit_int_info;
+       dst->exit_int_info_err    = from->exit_int_info_err;
+       dst->nested_ctl           = from->nested_ctl;
+       dst->event_inj            = from->event_inj;
+       dst->event_inj_err        = from->event_inj_err;
+       dst->nested_cr3           = from->nested_cr3;
+       dst->virt_ext              = from->virt_ext;
+       dst->pause_filter_count   = from->pause_filter_count;
+       dst->pause_filter_thresh  = from->pause_filter_thresh;
+}
+
+static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
+{
+       /*
+        * This function merges the msr permission bitmaps of kvm and the
+        * nested vmcb. It is optimized in that it only merges the parts where
+        * the kvm msr permission bitmap may contain zero bits
+        */
+       int i;
+
+       if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+               return true;
+
+       for (i = 0; i < MSRPM_OFFSETS; i++) {
+               u32 value, p;
+               u64 offset;
+
+               if (msrpm_offsets[i] == 0xffffffff)
+                       break;
+
+               p      = msrpm_offsets[i];
+               offset = svm->nested.vmcb_msrpm + (p * 4);
+
+               if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
+                       return false;
+
+               svm->nested.msrpm[p] = svm->msrpm[p] | value;
+       }
+
+       svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
+
+       return true;
+}
+
+static bool nested_vmcb_checks(struct vmcb *vmcb)
+{
+       if ((vmcb->save.efer & EFER_SVME) == 0)
+               return false;
+
+       if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+               return false;
+
+       if (vmcb->control.asid == 0)
+               return false;
+
+       if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
+           !npt_enabled)
+               return false;
+
+       return true;
+}
+
+void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+                         struct vmcb *nested_vmcb, struct kvm_host_map *map)
+{
+       bool evaluate_pending_interrupts =
+               is_intercept(svm, INTERCEPT_VINTR) ||
+               is_intercept(svm, INTERCEPT_IRET);
+
+       if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
+               svm->vcpu.arch.hflags |= HF_HIF_MASK;
+       else
+               svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
+
+       if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
+               svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
+               nested_svm_init_mmu_context(&svm->vcpu);
+       }
+
+       /* Load the nested guest state */
+       svm->vmcb->save.es = nested_vmcb->save.es;
+       svm->vmcb->save.cs = nested_vmcb->save.cs;
+       svm->vmcb->save.ss = nested_vmcb->save.ss;
+       svm->vmcb->save.ds = nested_vmcb->save.ds;
+       svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
+       svm->vmcb->save.idtr = nested_vmcb->save.idtr;
+       kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
+       svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
+       svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
+       svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
+       if (npt_enabled) {
+               svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
+               svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
+       } else
+               (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+
+       /* Guest paging mode is active - reset mmu */
+       kvm_mmu_reset_context(&svm->vcpu);
+
+       svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
+       kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
+       kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
+       kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip);
+
+       /* In case we don't even reach vcpu_run, the fields are not updated */
+       svm->vmcb->save.rax = nested_vmcb->save.rax;
+       svm->vmcb->save.rsp = nested_vmcb->save.rsp;
+       svm->vmcb->save.rip = nested_vmcb->save.rip;
+       svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
+       svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
+       svm->vmcb->save.cpl = nested_vmcb->save.cpl;
+
+       svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
+       svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
+
+       /* cache intercepts */
+       svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
+       svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
+       svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
+       svm->nested.intercept            = nested_vmcb->control.intercept;
+
+       svm_flush_tlb(&svm->vcpu, true);
+       svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
+       if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
+               svm->vcpu.arch.hflags |= HF_VINTR_MASK;
+       else
+               svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
+
+       svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
+       svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+
+       svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
+       svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
+       svm->vmcb->control.int_state = nested_vmcb->control.int_state;
+       svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
+       svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
+
+       svm->vmcb->control.pause_filter_count =
+               nested_vmcb->control.pause_filter_count;
+       svm->vmcb->control.pause_filter_thresh =
+               nested_vmcb->control.pause_filter_thresh;
+
+       kvm_vcpu_unmap(&svm->vcpu, map, true);
+
+       /* Enter Guest-Mode */
+       enter_guest_mode(&svm->vcpu);
+
+       /*
+        * Merge guest and host intercepts - must be called  with vcpu in
+        * guest-mode to take affect here
+        */
+       recalc_intercepts(svm);
+
+       svm->nested.vmcb = vmcb_gpa;
+
+       /*
+        * If L1 had a pending IRQ/NMI before executing VMRUN,
+        * which wasn't delivered because it was disallowed (e.g.
+        * interrupts disabled), L0 needs to evaluate if this pending
+        * event should cause an exit from L2 to L1 or be delivered
+        * directly to L2.
+        *
+        * Usually this would be handled by the processor noticing an
+        * IRQ/NMI window request.  However, VMRUN can unblock interrupts
+        * by implicitly setting GIF, so force L0 to perform pending event
+        * evaluation by requesting a KVM_REQ_EVENT.
+        */
+       enable_gif(svm);
+       if (unlikely(evaluate_pending_interrupts))
+               kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
+       mark_all_dirty(svm->vmcb);
+}
+
+int nested_svm_vmrun(struct vcpu_svm *svm)
+{
+       int ret;
+       struct vmcb *nested_vmcb;
+       struct vmcb *hsave = svm->nested.hsave;
+       struct vmcb *vmcb = svm->vmcb;
+       struct kvm_host_map map;
+       u64 vmcb_gpa;
+
+       vmcb_gpa = svm->vmcb->save.rax;
+
+       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
+       if (ret == -EINVAL) {
+               kvm_inject_gp(&svm->vcpu, 0);
+               return 1;
+       } else if (ret) {
+               return kvm_skip_emulated_instruction(&svm->vcpu);
+       }
+
+       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+       nested_vmcb = map.hva;
+
+       if (!nested_vmcb_checks(nested_vmcb)) {
+               nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
+               nested_vmcb->control.exit_code_hi = 0;
+               nested_vmcb->control.exit_info_1  = 0;
+               nested_vmcb->control.exit_info_2  = 0;
+
+               kvm_vcpu_unmap(&svm->vcpu, &map, true);
+
+               return ret;
+       }
+
+       trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
+                              nested_vmcb->save.rip,
+                              nested_vmcb->control.int_ctl,
+                              nested_vmcb->control.event_inj,
+                              nested_vmcb->control.nested_ctl);
+
+       trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+                                   nested_vmcb->control.intercept_cr >> 16,
+                                   nested_vmcb->control.intercept_exceptions,
+                                   nested_vmcb->control.intercept);
+
+       /* Clear internal status */
+       kvm_clear_exception_queue(&svm->vcpu);
+       kvm_clear_interrupt_queue(&svm->vcpu);
+
+       /*
+        * Save the old vmcb, so we don't need to pick what we save, but can
+        * restore everything when a VMEXIT occurs
+        */
+       hsave->save.es     = vmcb->save.es;
+       hsave->save.cs     = vmcb->save.cs;
+       hsave->save.ss     = vmcb->save.ss;
+       hsave->save.ds     = vmcb->save.ds;
+       hsave->save.gdtr   = vmcb->save.gdtr;
+       hsave->save.idtr   = vmcb->save.idtr;
+       hsave->save.efer   = svm->vcpu.arch.efer;
+       hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
+       hsave->save.cr4    = svm->vcpu.arch.cr4;
+       hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
+       hsave->save.rip    = kvm_rip_read(&svm->vcpu);
+       hsave->save.rsp    = vmcb->save.rsp;
+       hsave->save.rax    = vmcb->save.rax;
+       if (npt_enabled)
+               hsave->save.cr3    = vmcb->save.cr3;
+       else
+               hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
+
+       copy_vmcb_control_area(hsave, vmcb);
+
+       enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
+
+       if (!nested_svm_vmrun_msrpm(svm)) {
+               svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
+               svm->vmcb->control.exit_code_hi = 0;
+               svm->vmcb->control.exit_info_1  = 0;
+               svm->vmcb->control.exit_info_2  = 0;
+
+               nested_svm_vmexit(svm);
+       }
+
+       return ret;
+}
+
+void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+       to_vmcb->save.fs = from_vmcb->save.fs;
+       to_vmcb->save.gs = from_vmcb->save.gs;
+       to_vmcb->save.tr = from_vmcb->save.tr;
+       to_vmcb->save.ldtr = from_vmcb->save.ldtr;
+       to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
+       to_vmcb->save.star = from_vmcb->save.star;
+       to_vmcb->save.lstar = from_vmcb->save.lstar;
+       to_vmcb->save.cstar = from_vmcb->save.cstar;
+       to_vmcb->save.sfmask = from_vmcb->save.sfmask;
+       to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
+       to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
+       to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
+}
+
+int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+       int rc;
+       struct vmcb *nested_vmcb;
+       struct vmcb *hsave = svm->nested.hsave;
+       struct vmcb *vmcb = svm->vmcb;
+       struct kvm_host_map map;
+
+       trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
+                                      vmcb->control.exit_info_1,
+                                      vmcb->control.exit_info_2,
+                                      vmcb->control.exit_int_info,
+                                      vmcb->control.exit_int_info_err,
+                                      KVM_ISA_SVM);
+
+       rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map);
+       if (rc) {
+               if (rc == -EINVAL)
+                       kvm_inject_gp(&svm->vcpu, 0);
+               return 1;
+       }
+
+       nested_vmcb = map.hva;
+
+       /* Exit Guest-Mode */
+       leave_guest_mode(&svm->vcpu);
+       svm->nested.vmcb = 0;
+
+       /* Give the current vmcb to the guest */
+       disable_gif(svm);
+
+       nested_vmcb->save.es     = vmcb->save.es;
+       nested_vmcb->save.cs     = vmcb->save.cs;
+       nested_vmcb->save.ss     = vmcb->save.ss;
+       nested_vmcb->save.ds     = vmcb->save.ds;
+       nested_vmcb->save.gdtr   = vmcb->save.gdtr;
+       nested_vmcb->save.idtr   = vmcb->save.idtr;
+       nested_vmcb->save.efer   = svm->vcpu.arch.efer;
+       nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
+       nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
+       nested_vmcb->save.cr2    = vmcb->save.cr2;
+       nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
+       nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
+       nested_vmcb->save.rip    = vmcb->save.rip;
+       nested_vmcb->save.rsp    = vmcb->save.rsp;
+       nested_vmcb->save.rax    = vmcb->save.rax;
+       nested_vmcb->save.dr7    = vmcb->save.dr7;
+       nested_vmcb->save.dr6    = vmcb->save.dr6;
+       nested_vmcb->save.cpl    = vmcb->save.cpl;
+
+       nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
+       nested_vmcb->control.int_vector        = vmcb->control.int_vector;
+       nested_vmcb->control.int_state         = vmcb->control.int_state;
+       nested_vmcb->control.exit_code         = vmcb->control.exit_code;
+       nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
+       nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
+       nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
+       nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
+       nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+
+       if (svm->nrips_enabled)
+               nested_vmcb->control.next_rip  = vmcb->control.next_rip;
+
+       /*
+        * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
+        * to make sure that we do not lose injected events. So check event_inj
+        * here and copy it to exit_int_info if it is valid.
+        * Exit_int_info and event_inj can't be both valid because the case
+        * below only happens on a VMRUN instruction intercept which has
+        * no valid exit_int_info set.
+        */
+       if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
+               struct vmcb_control_area *nc = &nested_vmcb->control;
+
+               nc->exit_int_info     = vmcb->control.event_inj;
+               nc->exit_int_info_err = vmcb->control.event_inj_err;
+       }
+
+       nested_vmcb->control.tlb_ctl           = 0;
+       nested_vmcb->control.event_inj         = 0;
+       nested_vmcb->control.event_inj_err     = 0;
+
+       nested_vmcb->control.pause_filter_count =
+               svm->vmcb->control.pause_filter_count;
+       nested_vmcb->control.pause_filter_thresh =
+               svm->vmcb->control.pause_filter_thresh;
+
+       /* We always set V_INTR_MASKING and remember the old value in hflags */
+       if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+               nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
+
+       /* Restore the original control entries */
+       copy_vmcb_control_area(vmcb, hsave);
+
+       svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
+       kvm_clear_exception_queue(&svm->vcpu);
+       kvm_clear_interrupt_queue(&svm->vcpu);
+
+       svm->nested.nested_cr3 = 0;
+
+       /* Restore selected save entries */
+       svm->vmcb->save.es = hsave->save.es;
+       svm->vmcb->save.cs = hsave->save.cs;
+       svm->vmcb->save.ss = hsave->save.ss;
+       svm->vmcb->save.ds = hsave->save.ds;
+       svm->vmcb->save.gdtr = hsave->save.gdtr;
+       svm->vmcb->save.idtr = hsave->save.idtr;
+       kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
+       svm_set_efer(&svm->vcpu, hsave->save.efer);
+       svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
+       svm_set_cr4(&svm->vcpu, hsave->save.cr4);
+       if (npt_enabled) {
+               svm->vmcb->save.cr3 = hsave->save.cr3;
+               svm->vcpu.arch.cr3 = hsave->save.cr3;
+       } else {
+               (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+       }
+       kvm_rax_write(&svm->vcpu, hsave->save.rax);
+       kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
+       kvm_rip_write(&svm->vcpu, hsave->save.rip);
+       svm->vmcb->save.dr7 = 0;
+       svm->vmcb->save.cpl = 0;
+       svm->vmcb->control.exit_int_info = 0;
+
+       mark_all_dirty(svm->vmcb);
+
+       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+
+       nested_svm_uninit_mmu_context(&svm->vcpu);
+       kvm_mmu_reset_context(&svm->vcpu);
+       kvm_mmu_load(&svm->vcpu);
+
+       /*
+        * Drop what we picked up for L2 via svm_complete_interrupts() so it
+        * doesn't end up in L1.
+        */
+       svm->vcpu.arch.nmi_injected = false;
+       kvm_clear_exception_queue(&svm->vcpu);
+       kvm_clear_interrupt_queue(&svm->vcpu);
+
+       return 0;
+}
+
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+{
+       u32 offset, msr, value;
+       int write, mask;
+
+       if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+               return NESTED_EXIT_HOST;
+
+       msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+       offset = svm_msrpm_offset(msr);
+       write  = svm->vmcb->control.exit_info_1 & 1;
+       mask   = 1 << ((2 * (msr & 0xf)) + write);
+
+       if (offset == MSR_INVALID)
+               return NESTED_EXIT_DONE;
+
+       /* Offset is in 32 bit units but need in 8 bit units */
+       offset *= 4;
+
+       if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
+               return NESTED_EXIT_DONE;
+
+       return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+/* DB exceptions for our internal use must not cause vmexit */
+static int nested_svm_intercept_db(struct vcpu_svm *svm)
+{
+       unsigned long dr6;
+
+       /* if we're not singlestepping, it's not ours */
+       if (!svm->nmi_singlestep)
+               return NESTED_EXIT_DONE;
+
+       /* if it's not a singlestep exception, it's not ours */
+       if (kvm_get_dr(&svm->vcpu, 6, &dr6))
+               return NESTED_EXIT_DONE;
+       if (!(dr6 & DR6_BS))
+               return NESTED_EXIT_DONE;
+
+       /* if the guest is singlestepping, it should get the vmexit */
+       if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
+               disable_nmi_singlestep(svm);
+               return NESTED_EXIT_DONE;
+       }
+
+       /* it's ours, the nested hypervisor must not see this one */
+       return NESTED_EXIT_HOST;
+}
+
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+       unsigned port, size, iopm_len;
+       u16 val, mask;
+       u8 start_bit;
+       u64 gpa;
+
+       if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+               return NESTED_EXIT_HOST;
+
+       port = svm->vmcb->control.exit_info_1 >> 16;
+       size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
+               SVM_IOIO_SIZE_SHIFT;
+       gpa  = svm->nested.vmcb_iopm + (port / 8);
+       start_bit = port % 8;
+       iopm_len = (start_bit + size > 8) ? 2 : 1;
+       mask = (0xf >> (4 - size)) << start_bit;
+       val = 0;
+
+       if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
+               return NESTED_EXIT_DONE;
+
+       return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_intercept(struct vcpu_svm *svm)
+{
+       u32 exit_code = svm->vmcb->control.exit_code;
+       int vmexit = NESTED_EXIT_HOST;
+
+       switch (exit_code) {
+       case SVM_EXIT_MSR:
+               vmexit = nested_svm_exit_handled_msr(svm);
+               break;
+       case SVM_EXIT_IOIO:
+               vmexit = nested_svm_intercept_ioio(svm);
+               break;
+       case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
+               u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
+               if (svm->nested.intercept_cr & bit)
+                       vmexit = NESTED_EXIT_DONE;
+               break;
+       }
+       case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
+               u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
+               if (svm->nested.intercept_dr & bit)
+                       vmexit = NESTED_EXIT_DONE;
+               break;
+       }
+       case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+               u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+               if (svm->nested.intercept_exceptions & excp_bits) {
+                       if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
+                               vmexit = nested_svm_intercept_db(svm);
+                       else
+                               vmexit = NESTED_EXIT_DONE;
+               }
+               /* async page fault always cause vmexit */
+               else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
+                        svm->vcpu.arch.exception.nested_apf != 0)
+                       vmexit = NESTED_EXIT_DONE;
+               break;
+       }
+       case SVM_EXIT_ERR: {
+               vmexit = NESTED_EXIT_DONE;
+               break;
+       }
+       default: {
+               u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
+               if (svm->nested.intercept & exit_bits)
+                       vmexit = NESTED_EXIT_DONE;
+       }
+       }
+
+       return vmexit;
+}
+
+int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+       int vmexit;
+
+       vmexit = nested_svm_intercept(svm);
+
+       if (vmexit == NESTED_EXIT_DONE)
+               nested_svm_vmexit(svm);
+
+       return vmexit;
+}
+
+int nested_svm_check_permissions(struct vcpu_svm *svm)
+{
+       if (!(svm->vcpu.arch.efer & EFER_SVME) ||
+           !is_paging(&svm->vcpu)) {
+               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       if (svm->vmcb->save.cpl) {
+               kvm_inject_gp(&svm->vcpu, 0);
+               return 1;
+       }
+
+       return 0;
+}
+
+int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+                              bool has_error_code, u32 error_code)
+{
+       int vmexit;
+
+       if (!is_guest_mode(&svm->vcpu))
+               return 0;
+
+       vmexit = nested_svm_intercept(svm);
+       if (vmexit != NESTED_EXIT_DONE)
+               return 0;
+
+       svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+       svm->vmcb->control.exit_code_hi = 0;
+       svm->vmcb->control.exit_info_1 = error_code;
+
+       /*
+        * EXITINFO2 is undefined for all exception intercepts other
+        * than #PF.
+        */
+       if (svm->vcpu.arch.exception.nested_apf)
+               svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+       else if (svm->vcpu.arch.exception.has_payload)
+               svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
+       else
+               svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+
+       svm->nested.exit_required = true;
+       return vmexit;
+}
+
+static void nested_svm_intr(struct vcpu_svm *svm)
+{
+       svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
+       svm->vmcb->control.exit_info_1 = 0;
+       svm->vmcb->control.exit_info_2 = 0;
+
+       /* nested_svm_vmexit this gets called afterwards from handle_exit */
+       svm->nested.exit_required = true;
+       trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+}
+
+static bool nested_exit_on_intr(struct vcpu_svm *svm)
+{
+       return (svm->nested.intercept & 1ULL);
+}
+
+int svm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       bool block_nested_events =
+               kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required;
+
+       if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) {
+               if (block_nested_events)
+                       return -EBUSY;
+               nested_svm_intr(svm);
+               return 0;
+       }
+
+       return 0;
+}
+
+int nested_svm_exit_special(struct vcpu_svm *svm)
+{
+       u32 exit_code = svm->vmcb->control.exit_code;
+
+       switch (exit_code) {
+       case SVM_EXIT_INTR:
+       case SVM_EXIT_NMI:
+       case SVM_EXIT_EXCP_BASE + MC_VECTOR:
+               return NESTED_EXIT_HOST;
+       case SVM_EXIT_NPF:
+               /* For now we are always handling NPFs when using them */
+               if (npt_enabled)
+                       return NESTED_EXIT_HOST;
+               break;
+       case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+               /* When we're shadowing, trap PFs, but not async PF */
+               if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0)
+                       return NESTED_EXIT_HOST;
+               break;
+       default:
+               break;
+       }
+
+       return NESTED_EXIT_CONTINUE;
+}
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
new file mode 100644 (file)
index 0000000..0e3fc31
--- /dev/null
@@ -0,0 +1,1187 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM-SEV support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/psp-sev.h>
+#include <linux/swap.h>
+
+#include "x86.h"
+#include "svm.h"
+
+static int sev_flush_asids(void);
+static DECLARE_RWSEM(sev_deactivate_lock);
+static DEFINE_MUTEX(sev_bitmap_lock);
+unsigned int max_sev_asid;
+static unsigned int min_sev_asid;
+static unsigned long *sev_asid_bitmap;
+static unsigned long *sev_reclaim_asid_bitmap;
+#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
+
+struct enc_region {
+       struct list_head list;
+       unsigned long npages;
+       struct page **pages;
+       unsigned long uaddr;
+       unsigned long size;
+};
+
+static int sev_flush_asids(void)
+{
+       int ret, error = 0;
+
+       /*
+        * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
+        * so it must be guarded.
+        */
+       down_write(&sev_deactivate_lock);
+
+       wbinvd_on_all_cpus();
+       ret = sev_guest_df_flush(&error);
+
+       up_write(&sev_deactivate_lock);
+
+       if (ret)
+               pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
+
+       return ret;
+}
+
+/* Must be called with the sev_bitmap_lock held */
+static bool __sev_recycle_asids(void)
+{
+       int pos;
+
+       /* Check if there are any ASIDs to reclaim before performing a flush */
+       pos = find_next_bit(sev_reclaim_asid_bitmap,
+                           max_sev_asid, min_sev_asid - 1);
+       if (pos >= max_sev_asid)
+               return false;
+
+       if (sev_flush_asids())
+               return false;
+
+       bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
+                  max_sev_asid);
+       bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid);
+
+       return true;
+}
+
+static int sev_asid_new(void)
+{
+       bool retry = true;
+       int pos;
+
+       mutex_lock(&sev_bitmap_lock);
+
+       /*
+        * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
+        */
+again:
+       pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
+       if (pos >= max_sev_asid) {
+               if (retry && __sev_recycle_asids()) {
+                       retry = false;
+                       goto again;
+               }
+               mutex_unlock(&sev_bitmap_lock);
+               return -EBUSY;
+       }
+
+       __set_bit(pos, sev_asid_bitmap);
+
+       mutex_unlock(&sev_bitmap_lock);
+
+       return pos + 1;
+}
+
+static int sev_get_asid(struct kvm *kvm)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+       return sev->asid;
+}
+
+static void sev_asid_free(int asid)
+{
+       struct svm_cpu_data *sd;
+       int cpu, pos;
+
+       mutex_lock(&sev_bitmap_lock);
+
+       pos = asid - 1;
+       __set_bit(pos, sev_reclaim_asid_bitmap);
+
+       for_each_possible_cpu(cpu) {
+               sd = per_cpu(svm_data, cpu);
+               sd->sev_vmcbs[pos] = NULL;
+       }
+
+       mutex_unlock(&sev_bitmap_lock);
+}
+
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
+       struct sev_data_decommission *decommission;
+       struct sev_data_deactivate *data;
+
+       if (!handle)
+               return;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return;
+
+       /* deactivate handle */
+       data->handle = handle;
+
+       /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
+       down_read(&sev_deactivate_lock);
+       sev_guest_deactivate(data, NULL);
+       up_read(&sev_deactivate_lock);
+
+       kfree(data);
+
+       decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
+       if (!decommission)
+               return;
+
+       /* decommission handle */
+       decommission->handle = handle;
+       sev_guest_decommission(decommission, NULL);
+
+       kfree(decommission);
+}
+
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       int asid, ret;
+
+       ret = -EBUSY;
+       if (unlikely(sev->active))
+               return ret;
+
+       asid = sev_asid_new();
+       if (asid < 0)
+               return ret;
+
+       ret = sev_platform_init(&argp->error);
+       if (ret)
+               goto e_free;
+
+       sev->active = true;
+       sev->asid = asid;
+       INIT_LIST_HEAD(&sev->regions_list);
+
+       return 0;
+
+e_free:
+       sev_asid_free(asid);
+       return ret;
+}
+
+static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
+{
+       struct sev_data_activate *data;
+       int asid = sev_get_asid(kvm);
+       int ret;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+       if (!data)
+               return -ENOMEM;
+
+       /* activate ASID on the given handle */
+       data->handle = handle;
+       data->asid   = asid;
+       ret = sev_guest_activate(data, error);
+       kfree(data);
+
+       return ret;
+}
+
+static int __sev_issue_cmd(int fd, int id, void *data, int *error)
+{
+       struct fd f;
+       int ret;
+
+       f = fdget(fd);
+       if (!f.file)
+               return -EBADF;
+
+       ret = sev_issue_cmd_external_user(f.file, id, data, error);
+
+       fdput(f);
+       return ret;
+}
+
+static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+       return __sev_issue_cmd(sev->fd, id, data, error);
+}
+
+static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_launch_start *start;
+       struct kvm_sev_launch_start params;
+       void *dh_blob, *session_blob;
+       int *error = &argp->error;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+               return -EFAULT;
+
+       start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
+       if (!start)
+               return -ENOMEM;
+
+       dh_blob = NULL;
+       if (params.dh_uaddr) {
+               dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
+               if (IS_ERR(dh_blob)) {
+                       ret = PTR_ERR(dh_blob);
+                       goto e_free;
+               }
+
+               start->dh_cert_address = __sme_set(__pa(dh_blob));
+               start->dh_cert_len = params.dh_len;
+       }
+
+       session_blob = NULL;
+       if (params.session_uaddr) {
+               session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
+               if (IS_ERR(session_blob)) {
+                       ret = PTR_ERR(session_blob);
+                       goto e_free_dh;
+               }
+
+               start->session_address = __sme_set(__pa(session_blob));
+               start->session_len = params.session_len;
+       }
+
+       start->handle = params.handle;
+       start->policy = params.policy;
+
+       /* create memory encryption context */
+       ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+       if (ret)
+               goto e_free_session;
+
+       /* Bind ASID to this guest */
+       ret = sev_bind_asid(kvm, start->handle, error);
+       if (ret)
+               goto e_free_session;
+
+       /* return handle to userspace */
+       params.handle = start->handle;
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+               sev_unbind_asid(kvm, start->handle);
+               ret = -EFAULT;
+               goto e_free_session;
+       }
+
+       sev->handle = start->handle;
+       sev->fd = argp->sev_fd;
+
+e_free_session:
+       kfree(session_blob);
+e_free_dh:
+       kfree(dh_blob);
+e_free:
+       kfree(start);
+       return ret;
+}
+
+static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
+                                   unsigned long ulen, unsigned long *n,
+                                   int write)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       unsigned long npages, npinned, size;
+       unsigned long locked, lock_limit;
+       struct page **pages;
+       unsigned long first, last;
+
+       if (ulen == 0 || uaddr + ulen < uaddr)
+               return NULL;
+
+       /* Calculate number of pages. */
+       first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
+       last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
+       npages = (last - first + 1);
+
+       locked = sev->pages_locked + npages;
+       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+       if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+               pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
+               return NULL;
+       }
+
+       /* Avoid using vmalloc for smaller buffers. */
+       size = npages * sizeof(struct page *);
+       if (size > PAGE_SIZE)
+               pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+                                 PAGE_KERNEL);
+       else
+               pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
+
+       if (!pages)
+               return NULL;
+
+       /* Pin the user virtual address. */
+       npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
+       if (npinned != npages) {
+               pr_err("SEV: Failure locking %lu pages.\n", npages);
+               goto err;
+       }
+
+       *n = npages;
+       sev->pages_locked = locked;
+
+       return pages;
+
+err:
+       if (npinned > 0)
+               release_pages(pages, npinned);
+
+       kvfree(pages);
+       return NULL;
+}
+
+static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
+                            unsigned long npages)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+       release_pages(pages, npages);
+       kvfree(pages);
+       sev->pages_locked -= npages;
+}
+
+static void sev_clflush_pages(struct page *pages[], unsigned long npages)
+{
+       uint8_t *page_virtual;
+       unsigned long i;
+
+       if (npages == 0 || pages == NULL)
+               return;
+
+       for (i = 0; i < npages; i++) {
+               page_virtual = kmap_atomic(pages[i]);
+               clflush_cache_range(page_virtual, PAGE_SIZE);
+               kunmap_atomic(page_virtual);
+       }
+}
+
+static unsigned long get_num_contig_pages(unsigned long idx,
+                               struct page **inpages, unsigned long npages)
+{
+       unsigned long paddr, next_paddr;
+       unsigned long i = idx + 1, pages = 1;
+
+       /* find the number of contiguous pages starting from idx */
+       paddr = __sme_page_pa(inpages[idx]);
+       while (i < npages) {
+               next_paddr = __sme_page_pa(inpages[i++]);
+               if ((paddr + PAGE_SIZE) == next_paddr) {
+                       pages++;
+                       paddr = next_paddr;
+                       continue;
+               }
+               break;
+       }
+
+       return pages;
+}
+
+static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct kvm_sev_launch_update_data params;
+       struct sev_data_launch_update_data *data;
+       struct page **inpages;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+               return -EFAULT;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+       if (!data)
+               return -ENOMEM;
+
+       vaddr = params.uaddr;
+       size = params.len;
+       vaddr_end = vaddr + size;
+
+       /* Lock the user memory. */
+       inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
+       if (!inpages) {
+               ret = -ENOMEM;
+               goto e_free;
+       }
+
+       /*
+        * The LAUNCH_UPDATE command will perform in-place encryption of the
+        * memory content (i.e it will write the same memory region with C=1).
+        * It's possible that the cache may contain the data with C=0, i.e.,
+        * unencrypted so invalidate it first.
+        */
+       sev_clflush_pages(inpages, npages);
+
+       for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
+               int offset, len;
+
+               /*
+                * If the user buffer is not page-aligned, calculate the offset
+                * within the page.
+                */
+               offset = vaddr & (PAGE_SIZE - 1);
+
+               /* Calculate the number of pages that can be encrypted in one go. */
+               pages = get_num_contig_pages(i, inpages, npages);
+
+               len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
+
+               data->handle = sev->handle;
+               data->len = len;
+               data->address = __sme_page_pa(inpages[i]) + offset;
+               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+               if (ret)
+                       goto e_unpin;
+
+               size -= len;
+               next_vaddr = vaddr + len;
+       }
+
+e_unpin:
+       /* content of memory is updated, mark pages dirty */
+       for (i = 0; i < npages; i++) {
+               set_page_dirty_lock(inpages[i]);
+               mark_page_accessed(inpages[i]);
+       }
+       /* unlock the user pages */
+       sev_unpin_memory(kvm, inpages, npages);
+e_free:
+       kfree(data);
+       return ret;
+}
+
+static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       void __user *measure = (void __user *)(uintptr_t)argp->data;
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_launch_measure *data;
+       struct kvm_sev_launch_measure params;
+       void __user *p = NULL;
+       void *blob = NULL;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, measure, sizeof(params)))
+               return -EFAULT;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+       if (!data)
+               return -ENOMEM;
+
+       /* User wants to query the blob length */
+       if (!params.len)
+               goto cmd;
+
+       p = (void __user *)(uintptr_t)params.uaddr;
+       if (p) {
+               if (params.len > SEV_FW_BLOB_MAX_SIZE) {
+                       ret = -EINVAL;
+                       goto e_free;
+               }
+
+               ret = -ENOMEM;
+               blob = kmalloc(params.len, GFP_KERNEL);
+               if (!blob)
+                       goto e_free;
+
+               data->address = __psp_pa(blob);
+               data->len = params.len;
+       }
+
+cmd:
+       data->handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+
+       /*
+        * If we query the session length, FW responded with expected data.
+        */
+       if (!params.len)
+               goto done;
+
+       if (ret)
+               goto e_free_blob;
+
+       if (blob) {
+               if (copy_to_user(p, blob, params.len))
+                       ret = -EFAULT;
+       }
+
+done:
+       params.len = data->len;
+       if (copy_to_user(measure, &params, sizeof(params)))
+               ret = -EFAULT;
+e_free_blob:
+       kfree(blob);
+e_free:
+       kfree(data);
+       return ret;
+}
+
+static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_launch_finish *data;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+       if (!data)
+               return -ENOMEM;
+
+       data->handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
+
+       kfree(data);
+       return ret;
+}
+
+static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct kvm_sev_guest_status params;
+       struct sev_data_guest_status *data;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+       if (!data)
+               return -ENOMEM;
+
+       data->handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+       if (ret)
+               goto e_free;
+
+       params.policy = data->policy;
+       params.state = data->state;
+       params.handle = data->handle;
+
+       if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+               ret = -EFAULT;
+e_free:
+       kfree(data);
+       return ret;
+}
+
+static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
+                              unsigned long dst, int size,
+                              int *error, bool enc)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_dbg *data;
+       int ret;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+       if (!data)
+               return -ENOMEM;
+
+       data->handle = sev->handle;
+       data->dst_addr = dst;
+       data->src_addr = src;
+       data->len = size;
+
+       ret = sev_issue_cmd(kvm,
+                           enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+                           data, error);
+       kfree(data);
+       return ret;
+}
+
+static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
+                            unsigned long dst_paddr, int sz, int *err)
+{
+       int offset;
+
+       /*
+        * Its safe to read more than we are asked, caller should ensure that
+        * destination has enough space.
+        */
+       src_paddr = round_down(src_paddr, 16);
+       offset = src_paddr & 15;
+       sz = round_up(sz + offset, 16);
+
+       return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
+}
+
+static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
+                                 unsigned long __user dst_uaddr,
+                                 unsigned long dst_paddr,
+                                 int size, int *err)
+{
+       struct page *tpage = NULL;
+       int ret, offset;
+
+       /* if inputs are not 16-byte then use intermediate buffer */
+       if (!IS_ALIGNED(dst_paddr, 16) ||
+           !IS_ALIGNED(paddr,     16) ||
+           !IS_ALIGNED(size,      16)) {
+               tpage = (void *)alloc_page(GFP_KERNEL);
+               if (!tpage)
+                       return -ENOMEM;
+
+               dst_paddr = __sme_page_pa(tpage);
+       }
+
+       ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
+       if (ret)
+               goto e_free;
+
+       if (tpage) {
+               offset = paddr & 15;
+               if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
+                                page_address(tpage) + offset, size))
+                       ret = -EFAULT;
+       }
+
+e_free:
+       if (tpage)
+               __free_page(tpage);
+
+       return ret;
+}
+
+static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+                                 unsigned long __user vaddr,
+                                 unsigned long dst_paddr,
+                                 unsigned long __user dst_vaddr,
+                                 int size, int *error)
+{
+       struct page *src_tpage = NULL;
+       struct page *dst_tpage = NULL;
+       int ret, len = size;
+
+       /* If source buffer is not aligned then use an intermediate buffer */
+       if (!IS_ALIGNED(vaddr, 16)) {
+               src_tpage = alloc_page(GFP_KERNEL);
+               if (!src_tpage)
+                       return -ENOMEM;
+
+               if (copy_from_user(page_address(src_tpage),
+                               (void __user *)(uintptr_t)vaddr, size)) {
+                       __free_page(src_tpage);
+                       return -EFAULT;
+               }
+
+               paddr = __sme_page_pa(src_tpage);
+       }
+
+       /*
+        *  If destination buffer or length is not aligned then do read-modify-write:
+        *   - decrypt destination in an intermediate buffer
+        *   - copy the source buffer in an intermediate buffer
+        *   - use the intermediate buffer as source buffer
+        */
+       if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+               int dst_offset;
+
+               dst_tpage = alloc_page(GFP_KERNEL);
+               if (!dst_tpage) {
+                       ret = -ENOMEM;
+                       goto e_free;
+               }
+
+               ret = __sev_dbg_decrypt(kvm, dst_paddr,
+                                       __sme_page_pa(dst_tpage), size, error);
+               if (ret)
+                       goto e_free;
+
+               /*
+                *  If source is kernel buffer then use memcpy() otherwise
+                *  copy_from_user().
+                */
+               dst_offset = dst_paddr & 15;
+
+               if (src_tpage)
+                       memcpy(page_address(dst_tpage) + dst_offset,
+                              page_address(src_tpage), size);
+               else {
+                       if (copy_from_user(page_address(dst_tpage) + dst_offset,
+                                          (void __user *)(uintptr_t)vaddr, size)) {
+                               ret = -EFAULT;
+                               goto e_free;
+                       }
+               }
+
+               paddr = __sme_page_pa(dst_tpage);
+               dst_paddr = round_down(dst_paddr, 16);
+               len = round_up(size, 16);
+       }
+
+       ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
+
+e_free:
+       if (src_tpage)
+               __free_page(src_tpage);
+       if (dst_tpage)
+               __free_page(dst_tpage);
+       return ret;
+}
+
+static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
+{
+       unsigned long vaddr, vaddr_end, next_vaddr;
+       unsigned long dst_vaddr;
+       struct page **src_p, **dst_p;
+       struct kvm_sev_dbg debug;
+       unsigned long n;
+       unsigned int size;
+       int ret;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+               return -EFAULT;
+
+       if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
+               return -EINVAL;
+       if (!debug.dst_uaddr)
+               return -EINVAL;
+
+       vaddr = debug.src_uaddr;
+       size = debug.len;
+       vaddr_end = vaddr + size;
+       dst_vaddr = debug.dst_uaddr;
+
+       for (; vaddr < vaddr_end; vaddr = next_vaddr) {
+               int len, s_off, d_off;
+
+               /* lock userspace source and destination page */
+               src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
+               if (!src_p)
+                       return -EFAULT;
+
+               dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
+               if (!dst_p) {
+                       sev_unpin_memory(kvm, src_p, n);
+                       return -EFAULT;
+               }
+
+               /*
+                * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
+                * memory content (i.e it will write the same memory region with C=1).
+                * It's possible that the cache may contain the data with C=0, i.e.,
+                * unencrypted so invalidate it first.
+                */
+               sev_clflush_pages(src_p, 1);
+               sev_clflush_pages(dst_p, 1);
+
+               /*
+                * Since user buffer may not be page aligned, calculate the
+                * offset within the page.
+                */
+               s_off = vaddr & ~PAGE_MASK;
+               d_off = dst_vaddr & ~PAGE_MASK;
+               len = min_t(size_t, (PAGE_SIZE - s_off), size);
+
+               if (dec)
+                       ret = __sev_dbg_decrypt_user(kvm,
+                                                    __sme_page_pa(src_p[0]) + s_off,
+                                                    dst_vaddr,
+                                                    __sme_page_pa(dst_p[0]) + d_off,
+                                                    len, &argp->error);
+               else
+                       ret = __sev_dbg_encrypt_user(kvm,
+                                                    __sme_page_pa(src_p[0]) + s_off,
+                                                    vaddr,
+                                                    __sme_page_pa(dst_p[0]) + d_off,
+                                                    dst_vaddr,
+                                                    len, &argp->error);
+
+               sev_unpin_memory(kvm, src_p, n);
+               sev_unpin_memory(kvm, dst_p, n);
+
+               if (ret)
+                       goto err;
+
+               next_vaddr = vaddr + len;
+               dst_vaddr = dst_vaddr + len;
+               size -= len;
+       }
+err:
+       return ret;
+}
+
+static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct sev_data_launch_secret *data;
+       struct kvm_sev_launch_secret params;
+       struct page **pages;
+       void *blob, *hdr;
+       unsigned long n;
+       int ret, offset;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+               return -EFAULT;
+
+       pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
+       if (!pages)
+               return -ENOMEM;
+
+       /*
+        * The secret must be copied into contiguous memory region, lets verify
+        * that userspace memory pages are contiguous before we issue command.
+        */
+       if (get_num_contig_pages(0, pages, n) != n) {
+               ret = -EINVAL;
+               goto e_unpin_memory;
+       }
+
+       ret = -ENOMEM;
+       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+       if (!data)
+               goto e_unpin_memory;
+
+       offset = params.guest_uaddr & (PAGE_SIZE - 1);
+       data->guest_address = __sme_page_pa(pages[0]) + offset;
+       data->guest_len = params.guest_len;
+
+       blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+       if (IS_ERR(blob)) {
+               ret = PTR_ERR(blob);
+               goto e_free;
+       }
+
+       data->trans_address = __psp_pa(blob);
+       data->trans_len = params.trans_len;
+
+       hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+       if (IS_ERR(hdr)) {
+               ret = PTR_ERR(hdr);
+               goto e_free_blob;
+       }
+       data->hdr_address = __psp_pa(hdr);
+       data->hdr_len = params.hdr_len;
+
+       data->handle = sev->handle;
+       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+
+       kfree(hdr);
+
+e_free_blob:
+       kfree(blob);
+e_free:
+       kfree(data);
+e_unpin_memory:
+       sev_unpin_memory(kvm, pages, n);
+       return ret;
+}
+
+int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
+{
+       struct kvm_sev_cmd sev_cmd;
+       int r;
+
+       if (!svm_sev_enabled())
+               return -ENOTTY;
+
+       if (!argp)
+               return 0;
+
+       if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
+               return -EFAULT;
+
+       mutex_lock(&kvm->lock);
+
+       switch (sev_cmd.id) {
+       case KVM_SEV_INIT:
+               r = sev_guest_init(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_LAUNCH_START:
+               r = sev_launch_start(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_LAUNCH_UPDATE_DATA:
+               r = sev_launch_update_data(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_LAUNCH_MEASURE:
+               r = sev_launch_measure(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_LAUNCH_FINISH:
+               r = sev_launch_finish(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_GUEST_STATUS:
+               r = sev_guest_status(kvm, &sev_cmd);
+               break;
+       case KVM_SEV_DBG_DECRYPT:
+               r = sev_dbg_crypt(kvm, &sev_cmd, true);
+               break;
+       case KVM_SEV_DBG_ENCRYPT:
+               r = sev_dbg_crypt(kvm, &sev_cmd, false);
+               break;
+       case KVM_SEV_LAUNCH_SECRET:
+               r = sev_launch_secret(kvm, &sev_cmd);
+               break;
+       default:
+               r = -EINVAL;
+               goto out;
+       }
+
+       if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
+               r = -EFAULT;
+
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
+int svm_register_enc_region(struct kvm *kvm,
+                           struct kvm_enc_region *range)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct enc_region *region;
+       int ret = 0;
+
+       if (!sev_guest(kvm))
+               return -ENOTTY;
+
+       if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
+               return -EINVAL;
+
+       region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
+       if (!region)
+               return -ENOMEM;
+
+       region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
+       if (!region->pages) {
+               ret = -ENOMEM;
+               goto e_free;
+       }
+
+       /*
+        * The guest may change the memory encryption attribute from C=0 -> C=1
+        * or vice versa for this memory range. Lets make sure caches are
+        * flushed to ensure that guest data gets written into memory with
+        * correct C-bit.
+        */
+       sev_clflush_pages(region->pages, region->npages);
+
+       region->uaddr = range->addr;
+       region->size = range->size;
+
+       mutex_lock(&kvm->lock);
+       list_add_tail(&region->list, &sev->regions_list);
+       mutex_unlock(&kvm->lock);
+
+       return ret;
+
+e_free:
+       kfree(region);
+       return ret;
+}
+
+static struct enc_region *
+find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct list_head *head = &sev->regions_list;
+       struct enc_region *i;
+
+       list_for_each_entry(i, head, list) {
+               if (i->uaddr == range->addr &&
+                   i->size == range->size)
+                       return i;
+       }
+
+       return NULL;
+}
+
+static void __unregister_enc_region_locked(struct kvm *kvm,
+                                          struct enc_region *region)
+{
+       sev_unpin_memory(kvm, region->pages, region->npages);
+       list_del(&region->list);
+       kfree(region);
+}
+
+int svm_unregister_enc_region(struct kvm *kvm,
+                             struct kvm_enc_region *range)
+{
+       struct enc_region *region;
+       int ret;
+
+       mutex_lock(&kvm->lock);
+
+       if (!sev_guest(kvm)) {
+               ret = -ENOTTY;
+               goto failed;
+       }
+
+       region = find_enc_region(kvm, range);
+       if (!region) {
+               ret = -EINVAL;
+               goto failed;
+       }
+
+       /*
+        * Ensure that all guest tagged cache entries are flushed before
+        * releasing the pages back to the system for use. CLFLUSH will
+        * not do this, so issue a WBINVD.
+        */
+       wbinvd_on_all_cpus();
+
+       __unregister_enc_region_locked(kvm, region);
+
+       mutex_unlock(&kvm->lock);
+       return 0;
+
+failed:
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
+void sev_vm_destroy(struct kvm *kvm)
+{
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct list_head *head = &sev->regions_list;
+       struct list_head *pos, *q;
+
+       if (!sev_guest(kvm))
+               return;
+
+       mutex_lock(&kvm->lock);
+
+       /*
+        * Ensure that all guest tagged cache entries are flushed before
+        * releasing the pages back to the system for use. CLFLUSH will
+        * not do this, so issue a WBINVD.
+        */
+       wbinvd_on_all_cpus();
+
+       /*
+        * if userspace was terminated before unregistering the memory regions
+        * then lets unpin all the registered memory.
+        */
+       if (!list_empty(head)) {
+               list_for_each_safe(pos, q, head) {
+                       __unregister_enc_region_locked(kvm,
+                               list_entry(pos, struct enc_region, list));
+               }
+       }
+
+       mutex_unlock(&kvm->lock);
+
+       sev_unbind_asid(kvm, sev->handle);
+       sev_asid_free(sev->asid);
+}
+
+int __init sev_hardware_setup(void)
+{
+       struct sev_user_data_status *status;
+       int rc;
+
+       /* Maximum number of encrypted guests supported simultaneously */
+       max_sev_asid = cpuid_ecx(0x8000001F);
+
+       if (!max_sev_asid)
+               return 1;
+
+       /* Minimum ASID value that should be used for SEV guest */
+       min_sev_asid = cpuid_edx(0x8000001F);
+
+       /* Initialize SEV ASID bitmaps */
+       sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
+       if (!sev_asid_bitmap)
+               return 1;
+
+       sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
+       if (!sev_reclaim_asid_bitmap)
+               return 1;
+
+       status = kmalloc(sizeof(*status), GFP_KERNEL);
+       if (!status)
+               return 1;
+
+       /*
+        * Check SEV platform status.
+        *
+        * PLATFORM_STATUS can be called in any state, if we failed to query
+        * the PLATFORM status then either PSP firmware does not support SEV
+        * feature or SEV firmware is dead.
+        */
+       rc = sev_platform_status(status, NULL);
+       if (rc)
+               goto err;
+
+       pr_info("SEV supported\n");
+
+err:
+       kfree(status);
+       return rc;
+}
+
+void sev_hardware_teardown(void)
+{
+       bitmap_free(sev_asid_bitmap);
+       bitmap_free(sev_reclaim_asid_bitmap);
+
+       sev_flush_asids();
+}
+
+void pre_sev_run(struct vcpu_svm *svm, int cpu)
+{
+       struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+       int asid = sev_get_asid(svm->vcpu.kvm);
+
+       /* Assign the asid allocated with this SEV guest */
+       svm->vmcb->control.asid = asid;
+
+       /*
+        * Flush guest TLB:
+        *
+        * 1) when different VMCB for the same ASID is to be run on the same host CPU.
+        * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
+        */
+       if (sd->sev_vmcbs[asid] == svm->vmcb &&
+           svm->last_cpu == cpu)
+               return;
+
+       svm->last_cpu = cpu;
+       sd->sev_vmcbs[asid] = svm->vmcb;
+       svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+       mark_dirty(svm->vmcb, VMCB_ASID);
+}
similarity index 54%
rename from arch/x86/kvm/svm.c
rename to arch/x86/kvm/svm/svm.c
index 851e9cc..2be5bba 100644 (file)
@@ -1,17 +1,3 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * AMD SVM support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- *   Yaniv Kamay  <yaniv@qumranet.com>
- *   Avi Kivity   <avi@qumranet.com>
- */
-
 #define pr_fmt(fmt) "SVM: " fmt
 
 #include <linux/kvm_host.h>
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
+#include <linux/amd-iommu.h>
 #include <linux/sched.h>
 #include <linux/trace_events.h>
 #include <linux/slab.h>
-#include <linux/amd-iommu.h>
 #include <linux/hashtable.h>
 #include <linux/frame.h>
 #include <linux/psp-sev.h>
@@ -53,6 +39,8 @@
 #include <asm/virtext.h>
 #include "trace.h"
 
+#include "svm.h"
+
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
 MODULE_AUTHOR("Qumranet");
@@ -80,107 +68,15 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 #define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
 #define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
 
-#define SVM_AVIC_DOORBELL      0xc001011b
-
-#define NESTED_EXIT_HOST       0       /* Exit handled on host level */
-#define NESTED_EXIT_DONE       1       /* Exit caused nested vmexit  */
-#define NESTED_EXIT_CONTINUE   2       /* Further checks needed      */
-
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
 #define TSC_RATIO_RSVD          0xffffff0000000000ULL
 #define TSC_RATIO_MIN          0x0000000000000001ULL
 #define TSC_RATIO_MAX          0x000000ffffffffffULL
 
-#define AVIC_HPA_MASK  ~((0xFFFULL << 52) | 0xFFF)
-
-/*
- * 0xff is broadcast, so the max index allowed for physical APIC ID
- * table is 0xfe.  APIC IDs above 0xff are reserved.
- */
-#define AVIC_MAX_PHYSICAL_ID_COUNT     255
-
-#define AVIC_UNACCEL_ACCESS_WRITE_MASK         1
-#define AVIC_UNACCEL_ACCESS_OFFSET_MASK                0xFF0
-#define AVIC_UNACCEL_ACCESS_VECTOR_MASK                0xFFFFFFFF
-
-/* AVIC GATAG is encoded using VM and VCPU IDs */
-#define AVIC_VCPU_ID_BITS              8
-#define AVIC_VCPU_ID_MASK              ((1 << AVIC_VCPU_ID_BITS) - 1)
-
-#define AVIC_VM_ID_BITS                        24
-#define AVIC_VM_ID_NR                  (1 << AVIC_VM_ID_BITS)
-#define AVIC_VM_ID_MASK                        ((1 << AVIC_VM_ID_BITS) - 1)
-
-#define AVIC_GATAG(x, y)               (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
-                                               (y & AVIC_VCPU_ID_MASK))
-#define AVIC_GATAG_TO_VMID(x)          ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
-#define AVIC_GATAG_TO_VCPUID(x)                (x & AVIC_VCPU_ID_MASK)
-
 static bool erratum_383_found __read_mostly;
 
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
-       MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
-       MSR_FS_BASE,
-#endif
-       MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-       MSR_TSC_AUX,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-
-struct kvm_sev_info {
-       bool active;            /* SEV enabled guest */
-       unsigned int asid;      /* ASID used for this guest */
-       unsigned int handle;    /* SEV firmware handle */
-       int fd;                 /* SEV device fd */
-       unsigned long pages_locked; /* Number of pages locked */
-       struct list_head regions_list;  /* List of registered regions */
-};
-
-struct kvm_svm {
-       struct kvm kvm;
-
-       /* Struct members for AVIC */
-       u32 avic_vm_id;
-       struct page *avic_logical_id_table_page;
-       struct page *avic_physical_id_table_page;
-       struct hlist_node hnode;
-
-       struct kvm_sev_info sev_info;
-};
-
-struct kvm_vcpu;
-
-struct nested_state {
-       struct vmcb *hsave;
-       u64 hsave_msr;
-       u64 vm_cr_msr;
-       u64 vmcb;
-
-       /* These are the merged vectors */
-       u32 *msrpm;
-
-       /* gpa pointers to the real vectors */
-       u64 vmcb_msrpm;
-       u64 vmcb_iopm;
-
-       /* A VMEXIT is required but not yet emulated */
-       bool exit_required;
-
-       /* cache for intercepts of the guest */
-       u32 intercept_cr;
-       u32 intercept_dr;
-       u32 intercept_exceptions;
-       u64 intercept;
-
-       /* Nested Paging related state */
-       u64 nested_cr3;
-};
-
-#define MSRPM_OFFSETS  16
-static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 
 /*
  * Set osvw_len to higher value when updated Revision Guides
@@ -188,92 +84,9 @@ static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  */
 static uint64_t osvw_len = 4, osvw_status;
 
-struct vcpu_svm {
-       struct kvm_vcpu vcpu;
-       struct vmcb *vmcb;
-       unsigned long vmcb_pa;
-       struct svm_cpu_data *svm_data;
-       uint64_t asid_generation;
-       uint64_t sysenter_esp;
-       uint64_t sysenter_eip;
-       uint64_t tsc_aux;
-
-       u64 msr_decfg;
-
-       u64 next_rip;
-
-       u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
-       struct {
-               u16 fs;
-               u16 gs;
-               u16 ldt;
-               u64 gs_base;
-       } host;
-
-       u64 spec_ctrl;
-       /*
-        * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
-        * translated into the appropriate L2_CFG bits on the host to
-        * perform speculative control.
-        */
-       u64 virt_spec_ctrl;
-
-       u32 *msrpm;
-
-       ulong nmi_iret_rip;
-
-       struct nested_state nested;
-
-       bool nmi_singlestep;
-       u64 nmi_singlestep_guest_rflags;
-
-       unsigned int3_injected;
-       unsigned long int3_rip;
-
-       /* cached guest cpuid flags for faster access */
-       bool nrips_enabled      : 1;
-
-       u32 ldr_reg;
-       u32 dfr_reg;
-       struct page *avic_backing_page;
-       u64 *avic_physical_id_cache;
-       bool avic_is_running;
-
-       /*
-        * Per-vcpu list of struct amd_svm_iommu_ir:
-        * This is used mainly to store interrupt remapping information used
-        * when update the vcpu affinity. This avoids the need to scan for
-        * IRTE and try to match ga_tag in the IOMMU driver.
-        */
-       struct list_head ir_list;
-       spinlock_t ir_list_lock;
-
-       /* which host CPU was used for running this vcpu */
-       unsigned int last_cpu;
-};
-
-/*
- * This is a wrapper of struct amd_iommu_ir_data.
- */
-struct amd_svm_iommu_ir {
-       struct list_head node;  /* Used by SVM for per-vcpu ir_list */
-       void *data;             /* Storing pointer to struct amd_ir_data */
-};
-
-#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK   (0xFF)
-#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT                        31
-#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK               (1 << 31)
-
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK   (0xFFULL)
-#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK       (0xFFFFFFFFFFULL << 12)
-#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK         (1ULL << 62)
-#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK              (1ULL << 63)
-
 static DEFINE_PER_CPU(u64, current_tsc_ratio);
 #define TSC_RATIO_DEFAULT      0x0100000000ULL
 
-#define MSR_INVALID                    0xffffffffU
-
 static const struct svm_direct_access_msrs {
        u32 index;   /* Index of the MSR */
        bool always; /* True if intercept is always on */
@@ -299,9 +112,9 @@ static const struct svm_direct_access_msrs {
 
 /* enable NPT for AMD64 and X86 with PAE */
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-static bool npt_enabled = true;
+bool npt_enabled = true;
 #else
-static bool npt_enabled;
+bool npt_enabled;
 #endif
 
 /*
@@ -360,12 +173,6 @@ module_param(npt, int, S_IRUGO);
 static int nested = true;
 module_param(nested, int, S_IRUGO);
 
-/* enable / disable AVIC */
-static int avic;
-#ifdef CONFIG_X86_LOCAL_APIC
-module_param(avic, int, S_IRUGO);
-#endif
-
 /* enable/disable Next RIP Save */
 static int nrips = true;
 module_param(nrips, int, 0444);
@@ -387,303 +194,7 @@ module_param(dump_invalid_vmcb, bool, 0644);
 
 static u8 rsm_ins_bytes[] = "\x0f\xaa";
 
-static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
-static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
-static inline void avic_post_state_restore(struct kvm_vcpu *vcpu);
-
-static int nested_svm_exit_handled(struct vcpu_svm *svm);
-static int nested_svm_intercept(struct vcpu_svm *svm);
-static int nested_svm_vmexit(struct vcpu_svm *svm);
-static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
-                                     bool has_error_code, u32 error_code);
-
-enum {
-       VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
-                           pause filter count */
-       VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
-       VMCB_ASID,       /* ASID */
-       VMCB_INTR,       /* int_ctl, int_vector */
-       VMCB_NPT,        /* npt_en, nCR3, gPAT */
-       VMCB_CR,         /* CR0, CR3, CR4, EFER */
-       VMCB_DR,         /* DR6, DR7 */
-       VMCB_DT,         /* GDT, IDT */
-       VMCB_SEG,        /* CS, DS, SS, ES, CPL */
-       VMCB_CR2,        /* CR2 only */
-       VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
-       VMCB_AVIC,       /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
-                         * AVIC PHYSICAL_TABLE pointer,
-                         * AVIC LOGICAL_TABLE pointer
-                         */
-       VMCB_DIRTY_MAX,
-};
-
-/* TPR and CR2 are always written before VMRUN */
-#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
-
-#define VMCB_AVIC_APIC_BAR_MASK                0xFFFFFFFFFF000ULL
-
-static int sev_flush_asids(void);
-static DECLARE_RWSEM(sev_deactivate_lock);
-static DEFINE_MUTEX(sev_bitmap_lock);
-static unsigned int max_sev_asid;
-static unsigned int min_sev_asid;
-static unsigned long *sev_asid_bitmap;
-static unsigned long *sev_reclaim_asid_bitmap;
-#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
-
-struct enc_region {
-       struct list_head list;
-       unsigned long npages;
-       struct page **pages;
-       unsigned long uaddr;
-       unsigned long size;
-};
-
-
-static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
-{
-       return container_of(kvm, struct kvm_svm, kvm);
-}
-
-static inline bool svm_sev_enabled(void)
-{
-       return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
-}
-
-static inline bool sev_guest(struct kvm *kvm)
-{
-#ifdef CONFIG_KVM_AMD_SEV
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
-       return sev->active;
-#else
-       return false;
-#endif
-}
-
-static inline int sev_get_asid(struct kvm *kvm)
-{
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
-       return sev->asid;
-}
-
-static inline void mark_all_dirty(struct vmcb *vmcb)
-{
-       vmcb->control.clean = 0;
-}
-
-static inline void mark_all_clean(struct vmcb *vmcb)
-{
-       vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
-                              & ~VMCB_ALWAYS_DIRTY_MASK;
-}
-
-static inline void mark_dirty(struct vmcb *vmcb, int bit)
-{
-       vmcb->control.clean &= ~(1 << bit);
-}
-
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
-{
-       return container_of(vcpu, struct vcpu_svm, vcpu);
-}
-
-static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
-{
-       svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
-       mark_dirty(svm->vmcb, VMCB_AVIC);
-}
-
-static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       u64 *entry = svm->avic_physical_id_cache;
-
-       if (!entry)
-               return false;
-
-       return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
-}
-
-static void recalc_intercepts(struct vcpu_svm *svm)
-{
-       struct vmcb_control_area *c, *h;
-       struct nested_state *g;
-
-       mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-
-       if (!is_guest_mode(&svm->vcpu))
-               return;
-
-       c = &svm->vmcb->control;
-       h = &svm->nested.hsave->control;
-       g = &svm->nested;
-
-       c->intercept_cr = h->intercept_cr;
-       c->intercept_dr = h->intercept_dr;
-       c->intercept_exceptions = h->intercept_exceptions;
-       c->intercept = h->intercept;
-
-       if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
-               /* We only want the cr8 intercept bits of L1 */
-               c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ);
-               c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE);
-
-               /*
-                * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
-                * affect any interrupt we may want to inject; therefore,
-                * interrupt window vmexits are irrelevant to L0.
-                */
-               c->intercept &= ~(1ULL << INTERCEPT_VINTR);
-       }
-
-       /* We don't want to see VMMCALLs from a nested guest */
-       c->intercept &= ~(1ULL << INTERCEPT_VMMCALL);
-
-       c->intercept_cr |= g->intercept_cr;
-       c->intercept_dr |= g->intercept_dr;
-       c->intercept_exceptions |= g->intercept_exceptions;
-       c->intercept |= g->intercept;
-}
-
-static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
-{
-       if (is_guest_mode(&svm->vcpu))
-               return svm->nested.hsave;
-       else
-               return svm->vmcb;
-}
-
-static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
-{
-       struct vmcb *vmcb = get_host_vmcb(svm);
-
-       vmcb->control.intercept_cr |= (1U << bit);
-
-       recalc_intercepts(svm);
-}
-
-static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
-{
-       struct vmcb *vmcb = get_host_vmcb(svm);
-
-       vmcb->control.intercept_cr &= ~(1U << bit);
-
-       recalc_intercepts(svm);
-}
-
-static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
-{
-       struct vmcb *vmcb = get_host_vmcb(svm);
-
-       return vmcb->control.intercept_cr & (1U << bit);
-}
-
-static inline void set_dr_intercepts(struct vcpu_svm *svm)
-{
-       struct vmcb *vmcb = get_host_vmcb(svm);
-
-       vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
-               | (1 << INTERCEPT_DR1_READ)
-               | (1 << INTERCEPT_DR2_READ)
-               | (1 << INTERCEPT_DR3_READ)
-               | (1 << INTERCEPT_DR4_READ)
-               | (1 << INTERCEPT_DR5_READ)
-               | (1 << INTERCEPT_DR6_READ)
-               | (1 << INTERCEPT_DR7_READ)
-               | (1 << INTERCEPT_DR0_WRITE)
-               | (1 << INTERCEPT_DR1_WRITE)
-               | (1 << INTERCEPT_DR2_WRITE)
-               | (1 << INTERCEPT_DR3_WRITE)
-               | (1 << INTERCEPT_DR4_WRITE)
-               | (1 << INTERCEPT_DR5_WRITE)
-               | (1 << INTERCEPT_DR6_WRITE)
-               | (1 << INTERCEPT_DR7_WRITE);
-
-       recalc_intercepts(svm);
-}
-
-static inline void clr_dr_intercepts(struct vcpu_svm *svm)
-{
-       struct vmcb *vmcb = get_host_vmcb(svm);
-
-       vmcb->control.intercept_dr = 0;
-
-       recalc_intercepts(svm);
-}
-
-static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
-{
-       struct vmcb *vmcb = get_host_vmcb(svm);
-
-       vmcb->control.intercept_exceptions |= (1U << bit);
-
-       recalc_intercepts(svm);
-}
-
-static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
-{
-       struct vmcb *vmcb = get_host_vmcb(svm);
-
-       vmcb->control.intercept_exceptions &= ~(1U << bit);
-
-       recalc_intercepts(svm);
-}
-
-static inline void set_intercept(struct vcpu_svm *svm, int bit)
-{
-       struct vmcb *vmcb = get_host_vmcb(svm);
-
-       vmcb->control.intercept |= (1ULL << bit);
-
-       recalc_intercepts(svm);
-}
-
-static inline void clr_intercept(struct vcpu_svm *svm, int bit)
-{
-       struct vmcb *vmcb = get_host_vmcb(svm);
-
-       vmcb->control.intercept &= ~(1ULL << bit);
-
-       recalc_intercepts(svm);
-}
-
-static inline bool is_intercept(struct vcpu_svm *svm, int bit)
-{
-       return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
-}
-
-static inline bool vgif_enabled(struct vcpu_svm *svm)
-{
-       return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
-}
-
-static inline void enable_gif(struct vcpu_svm *svm)
-{
-       if (vgif_enabled(svm))
-               svm->vmcb->control.int_ctl |= V_GIF_MASK;
-       else
-               svm->vcpu.arch.hflags |= HF_GIF_MASK;
-}
-
-static inline void disable_gif(struct vcpu_svm *svm)
-{
-       if (vgif_enabled(svm))
-               svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
-       else
-               svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
-}
-
-static inline bool gif_set(struct vcpu_svm *svm)
-{
-       if (vgif_enabled(svm))
-               return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
-       else
-               return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
-}
 
 static unsigned long iopm_base;
 
@@ -696,23 +207,7 @@ struct kvm_ldttss_desc {
        u32 zero1;
 } __attribute__((packed));
 
-struct svm_cpu_data {
-       int cpu;
-
-       u64 asid_generation;
-       u32 max_asid;
-       u32 next_asid;
-       u32 min_asid;
-       struct kvm_ldttss_desc *tss_desc;
-
-       struct page *save_area;
-       struct vmcb *current_vmcb;
-
-       /* index = sev_asid, value = vmcb pointer */
-       struct vmcb **sev_vmcbs;
-};
-
-static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 
 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 
@@ -720,7 +215,7 @@ static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 #define MSRS_RANGE_SIZE 2048
 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 
-static u32 svm_msrpm_offset(u32 msr)
+u32 svm_msrpm_offset(u32 msr)
 {
        u32 offset;
        int i;
@@ -767,7 +262,7 @@ static int get_npt_level(struct kvm_vcpu *vcpu)
 #endif
 }
 
-static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
        vcpu->arch.efer = efer;
 
@@ -1198,7 +693,7 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 }
 
-static void disable_nmi_singlestep(struct vcpu_svm *svm)
+void disable_nmi_singlestep(struct vcpu_svm *svm)
 {
        svm->nmi_singlestep = false;
 
@@ -1211,97 +706,6 @@ static void disable_nmi_singlestep(struct vcpu_svm *svm)
        }
 }
 
-/* Note:
- * This hash table is used to map VM_ID to a struct kvm_svm,
- * when handling AMD IOMMU GALOG notification to schedule in
- * a particular vCPU.
- */
-#define SVM_VM_DATA_HASH_BITS  8
-static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
-static u32 next_vm_id = 0;
-static bool next_vm_id_wrapped = 0;
-static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
-
-/* Note:
- * This function is called from IOMMU driver to notify
- * SVM to schedule in a particular vCPU of a particular VM.
- */
-static int avic_ga_log_notifier(u32 ga_tag)
-{
-       unsigned long flags;
-       struct kvm_svm *kvm_svm;
-       struct kvm_vcpu *vcpu = NULL;
-       u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
-       u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
-
-       pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
-       trace_kvm_avic_ga_log(vm_id, vcpu_id);
-
-       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
-       hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
-               if (kvm_svm->avic_vm_id != vm_id)
-                       continue;
-               vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
-               break;
-       }
-       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
-
-       /* Note:
-        * At this point, the IOMMU should have already set the pending
-        * bit in the vAPIC backing page. So, we just need to schedule
-        * in the vcpu.
-        */
-       if (vcpu)
-               kvm_vcpu_wake_up(vcpu);
-
-       return 0;
-}
-
-static __init int sev_hardware_setup(void)
-{
-       struct sev_user_data_status *status;
-       int rc;
-
-       /* Maximum number of encrypted guests supported simultaneously */
-       max_sev_asid = cpuid_ecx(0x8000001F);
-
-       if (!max_sev_asid)
-               return 1;
-
-       /* Minimum ASID value that should be used for SEV guest */
-       min_sev_asid = cpuid_edx(0x8000001F);
-
-       /* Initialize SEV ASID bitmaps */
-       sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
-       if (!sev_asid_bitmap)
-               return 1;
-
-       sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
-       if (!sev_reclaim_asid_bitmap)
-               return 1;
-
-       status = kmalloc(sizeof(*status), GFP_KERNEL);
-       if (!status)
-               return 1;
-
-       /*
-        * Check SEV platform status.
-        *
-        * PLATFORM_STATUS can be called in any state, if we failed to query
-        * the PLATFORM status then either PSP firmware does not support SEV
-        * feature or SEV firmware is dead.
-        */
-       rc = sev_platform_status(status, NULL);
-       if (rc)
-               goto err;
-
-       pr_info("SEV supported\n");
-
-err:
-       kfree(status);
-       return rc;
-}
-
 static void grow_ple_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1383,12 +787,8 @@ static void svm_hardware_teardown(void)
 {
        int cpu;
 
-       if (svm_sev_enabled()) {
-               bitmap_free(sev_asid_bitmap);
-               bitmap_free(sev_reclaim_asid_bitmap);
-
-               sev_flush_asids();
-       }
+       if (svm_sev_enabled())
+               sev_hardware_teardown();
 
        for_each_possible_cpu(cpu)
                svm_cpu_uninit(cpu);
@@ -1585,24 +985,6 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
        return svm->vmcb->control.tsc_offset;
 }
 
-static void avic_init_vmcb(struct vcpu_svm *svm)
-{
-       struct vmcb *vmcb = svm->vmcb;
-       struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
-       phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
-       phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
-       phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
-
-       vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
-       vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
-       vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
-       vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
-       if (kvm_apicv_activated(svm->vcpu.kvm))
-               vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
-       else
-               vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
-}
-
 static void init_vmcb(struct vcpu_svm *svm)
 {
        struct vmcb_control_area *control = &svm->vmcb->control;
@@ -1762,465 +1144,22 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 }
 
-static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
-                                      unsigned int index)
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
-       u64 *avic_physical_id_table;
-       struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u32 dummy;
+       u32 eax = 1;
 
-       if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
-               return NULL;
+       svm->spec_ctrl = 0;
+       svm->virt_spec_ctrl = 0;
 
-       avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
-
-       return &avic_physical_id_table[index];
-}
-
-/**
- * Note:
- * AVIC hardware walks the nested page table to check permissions,
- * but does not use the SPA address specified in the leaf page
- * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
- * field of the VMCB. Therefore, we set up the
- * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
- */
-static int avic_update_access_page(struct kvm *kvm, bool activate)
-{
-       int ret = 0;
-
-       mutex_lock(&kvm->slots_lock);
-       /*
-        * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger
-        * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
-        * memory region. So, we need to ensure that kvm->mm == current->mm.
-        */
-       if ((kvm->arch.apic_access_page_done == activate) ||
-           (kvm->mm != current->mm))
-               goto out;
-
-       ret = __x86_set_memory_region(kvm,
-                                     APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
-                                     APIC_DEFAULT_PHYS_BASE,
-                                     activate ? PAGE_SIZE : 0);
-       if (ret)
-               goto out;
-
-       kvm->arch.apic_access_page_done = activate;
-out:
-       mutex_unlock(&kvm->slots_lock);
-       return ret;
-}
-
-static int avic_init_backing_page(struct kvm_vcpu *vcpu)
-{
-       u64 *entry, new_entry;
-       int id = vcpu->vcpu_id;
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
-               return -EINVAL;
-
-       if (!svm->vcpu.arch.apic->regs)
-               return -EINVAL;
-
-       if (kvm_apicv_activated(vcpu->kvm)) {
-               int ret;
-
-               ret = avic_update_access_page(vcpu->kvm, true);
-               if (ret)
-                       return ret;
-       }
-
-       svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
-
-       /* Setting AVIC backing page address in the phy APIC ID table */
-       entry = avic_get_physical_id_entry(vcpu, id);
-       if (!entry)
-               return -EINVAL;
-
-       new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
-                             AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
-                             AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
-       WRITE_ONCE(*entry, new_entry);
-
-       svm->avic_physical_id_cache = entry;
-
-       return 0;
-}
-
-static void sev_asid_free(int asid)
-{
-       struct svm_cpu_data *sd;
-       int cpu, pos;
-
-       mutex_lock(&sev_bitmap_lock);
-
-       pos = asid - 1;
-       __set_bit(pos, sev_reclaim_asid_bitmap);
-
-       for_each_possible_cpu(cpu) {
-               sd = per_cpu(svm_data, cpu);
-               sd->sev_vmcbs[pos] = NULL;
-       }
-
-       mutex_unlock(&sev_bitmap_lock);
-}
-
-static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
-{
-       struct sev_data_decommission *decommission;
-       struct sev_data_deactivate *data;
-
-       if (!handle)
-               return;
-
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               return;
-
-       /* deactivate handle */
-       data->handle = handle;
-
-       /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
-       down_read(&sev_deactivate_lock);
-       sev_guest_deactivate(data, NULL);
-       up_read(&sev_deactivate_lock);
-
-       kfree(data);
-
-       decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
-       if (!decommission)
-               return;
-
-       /* decommission handle */
-       decommission->handle = handle;
-       sev_guest_decommission(decommission, NULL);
-
-       kfree(decommission);
-}
-
-static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
-                                   unsigned long ulen, unsigned long *n,
-                                   int write)
-{
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       unsigned long npages, npinned, size;
-       unsigned long locked, lock_limit;
-       struct page **pages;
-       unsigned long first, last;
-
-       if (ulen == 0 || uaddr + ulen < uaddr)
-               return NULL;
-
-       /* Calculate number of pages. */
-       first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
-       last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
-       npages = (last - first + 1);
-
-       locked = sev->pages_locked + npages;
-       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-       if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
-               pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
-               return NULL;
-       }
-
-       /* Avoid using vmalloc for smaller buffers. */
-       size = npages * sizeof(struct page *);
-       if (size > PAGE_SIZE)
-               pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
-                                 PAGE_KERNEL);
-       else
-               pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
-
-       if (!pages)
-               return NULL;
-
-       /* Pin the user virtual address. */
-       npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
-       if (npinned != npages) {
-               pr_err("SEV: Failure locking %lu pages.\n", npages);
-               goto err;
-       }
-
-       *n = npages;
-       sev->pages_locked = locked;
-
-       return pages;
-
-err:
-       if (npinned > 0)
-               release_pages(pages, npinned);
-
-       kvfree(pages);
-       return NULL;
-}
-
-static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
-                            unsigned long npages)
-{
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
-       release_pages(pages, npages);
-       kvfree(pages);
-       sev->pages_locked -= npages;
-}
-
-static void sev_clflush_pages(struct page *pages[], unsigned long npages)
-{
-       uint8_t *page_virtual;
-       unsigned long i;
-
-       if (npages == 0 || pages == NULL)
-               return;
-
-       for (i = 0; i < npages; i++) {
-               page_virtual = kmap_atomic(pages[i]);
-               clflush_cache_range(page_virtual, PAGE_SIZE);
-               kunmap_atomic(page_virtual);
-       }
-}
-
-static void __unregister_enc_region_locked(struct kvm *kvm,
-                                          struct enc_region *region)
-{
-       sev_unpin_memory(kvm, region->pages, region->npages);
-       list_del(&region->list);
-       kfree(region);
-}
-
-static void sev_vm_destroy(struct kvm *kvm)
-{
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct list_head *head = &sev->regions_list;
-       struct list_head *pos, *q;
-
-       if (!sev_guest(kvm))
-               return;
-
-       mutex_lock(&kvm->lock);
-
-       /*
-        * Ensure that all guest tagged cache entries are flushed before
-        * releasing the pages back to the system for use. CLFLUSH will
-        * not do this, so issue a WBINVD.
-        */
-       wbinvd_on_all_cpus();
-
-       /*
-        * if userspace was terminated before unregistering the memory regions
-        * then lets unpin all the registered memory.
-        */
-       if (!list_empty(head)) {
-               list_for_each_safe(pos, q, head) {
-                       __unregister_enc_region_locked(kvm,
-                               list_entry(pos, struct enc_region, list));
-               }
-       }
-
-       mutex_unlock(&kvm->lock);
-
-       sev_unbind_asid(kvm, sev->handle);
-       sev_asid_free(sev->asid);
-}
-
-static void avic_vm_destroy(struct kvm *kvm)
-{
-       unsigned long flags;
-       struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
-
-       if (!avic)
-               return;
-
-       if (kvm_svm->avic_logical_id_table_page)
-               __free_page(kvm_svm->avic_logical_id_table_page);
-       if (kvm_svm->avic_physical_id_table_page)
-               __free_page(kvm_svm->avic_physical_id_table_page);
-
-       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
-       hash_del(&kvm_svm->hnode);
-       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
-}
-
-static void svm_vm_destroy(struct kvm *kvm)
-{
-       avic_vm_destroy(kvm);
-       sev_vm_destroy(kvm);
-}
-
-static int avic_vm_init(struct kvm *kvm)
-{
-       unsigned long flags;
-       int err = -ENOMEM;
-       struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
-       struct kvm_svm *k2;
-       struct page *p_page;
-       struct page *l_page;
-       u32 vm_id;
-
-       if (!avic)
-               return 0;
-
-       /* Allocating physical APIC ID table (4KB) */
-       p_page = alloc_page(GFP_KERNEL_ACCOUNT);
-       if (!p_page)
-               goto free_avic;
-
-       kvm_svm->avic_physical_id_table_page = p_page;
-       clear_page(page_address(p_page));
-
-       /* Allocating logical APIC ID table (4KB) */
-       l_page = alloc_page(GFP_KERNEL_ACCOUNT);
-       if (!l_page)
-               goto free_avic;
-
-       kvm_svm->avic_logical_id_table_page = l_page;
-       clear_page(page_address(l_page));
-
-       spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
- again:
-       vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
-       if (vm_id == 0) { /* id is 1-based, zero is not okay */
-               next_vm_id_wrapped = 1;
-               goto again;
-       }
-       /* Is it still in use? Only possible if wrapped at least once */
-       if (next_vm_id_wrapped) {
-               hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
-                       if (k2->avic_vm_id == vm_id)
-                               goto again;
-               }
-       }
-       kvm_svm->avic_vm_id = vm_id;
-       hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
-       spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
-
-       return 0;
-
-free_avic:
-       avic_vm_destroy(kvm);
-       return err;
-}
-
-static int svm_vm_init(struct kvm *kvm)
-{
-       if (avic) {
-               int ret = avic_vm_init(kvm);
-               if (ret)
-                       return ret;
-       }
-
-       kvm_apicv_init(kvm, avic);
-       return 0;
-}
-
-static inline int
-avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
-{
-       int ret = 0;
-       unsigned long flags;
-       struct amd_svm_iommu_ir *ir;
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (!kvm_arch_has_assigned_device(vcpu->kvm))
-               return 0;
-
-       /*
-        * Here, we go through the per-vcpu ir_list to update all existing
-        * interrupt remapping table entry targeting this vcpu.
-        */
-       spin_lock_irqsave(&svm->ir_list_lock, flags);
-
-       if (list_empty(&svm->ir_list))
-               goto out;
-
-       list_for_each_entry(ir, &svm->ir_list, node) {
-               ret = amd_iommu_update_ga(cpu, r, ir->data);
-               if (ret)
-                       break;
-       }
-out:
-       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-       return ret;
-}
-
-static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-       u64 entry;
-       /* ID = 0xff (broadcast), ID > 0xff (reserved) */
-       int h_physical_id = kvm_cpu_get_apicid(cpu);
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
-
-       /*
-        * Since the host physical APIC id is 8 bits,
-        * we can support host APIC ID upto 255.
-        */
-       if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
-               return;
-
-       entry = READ_ONCE(*(svm->avic_physical_id_cache));
-       WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
-
-       entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
-       entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
-
-       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       if (svm->avic_is_running)
-               entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-
-       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
-       avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
-                                       svm->avic_is_running);
-}
-
-static void avic_vcpu_put(struct kvm_vcpu *vcpu)
-{
-       u64 entry;
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
-
-       entry = READ_ONCE(*(svm->avic_physical_id_cache));
-       if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
-               avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
-
-       entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-       WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
-}
-
-/**
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       svm->avic_is_running = is_run;
-       if (is_run)
-               avic_vcpu_load(vcpu, vcpu->cpu);
-       else
-               avic_vcpu_put(vcpu);
-}
-
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       u32 dummy;
-       u32 eax = 1;
-
-       svm->spec_ctrl = 0;
-       svm->virt_spec_ctrl = 0;
-
-       if (!init_event) {
-               svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
-                                          MSR_IA32_APICBASE_ENABLE;
-               if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
-                       svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
-       }
-       init_vmcb(svm);
+       if (!init_event) {
+               svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+                                          MSR_IA32_APICBASE_ENABLE;
+               if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
+                       svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+       }
+       init_vmcb(svm);
 
        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
        kvm_rdx_write(vcpu, eax);
@@ -2229,25 +1168,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
 }
 
-static int avic_init_vcpu(struct vcpu_svm *svm)
-{
-       int ret;
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-
-       if (!avic || !irqchip_in_kernel(vcpu->kvm))
-               return 0;
-
-       ret = avic_init_backing_page(&svm->vcpu);
-       if (ret)
-               return ret;
-
-       INIT_LIST_HEAD(&svm->ir_list);
-       spin_lock_init(&svm->ir_list_lock);
-       svm->dfr_reg = APIC_DFR_FLAT;
-
-       return ret;
-}
-
 static int svm_create_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm;
@@ -2404,18 +1324,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 }
 
-static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
-{
-       avic_set_running(vcpu, false);
-}
-
-static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
-{
-       if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
-               kvm_vcpu_update_apicv(vcpu);
-       avic_set_running(vcpu, true);
-}
-
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -2652,7 +1560,7 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
        }
 }
 
-static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -2686,7 +1594,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        update_cr0_intercept(svm);
 }
 
-static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
        unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
@@ -3022,4256 +1930,1881 @@ static int vmmcall_interception(struct vcpu_svm *svm)
        return kvm_emulate_hypercall(&svm->vcpu);
 }
 
-static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       return svm->nested.nested_cr3;
-}
-
-static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
+static int vmload_interception(struct vcpu_svm *svm)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
-       u64 cr3 = svm->nested.nested_cr3;
-       u64 pdpte;
+       struct vmcb *nested_vmcb;
+       struct kvm_host_map map;
        int ret;
 
-       ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
-                                      offset_in_page(cr3) + index * 8, 8);
-       if (ret)
-               return 0;
-       return pdpte;
-}
-
-static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
-                                      struct x86_exception *fault)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
+       if (nested_svm_check_permissions(svm))
+               return 1;
 
-       if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
-               /*
-                * TODO: track the cause of the nested page fault, and
-                * correctly fill in the high bits of exit_info_1.
-                */
-               svm->vmcb->control.exit_code = SVM_EXIT_NPF;
-               svm->vmcb->control.exit_code_hi = 0;
-               svm->vmcb->control.exit_info_1 = (1ULL << 32);
-               svm->vmcb->control.exit_info_2 = fault->address;
+       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+       if (ret) {
+               if (ret == -EINVAL)
+                       kvm_inject_gp(&svm->vcpu, 0);
+               return 1;
        }
 
-       svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
-       svm->vmcb->control.exit_info_1 |= fault->error_code;
+       nested_vmcb = map.hva;
 
-       /*
-        * The present bit is always zero for page structure faults on real
-        * hardware.
-        */
-       if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
-               svm->vmcb->control.exit_info_1 &= ~1;
-
-       nested_svm_vmexit(svm);
-}
+       ret = kvm_skip_emulated_instruction(&svm->vcpu);
 
-static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
-{
-       WARN_ON(mmu_is_nested(vcpu));
+       nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
+       kvm_vcpu_unmap(&svm->vcpu, &map, true);
 
-       vcpu->arch.mmu = &vcpu->arch.guest_mmu;
-       kvm_init_shadow_mmu(vcpu);
-       vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
-       vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
-       vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
-       vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu);
-       reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
-       vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+       return ret;
 }
 
-static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+static int vmsave_interception(struct vcpu_svm *svm)
 {
-       vcpu->arch.mmu = &vcpu->arch.root_mmu;
-       vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
-}
+       struct vmcb *nested_vmcb;
+       struct kvm_host_map map;
+       int ret;
 
-static int nested_svm_check_permissions(struct vcpu_svm *svm)
-{
-       if (!(svm->vcpu.arch.efer & EFER_SVME) ||
-           !is_paging(&svm->vcpu)) {
-               kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       if (nested_svm_check_permissions(svm))
                return 1;
-       }
 
-       if (svm->vmcb->save.cpl) {
-               kvm_inject_gp(&svm->vcpu, 0);
+       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+       if (ret) {
+               if (ret == -EINVAL)
+                       kvm_inject_gp(&svm->vcpu, 0);
                return 1;
        }
 
-       return 0;
+       nested_vmcb = map.hva;
+
+       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+       nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
+       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+
+       return ret;
 }
 
-static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
-                                     bool has_error_code, u32 error_code)
+static int vmrun_interception(struct vcpu_svm *svm)
 {
-       int vmexit;
+       if (nested_svm_check_permissions(svm))
+               return 1;
 
-       if (!is_guest_mode(&svm->vcpu))
-               return 0;
+       return nested_svm_vmrun(svm);
+}
 
-       vmexit = nested_svm_intercept(svm);
-       if (vmexit != NESTED_EXIT_DONE)
-               return 0;
+static int stgi_interception(struct vcpu_svm *svm)
+{
+       int ret;
 
-       svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
-       svm->vmcb->control.exit_code_hi = 0;
-       svm->vmcb->control.exit_info_1 = error_code;
+       if (nested_svm_check_permissions(svm))
+               return 1;
 
        /*
-        * EXITINFO2 is undefined for all exception intercepts other
-        * than #PF.
+        * If VGIF is enabled, the STGI intercept is only added to
+        * detect the opening of the SMI/NMI window; remove it now.
         */
-       if (svm->vcpu.arch.exception.nested_apf)
-               svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
-       else if (svm->vcpu.arch.exception.has_payload)
-               svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
-       else
-               svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
-
-       svm->nested.exit_required = true;
-       return vmexit;
-}
+       if (vgif_enabled(svm))
+               clr_intercept(svm, INTERCEPT_STGI);
 
-static void nested_svm_intr(struct vcpu_svm *svm)
-{
-       svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
-       svm->vmcb->control.exit_info_1 = 0;
-       svm->vmcb->control.exit_info_2 = 0;
+       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 
-       /* nested_svm_vmexit this gets called afterwards from handle_exit */
-       svm->nested.exit_required = true;
-       trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-}
+       enable_gif(svm);
 
-static bool nested_exit_on_intr(struct vcpu_svm *svm)
-{
-       return (svm->nested.intercept & 1ULL);
+       return ret;
 }
 
-static int svm_check_nested_events(struct kvm_vcpu *vcpu)
+static int clgi_interception(struct vcpu_svm *svm)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
-       bool block_nested_events =
-               kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required;
-
-       if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) {
-               if (block_nested_events)
-                       return -EBUSY;
-               nested_svm_intr(svm);
-               return 0;
-       }
+       int ret;
 
-       return 0;
-}
+       if (nested_svm_check_permissions(svm))
+               return 1;
 
-/* This function returns true if it is save to enable the nmi window */
-static inline bool nested_svm_nmi(struct vcpu_svm *svm)
-{
-       if (!is_guest_mode(&svm->vcpu))
-               return true;
+       ret = kvm_skip_emulated_instruction(&svm->vcpu);
 
-       if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
-               return true;
+       disable_gif(svm);
 
-       svm->vmcb->control.exit_code = SVM_EXIT_NMI;
-       svm->nested.exit_required = true;
+       /* After a CLGI no interrupts should come */
+       if (!kvm_vcpu_apicv_active(&svm->vcpu))
+               svm_clear_vintr(svm);
 
-       return false;
+       return ret;
 }
 
-static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+static int invlpga_interception(struct vcpu_svm *svm)
 {
-       unsigned port, size, iopm_len;
-       u16 val, mask;
-       u8 start_bit;
-       u64 gpa;
-
-       if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
-               return NESTED_EXIT_HOST;
+       struct kvm_vcpu *vcpu = &svm->vcpu;
 
-       port = svm->vmcb->control.exit_info_1 >> 16;
-       size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
-               SVM_IOIO_SIZE_SHIFT;
-       gpa  = svm->nested.vmcb_iopm + (port / 8);
-       start_bit = port % 8;
-       iopm_len = (start_bit + size > 8) ? 2 : 1;
-       mask = (0xf >> (4 - size)) << start_bit;
-       val = 0;
+       trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
+                         kvm_rax_read(&svm->vcpu));
 
-       if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
-               return NESTED_EXIT_DONE;
+       /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+       kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
 
-       return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+       return kvm_skip_emulated_instruction(&svm->vcpu);
 }
 
-static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+static int skinit_interception(struct vcpu_svm *svm)
 {
-       u32 offset, msr, value;
-       int write, mask;
+       trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
 
-       if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
-               return NESTED_EXIT_HOST;
+       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       return 1;
+}
 
-       msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-       offset = svm_msrpm_offset(msr);
-       write  = svm->vmcb->control.exit_info_1 & 1;
-       mask   = 1 << ((2 * (msr & 0xf)) + write);
+static int wbinvd_interception(struct vcpu_svm *svm)
+{
+       return kvm_emulate_wbinvd(&svm->vcpu);
+}
 
-       if (offset == MSR_INVALID)
-               return NESTED_EXIT_DONE;
+static int xsetbv_interception(struct vcpu_svm *svm)
+{
+       u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
+       u32 index = kvm_rcx_read(&svm->vcpu);
 
-       /* Offset is in 32 bit units but need in 8 bit units */
-       offset *= 4;
+       if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
+               return kvm_skip_emulated_instruction(&svm->vcpu);
+       }
 
-       if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
-               return NESTED_EXIT_DONE;
+       return 1;
+}
 
-       return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+static int rdpru_interception(struct vcpu_svm *svm)
+{
+       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       return 1;
 }
 
-/* DB exceptions for our internal use must not cause vmexit */
-static int nested_svm_intercept_db(struct vcpu_svm *svm)
+static int task_switch_interception(struct vcpu_svm *svm)
 {
-       unsigned long dr6;
+       u16 tss_selector;
+       int reason;
+       int int_type = svm->vmcb->control.exit_int_info &
+               SVM_EXITINTINFO_TYPE_MASK;
+       int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
+       uint32_t type =
+               svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
+       uint32_t idt_v =
+               svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+       bool has_error_code = false;
+       u32 error_code = 0;
 
-       /* if we're not singlestepping, it's not ours */
-       if (!svm->nmi_singlestep)
-               return NESTED_EXIT_DONE;
+       tss_selector = (u16)svm->vmcb->control.exit_info_1;
 
-       /* if it's not a singlestep exception, it's not ours */
-       if (kvm_get_dr(&svm->vcpu, 6, &dr6))
-               return NESTED_EXIT_DONE;
-       if (!(dr6 & DR6_BS))
-               return NESTED_EXIT_DONE;
+       if (svm->vmcb->control.exit_info_2 &
+           (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
+               reason = TASK_SWITCH_IRET;
+       else if (svm->vmcb->control.exit_info_2 &
+                (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+               reason = TASK_SWITCH_JMP;
+       else if (idt_v)
+               reason = TASK_SWITCH_GATE;
+       else
+               reason = TASK_SWITCH_CALL;
 
-       /* if the guest is singlestepping, it should get the vmexit */
-       if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
-               disable_nmi_singlestep(svm);
-               return NESTED_EXIT_DONE;
+       if (reason == TASK_SWITCH_GATE) {
+               switch (type) {
+               case SVM_EXITINTINFO_TYPE_NMI:
+                       svm->vcpu.arch.nmi_injected = false;
+                       break;
+               case SVM_EXITINTINFO_TYPE_EXEPT:
+                       if (svm->vmcb->control.exit_info_2 &
+                           (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+                               has_error_code = true;
+                               error_code =
+                                       (u32)svm->vmcb->control.exit_info_2;
+                       }
+                       kvm_clear_exception_queue(&svm->vcpu);
+                       break;
+               case SVM_EXITINTINFO_TYPE_INTR:
+                       kvm_clear_interrupt_queue(&svm->vcpu);
+                       break;
+               default:
+                       break;
+               }
        }
 
-       /* it's ours, the nested hypervisor must not see this one */
-       return NESTED_EXIT_HOST;
-}
-
-static int nested_svm_exit_special(struct vcpu_svm *svm)
-{
-       u32 exit_code = svm->vmcb->control.exit_code;
-
-       switch (exit_code) {
-       case SVM_EXIT_INTR:
-       case SVM_EXIT_NMI:
-       case SVM_EXIT_EXCP_BASE + MC_VECTOR:
-               return NESTED_EXIT_HOST;
-       case SVM_EXIT_NPF:
-               /* For now we are always handling NPFs when using them */
-               if (npt_enabled)
-                       return NESTED_EXIT_HOST;
-               break;
-       case SVM_EXIT_EXCP_BASE + PF_VECTOR:
-               /* When we're shadowing, trap PFs, but not async PF */
-               if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0)
-                       return NESTED_EXIT_HOST;
-               break;
-       default:
-               break;
+       if (reason != TASK_SWITCH_GATE ||
+           int_type == SVM_EXITINTINFO_TYPE_SOFT ||
+           (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
+            (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
+               if (!skip_emulated_instruction(&svm->vcpu))
+                       return 0;
        }
 
-       return NESTED_EXIT_CONTINUE;
+       if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
+               int_vec = -1;
+
+       return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+                              has_error_code, error_code);
 }
 
-static int nested_svm_intercept(struct vcpu_svm *svm)
+static int cpuid_interception(struct vcpu_svm *svm)
 {
-       u32 exit_code = svm->vmcb->control.exit_code;
-       int vmexit = NESTED_EXIT_HOST;
-
-       switch (exit_code) {
-       case SVM_EXIT_MSR:
-               vmexit = nested_svm_exit_handled_msr(svm);
-               break;
-       case SVM_EXIT_IOIO:
-               vmexit = nested_svm_intercept_ioio(svm);
-               break;
-       case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
-               u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
-               if (svm->nested.intercept_cr & bit)
-                       vmexit = NESTED_EXIT_DONE;
-               break;
-       }
-       case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
-               u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
-               if (svm->nested.intercept_dr & bit)
-                       vmexit = NESTED_EXIT_DONE;
-               break;
-       }
-       case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
-               u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
-               if (svm->nested.intercept_exceptions & excp_bits) {
-                       if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
-                               vmexit = nested_svm_intercept_db(svm);
-                       else
-                               vmexit = NESTED_EXIT_DONE;
-               }
-               /* async page fault always cause vmexit */
-               else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
-                        svm->vcpu.arch.exception.nested_apf != 0)
-                       vmexit = NESTED_EXIT_DONE;
-               break;
-       }
-       case SVM_EXIT_ERR: {
-               vmexit = NESTED_EXIT_DONE;
-               break;
-       }
-       default: {
-               u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
-               if (svm->nested.intercept & exit_bits)
-                       vmexit = NESTED_EXIT_DONE;
-       }
-       }
+       return kvm_emulate_cpuid(&svm->vcpu);
+}
 
-       return vmexit;
+static int iret_interception(struct vcpu_svm *svm)
+{
+       ++svm->vcpu.stat.nmi_window_exits;
+       clr_intercept(svm, INTERCEPT_IRET);
+       svm->vcpu.arch.hflags |= HF_IRET_MASK;
+       svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       return 1;
 }
 
-static int nested_svm_exit_handled(struct vcpu_svm *svm)
+static int invlpg_interception(struct vcpu_svm *svm)
 {
-       int vmexit;
+       if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+               return kvm_emulate_instruction(&svm->vcpu, 0);
 
-       vmexit = nested_svm_intercept(svm);
+       kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
+       return kvm_skip_emulated_instruction(&svm->vcpu);
+}
 
-       if (vmexit == NESTED_EXIT_DONE)
-               nested_svm_vmexit(svm);
+static int emulate_on_interception(struct vcpu_svm *svm)
+{
+       return kvm_emulate_instruction(&svm->vcpu, 0);
+}
 
-       return vmexit;
-}
-
-static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
-{
-       struct vmcb_control_area *dst  = &dst_vmcb->control;
-       struct vmcb_control_area *from = &from_vmcb->control;
-
-       dst->intercept_cr         = from->intercept_cr;
-       dst->intercept_dr         = from->intercept_dr;
-       dst->intercept_exceptions = from->intercept_exceptions;
-       dst->intercept            = from->intercept;
-       dst->iopm_base_pa         = from->iopm_base_pa;
-       dst->msrpm_base_pa        = from->msrpm_base_pa;
-       dst->tsc_offset           = from->tsc_offset;
-       dst->asid                 = from->asid;
-       dst->tlb_ctl              = from->tlb_ctl;
-       dst->int_ctl              = from->int_ctl;
-       dst->int_vector           = from->int_vector;
-       dst->int_state            = from->int_state;
-       dst->exit_code            = from->exit_code;
-       dst->exit_code_hi         = from->exit_code_hi;
-       dst->exit_info_1          = from->exit_info_1;
-       dst->exit_info_2          = from->exit_info_2;
-       dst->exit_int_info        = from->exit_int_info;
-       dst->exit_int_info_err    = from->exit_int_info_err;
-       dst->nested_ctl           = from->nested_ctl;
-       dst->event_inj            = from->event_inj;
-       dst->event_inj_err        = from->event_inj_err;
-       dst->nested_cr3           = from->nested_cr3;
-       dst->virt_ext              = from->virt_ext;
-       dst->pause_filter_count   = from->pause_filter_count;
-       dst->pause_filter_thresh  = from->pause_filter_thresh;
-}
-
-static int nested_svm_vmexit(struct vcpu_svm *svm)
-{
-       int rc;
-       struct vmcb *nested_vmcb;
-       struct vmcb *hsave = svm->nested.hsave;
-       struct vmcb *vmcb = svm->vmcb;
-       struct kvm_host_map map;
+static int rsm_interception(struct vcpu_svm *svm)
+{
+       return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
+}
 
-       trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
-                                      vmcb->control.exit_info_1,
-                                      vmcb->control.exit_info_2,
-                                      vmcb->control.exit_int_info,
-                                      vmcb->control.exit_int_info_err,
-                                      KVM_ISA_SVM);
+static int rdpmc_interception(struct vcpu_svm *svm)
+{
+       int err;
 
-       rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map);
-       if (rc) {
-               if (rc == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
-               return 1;
-       }
+       if (!nrips)
+               return emulate_on_interception(svm);
 
-       nested_vmcb = map.hva;
+       err = kvm_rdpmc(&svm->vcpu);
+       return kvm_complete_insn_gp(&svm->vcpu, err);
+}
 
-       /* Exit Guest-Mode */
-       leave_guest_mode(&svm->vcpu);
-       svm->nested.vmcb = 0;
+static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+                                           unsigned long val)
+{
+       unsigned long cr0 = svm->vcpu.arch.cr0;
+       bool ret = false;
+       u64 intercept;
 
-       /* Give the current vmcb to the guest */
-       disable_gif(svm);
+       intercept = svm->nested.intercept;
 
-       nested_vmcb->save.es     = vmcb->save.es;
-       nested_vmcb->save.cs     = vmcb->save.cs;
-       nested_vmcb->save.ss     = vmcb->save.ss;
-       nested_vmcb->save.ds     = vmcb->save.ds;
-       nested_vmcb->save.gdtr   = vmcb->save.gdtr;
-       nested_vmcb->save.idtr   = vmcb->save.idtr;
-       nested_vmcb->save.efer   = svm->vcpu.arch.efer;
-       nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
-       nested_vmcb->save.cr2    = vmcb->save.cr2;
-       nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
-       nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
-       nested_vmcb->save.rip    = vmcb->save.rip;
-       nested_vmcb->save.rsp    = vmcb->save.rsp;
-       nested_vmcb->save.rax    = vmcb->save.rax;
-       nested_vmcb->save.dr7    = vmcb->save.dr7;
-       nested_vmcb->save.dr6    = vmcb->save.dr6;
-       nested_vmcb->save.cpl    = vmcb->save.cpl;
-
-       nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
-       nested_vmcb->control.int_vector        = vmcb->control.int_vector;
-       nested_vmcb->control.int_state         = vmcb->control.int_state;
-       nested_vmcb->control.exit_code         = vmcb->control.exit_code;
-       nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
-       nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
-       nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
-       nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
-       nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
-
-       if (svm->nrips_enabled)
-               nested_vmcb->control.next_rip  = vmcb->control.next_rip;
+       if (!is_guest_mode(&svm->vcpu) ||
+           (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
+               return false;
 
-       /*
-        * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
-        * to make sure that we do not lose injected events. So check event_inj
-        * here and copy it to exit_int_info if it is valid.
-        * Exit_int_info and event_inj can't be both valid because the case
-        * below only happens on a VMRUN instruction intercept which has
-        * no valid exit_int_info set.
-        */
-       if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
-               struct vmcb_control_area *nc = &nested_vmcb->control;
+       cr0 &= ~SVM_CR0_SELECTIVE_MASK;
+       val &= ~SVM_CR0_SELECTIVE_MASK;
 
-               nc->exit_int_info     = vmcb->control.event_inj;
-               nc->exit_int_info_err = vmcb->control.event_inj_err;
+       if (cr0 ^ val) {
+               svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+               ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
        }
 
-       nested_vmcb->control.tlb_ctl           = 0;
-       nested_vmcb->control.event_inj         = 0;
-       nested_vmcb->control.event_inj_err     = 0;
+       return ret;
+}
 
-       nested_vmcb->control.pause_filter_count =
-               svm->vmcb->control.pause_filter_count;
-       nested_vmcb->control.pause_filter_thresh =
-               svm->vmcb->control.pause_filter_thresh;
+#define CR_VALID (1ULL << 63)
 
-       /* We always set V_INTR_MASKING and remember the old value in hflags */
-       if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
-               nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
+static int cr_interception(struct vcpu_svm *svm)
+{
+       int reg, cr;
+       unsigned long val;
+       int err;
 
-       /* Restore the original control entries */
-       copy_vmcb_control_area(vmcb, hsave);
+       if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+               return emulate_on_interception(svm);
 
-       svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
-
-       svm->nested.nested_cr3 = 0;
-
-       /* Restore selected save entries */
-       svm->vmcb->save.es = hsave->save.es;
-       svm->vmcb->save.cs = hsave->save.cs;
-       svm->vmcb->save.ss = hsave->save.ss;
-       svm->vmcb->save.ds = hsave->save.ds;
-       svm->vmcb->save.gdtr = hsave->save.gdtr;
-       svm->vmcb->save.idtr = hsave->save.idtr;
-       kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
-       svm_set_efer(&svm->vcpu, hsave->save.efer);
-       svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
-       svm_set_cr4(&svm->vcpu, hsave->save.cr4);
-       if (npt_enabled) {
-               svm->vmcb->save.cr3 = hsave->save.cr3;
-               svm->vcpu.arch.cr3 = hsave->save.cr3;
-       } else {
-               (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
-       }
-       kvm_rax_write(&svm->vcpu, hsave->save.rax);
-       kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
-       kvm_rip_write(&svm->vcpu, hsave->save.rip);
-       svm->vmcb->save.dr7 = 0;
-       svm->vmcb->save.cpl = 0;
-       svm->vmcb->control.exit_int_info = 0;
-
-       mark_all_dirty(svm->vmcb);
-
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+               return emulate_on_interception(svm);
 
-       nested_svm_uninit_mmu_context(&svm->vcpu);
-       kvm_mmu_reset_context(&svm->vcpu);
-       kvm_mmu_load(&svm->vcpu);
+       reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+       if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
+               cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
+       else
+               cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
 
-       /*
-        * Drop what we picked up for L2 via svm_complete_interrupts() so it
-        * doesn't end up in L1.
-        */
-       svm->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       err = 0;
+       if (cr >= 16) { /* mov to cr */
+               cr -= 16;
+               val = kvm_register_read(&svm->vcpu, reg);
+               switch (cr) {
+               case 0:
+                       if (!check_selective_cr0_intercepted(svm, val))
+                               err = kvm_set_cr0(&svm->vcpu, val);
+                       else
+                               return 1;
 
-       return 0;
+                       break;
+               case 3:
+                       err = kvm_set_cr3(&svm->vcpu, val);
+                       break;
+               case 4:
+                       err = kvm_set_cr4(&svm->vcpu, val);
+                       break;
+               case 8:
+                       err = kvm_set_cr8(&svm->vcpu, val);
+                       break;
+               default:
+                       WARN(1, "unhandled write to CR%d", cr);
+                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+                       return 1;
+               }
+       } else { /* mov from cr */
+               switch (cr) {
+               case 0:
+                       val = kvm_read_cr0(&svm->vcpu);
+                       break;
+               case 2:
+                       val = svm->vcpu.arch.cr2;
+                       break;
+               case 3:
+                       val = kvm_read_cr3(&svm->vcpu);
+                       break;
+               case 4:
+                       val = kvm_read_cr4(&svm->vcpu);
+                       break;
+               case 8:
+                       val = kvm_get_cr8(&svm->vcpu);
+                       break;
+               default:
+                       WARN(1, "unhandled read from CR%d", cr);
+                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+                       return 1;
+               }
+               kvm_register_write(&svm->vcpu, reg, val);
+       }
+       return kvm_complete_insn_gp(&svm->vcpu, err);
 }
 
-static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
+static int dr_interception(struct vcpu_svm *svm)
 {
-       /*
-        * This function merges the msr permission bitmaps of kvm and the
-        * nested vmcb. It is optimized in that it only merges the parts where
-        * the kvm msr permission bitmap may contain zero bits
-        */
-       int i;
-
-       if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
-               return true;
-
-       for (i = 0; i < MSRPM_OFFSETS; i++) {
-               u32 value, p;
-               u64 offset;
+       int reg, dr;
+       unsigned long val;
 
-               if (msrpm_offsets[i] == 0xffffffff)
-                       break;
+       if (svm->vcpu.guest_debug == 0) {
+               /*
+                * No more DR vmexits; force a reload of the debug registers
+                * and reenter on this instruction.  The next vmexit will
+                * retrieve the full state of the debug registers.
+                */
+               clr_dr_intercepts(svm);
+               svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+               return 1;
+       }
 
-               p      = msrpm_offsets[i];
-               offset = svm->nested.vmcb_msrpm + (p * 4);
+       if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+               return emulate_on_interception(svm);
 
-               if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
-                       return false;
+       reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+       dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
 
-               svm->nested.msrpm[p] = svm->msrpm[p] | value;
+       if (dr >= 16) { /* mov to DRn */
+               if (!kvm_require_dr(&svm->vcpu, dr - 16))
+                       return 1;
+               val = kvm_register_read(&svm->vcpu, reg);
+               kvm_set_dr(&svm->vcpu, dr - 16, val);
+       } else {
+               if (!kvm_require_dr(&svm->vcpu, dr))
+                       return 1;
+               kvm_get_dr(&svm->vcpu, dr, &val);
+               kvm_register_write(&svm->vcpu, reg, val);
        }
 
-       svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
-
-       return true;
+       return kvm_skip_emulated_instruction(&svm->vcpu);
 }
 
-static bool nested_vmcb_checks(struct vmcb *vmcb)
+static int cr8_write_interception(struct vcpu_svm *svm)
 {
-       if ((vmcb->save.efer & EFER_SVME) == 0)
-               return false;
-
-       if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
-               return false;
-
-       if (vmcb->control.asid == 0)
-               return false;
-
-       if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
-           !npt_enabled)
-               return false;
+       struct kvm_run *kvm_run = svm->vcpu.run;
+       int r;
 
-       return true;
+       u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+       /* instruction emulation calls kvm_set_cr8() */
+       r = cr_interception(svm);
+       if (lapic_in_kernel(&svm->vcpu))
+               return r;
+       if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+               return r;
+       kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+       return 0;
 }
 
-static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
-                                struct vmcb *nested_vmcb, struct kvm_host_map *map)
+static int svm_get_msr_feature(struct kvm_msr_entry *msr)
 {
-       bool evaluate_pending_interrupts =
-               is_intercept(svm, INTERCEPT_VINTR) ||
-               is_intercept(svm, INTERCEPT_IRET);
-
-       if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
-               svm->vcpu.arch.hflags |= HF_HIF_MASK;
-       else
-               svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
+       msr->data = 0;
 
-       if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
-               svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
-               nested_svm_init_mmu_context(&svm->vcpu);
+       switch (msr->index) {
+       case MSR_F10H_DECFG:
+               if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
+                       msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
+               break;
+       default:
+               return 1;
        }
 
-       /* Load the nested guest state */
-       svm->vmcb->save.es = nested_vmcb->save.es;
-       svm->vmcb->save.cs = nested_vmcb->save.cs;
-       svm->vmcb->save.ss = nested_vmcb->save.ss;
-       svm->vmcb->save.ds = nested_vmcb->save.ds;
-       svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
-       svm->vmcb->save.idtr = nested_vmcb->save.idtr;
-       kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
-       svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
-       svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
-       svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
-       if (npt_enabled) {
-               svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
-               svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
-       } else
-               (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
-
-       /* Guest paging mode is active - reset mmu */
-       kvm_mmu_reset_context(&svm->vcpu);
+       return 0;
+}
 
-       svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
-       kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
-       kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
-       kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip);
-
-       /* In case we don't even reach vcpu_run, the fields are not updated */
-       svm->vmcb->save.rax = nested_vmcb->save.rax;
-       svm->vmcb->save.rsp = nested_vmcb->save.rsp;
-       svm->vmcb->save.rip = nested_vmcb->save.rip;
-       svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
-       svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
-       svm->vmcb->save.cpl = nested_vmcb->save.cpl;
-
-       svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
-       svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
-
-       /* cache intercepts */
-       svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
-       svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
-       svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
-       svm->nested.intercept            = nested_vmcb->control.intercept;
-
-       svm_flush_tlb(&svm->vcpu, true);
-       svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
-       if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
-               svm->vcpu.arch.hflags |= HF_VINTR_MASK;
-       else
-               svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
-       svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+       switch (msr_info->index) {
+       case MSR_STAR:
+               msr_info->data = svm->vmcb->save.star;
+               break;
+#ifdef CONFIG_X86_64
+       case MSR_LSTAR:
+               msr_info->data = svm->vmcb->save.lstar;
+               break;
+       case MSR_CSTAR:
+               msr_info->data = svm->vmcb->save.cstar;
+               break;
+       case MSR_KERNEL_GS_BASE:
+               msr_info->data = svm->vmcb->save.kernel_gs_base;
+               break;
+       case MSR_SYSCALL_MASK:
+               msr_info->data = svm->vmcb->save.sfmask;
+               break;
+#endif
+       case MSR_IA32_SYSENTER_CS:
+               msr_info->data = svm->vmcb->save.sysenter_cs;
+               break;
+       case MSR_IA32_SYSENTER_EIP:
+               msr_info->data = svm->sysenter_eip;
+               break;
+       case MSR_IA32_SYSENTER_ESP:
+               msr_info->data = svm->sysenter_esp;
+               break;
+       case MSR_TSC_AUX:
+               if (!boot_cpu_has(X86_FEATURE_RDTSCP))
+                       return 1;
+               msr_info->data = svm->tsc_aux;
+               break;
+       /*
+        * Nobody will change the following 5 values in the VMCB so we can
+        * safely return them on rdmsr. They will always be 0 until LBRV is
+        * implemented.
+        */
+       case MSR_IA32_DEBUGCTLMSR:
+               msr_info->data = svm->vmcb->save.dbgctl;
+               break;
+       case MSR_IA32_LASTBRANCHFROMIP:
+               msr_info->data = svm->vmcb->save.br_from;
+               break;
+       case MSR_IA32_LASTBRANCHTOIP:
+               msr_info->data = svm->vmcb->save.br_to;
+               break;
+       case MSR_IA32_LASTINTFROMIP:
+               msr_info->data = svm->vmcb->save.last_excp_from;
+               break;
+       case MSR_IA32_LASTINTTOIP:
+               msr_info->data = svm->vmcb->save.last_excp_to;
+               break;
+       case MSR_VM_HSAVE_PA:
+               msr_info->data = svm->nested.hsave_msr;
+               break;
+       case MSR_VM_CR:
+               msr_info->data = svm->nested.vm_cr_msr;
+               break;
+       case MSR_IA32_SPEC_CTRL:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
+                       return 1;
 
-       svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
-       svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
-       svm->vmcb->control.int_state = nested_vmcb->control.int_state;
-       svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
-       svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
+               msr_info->data = svm->spec_ctrl;
+               break;
+       case MSR_AMD64_VIRT_SPEC_CTRL:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+                       return 1;
 
-       svm->vmcb->control.pause_filter_count =
-               nested_vmcb->control.pause_filter_count;
-       svm->vmcb->control.pause_filter_thresh =
-               nested_vmcb->control.pause_filter_thresh;
+               msr_info->data = svm->virt_spec_ctrl;
+               break;
+       case MSR_F15H_IC_CFG: {
 
-       kvm_vcpu_unmap(&svm->vcpu, map, true);
+               int family, model;
 
-       /* Enter Guest-Mode */
-       enter_guest_mode(&svm->vcpu);
+               family = guest_cpuid_family(vcpu);
+               model  = guest_cpuid_model(vcpu);
 
-       /*
-        * Merge guest and host intercepts - must be called  with vcpu in
-        * guest-mode to take affect here
-        */
-       recalc_intercepts(svm);
+               if (family < 0 || model < 0)
+                       return kvm_get_msr_common(vcpu, msr_info);
 
-       svm->nested.vmcb = vmcb_gpa;
+               msr_info->data = 0;
 
-       /*
-        * If L1 had a pending IRQ/NMI before executing VMRUN,
-        * which wasn't delivered because it was disallowed (e.g.
-        * interrupts disabled), L0 needs to evaluate if this pending
-        * event should cause an exit from L2 to L1 or be delivered
-        * directly to L2.
-        *
-        * Usually this would be handled by the processor noticing an
-        * IRQ/NMI window request.  However, VMRUN can unblock interrupts
-        * by implicitly setting GIF, so force L0 to perform pending event
-        * evaluation by requesting a KVM_REQ_EVENT.
-        */
-       enable_gif(svm);
-       if (unlikely(evaluate_pending_interrupts))
-               kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+               if (family == 0x15 &&
+                   (model >= 0x2 && model < 0x20))
+                       msr_info->data = 0x1E;
+               }
+               break;
+       case MSR_F10H_DECFG:
+               msr_info->data = svm->msr_decfg;
+               break;
+       default:
+               return kvm_get_msr_common(vcpu, msr_info);
+       }
+       return 0;
+}
 
-       mark_all_dirty(svm->vmcb);
+static int rdmsr_interception(struct vcpu_svm *svm)
+{
+       return kvm_emulate_rdmsr(&svm->vcpu);
 }
 
-static int nested_svm_vmrun(struct vcpu_svm *svm)
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
 {
-       int ret;
-       struct vmcb *nested_vmcb;
-       struct vmcb *hsave = svm->nested.hsave;
-       struct vmcb *vmcb = svm->vmcb;
-       struct kvm_host_map map;
-       u64 vmcb_gpa;
-
-       vmcb_gpa = svm->vmcb->save.rax;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       int svm_dis, chg_mask;
 
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
-       if (ret == -EINVAL) {
-               kvm_inject_gp(&svm->vcpu, 0);
+       if (data & ~SVM_VM_CR_VALID_MASK)
                return 1;
-       } else if (ret) {
-               return kvm_skip_emulated_instruction(&svm->vcpu);
-       }
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+       chg_mask = SVM_VM_CR_VALID_MASK;
 
-       nested_vmcb = map.hva;
+       if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+               chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
 
-       if (!nested_vmcb_checks(nested_vmcb)) {
-               nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
-               nested_vmcb->control.exit_code_hi = 0;
-               nested_vmcb->control.exit_info_1  = 0;
-               nested_vmcb->control.exit_info_2  = 0;
+       svm->nested.vm_cr_msr &= ~chg_mask;
+       svm->nested.vm_cr_msr |= (data & chg_mask);
 
-               kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
 
-               return ret;
-       }
+       /* check for svm_disable while efer.svme is set */
+       if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+               return 1;
 
-       trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
-                              nested_vmcb->save.rip,
-                              nested_vmcb->control.int_ctl,
-                              nested_vmcb->control.event_inj,
-                              nested_vmcb->control.nested_ctl);
+       return 0;
+}
 
-       trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
-                                   nested_vmcb->control.intercept_cr >> 16,
-                                   nested_vmcb->control.intercept_exceptions,
-                                   nested_vmcb->control.intercept);
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       /* Clear internal status */
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
+       u32 ecx = msr->index;
+       u64 data = msr->data;
+       switch (ecx) {
+       case MSR_IA32_CR_PAT:
+               if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+                       return 1;
+               vcpu->arch.pat = data;
+               svm->vmcb->save.g_pat = data;
+               mark_dirty(svm->vmcb, VMCB_NPT);
+               break;
+       case MSR_IA32_SPEC_CTRL:
+               if (!msr->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
+                       return 1;
 
-       /*
-        * Save the old vmcb, so we don't need to pick what we save, but can
-        * restore everything when a VMEXIT occurs
-        */
-       hsave->save.es     = vmcb->save.es;
-       hsave->save.cs     = vmcb->save.cs;
-       hsave->save.ss     = vmcb->save.ss;
-       hsave->save.ds     = vmcb->save.ds;
-       hsave->save.gdtr   = vmcb->save.gdtr;
-       hsave->save.idtr   = vmcb->save.idtr;
-       hsave->save.efer   = svm->vcpu.arch.efer;
-       hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
-       hsave->save.cr4    = svm->vcpu.arch.cr4;
-       hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
-       hsave->save.rip    = kvm_rip_read(&svm->vcpu);
-       hsave->save.rsp    = vmcb->save.rsp;
-       hsave->save.rax    = vmcb->save.rax;
-       if (npt_enabled)
-               hsave->save.cr3    = vmcb->save.cr3;
-       else
-               hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
+               if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
+                       return 1;
 
-       copy_vmcb_control_area(hsave, vmcb);
+               svm->spec_ctrl = data;
+               if (!data)
+                       break;
 
-       enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
+               /*
+                * For non-nested:
+                * When it's written (to non-zero) for the first time, pass
+                * it through.
+                *
+                * For nested:
+                * The handling of the MSR bitmap for L2 guests is done in
+                * nested_svm_vmrun_msrpm.
+                * We update the L1 MSR bit as well since it will end up
+                * touching the MSR anyway now.
+                */
+               set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+               break;
+       case MSR_IA32_PRED_CMD:
+               if (!msr->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
+                       return 1;
 
-       if (!nested_svm_vmrun_msrpm(svm)) {
-               svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
-               svm->vmcb->control.exit_code_hi = 0;
-               svm->vmcb->control.exit_info_1  = 0;
-               svm->vmcb->control.exit_info_2  = 0;
+               if (data & ~PRED_CMD_IBPB)
+                       return 1;
+               if (!boot_cpu_has(X86_FEATURE_AMD_IBPB))
+                       return 1;
+               if (!data)
+                       break;
 
-               nested_svm_vmexit(svm);
-       }
+               wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+               set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
+               break;
+       case MSR_AMD64_VIRT_SPEC_CTRL:
+               if (!msr->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+                       return 1;
 
-       return ret;
-}
+               if (data & ~SPEC_CTRL_SSBD)
+                       return 1;
 
-static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
-{
-       to_vmcb->save.fs = from_vmcb->save.fs;
-       to_vmcb->save.gs = from_vmcb->save.gs;
-       to_vmcb->save.tr = from_vmcb->save.tr;
-       to_vmcb->save.ldtr = from_vmcb->save.ldtr;
-       to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
-       to_vmcb->save.star = from_vmcb->save.star;
-       to_vmcb->save.lstar = from_vmcb->save.lstar;
-       to_vmcb->save.cstar = from_vmcb->save.cstar;
-       to_vmcb->save.sfmask = from_vmcb->save.sfmask;
-       to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
-       to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
-       to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
-}
+               svm->virt_spec_ctrl = data;
+               break;
+       case MSR_STAR:
+               svm->vmcb->save.star = data;
+               break;
+#ifdef CONFIG_X86_64
+       case MSR_LSTAR:
+               svm->vmcb->save.lstar = data;
+               break;
+       case MSR_CSTAR:
+               svm->vmcb->save.cstar = data;
+               break;
+       case MSR_KERNEL_GS_BASE:
+               svm->vmcb->save.kernel_gs_base = data;
+               break;
+       case MSR_SYSCALL_MASK:
+               svm->vmcb->save.sfmask = data;
+               break;
+#endif
+       case MSR_IA32_SYSENTER_CS:
+               svm->vmcb->save.sysenter_cs = data;
+               break;
+       case MSR_IA32_SYSENTER_EIP:
+               svm->sysenter_eip = data;
+               svm->vmcb->save.sysenter_eip = data;
+               break;
+       case MSR_IA32_SYSENTER_ESP:
+               svm->sysenter_esp = data;
+               svm->vmcb->save.sysenter_esp = data;
+               break;
+       case MSR_TSC_AUX:
+               if (!boot_cpu_has(X86_FEATURE_RDTSCP))
+                       return 1;
 
-static int vmload_interception(struct vcpu_svm *svm)
-{
-       struct vmcb *nested_vmcb;
-       struct kvm_host_map map;
-       int ret;
+               /*
+                * This is rare, so we update the MSR here instead of using
+                * direct_access_msrs.  Doing that would require a rdmsr in
+                * svm_vcpu_put.
+                */
+               svm->tsc_aux = data;
+               wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+               break;
+       case MSR_IA32_DEBUGCTLMSR:
+               if (!boot_cpu_has(X86_FEATURE_LBRV)) {
+                       vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+                                   __func__, data);
+                       break;
+               }
+               if (data & DEBUGCTL_RESERVED_BITS)
+                       return 1;
 
-       if (nested_svm_check_permissions(svm))
-               return 1;
+               svm->vmcb->save.dbgctl = data;
+               mark_dirty(svm->vmcb, VMCB_LBR);
+               if (data & (1ULL<<0))
+                       svm_enable_lbrv(svm);
+               else
+                       svm_disable_lbrv(svm);
+               break;
+       case MSR_VM_HSAVE_PA:
+               svm->nested.hsave_msr = data;
+               break;
+       case MSR_VM_CR:
+               return svm_set_vm_cr(vcpu, data);
+       case MSR_VM_IGNNE:
+               vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+               break;
+       case MSR_F10H_DECFG: {
+               struct kvm_msr_entry msr_entry;
 
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
-       if (ret) {
-               if (ret == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
-               return 1;
-       }
+               msr_entry.index = msr->index;
+               if (svm_get_msr_feature(&msr_entry))
+                       return 1;
 
-       nested_vmcb = map.hva;
+               /* Check the supported bits */
+               if (data & ~msr_entry.data)
+                       return 1;
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+               /* Don't allow the guest to change a bit, #GP */
+               if (!msr->host_initiated && (data ^ msr_entry.data))
+                       return 1;
 
-       nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+               svm->msr_decfg = data;
+               break;
+       }
+       case MSR_IA32_APICBASE:
+               if (kvm_vcpu_apicv_active(vcpu))
+                       avic_update_vapic_bar(to_svm(vcpu), data);
+               /* Fall through */
+       default:
+               return kvm_set_msr_common(vcpu, msr);
+       }
+       return 0;
+}
 
-       return ret;
+static int wrmsr_interception(struct vcpu_svm *svm)
+{
+       return kvm_emulate_wrmsr(&svm->vcpu);
 }
 
-static int vmsave_interception(struct vcpu_svm *svm)
+static int msr_interception(struct vcpu_svm *svm)
 {
-       struct vmcb *nested_vmcb;
-       struct kvm_host_map map;
-       int ret;
+       if (svm->vmcb->control.exit_info_1)
+               return wrmsr_interception(svm);
+       else
+               return rdmsr_interception(svm);
+}
 
-       if (nested_svm_check_permissions(svm))
-               return 1;
+static int interrupt_window_interception(struct vcpu_svm *svm)
+{
+       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       svm_clear_vintr(svm);
 
-       ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
-       if (ret) {
-               if (ret == -EINVAL)
-                       kvm_inject_gp(&svm->vcpu, 0);
-               return 1;
-       }
+       /*
+        * For AVIC, the only reason to end up here is ExtINTs.
+        * In this case AVIC was temporarily disabled for
+        * requesting the IRQ window and we have to re-enable it.
+        */
+       svm_toggle_avic_for_irq_window(&svm->vcpu, true);
 
-       nested_vmcb = map.hva;
+       svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+       mark_dirty(svm->vmcb, VMCB_INTR);
+       ++svm->vcpu.stat.irq_window_exits;
+       return 1;
+}
 
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
+static int pause_interception(struct vcpu_svm *svm)
+{
+       struct kvm_vcpu *vcpu = &svm->vcpu;
+       bool in_kernel = (svm_get_cpl(vcpu) == 0);
 
-       nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
-       kvm_vcpu_unmap(&svm->vcpu, &map, true);
+       if (pause_filter_thresh)
+               grow_ple_window(vcpu);
 
-       return ret;
+       kvm_vcpu_on_spin(vcpu, in_kernel);
+       return 1;
 }
 
-static int vmrun_interception(struct vcpu_svm *svm)
+static int nop_interception(struct vcpu_svm *svm)
 {
-       if (nested_svm_check_permissions(svm))
-               return 1;
-
-       return nested_svm_vmrun(svm);
+       return kvm_skip_emulated_instruction(&(svm->vcpu));
 }
 
-static int stgi_interception(struct vcpu_svm *svm)
+static int monitor_interception(struct vcpu_svm *svm)
 {
-       int ret;
-
-       if (nested_svm_check_permissions(svm))
-               return 1;
-
-       /*
-        * If VGIF is enabled, the STGI intercept is only added to
-        * detect the opening of the SMI/NMI window; remove it now.
-        */
-       if (vgif_enabled(svm))
-               clr_intercept(svm, INTERCEPT_STGI);
-
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-
-       enable_gif(svm);
-
-       return ret;
-}
-
-static int clgi_interception(struct vcpu_svm *svm)
-{
-       int ret;
-
-       if (nested_svm_check_permissions(svm))
-               return 1;
-
-       ret = kvm_skip_emulated_instruction(&svm->vcpu);
-
-       disable_gif(svm);
-
-       /* After a CLGI no interrupts should come */
-       if (!kvm_vcpu_apicv_active(&svm->vcpu))
-               svm_clear_vintr(svm);
-
-       return ret;
-}
-
-static int invlpga_interception(struct vcpu_svm *svm)
-{
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-
-       trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
-                         kvm_rax_read(&svm->vcpu));
-
-       /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
-       kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
-
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
-
-static int skinit_interception(struct vcpu_svm *svm)
-{
-       trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
-
-       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-       return 1;
-}
-
-static int wbinvd_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_wbinvd(&svm->vcpu);
+       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
+       return nop_interception(svm);
 }
 
-static int xsetbv_interception(struct vcpu_svm *svm)
+static int mwait_interception(struct vcpu_svm *svm)
 {
-       u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
-       u32 index = kvm_rcx_read(&svm->vcpu);
-
-       if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
-               return kvm_skip_emulated_instruction(&svm->vcpu);
-       }
-
-       return 1;
+       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
+       return nop_interception(svm);
 }
 
-static int rdpru_interception(struct vcpu_svm *svm)
-{
-       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-       return 1;
-}
+static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
+       [SVM_EXIT_READ_CR0]                     = cr_interception,
+       [SVM_EXIT_READ_CR3]                     = cr_interception,
+       [SVM_EXIT_READ_CR4]                     = cr_interception,
+       [SVM_EXIT_READ_CR8]                     = cr_interception,
+       [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
+       [SVM_EXIT_WRITE_CR0]                    = cr_interception,
+       [SVM_EXIT_WRITE_CR3]                    = cr_interception,
+       [SVM_EXIT_WRITE_CR4]                    = cr_interception,
+       [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
+       [SVM_EXIT_READ_DR0]                     = dr_interception,
+       [SVM_EXIT_READ_DR1]                     = dr_interception,
+       [SVM_EXIT_READ_DR2]                     = dr_interception,
+       [SVM_EXIT_READ_DR3]                     = dr_interception,
+       [SVM_EXIT_READ_DR4]                     = dr_interception,
+       [SVM_EXIT_READ_DR5]                     = dr_interception,
+       [SVM_EXIT_READ_DR6]                     = dr_interception,
+       [SVM_EXIT_READ_DR7]                     = dr_interception,
+       [SVM_EXIT_WRITE_DR0]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR1]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR2]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR3]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR4]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR5]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR6]                    = dr_interception,
+       [SVM_EXIT_WRITE_DR7]                    = dr_interception,
+       [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
+       [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
+       [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
+       [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
+       [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
+       [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
+       [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
+       [SVM_EXIT_INTR]                         = intr_interception,
+       [SVM_EXIT_NMI]                          = nmi_interception,
+       [SVM_EXIT_SMI]                          = nop_on_interception,
+       [SVM_EXIT_INIT]                         = nop_on_interception,
+       [SVM_EXIT_VINTR]                        = interrupt_window_interception,
+       [SVM_EXIT_RDPMC]                        = rdpmc_interception,
+       [SVM_EXIT_CPUID]                        = cpuid_interception,
+       [SVM_EXIT_IRET]                         = iret_interception,
+       [SVM_EXIT_INVD]                         = emulate_on_interception,
+       [SVM_EXIT_PAUSE]                        = pause_interception,
+       [SVM_EXIT_HLT]                          = halt_interception,
+       [SVM_EXIT_INVLPG]                       = invlpg_interception,
+       [SVM_EXIT_INVLPGA]                      = invlpga_interception,
+       [SVM_EXIT_IOIO]                         = io_interception,
+       [SVM_EXIT_MSR]                          = msr_interception,
+       [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
+       [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
+       [SVM_EXIT_VMRUN]                        = vmrun_interception,
+       [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
+       [SVM_EXIT_VMLOAD]                       = vmload_interception,
+       [SVM_EXIT_VMSAVE]                       = vmsave_interception,
+       [SVM_EXIT_STGI]                         = stgi_interception,
+       [SVM_EXIT_CLGI]                         = clgi_interception,
+       [SVM_EXIT_SKINIT]                       = skinit_interception,
+       [SVM_EXIT_WBINVD]                       = wbinvd_interception,
+       [SVM_EXIT_MONITOR]                      = monitor_interception,
+       [SVM_EXIT_MWAIT]                        = mwait_interception,
+       [SVM_EXIT_XSETBV]                       = xsetbv_interception,
+       [SVM_EXIT_RDPRU]                        = rdpru_interception,
+       [SVM_EXIT_NPF]                          = npf_interception,
+       [SVM_EXIT_RSM]                          = rsm_interception,
+       [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
+       [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
+};
 
-static int task_switch_interception(struct vcpu_svm *svm)
+static void dump_vmcb(struct kvm_vcpu *vcpu)
 {
-       u16 tss_selector;
-       int reason;
-       int int_type = svm->vmcb->control.exit_int_info &
-               SVM_EXITINTINFO_TYPE_MASK;
-       int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
-       uint32_t type =
-               svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
-       uint32_t idt_v =
-               svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
-       bool has_error_code = false;
-       u32 error_code = 0;
-
-       tss_selector = (u16)svm->vmcb->control.exit_info_1;
-
-       if (svm->vmcb->control.exit_info_2 &
-           (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
-               reason = TASK_SWITCH_IRET;
-       else if (svm->vmcb->control.exit_info_2 &
-                (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
-               reason = TASK_SWITCH_JMP;
-       else if (idt_v)
-               reason = TASK_SWITCH_GATE;
-       else
-               reason = TASK_SWITCH_CALL;
-
-       if (reason == TASK_SWITCH_GATE) {
-               switch (type) {
-               case SVM_EXITINTINFO_TYPE_NMI:
-                       svm->vcpu.arch.nmi_injected = false;
-                       break;
-               case SVM_EXITINTINFO_TYPE_EXEPT:
-                       if (svm->vmcb->control.exit_info_2 &
-                           (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
-                               has_error_code = true;
-                               error_code =
-                                       (u32)svm->vmcb->control.exit_info_2;
-                       }
-                       kvm_clear_exception_queue(&svm->vcpu);
-                       break;
-               case SVM_EXITINTINFO_TYPE_INTR:
-                       kvm_clear_interrupt_queue(&svm->vcpu);
-                       break;
-               default:
-                       break;
-               }
-       }
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb_control_area *control = &svm->vmcb->control;
+       struct vmcb_save_area *save = &svm->vmcb->save;
 
-       if (reason != TASK_SWITCH_GATE ||
-           int_type == SVM_EXITINTINFO_TYPE_SOFT ||
-           (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
-            (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
-               if (!skip_emulated_instruction(&svm->vcpu))
-                       return 0;
+       if (!dump_invalid_vmcb) {
+               pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
+               return;
        }
 
-       if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
-               int_vec = -1;
-
-       return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
-                              has_error_code, error_code);
-}
-
-static int cpuid_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_cpuid(&svm->vcpu);
-}
-
-static int iret_interception(struct vcpu_svm *svm)
-{
-       ++svm->vcpu.stat.nmi_window_exits;
-       clr_intercept(svm, INTERCEPT_IRET);
-       svm->vcpu.arch.hflags |= HF_IRET_MASK;
-       svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-       return 1;
-}
-
-static int invlpg_interception(struct vcpu_svm *svm)
-{
-       if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return kvm_emulate_instruction(&svm->vcpu, 0);
-
-       kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
-
-static int emulate_on_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_instruction(&svm->vcpu, 0);
-}
-
-static int rsm_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
-}
-
-static int rdpmc_interception(struct vcpu_svm *svm)
-{
-       int err;
-
-       if (!nrips)
-               return emulate_on_interception(svm);
-
-       err = kvm_rdpmc(&svm->vcpu);
-       return kvm_complete_insn_gp(&svm->vcpu, err);
-}
-
-static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
-                                           unsigned long val)
-{
-       unsigned long cr0 = svm->vcpu.arch.cr0;
-       bool ret = false;
-       u64 intercept;
-
-       intercept = svm->nested.intercept;
-
-       if (!is_guest_mode(&svm->vcpu) ||
-           (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
-               return false;
-
-       cr0 &= ~SVM_CR0_SELECTIVE_MASK;
-       val &= ~SVM_CR0_SELECTIVE_MASK;
-
-       if (cr0 ^ val) {
-               svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
-               ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
-       }
-
-       return ret;
-}
-
-#define CR_VALID (1ULL << 63)
-
-static int cr_interception(struct vcpu_svm *svm)
-{
-       int reg, cr;
-       unsigned long val;
-       int err;
-
-       if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return emulate_on_interception(svm);
-
-       if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
-               return emulate_on_interception(svm);
-
-       reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
-       if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
-               cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
-       else
-               cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
-
-       err = 0;
-       if (cr >= 16) { /* mov to cr */
-               cr -= 16;
-               val = kvm_register_read(&svm->vcpu, reg);
-               switch (cr) {
-               case 0:
-                       if (!check_selective_cr0_intercepted(svm, val))
-                               err = kvm_set_cr0(&svm->vcpu, val);
-                       else
-                               return 1;
-
-                       break;
-               case 3:
-                       err = kvm_set_cr3(&svm->vcpu, val);
-                       break;
-               case 4:
-                       err = kvm_set_cr4(&svm->vcpu, val);
-                       break;
-               case 8:
-                       err = kvm_set_cr8(&svm->vcpu, val);
-                       break;
-               default:
-                       WARN(1, "unhandled write to CR%d", cr);
-                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-                       return 1;
-               }
-       } else { /* mov from cr */
-               switch (cr) {
-               case 0:
-                       val = kvm_read_cr0(&svm->vcpu);
-                       break;
-               case 2:
-                       val = svm->vcpu.arch.cr2;
-                       break;
-               case 3:
-                       val = kvm_read_cr3(&svm->vcpu);
-                       break;
-               case 4:
-                       val = kvm_read_cr4(&svm->vcpu);
-                       break;
-               case 8:
-                       val = kvm_get_cr8(&svm->vcpu);
-                       break;
-               default:
-                       WARN(1, "unhandled read from CR%d", cr);
-                       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
-                       return 1;
-               }
-               kvm_register_write(&svm->vcpu, reg, val);
-       }
-       return kvm_complete_insn_gp(&svm->vcpu, err);
-}
-
-static int dr_interception(struct vcpu_svm *svm)
-{
-       int reg, dr;
-       unsigned long val;
-
-       if (svm->vcpu.guest_debug == 0) {
-               /*
-                * No more DR vmexits; force a reload of the debug registers
-                * and reenter on this instruction.  The next vmexit will
-                * retrieve the full state of the debug registers.
-                */
-               clr_dr_intercepts(svm);
-               svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
-               return 1;
-       }
-
-       if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
-               return emulate_on_interception(svm);
-
-       reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
-       dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
-
-       if (dr >= 16) { /* mov to DRn */
-               if (!kvm_require_dr(&svm->vcpu, dr - 16))
-                       return 1;
-               val = kvm_register_read(&svm->vcpu, reg);
-               kvm_set_dr(&svm->vcpu, dr - 16, val);
-       } else {
-               if (!kvm_require_dr(&svm->vcpu, dr))
-                       return 1;
-               kvm_get_dr(&svm->vcpu, dr, &val);
-               kvm_register_write(&svm->vcpu, reg, val);
-       }
-
-       return kvm_skip_emulated_instruction(&svm->vcpu);
-}
-
-static int cr8_write_interception(struct vcpu_svm *svm)
-{
-       struct kvm_run *kvm_run = svm->vcpu.run;
-       int r;
-
-       u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
-       /* instruction emulation calls kvm_set_cr8() */
-       r = cr_interception(svm);
-       if (lapic_in_kernel(&svm->vcpu))
-               return r;
-       if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
-               return r;
-       kvm_run->exit_reason = KVM_EXIT_SET_TPR;
-       return 0;
-}
-
-static int svm_get_msr_feature(struct kvm_msr_entry *msr)
-{
-       msr->data = 0;
-
-       switch (msr->index) {
-       case MSR_F10H_DECFG:
-               if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
-                       msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
-               break;
-       default:
-               return 1;
-       }
-
-       return 0;
-}
-
-static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       switch (msr_info->index) {
-       case MSR_STAR:
-               msr_info->data = svm->vmcb->save.star;
-               break;
-#ifdef CONFIG_X86_64
-       case MSR_LSTAR:
-               msr_info->data = svm->vmcb->save.lstar;
-               break;
-       case MSR_CSTAR:
-               msr_info->data = svm->vmcb->save.cstar;
-               break;
-       case MSR_KERNEL_GS_BASE:
-               msr_info->data = svm->vmcb->save.kernel_gs_base;
-               break;
-       case MSR_SYSCALL_MASK:
-               msr_info->data = svm->vmcb->save.sfmask;
-               break;
-#endif
-       case MSR_IA32_SYSENTER_CS:
-               msr_info->data = svm->vmcb->save.sysenter_cs;
-               break;
-       case MSR_IA32_SYSENTER_EIP:
-               msr_info->data = svm->sysenter_eip;
-               break;
-       case MSR_IA32_SYSENTER_ESP:
-               msr_info->data = svm->sysenter_esp;
-               break;
-       case MSR_TSC_AUX:
-               if (!boot_cpu_has(X86_FEATURE_RDTSCP))
-                       return 1;
-               msr_info->data = svm->tsc_aux;
-               break;
-       /*
-        * Nobody will change the following 5 values in the VMCB so we can
-        * safely return them on rdmsr. They will always be 0 until LBRV is
-        * implemented.
-        */
-       case MSR_IA32_DEBUGCTLMSR:
-               msr_info->data = svm->vmcb->save.dbgctl;
-               break;
-       case MSR_IA32_LASTBRANCHFROMIP:
-               msr_info->data = svm->vmcb->save.br_from;
-               break;
-       case MSR_IA32_LASTBRANCHTOIP:
-               msr_info->data = svm->vmcb->save.br_to;
-               break;
-       case MSR_IA32_LASTINTFROMIP:
-               msr_info->data = svm->vmcb->save.last_excp_from;
-               break;
-       case MSR_IA32_LASTINTTOIP:
-               msr_info->data = svm->vmcb->save.last_excp_to;
-               break;
-       case MSR_VM_HSAVE_PA:
-               msr_info->data = svm->nested.hsave_msr;
-               break;
-       case MSR_VM_CR:
-               msr_info->data = svm->nested.vm_cr_msr;
-               break;
-       case MSR_IA32_SPEC_CTRL:
-               if (!msr_info->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
-                       return 1;
-
-               msr_info->data = svm->spec_ctrl;
-               break;
-       case MSR_AMD64_VIRT_SPEC_CTRL:
-               if (!msr_info->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
-                       return 1;
-
-               msr_info->data = svm->virt_spec_ctrl;
-               break;
-       case MSR_F15H_IC_CFG: {
-
-               int family, model;
-
-               family = guest_cpuid_family(vcpu);
-               model  = guest_cpuid_model(vcpu);
-
-               if (family < 0 || model < 0)
-                       return kvm_get_msr_common(vcpu, msr_info);
-
-               msr_info->data = 0;
-
-               if (family == 0x15 &&
-                   (model >= 0x2 && model < 0x20))
-                       msr_info->data = 0x1E;
-               }
-               break;
-       case MSR_F10H_DECFG:
-               msr_info->data = svm->msr_decfg;
-               break;
-       default:
-               return kvm_get_msr_common(vcpu, msr_info);
-       }
-       return 0;
-}
-
-static int rdmsr_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_rdmsr(&svm->vcpu);
-}
-
-static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       int svm_dis, chg_mask;
-
-       if (data & ~SVM_VM_CR_VALID_MASK)
-               return 1;
-
-       chg_mask = SVM_VM_CR_VALID_MASK;
-
-       if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
-               chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
-
-       svm->nested.vm_cr_msr &= ~chg_mask;
-       svm->nested.vm_cr_msr |= (data & chg_mask);
-
-       svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
-
-       /* check for svm_disable while efer.svme is set */
-       if (svm_dis && (vcpu->arch.efer & EFER_SVME))
-               return 1;
-
-       return 0;
-}
-
-static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       u32 ecx = msr->index;
-       u64 data = msr->data;
-       switch (ecx) {
-       case MSR_IA32_CR_PAT:
-               if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
-                       return 1;
-               vcpu->arch.pat = data;
-               svm->vmcb->save.g_pat = data;
-               mark_dirty(svm->vmcb, VMCB_NPT);
-               break;
-       case MSR_IA32_SPEC_CTRL:
-               if (!msr->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
-                       return 1;
-
-               if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
-                       return 1;
-
-               svm->spec_ctrl = data;
-               if (!data)
-                       break;
-
-               /*
-                * For non-nested:
-                * When it's written (to non-zero) for the first time, pass
-                * it through.
-                *
-                * For nested:
-                * The handling of the MSR bitmap for L2 guests is done in
-                * nested_svm_vmrun_msrpm.
-                * We update the L1 MSR bit as well since it will end up
-                * touching the MSR anyway now.
-                */
-               set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
-               break;
-       case MSR_IA32_PRED_CMD:
-               if (!msr->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
-                       return 1;
-
-               if (data & ~PRED_CMD_IBPB)
-                       return 1;
-               if (!boot_cpu_has(X86_FEATURE_AMD_IBPB))
-                       return 1;
-               if (!data)
-                       break;
-
-               wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
-               set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
-               break;
-       case MSR_AMD64_VIRT_SPEC_CTRL:
-               if (!msr->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
-                       return 1;
-
-               if (data & ~SPEC_CTRL_SSBD)
-                       return 1;
-
-               svm->virt_spec_ctrl = data;
-               break;
-       case MSR_STAR:
-               svm->vmcb->save.star = data;
-               break;
-#ifdef CONFIG_X86_64
-       case MSR_LSTAR:
-               svm->vmcb->save.lstar = data;
-               break;
-       case MSR_CSTAR:
-               svm->vmcb->save.cstar = data;
-               break;
-       case MSR_KERNEL_GS_BASE:
-               svm->vmcb->save.kernel_gs_base = data;
-               break;
-       case MSR_SYSCALL_MASK:
-               svm->vmcb->save.sfmask = data;
-               break;
-#endif
-       case MSR_IA32_SYSENTER_CS:
-               svm->vmcb->save.sysenter_cs = data;
-               break;
-       case MSR_IA32_SYSENTER_EIP:
-               svm->sysenter_eip = data;
-               svm->vmcb->save.sysenter_eip = data;
-               break;
-       case MSR_IA32_SYSENTER_ESP:
-               svm->sysenter_esp = data;
-               svm->vmcb->save.sysenter_esp = data;
-               break;
-       case MSR_TSC_AUX:
-               if (!boot_cpu_has(X86_FEATURE_RDTSCP))
-                       return 1;
-
-               /*
-                * This is rare, so we update the MSR here instead of using
-                * direct_access_msrs.  Doing that would require a rdmsr in
-                * svm_vcpu_put.
-                */
-               svm->tsc_aux = data;
-               wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
-               break;
-       case MSR_IA32_DEBUGCTLMSR:
-               if (!boot_cpu_has(X86_FEATURE_LBRV)) {
-                       vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
-                                   __func__, data);
-                       break;
-               }
-               if (data & DEBUGCTL_RESERVED_BITS)
-                       return 1;
-
-               svm->vmcb->save.dbgctl = data;
-               mark_dirty(svm->vmcb, VMCB_LBR);
-               if (data & (1ULL<<0))
-                       svm_enable_lbrv(svm);
-               else
-                       svm_disable_lbrv(svm);
-               break;
-       case MSR_VM_HSAVE_PA:
-               svm->nested.hsave_msr = data;
-               break;
-       case MSR_VM_CR:
-               return svm_set_vm_cr(vcpu, data);
-       case MSR_VM_IGNNE:
-               vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
-               break;
-       case MSR_F10H_DECFG: {
-               struct kvm_msr_entry msr_entry;
-
-               msr_entry.index = msr->index;
-               if (svm_get_msr_feature(&msr_entry))
-                       return 1;
-
-               /* Check the supported bits */
-               if (data & ~msr_entry.data)
-                       return 1;
-
-               /* Don't allow the guest to change a bit, #GP */
-               if (!msr->host_initiated && (data ^ msr_entry.data))
-                       return 1;
-
-               svm->msr_decfg = data;
-               break;
-       }
-       case MSR_IA32_APICBASE:
-               if (kvm_vcpu_apicv_active(vcpu))
-                       avic_update_vapic_bar(to_svm(vcpu), data);
-               /* Fall through */
-       default:
-               return kvm_set_msr_common(vcpu, msr);
-       }
-       return 0;
-}
-
-static int wrmsr_interception(struct vcpu_svm *svm)
-{
-       return kvm_emulate_wrmsr(&svm->vcpu);
-}
-
-static int msr_interception(struct vcpu_svm *svm)
-{
-       if (svm->vmcb->control.exit_info_1)
-               return wrmsr_interception(svm);
-       else
-               return rdmsr_interception(svm);
-}
-
-static int interrupt_window_interception(struct vcpu_svm *svm)
-{
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-       svm_clear_vintr(svm);
-
-       /*
-        * For AVIC, the only reason to end up here is ExtINTs.
-        * In this case AVIC was temporarily disabled for
-        * requesting the IRQ window and we have to re-enable it.
-        */
-       svm_toggle_avic_for_irq_window(&svm->vcpu, true);
-
-       svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
-       mark_dirty(svm->vmcb, VMCB_INTR);
-       ++svm->vcpu.stat.irq_window_exits;
-       return 1;
-}
-
-static int pause_interception(struct vcpu_svm *svm)
-{
-       struct kvm_vcpu *vcpu = &svm->vcpu;
-       bool in_kernel = (svm_get_cpl(vcpu) == 0);
-
-       if (pause_filter_thresh)
-               grow_ple_window(vcpu);
-
-       kvm_vcpu_on_spin(vcpu, in_kernel);
-       return 1;
-}
-
-static int nop_interception(struct vcpu_svm *svm)
-{
-       return kvm_skip_emulated_instruction(&(svm->vcpu));
-}
-
-static int monitor_interception(struct vcpu_svm *svm)
-{
-       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
-       return nop_interception(svm);
-}
-
-static int mwait_interception(struct vcpu_svm *svm)
-{
-       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
-       return nop_interception(svm);
-}
-
-enum avic_ipi_failure_cause {
-       AVIC_IPI_FAILURE_INVALID_INT_TYPE,
-       AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
-       AVIC_IPI_FAILURE_INVALID_TARGET,
-       AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
-};
-
-static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
-{
-       u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
-       u32 icrl = svm->vmcb->control.exit_info_1;
-       u32 id = svm->vmcb->control.exit_info_2 >> 32;
-       u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
-       struct kvm_lapic *apic = svm->vcpu.arch.apic;
-
-       trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
-
-       switch (id) {
-       case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
-               /*
-                * AVIC hardware handles the generation of
-                * IPIs when the specified Message Type is Fixed
-                * (also known as fixed delivery mode) and
-                * the Trigger Mode is edge-triggered. The hardware
-                * also supports self and broadcast delivery modes
-                * specified via the Destination Shorthand(DSH)
-                * field of the ICRL. Logical and physical APIC ID
-                * formats are supported. All other IPI types cause
-                * a #VMEXIT, which needs to emulated.
-                */
-               kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
-               kvm_lapic_reg_write(apic, APIC_ICR, icrl);
-               break;
-       case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
-               int i;
-               struct kvm_vcpu *vcpu;
-               struct kvm *kvm = svm->vcpu.kvm;
-               struct kvm_lapic *apic = svm->vcpu.arch.apic;
-
-               /*
-                * At this point, we expect that the AVIC HW has already
-                * set the appropriate IRR bits on the valid target
-                * vcpus. So, we just need to kick the appropriate vcpu.
-                */
-               kvm_for_each_vcpu(i, vcpu, kvm) {
-                       bool m = kvm_apic_match_dest(vcpu, apic,
-                                                    icrl & APIC_SHORT_MASK,
-                                                    GET_APIC_DEST_FIELD(icrh),
-                                                    icrl & APIC_DEST_MASK);
-
-                       if (m && !avic_vcpu_is_running(vcpu))
-                               kvm_vcpu_wake_up(vcpu);
-               }
-               break;
-       }
-       case AVIC_IPI_FAILURE_INVALID_TARGET:
-               WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
-                         index, svm->vcpu.vcpu_id, icrh, icrl);
-               break;
-       case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
-               WARN_ONCE(1, "Invalid backing page\n");
-               break;
-       default:
-               pr_err("Unknown IPI interception\n");
-       }
-
-       return 1;
-}
-
-static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
-{
-       struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
-       int index;
-       u32 *logical_apic_id_table;
-       int dlid = GET_APIC_LOGICAL_ID(ldr);
-
-       if (!dlid)
-               return NULL;
-
-       if (flat) { /* flat */
-               index = ffs(dlid) - 1;
-               if (index > 7)
-                       return NULL;
-       } else { /* cluster */
-               int cluster = (dlid & 0xf0) >> 4;
-               int apic = ffs(dlid & 0x0f) - 1;
-
-               if ((apic < 0) || (apic > 7) ||
-                   (cluster >= 0xf))
-                       return NULL;
-               index = (cluster << 2) + apic;
-       }
-
-       logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
-
-       return &logical_apic_id_table[index];
-}
-
-static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
-{
-       bool flat;
-       u32 *entry, new_entry;
-
-       flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
-       entry = avic_get_logical_id_entry(vcpu, ldr, flat);
-       if (!entry)
-               return -EINVAL;
-
-       new_entry = READ_ONCE(*entry);
-       new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
-       new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
-       new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
-       WRITE_ONCE(*entry, new_entry);
-
-       return 0;
-}
-
-static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       bool flat = svm->dfr_reg == APIC_DFR_FLAT;
-       u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
-
-       if (entry)
-               clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
-}
-
-static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
-{
-       int ret = 0;
-       struct vcpu_svm *svm = to_svm(vcpu);
-       u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
-       u32 id = kvm_xapic_id(vcpu->arch.apic);
-
-       if (ldr == svm->ldr_reg)
-               return 0;
-
-       avic_invalidate_logical_id_entry(vcpu);
-
-       if (ldr)
-               ret = avic_ldr_write(vcpu, id, ldr);
-
-       if (!ret)
-               svm->ldr_reg = ldr;
-
-       return ret;
-}
-
-static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
-{
-       u64 *old, *new;
-       struct vcpu_svm *svm = to_svm(vcpu);
-       u32 id = kvm_xapic_id(vcpu->arch.apic);
-
-       if (vcpu->vcpu_id == id)
-               return 0;
-
-       old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
-       new = avic_get_physical_id_entry(vcpu, id);
-       if (!new || !old)
-               return 1;
-
-       /* We need to move physical_id_entry to new offset */
-       *new = *old;
-       *old = 0ULL;
-       to_svm(vcpu)->avic_physical_id_cache = new;
-
-       /*
-        * Also update the guest physical APIC ID in the logical
-        * APIC ID table entry if already setup the LDR.
-        */
-       if (svm->ldr_reg)
-               avic_handle_ldr_update(vcpu);
-
-       return 0;
-}
-
-static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
-
-       if (svm->dfr_reg == dfr)
-               return;
-
-       avic_invalidate_logical_id_entry(vcpu);
-       svm->dfr_reg = dfr;
-}
-
-static int avic_unaccel_trap_write(struct vcpu_svm *svm)
-{
-       struct kvm_lapic *apic = svm->vcpu.arch.apic;
-       u32 offset = svm->vmcb->control.exit_info_1 &
-                               AVIC_UNACCEL_ACCESS_OFFSET_MASK;
-
-       switch (offset) {
-       case APIC_ID:
-               if (avic_handle_apic_id_update(&svm->vcpu))
-                       return 0;
-               break;
-       case APIC_LDR:
-               if (avic_handle_ldr_update(&svm->vcpu))
-                       return 0;
-               break;
-       case APIC_DFR:
-               avic_handle_dfr_update(&svm->vcpu);
-               break;
-       default:
-               break;
-       }
-
-       kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
-
-       return 1;
-}
-
-static bool is_avic_unaccelerated_access_trap(u32 offset)
-{
-       bool ret = false;
-
-       switch (offset) {
-       case APIC_ID:
-       case APIC_EOI:
-       case APIC_RRR:
-       case APIC_LDR:
-       case APIC_DFR:
-       case APIC_SPIV:
-       case APIC_ESR:
-       case APIC_ICR:
-       case APIC_LVTT:
-       case APIC_LVTTHMR:
-       case APIC_LVTPC:
-       case APIC_LVT0:
-       case APIC_LVT1:
-       case APIC_LVTERR:
-       case APIC_TMICT:
-       case APIC_TDCR:
-               ret = true;
-               break;
-       default:
-               break;
-       }
-       return ret;
-}
-
-static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
-{
-       int ret = 0;
-       u32 offset = svm->vmcb->control.exit_info_1 &
-                    AVIC_UNACCEL_ACCESS_OFFSET_MASK;
-       u32 vector = svm->vmcb->control.exit_info_2 &
-                    AVIC_UNACCEL_ACCESS_VECTOR_MASK;
-       bool write = (svm->vmcb->control.exit_info_1 >> 32) &
-                    AVIC_UNACCEL_ACCESS_WRITE_MASK;
-       bool trap = is_avic_unaccelerated_access_trap(offset);
-
-       trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
-                                           trap, write, vector);
-       if (trap) {
-               /* Handling Trap */
-               WARN_ONCE(!write, "svm: Handling trap read.\n");
-               ret = avic_unaccel_trap_write(svm);
-       } else {
-               /* Handling Fault */
-               ret = kvm_emulate_instruction(&svm->vcpu, 0);
-       }
-
-       return ret;
-}
-
-static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
-       [SVM_EXIT_READ_CR0]                     = cr_interception,
-       [SVM_EXIT_READ_CR3]                     = cr_interception,
-       [SVM_EXIT_READ_CR4]                     = cr_interception,
-       [SVM_EXIT_READ_CR8]                     = cr_interception,
-       [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
-       [SVM_EXIT_WRITE_CR0]                    = cr_interception,
-       [SVM_EXIT_WRITE_CR3]                    = cr_interception,
-       [SVM_EXIT_WRITE_CR4]                    = cr_interception,
-       [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
-       [SVM_EXIT_READ_DR0]                     = dr_interception,
-       [SVM_EXIT_READ_DR1]                     = dr_interception,
-       [SVM_EXIT_READ_DR2]                     = dr_interception,
-       [SVM_EXIT_READ_DR3]                     = dr_interception,
-       [SVM_EXIT_READ_DR4]                     = dr_interception,
-       [SVM_EXIT_READ_DR5]                     = dr_interception,
-       [SVM_EXIT_READ_DR6]                     = dr_interception,
-       [SVM_EXIT_READ_DR7]                     = dr_interception,
-       [SVM_EXIT_WRITE_DR0]                    = dr_interception,
-       [SVM_EXIT_WRITE_DR1]                    = dr_interception,
-       [SVM_EXIT_WRITE_DR2]                    = dr_interception,
-       [SVM_EXIT_WRITE_DR3]                    = dr_interception,
-       [SVM_EXIT_WRITE_DR4]                    = dr_interception,
-       [SVM_EXIT_WRITE_DR5]                    = dr_interception,
-       [SVM_EXIT_WRITE_DR6]                    = dr_interception,
-       [SVM_EXIT_WRITE_DR7]                    = dr_interception,
-       [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
-       [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
-       [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
-       [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
-       [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
-       [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
-       [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
-       [SVM_EXIT_INTR]                         = intr_interception,
-       [SVM_EXIT_NMI]                          = nmi_interception,
-       [SVM_EXIT_SMI]                          = nop_on_interception,
-       [SVM_EXIT_INIT]                         = nop_on_interception,
-       [SVM_EXIT_VINTR]                        = interrupt_window_interception,
-       [SVM_EXIT_RDPMC]                        = rdpmc_interception,
-       [SVM_EXIT_CPUID]                        = cpuid_interception,
-       [SVM_EXIT_IRET]                         = iret_interception,
-       [SVM_EXIT_INVD]                         = emulate_on_interception,
-       [SVM_EXIT_PAUSE]                        = pause_interception,
-       [SVM_EXIT_HLT]                          = halt_interception,
-       [SVM_EXIT_INVLPG]                       = invlpg_interception,
-       [SVM_EXIT_INVLPGA]                      = invlpga_interception,
-       [SVM_EXIT_IOIO]                         = io_interception,
-       [SVM_EXIT_MSR]                          = msr_interception,
-       [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
-       [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
-       [SVM_EXIT_VMRUN]                        = vmrun_interception,
-       [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
-       [SVM_EXIT_VMLOAD]                       = vmload_interception,
-       [SVM_EXIT_VMSAVE]                       = vmsave_interception,
-       [SVM_EXIT_STGI]                         = stgi_interception,
-       [SVM_EXIT_CLGI]                         = clgi_interception,
-       [SVM_EXIT_SKINIT]                       = skinit_interception,
-       [SVM_EXIT_WBINVD]                       = wbinvd_interception,
-       [SVM_EXIT_MONITOR]                      = monitor_interception,
-       [SVM_EXIT_MWAIT]                        = mwait_interception,
-       [SVM_EXIT_XSETBV]                       = xsetbv_interception,
-       [SVM_EXIT_RDPRU]                        = rdpru_interception,
-       [SVM_EXIT_NPF]                          = npf_interception,
-       [SVM_EXIT_RSM]                          = rsm_interception,
-       [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
-       [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
-};
-
-static void dump_vmcb(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb_control_area *control = &svm->vmcb->control;
-       struct vmcb_save_area *save = &svm->vmcb->save;
-
-       if (!dump_invalid_vmcb) {
-               pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
-               return;
-       }
-
-       pr_err("VMCB Control Area:\n");
-       pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
-       pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
-       pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
-       pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
-       pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
-       pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
-       pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
-       pr_err("%-20s%d\n", "pause filter threshold:",
-              control->pause_filter_thresh);
-       pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
-       pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
-       pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
-       pr_err("%-20s%d\n", "asid:", control->asid);
-       pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
-       pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
-       pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
-       pr_err("%-20s%08x\n", "int_state:", control->int_state);
-       pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
-       pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
-       pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
-       pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
-       pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
-       pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
-       pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
-       pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
-       pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
-       pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
-       pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
-       pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
-       pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
-       pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
-       pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
-       pr_err("VMCB State Save Area:\n");
-       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
-              "es:",
-              save->es.selector, save->es.attrib,
-              save->es.limit, save->es.base);
-       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
-              "cs:",
-              save->cs.selector, save->cs.attrib,
-              save->cs.limit, save->cs.base);
-       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
-              "ss:",
-              save->ss.selector, save->ss.attrib,
-              save->ss.limit, save->ss.base);
-       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
-              "ds:",
-              save->ds.selector, save->ds.attrib,
-              save->ds.limit, save->ds.base);
-       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
-              "fs:",
-              save->fs.selector, save->fs.attrib,
-              save->fs.limit, save->fs.base);
-       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
-              "gs:",
-              save->gs.selector, save->gs.attrib,
-              save->gs.limit, save->gs.base);
-       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
-              "gdtr:",
-              save->gdtr.selector, save->gdtr.attrib,
-              save->gdtr.limit, save->gdtr.base);
-       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
-              "ldtr:",
-              save->ldtr.selector, save->ldtr.attrib,
-              save->ldtr.limit, save->ldtr.base);
-       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
-              "idtr:",
-              save->idtr.selector, save->idtr.attrib,
-              save->idtr.limit, save->idtr.base);
-       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
-              "tr:",
-              save->tr.selector, save->tr.attrib,
-              save->tr.limit, save->tr.base);
-       pr_err("cpl:            %d                efer:         %016llx\n",
-               save->cpl, save->efer);
-       pr_err("%-15s %016llx %-13s %016llx\n",
-              "cr0:", save->cr0, "cr2:", save->cr2);
-       pr_err("%-15s %016llx %-13s %016llx\n",
-              "cr3:", save->cr3, "cr4:", save->cr4);
-       pr_err("%-15s %016llx %-13s %016llx\n",
-              "dr6:", save->dr6, "dr7:", save->dr7);
-       pr_err("%-15s %016llx %-13s %016llx\n",
-              "rip:", save->rip, "rflags:", save->rflags);
-       pr_err("%-15s %016llx %-13s %016llx\n",
-              "rsp:", save->rsp, "rax:", save->rax);
-       pr_err("%-15s %016llx %-13s %016llx\n",
-              "star:", save->star, "lstar:", save->lstar);
-       pr_err("%-15s %016llx %-13s %016llx\n",
-              "cstar:", save->cstar, "sfmask:", save->sfmask);
-       pr_err("%-15s %016llx %-13s %016llx\n",
-              "kernel_gs_base:", save->kernel_gs_base,
-              "sysenter_cs:", save->sysenter_cs);
-       pr_err("%-15s %016llx %-13s %016llx\n",
-              "sysenter_esp:", save->sysenter_esp,
-              "sysenter_eip:", save->sysenter_eip);
-       pr_err("%-15s %016llx %-13s %016llx\n",
-              "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
-       pr_err("%-15s %016llx %-13s %016llx\n",
-              "br_from:", save->br_from, "br_to:", save->br_to);
-       pr_err("%-15s %016llx %-13s %016llx\n",
-              "excp_from:", save->last_excp_from,
-              "excp_to:", save->last_excp_to);
-}
-
-static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
-{
-       struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
-
-       *info1 = control->exit_info_1;
-       *info2 = control->exit_info_2;
-}
-
-static int handle_exit(struct kvm_vcpu *vcpu,
-       enum exit_fastpath_completion exit_fastpath)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       struct kvm_run *kvm_run = vcpu->run;
-       u32 exit_code = svm->vmcb->control.exit_code;
-
-       trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
-
-       if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
-               vcpu->arch.cr0 = svm->vmcb->save.cr0;
-       if (npt_enabled)
-               vcpu->arch.cr3 = svm->vmcb->save.cr3;
-
-       if (unlikely(svm->nested.exit_required)) {
-               nested_svm_vmexit(svm);
-               svm->nested.exit_required = false;
-
-               return 1;
-       }
-
-       if (is_guest_mode(vcpu)) {
-               int vmexit;
-
-               trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
-                                       svm->vmcb->control.exit_info_1,
-                                       svm->vmcb->control.exit_info_2,
-                                       svm->vmcb->control.exit_int_info,
-                                       svm->vmcb->control.exit_int_info_err,
-                                       KVM_ISA_SVM);
-
-               vmexit = nested_svm_exit_special(svm);
-
-               if (vmexit == NESTED_EXIT_CONTINUE)
-                       vmexit = nested_svm_exit_handled(svm);
-
-               if (vmexit == NESTED_EXIT_DONE)
-                       return 1;
-       }
-
-       svm_complete_interrupts(svm);
-
-       if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
-               kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-               kvm_run->fail_entry.hardware_entry_failure_reason
-                       = svm->vmcb->control.exit_code;
-               dump_vmcb(vcpu);
-               return 0;
-       }
-
-       if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
-           exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
-           exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
-           exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
-               printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
-                      "exit_code 0x%x\n",
-                      __func__, svm->vmcb->control.exit_int_info,
-                      exit_code);
-
-       if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
-               kvm_skip_emulated_instruction(vcpu);
-               return 1;
-       } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
-           || !svm_exit_handlers[exit_code]) {
-               vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
-               dump_vmcb(vcpu);
-               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               vcpu->run->internal.suberror =
-                       KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
-               vcpu->run->internal.ndata = 1;
-               vcpu->run->internal.data[0] = exit_code;
-               return 0;
-       }
-
-#ifdef CONFIG_RETPOLINE
-       if (exit_code == SVM_EXIT_MSR)
-               return msr_interception(svm);
-       else if (exit_code == SVM_EXIT_VINTR)
-               return interrupt_window_interception(svm);
-       else if (exit_code == SVM_EXIT_INTR)
-               return intr_interception(svm);
-       else if (exit_code == SVM_EXIT_HLT)
-               return halt_interception(svm);
-       else if (exit_code == SVM_EXIT_NPF)
-               return npf_interception(svm);
-#endif
-       return svm_exit_handlers[exit_code](svm);
-}
-
-static void reload_tss(struct kvm_vcpu *vcpu)
-{
-       int cpu = raw_smp_processor_id();
-
-       struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
-       sd->tss_desc->type = 9; /* available 32/64-bit TSS */
-       load_TR_desc();
-}
-
-static void pre_sev_run(struct vcpu_svm *svm, int cpu)
-{
-       struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
-       int asid = sev_get_asid(svm->vcpu.kvm);
-
-       /* Assign the asid allocated with this SEV guest */
-       svm->vmcb->control.asid = asid;
-
-       /*
-        * Flush guest TLB:
-        *
-        * 1) when different VMCB for the same ASID is to be run on the same host CPU.
-        * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
-        */
-       if (sd->sev_vmcbs[asid] == svm->vmcb &&
-           svm->last_cpu == cpu)
-               return;
-
-       svm->last_cpu = cpu;
-       sd->sev_vmcbs[asid] = svm->vmcb;
-       svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
-       mark_dirty(svm->vmcb, VMCB_ASID);
-}
-
-static void pre_svm_run(struct vcpu_svm *svm)
-{
-       int cpu = raw_smp_processor_id();
-
-       struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
-
-       if (sev_guest(svm->vcpu.kvm))
-               return pre_sev_run(svm, cpu);
-
-       /* FIXME: handle wraparound of asid_generation */
-       if (svm->asid_generation != sd->asid_generation)
-               new_asid(svm, sd);
-}
-
-static void svm_inject_nmi(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
-       vcpu->arch.hflags |= HF_NMI_MASK;
-       set_intercept(svm, INTERCEPT_IRET);
-       ++vcpu->stat.nmi_injections;
-}
-
-static void svm_set_irq(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       BUG_ON(!(gif_set(svm)));
-
-       trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
-       ++vcpu->stat.irq_injections;
-
-       svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
-               SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
-}
-
-static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
-{
-       return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
-}
-
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (svm_nested_virtualize_tpr(vcpu))
-               return;
-
-       clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
-
-       if (irr == -1)
-               return;
-
-       if (tpr >= irr)
-               set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
-}
-
-static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
-{
-       return;
-}
-
-static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
-{
-}
-
-static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
-{
-}
-
-static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate)
-{
-       if (!avic || !lapic_in_kernel(vcpu))
-               return;
-
-       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-       kvm_request_apicv_update(vcpu->kvm, activate,
-                                APICV_INHIBIT_REASON_IRQWIN);
-       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-}
-
-static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
-{
-       int ret = 0;
-       unsigned long flags;
-       struct amd_svm_iommu_ir *ir;
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (!kvm_arch_has_assigned_device(vcpu->kvm))
-               return 0;
-
-       /*
-        * Here, we go through the per-vcpu ir_list to update all existing
-        * interrupt remapping table entry targeting this vcpu.
-        */
-       spin_lock_irqsave(&svm->ir_list_lock, flags);
-
-       if (list_empty(&svm->ir_list))
-               goto out;
-
-       list_for_each_entry(ir, &svm->ir_list, node) {
-               if (activate)
-                       ret = amd_iommu_activate_guest_mode(ir->data);
-               else
-                       ret = amd_iommu_deactivate_guest_mode(ir->data);
-               if (ret)
-                       break;
-       }
-out:
-       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-       return ret;
-}
-
-static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *vmcb = svm->vmcb;
-       bool activated = kvm_vcpu_apicv_active(vcpu);
-
-       if (!avic)
-               return;
-
-       if (activated) {
-               /**
-                * During AVIC temporary deactivation, guest could update
-                * APIC ID, DFR and LDR registers, which would not be trapped
-                * by avic_unaccelerated_access_interception(). In this case,
-                * we need to check and update the AVIC logical APIC ID table
-                * accordingly before re-activating.
-                */
-               avic_post_state_restore(vcpu);
-               vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
-       } else {
-               vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
-       }
-       mark_dirty(vmcb, VMCB_AVIC);
-
-       svm_set_pi_irte_mode(vcpu, activated);
-}
-
-static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
-{
-       return;
-}
-
-static int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
-{
-       if (!vcpu->arch.apicv_active)
-               return -1;
-
-       kvm_lapic_set_irr(vec, vcpu->arch.apic);
-       smp_mb__after_atomic();
-
-       if (avic_vcpu_is_running(vcpu)) {
-               int cpuid = vcpu->cpu;
-
-               if (cpuid != get_cpu())
-                       wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid));
-               put_cpu();
-       } else
-               kvm_vcpu_wake_up(vcpu);
-
-       return 0;
-}
-
-static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
-       return false;
-}
-
-static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
-{
-       unsigned long flags;
-       struct amd_svm_iommu_ir *cur;
-
-       spin_lock_irqsave(&svm->ir_list_lock, flags);
-       list_for_each_entry(cur, &svm->ir_list, node) {
-               if (cur->data != pi->ir_data)
-                       continue;
-               list_del(&cur->node);
-               kfree(cur);
-               break;
-       }
-       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-}
-
-static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
-{
-       int ret = 0;
-       unsigned long flags;
-       struct amd_svm_iommu_ir *ir;
-
-       /**
-        * In some cases, the existing irte is updaed and re-set,
-        * so we need to check here if it's already been * added
-        * to the ir_list.
-        */
-       if (pi->ir_data && (pi->prev_ga_tag != 0)) {
-               struct kvm *kvm = svm->vcpu.kvm;
-               u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
-               struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
-               struct vcpu_svm *prev_svm;
-
-               if (!prev_vcpu) {
-                       ret = -EINVAL;
-                       goto out;
-               }
-
-               prev_svm = to_svm(prev_vcpu);
-               svm_ir_list_del(prev_svm, pi);
-       }
-
-       /**
-        * Allocating new amd_iommu_pi_data, which will get
-        * add to the per-vcpu ir_list.
-        */
-       ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
-       if (!ir) {
-               ret = -ENOMEM;
-               goto out;
-       }
-       ir->data = pi->ir_data;
-
-       spin_lock_irqsave(&svm->ir_list_lock, flags);
-       list_add(&ir->node, &svm->ir_list);
-       spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-out:
-       return ret;
-}
-
-/**
- * Note:
- * The HW cannot support posting multicast/broadcast
- * interrupts to a vCPU. So, we still use legacy interrupt
- * remapping for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- */
-static int
-get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
-                struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
-{
-       struct kvm_lapic_irq irq;
-       struct kvm_vcpu *vcpu = NULL;
-
-       kvm_set_msi_irq(kvm, e, &irq);
-
-       if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
-           !kvm_irq_is_postable(&irq)) {
-               pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
-                        __func__, irq.vector);
-               return -1;
-       }
-
-       pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
-                irq.vector);
-       *svm = to_svm(vcpu);
-       vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
-       vcpu_info->vector = irq.vector;
-
-       return 0;
-}
-
-/*
- * svm_update_pi_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
-                             uint32_t guest_irq, bool set)
-{
-       struct kvm_kernel_irq_routing_entry *e;
-       struct kvm_irq_routing_table *irq_rt;
-       int idx, ret = -EINVAL;
-
-       if (!kvm_arch_has_assigned_device(kvm) ||
-           !irq_remapping_cap(IRQ_POSTING_CAP))
-               return 0;
-
-       pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
-                __func__, host_irq, guest_irq, set);
-
-       idx = srcu_read_lock(&kvm->irq_srcu);
-       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-       WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
-
-       hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
-               struct vcpu_data vcpu_info;
-               struct vcpu_svm *svm = NULL;
-
-               if (e->type != KVM_IRQ_ROUTING_MSI)
-                       continue;
-
-               /**
-                * Here, we setup with legacy mode in the following cases:
-                * 1. When cannot target interrupt to a specific vcpu.
-                * 2. Unsetting posted interrupt.
-                * 3. APIC virtialization is disabled for the vcpu.
-                * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
-                */
-               if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
-                   kvm_vcpu_apicv_active(&svm->vcpu)) {
-                       struct amd_iommu_pi_data pi;
-
-                       /* Try to enable guest_mode in IRTE */
-                       pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
-                                           AVIC_HPA_MASK);
-                       pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
-                                                    svm->vcpu.vcpu_id);
-                       pi.is_guest_mode = true;
-                       pi.vcpu_data = &vcpu_info;
-                       ret = irq_set_vcpu_affinity(host_irq, &pi);
-
-                       /**
-                        * Here, we successfully setting up vcpu affinity in
-                        * IOMMU guest mode. Now, we need to store the posted
-                        * interrupt information in a per-vcpu ir_list so that
-                        * we can reference to them directly when we update vcpu
-                        * scheduling information in IOMMU irte.
-                        */
-                       if (!ret && pi.is_guest_mode)
-                               svm_ir_list_add(svm, &pi);
-               } else {
-                       /* Use legacy mode in IRTE */
-                       struct amd_iommu_pi_data pi;
-
-                       /**
-                        * Here, pi is used to:
-                        * - Tell IOMMU to use legacy mode for this interrupt.
-                        * - Retrieve ga_tag of prior interrupt remapping data.
-                        */
-                       pi.is_guest_mode = false;
-                       ret = irq_set_vcpu_affinity(host_irq, &pi);
-
-                       /**
-                        * Check if the posted interrupt was previously
-                        * setup with the guest_mode by checking if the ga_tag
-                        * was cached. If so, we need to clean up the per-vcpu
-                        * ir_list.
-                        */
-                       if (!ret && pi.prev_ga_tag) {
-                               int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
-                               struct kvm_vcpu *vcpu;
-
-                               vcpu = kvm_get_vcpu_by_id(kvm, id);
-                               if (vcpu)
-                                       svm_ir_list_del(to_svm(vcpu), &pi);
-                       }
-               }
-
-               if (!ret && svm) {
-                       trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
-                                                e->gsi, vcpu_info.vector,
-                                                vcpu_info.pi_desc_addr, set);
-               }
-
-               if (ret < 0) {
-                       pr_err("%s: failed to update PI IRTE\n", __func__);
-                       goto out;
-               }
-       }
-
-       ret = 0;
-out:
-       srcu_read_unlock(&kvm->irq_srcu, idx);
-       return ret;
-}
-
-static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *vmcb = svm->vmcb;
-       int ret;
-       ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-             !(svm->vcpu.arch.hflags & HF_NMI_MASK);
-       ret = ret && gif_set(svm) && nested_svm_nmi(svm);
-
-       return ret;
-}
-
-static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
-}
-
-static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (masked) {
-               svm->vcpu.arch.hflags |= HF_NMI_MASK;
-               set_intercept(svm, INTERCEPT_IRET);
-       } else {
-               svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
-               clr_intercept(svm, INTERCEPT_IRET);
-       }
-}
-
-static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *vmcb = svm->vmcb;
-
-       if (!gif_set(svm) ||
-            (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
-               return 0;
-
-       if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK))
-               return !!(svm->vcpu.arch.hflags & HF_HIF_MASK);
-       else
-               return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
-}
-
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       /*
-        * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
-        * 1, because that's a separate STGI/VMRUN intercept.  The next time we
-        * get that intercept, this function will be called again though and
-        * we'll get the vintr intercept. However, if the vGIF feature is
-        * enabled, the STGI interception will not occur. Enable the irq
-        * window under the assumption that the hardware will set the GIF.
-        */
-       if (vgif_enabled(svm) || gif_set(svm)) {
-               /*
-                * IRQ window is not needed when AVIC is enabled,
-                * unless we have pending ExtINT since it cannot be injected
-                * via AVIC. In such case, we need to temporarily disable AVIC,
-                * and fallback to injecting IRQ via V_IRQ.
-                */
-               svm_toggle_avic_for_irq_window(vcpu, false);
-               svm_set_vintr(svm);
-       }
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
-           == HF_NMI_MASK)
-               return; /* IRET will cause a vm exit */
-
-       if (!gif_set(svm)) {
-               if (vgif_enabled(svm))
-                       set_intercept(svm, INTERCEPT_STGI);
-               return; /* STGI will cause a vm exit */
-       }
-
-       if (svm->nested.exit_required)
-               return; /* we're not going to run the guest yet */
-
-       /*
-        * Something prevents NMI from been injected. Single step over possible
-        * problem (IRET or exception injection or interrupt shadow)
-        */
-       svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
-       svm->nmi_singlestep = true;
-       svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
-}
-
-static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
-       return 0;
-}
-
-static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
-{
-       return 0;
-}
-
-static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
-               svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
-       else
-               svm->asid_generation--;
-}
-
-static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       invlpga(gva, svm->vmcb->control.asid);
-}
-
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       if (svm_nested_virtualize_tpr(vcpu))
-               return;
-
-       if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
-               int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
-               kvm_set_cr8(vcpu, cr8);
-       }
-}
-
-static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       u64 cr8;
-
-       if (svm_nested_virtualize_tpr(vcpu) ||
-           kvm_vcpu_apicv_active(vcpu))
-               return;
-
-       cr8 = kvm_get_cr8(vcpu);
-       svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
-       svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
-}
-
-static void svm_complete_interrupts(struct vcpu_svm *svm)
-{
-       u8 vector;
-       int type;
-       u32 exitintinfo = svm->vmcb->control.exit_int_info;
-       unsigned int3_injected = svm->int3_injected;
-
-       svm->int3_injected = 0;
-
-       /*
-        * If we've made progress since setting HF_IRET_MASK, we've
-        * executed an IRET and can allow NMI injection.
-        */
-       if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
-           && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
-               svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
-               kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-       }
-
-       svm->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&svm->vcpu);
-       kvm_clear_interrupt_queue(&svm->vcpu);
-
-       if (!(exitintinfo & SVM_EXITINTINFO_VALID))
-               return;
-
-       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
-
-       vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
-       type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
-
-       switch (type) {
-       case SVM_EXITINTINFO_TYPE_NMI:
-               svm->vcpu.arch.nmi_injected = true;
-               break;
-       case SVM_EXITINTINFO_TYPE_EXEPT:
-               /*
-                * In case of software exceptions, do not reinject the vector,
-                * but re-execute the instruction instead. Rewind RIP first
-                * if we emulated INT3 before.
-                */
-               if (kvm_exception_is_soft(vector)) {
-                       if (vector == BP_VECTOR && int3_injected &&
-                           kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
-                               kvm_rip_write(&svm->vcpu,
-                                             kvm_rip_read(&svm->vcpu) -
-                                             int3_injected);
-                       break;
-               }
-               if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
-                       u32 err = svm->vmcb->control.exit_int_info_err;
-                       kvm_requeue_exception_e(&svm->vcpu, vector, err);
-
-               } else
-                       kvm_requeue_exception(&svm->vcpu, vector);
-               break;
-       case SVM_EXITINTINFO_TYPE_INTR:
-               kvm_queue_interrupt(&svm->vcpu, vector, false);
-               break;
-       default:
-               break;
-       }
-}
-
-static void svm_cancel_injection(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb_control_area *control = &svm->vmcb->control;
-
-       control->exit_int_info = control->event_inj;
-       control->exit_int_info_err = control->event_inj_err;
-       control->event_inj = 0;
-       svm_complete_interrupts(svm);
-}
-
-static void svm_vcpu_run(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
-       svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-       svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
-
-       /*
-        * A vmexit emulation is required before the vcpu can be executed
-        * again.
-        */
-       if (unlikely(svm->nested.exit_required))
-               return;
-
-       /*
-        * Disable singlestep if we're injecting an interrupt/exception.
-        * We don't want our modified rflags to be pushed on the stack where
-        * we might not be able to easily reset them if we disabled NMI
-        * singlestep later.
-        */
-       if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
-               /*
-                * Event injection happens before external interrupts cause a
-                * vmexit and interrupts are disabled here, so smp_send_reschedule
-                * is enough to force an immediate vmexit.
-                */
-               disable_nmi_singlestep(svm);
-               smp_send_reschedule(vcpu->cpu);
-       }
-
-       pre_svm_run(svm);
-
-       sync_lapic_to_cr8(vcpu);
-
-       svm->vmcb->save.cr2 = vcpu->arch.cr2;
-
-       clgi();
-       kvm_load_guest_xsave_state(vcpu);
-
-       if (lapic_in_kernel(vcpu) &&
-               vcpu->arch.apic->lapic_timer.timer_advance_ns)
-               kvm_wait_lapic_expire(vcpu);
-
-       /*
-        * If this vCPU has touched SPEC_CTRL, restore the guest's value if
-        * it's non-zero. Since vmentry is serialising on affected CPUs, there
-        * is no need to worry about the conditional branch over the wrmsr
-        * being speculatively taken.
-        */
-       x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
-
-       local_irq_enable();
-
-       asm volatile (
-               "push %%" _ASM_BP "; \n\t"
-               "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
-               "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
-               "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
-               "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
-               "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
-               "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
-#ifdef CONFIG_X86_64
-               "mov %c[r8](%[svm]),  %%r8  \n\t"
-               "mov %c[r9](%[svm]),  %%r9  \n\t"
-               "mov %c[r10](%[svm]), %%r10 \n\t"
-               "mov %c[r11](%[svm]), %%r11 \n\t"
-               "mov %c[r12](%[svm]), %%r12 \n\t"
-               "mov %c[r13](%[svm]), %%r13 \n\t"
-               "mov %c[r14](%[svm]), %%r14 \n\t"
-               "mov %c[r15](%[svm]), %%r15 \n\t"
-#endif
-
-               /* Enter guest mode */
-               "push %%" _ASM_AX " \n\t"
-               "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
-               __ex("vmload %%" _ASM_AX) "\n\t"
-               __ex("vmrun %%" _ASM_AX) "\n\t"
-               __ex("vmsave %%" _ASM_AX) "\n\t"
-               "pop %%" _ASM_AX " \n\t"
-
-               /* Save guest registers, load host registers */
-               "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
-               "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
-               "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
-               "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
-               "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
-               "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
-#ifdef CONFIG_X86_64
-               "mov %%r8,  %c[r8](%[svm]) \n\t"
-               "mov %%r9,  %c[r9](%[svm]) \n\t"
-               "mov %%r10, %c[r10](%[svm]) \n\t"
-               "mov %%r11, %c[r11](%[svm]) \n\t"
-               "mov %%r12, %c[r12](%[svm]) \n\t"
-               "mov %%r13, %c[r13](%[svm]) \n\t"
-               "mov %%r14, %c[r14](%[svm]) \n\t"
-               "mov %%r15, %c[r15](%[svm]) \n\t"
-               /*
-               * Clear host registers marked as clobbered to prevent
-               * speculative use.
-               */
-               "xor %%r8d, %%r8d \n\t"
-               "xor %%r9d, %%r9d \n\t"
-               "xor %%r10d, %%r10d \n\t"
-               "xor %%r11d, %%r11d \n\t"
-               "xor %%r12d, %%r12d \n\t"
-               "xor %%r13d, %%r13d \n\t"
-               "xor %%r14d, %%r14d \n\t"
-               "xor %%r15d, %%r15d \n\t"
-#endif
-               "xor %%ebx, %%ebx \n\t"
-               "xor %%ecx, %%ecx \n\t"
-               "xor %%edx, %%edx \n\t"
-               "xor %%esi, %%esi \n\t"
-               "xor %%edi, %%edi \n\t"
-               "pop %%" _ASM_BP
-               :
-               : [svm]"a"(svm),
-                 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
-                 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
-                 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
-                 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
-                 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
-                 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
-                 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
-#ifdef CONFIG_X86_64
-                 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
-                 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
-                 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
-                 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
-                 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
-                 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
-                 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
-                 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
-#endif
-               : "cc", "memory"
-#ifdef CONFIG_X86_64
-               , "rbx", "rcx", "rdx", "rsi", "rdi"
-               , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
-#else
-               , "ebx", "ecx", "edx", "esi", "edi"
-#endif
-               );
-
-       /* Eliminate branch target predictions from guest mode */
-       vmexit_fill_RSB();
-
-#ifdef CONFIG_X86_64
-       wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
-       loadsegment(fs, svm->host.fs);
-#ifndef CONFIG_X86_32_LAZY_GS
-       loadsegment(gs, svm->host.gs);
-#endif
-#endif
-
-       /*
-        * We do not use IBRS in the kernel. If this vCPU has used the
-        * SPEC_CTRL MSR it may have left it on; save the value and
-        * turn it off. This is much more efficient than blindly adding
-        * it to the atomic save/restore list. Especially as the former
-        * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
-        *
-        * For non-nested case:
-        * If the L01 MSR bitmap does not intercept the MSR, then we need to
-        * save it.
-        *
-        * For nested case:
-        * If the L02 MSR bitmap does not intercept the MSR, then we need to
-        * save it.
-        */
-       if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
-               svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
-
-       reload_tss(vcpu);
-
-       local_irq_disable();
-
-       x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
-
-       vcpu->arch.cr2 = svm->vmcb->save.cr2;
-       vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
-       vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
-       vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
-
-       if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
-               kvm_before_interrupt(&svm->vcpu);
-
-       kvm_load_host_xsave_state(vcpu);
-       stgi();
-
-       /* Any pending NMI will happen here */
-
-       if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
-               kvm_after_interrupt(&svm->vcpu);
-
-       sync_cr8_to_lapic(vcpu);
-
-       svm->next_rip = 0;
-
-       svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
-
-       /* if exit due to PF check for async PF */
-       if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
-               svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
-
-       if (npt_enabled) {
-               vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
-               vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
-       }
-
-       /*
-        * We need to handle MC intercepts here before the vcpu has a chance to
-        * change the physical cpu
-        */
-       if (unlikely(svm->vmcb->control.exit_code ==
-                    SVM_EXIT_EXCP_BASE + MC_VECTOR))
-               svm_handle_mce(svm);
-
-       mark_all_clean(svm->vmcb);
-}
-STACK_FRAME_NON_STANDARD(svm_vcpu_run);
-
-static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       bool update_guest_cr3 = true;
-       unsigned long cr3;
-
-       cr3 = __sme_set(root);
-       if (npt_enabled) {
-               svm->vmcb->control.nested_cr3 = cr3;
-               mark_dirty(svm->vmcb, VMCB_NPT);
-
-               /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
-               if (is_guest_mode(vcpu))
-                       update_guest_cr3 = false;
-               else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
-                       cr3 = vcpu->arch.cr3;
-               else /* CR3 is already up-to-date.  */
-                       update_guest_cr3 = false;
-       }
-
-       if (update_guest_cr3) {
-               svm->vmcb->save.cr3 = cr3;
-               mark_dirty(svm->vmcb, VMCB_CR);
-       }
-}
-
-static int is_disabled(void)
-{
-       u64 vm_cr;
-
-       rdmsrl(MSR_VM_CR, vm_cr);
-       if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
-               return 1;
-
-       return 0;
-}
-
-static void
-svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
-       /*
-        * Patch in the VMMCALL instruction:
-        */
-       hypercall[0] = 0x0f;
-       hypercall[1] = 0x01;
-       hypercall[2] = 0xd9;
-}
-
-static int __init svm_check_processor_compat(void)
-{
-       return 0;
-}
-
-static bool svm_cpu_has_accelerated_tpr(void)
-{
-       return false;
-}
-
-static bool svm_has_emulated_msr(int index)
-{
-       switch (index) {
-       case MSR_IA32_MCG_EXT_CTL:
-       case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
-               return false;
-       default:
-               break;
-       }
-
-       return true;
-}
-
-static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
-{
-       return 0;
-}
-
-static void svm_cpuid_update(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
-                                   boot_cpu_has(X86_FEATURE_XSAVE) &&
-                                   boot_cpu_has(X86_FEATURE_XSAVES);
-
-       /* Update nrips enabled cache */
-       svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
-                            guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
-
-       if (!kvm_vcpu_apicv_active(vcpu))
-               return;
-
-       /*
-        * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
-        * is exposed to the guest, disable AVIC.
-        */
-       if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
-               kvm_request_apicv_update(vcpu->kvm, false,
-                                        APICV_INHIBIT_REASON_X2APIC);
-
-       /*
-        * Currently, AVIC does not work with nested virtualization.
-        * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
-        */
-       if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
-               kvm_request_apicv_update(vcpu->kvm, false,
-                                        APICV_INHIBIT_REASON_NESTED);
-}
-
-static bool svm_has_wbinvd_exit(void)
-{
-       return true;
-}
-
-#define PRE_EX(exit)  { .exit_code = (exit), \
-                       .stage = X86_ICPT_PRE_EXCEPT, }
-#define POST_EX(exit) { .exit_code = (exit), \
-                       .stage = X86_ICPT_POST_EXCEPT, }
-#define POST_MEM(exit) { .exit_code = (exit), \
-                       .stage = X86_ICPT_POST_MEMACCESS, }
-
-static const struct __x86_intercept {
-       u32 exit_code;
-       enum x86_intercept_stage stage;
-} x86_intercept_map[] = {
-       [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
-       [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
-       [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
-       [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
-       [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
-       [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
-       [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
-       [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
-       [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
-       [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
-       [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
-       [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
-       [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
-       [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
-       [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
-       [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
-       [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
-       [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
-       [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
-       [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
-       [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
-       [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
-       [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
-       [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
-       [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
-       [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
-       [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
-       [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
-       [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
-       [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
-       [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
-       [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
-       [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
-       [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
-       [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
-       [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
-       [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
-       [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
-       [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
-       [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
-       [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
-       [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
-       [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
-       [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
-       [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
-       [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
-       [x86_intercept_xsetbv]          = PRE_EX(SVM_EXIT_XSETBV),
-};
-
-#undef PRE_EX
-#undef POST_EX
-#undef POST_MEM
+       pr_err("VMCB Control Area:\n");
+       pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
+       pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
+       pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
+       pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
+       pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
+       pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
+       pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
+       pr_err("%-20s%d\n", "pause filter threshold:",
+              control->pause_filter_thresh);
+       pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
+       pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
+       pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
+       pr_err("%-20s%d\n", "asid:", control->asid);
+       pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
+       pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
+       pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
+       pr_err("%-20s%08x\n", "int_state:", control->int_state);
+       pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
+       pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
+       pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
+       pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
+       pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
+       pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
+       pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+       pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
+       pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
+       pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
+       pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
+       pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
+       pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
+       pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
+       pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
+       pr_err("VMCB State Save Area:\n");
+       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+              "es:",
+              save->es.selector, save->es.attrib,
+              save->es.limit, save->es.base);
+       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+              "cs:",
+              save->cs.selector, save->cs.attrib,
+              save->cs.limit, save->cs.base);
+       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+              "ss:",
+              save->ss.selector, save->ss.attrib,
+              save->ss.limit, save->ss.base);
+       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+              "ds:",
+              save->ds.selector, save->ds.attrib,
+              save->ds.limit, save->ds.base);
+       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+              "fs:",
+              save->fs.selector, save->fs.attrib,
+              save->fs.limit, save->fs.base);
+       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+              "gs:",
+              save->gs.selector, save->gs.attrib,
+              save->gs.limit, save->gs.base);
+       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+              "gdtr:",
+              save->gdtr.selector, save->gdtr.attrib,
+              save->gdtr.limit, save->gdtr.base);
+       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+              "ldtr:",
+              save->ldtr.selector, save->ldtr.attrib,
+              save->ldtr.limit, save->ldtr.base);
+       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+              "idtr:",
+              save->idtr.selector, save->idtr.attrib,
+              save->idtr.limit, save->idtr.base);
+       pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+              "tr:",
+              save->tr.selector, save->tr.attrib,
+              save->tr.limit, save->tr.base);
+       pr_err("cpl:            %d                efer:         %016llx\n",
+               save->cpl, save->efer);
+       pr_err("%-15s %016llx %-13s %016llx\n",
+              "cr0:", save->cr0, "cr2:", save->cr2);
+       pr_err("%-15s %016llx %-13s %016llx\n",
+              "cr3:", save->cr3, "cr4:", save->cr4);
+       pr_err("%-15s %016llx %-13s %016llx\n",
+              "dr6:", save->dr6, "dr7:", save->dr7);
+       pr_err("%-15s %016llx %-13s %016llx\n",
+              "rip:", save->rip, "rflags:", save->rflags);
+       pr_err("%-15s %016llx %-13s %016llx\n",
+              "rsp:", save->rsp, "rax:", save->rax);
+       pr_err("%-15s %016llx %-13s %016llx\n",
+              "star:", save->star, "lstar:", save->lstar);
+       pr_err("%-15s %016llx %-13s %016llx\n",
+              "cstar:", save->cstar, "sfmask:", save->sfmask);
+       pr_err("%-15s %016llx %-13s %016llx\n",
+              "kernel_gs_base:", save->kernel_gs_base,
+              "sysenter_cs:", save->sysenter_cs);
+       pr_err("%-15s %016llx %-13s %016llx\n",
+              "sysenter_esp:", save->sysenter_esp,
+              "sysenter_eip:", save->sysenter_eip);
+       pr_err("%-15s %016llx %-13s %016llx\n",
+              "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
+       pr_err("%-15s %016llx %-13s %016llx\n",
+              "br_from:", save->br_from, "br_to:", save->br_to);
+       pr_err("%-15s %016llx %-13s %016llx\n",
+              "excp_from:", save->last_excp_from,
+              "excp_to:", save->last_excp_to);
+}
 
-static int svm_check_intercept(struct kvm_vcpu *vcpu,
-                              struct x86_instruction_info *info,
-                              enum x86_intercept_stage stage,
-                              struct x86_exception *exception)
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
-       int vmexit, ret = X86EMUL_CONTINUE;
-       struct __x86_intercept icpt_info;
-       struct vmcb *vmcb = svm->vmcb;
-
-       if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
-               goto out;
-
-       icpt_info = x86_intercept_map[info->intercept];
-
-       if (stage != icpt_info.stage)
-               goto out;
-
-       switch (icpt_info.exit_code) {
-       case SVM_EXIT_READ_CR0:
-               if (info->intercept == x86_intercept_cr_read)
-                       icpt_info.exit_code += info->modrm_reg;
-               break;
-       case SVM_EXIT_WRITE_CR0: {
-               unsigned long cr0, val;
-               u64 intercept;
-
-               if (info->intercept == x86_intercept_cr_write)
-                       icpt_info.exit_code += info->modrm_reg;
-
-               if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
-                   info->intercept == x86_intercept_clts)
-                       break;
+       struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
 
-               intercept = svm->nested.intercept;
+       *info1 = control->exit_info_1;
+       *info2 = control->exit_info_2;
+}
 
-               if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
-                       break;
+static int handle_exit(struct kvm_vcpu *vcpu,
+       enum exit_fastpath_completion exit_fastpath)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_run *kvm_run = vcpu->run;
+       u32 exit_code = svm->vmcb->control.exit_code;
 
-               cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
-               val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
+       trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
 
-               if (info->intercept == x86_intercept_lmsw) {
-                       cr0 &= 0xfUL;
-                       val &= 0xfUL;
-                       /* lmsw can't clear PE - catch this here */
-                       if (cr0 & X86_CR0_PE)
-                               val |= X86_CR0_PE;
-               }
+       if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
+               vcpu->arch.cr0 = svm->vmcb->save.cr0;
+       if (npt_enabled)
+               vcpu->arch.cr3 = svm->vmcb->save.cr3;
 
-               if (cr0 ^ val)
-                       icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+       if (unlikely(svm->nested.exit_required)) {
+               nested_svm_vmexit(svm);
+               svm->nested.exit_required = false;
 
-               break;
+               return 1;
        }
-       case SVM_EXIT_READ_DR0:
-       case SVM_EXIT_WRITE_DR0:
-               icpt_info.exit_code += info->modrm_reg;
-               break;
-       case SVM_EXIT_MSR:
-               if (info->intercept == x86_intercept_wrmsr)
-                       vmcb->control.exit_info_1 = 1;
-               else
-                       vmcb->control.exit_info_1 = 0;
-               break;
-       case SVM_EXIT_PAUSE:
-               /*
-                * We get this for NOP only, but pause
-                * is rep not, check this here
-                */
-               if (info->rep_prefix != REPE_PREFIX)
-                       goto out;
-               break;
-       case SVM_EXIT_IOIO: {
-               u64 exit_info;
-               u32 bytes;
 
-               if (info->intercept == x86_intercept_in ||
-                   info->intercept == x86_intercept_ins) {
-                       exit_info = ((info->src_val & 0xffff) << 16) |
-                               SVM_IOIO_TYPE_MASK;
-                       bytes = info->dst_bytes;
-               } else {
-                       exit_info = (info->dst_val & 0xffff) << 16;
-                       bytes = info->src_bytes;
-               }
-
-               if (info->intercept == x86_intercept_outs ||
-                   info->intercept == x86_intercept_ins)
-                       exit_info |= SVM_IOIO_STR_MASK;
+       if (is_guest_mode(vcpu)) {
+               int vmexit;
 
-               if (info->rep_prefix)
-                       exit_info |= SVM_IOIO_REP_MASK;
+               trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
+                                       svm->vmcb->control.exit_info_1,
+                                       svm->vmcb->control.exit_info_2,
+                                       svm->vmcb->control.exit_int_info,
+                                       svm->vmcb->control.exit_int_info_err,
+                                       KVM_ISA_SVM);
 
-               bytes = min(bytes, 4u);
+               vmexit = nested_svm_exit_special(svm);
 
-               exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
+               if (vmexit == NESTED_EXIT_CONTINUE)
+                       vmexit = nested_svm_exit_handled(svm);
 
-               exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
+               if (vmexit == NESTED_EXIT_DONE)
+                       return 1;
+       }
 
-               vmcb->control.exit_info_1 = exit_info;
-               vmcb->control.exit_info_2 = info->next_rip;
+       svm_complete_interrupts(svm);
 
-               break;
-       }
-       default:
-               break;
+       if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
+               kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+               kvm_run->fail_entry.hardware_entry_failure_reason
+                       = svm->vmcb->control.exit_code;
+               dump_vmcb(vcpu);
+               return 0;
        }
 
-       /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
-       if (static_cpu_has(X86_FEATURE_NRIPS))
-               vmcb->control.next_rip  = info->next_rip;
-       vmcb->control.exit_code = icpt_info.exit_code;
-       vmexit = nested_svm_exit_handled(svm);
+       if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
+           exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+           exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
+           exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
+               printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
+                      "exit_code 0x%x\n",
+                      __func__, svm->vmcb->control.exit_int_info,
+                      exit_code);
 
-       ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
-                                          : X86EMUL_CONTINUE;
+       if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
+               kvm_skip_emulated_instruction(vcpu);
+               return 1;
+       } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+           || !svm_exit_handlers[exit_code]) {
+               vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
+               dump_vmcb(vcpu);
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror =
+                       KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+               vcpu->run->internal.ndata = 1;
+               vcpu->run->internal.data[0] = exit_code;
+               return 0;
+       }
 
-out:
-       return ret;
+#ifdef CONFIG_RETPOLINE
+       if (exit_code == SVM_EXIT_MSR)
+               return msr_interception(svm);
+       else if (exit_code == SVM_EXIT_VINTR)
+               return interrupt_window_interception(svm);
+       else if (exit_code == SVM_EXIT_INTR)
+               return intr_interception(svm);
+       else if (exit_code == SVM_EXIT_HLT)
+               return halt_interception(svm);
+       else if (exit_code == SVM_EXIT_NPF)
+               return npf_interception(svm);
+#endif
+       return svm_exit_handlers[exit_code](svm);
 }
 
-static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu,
-       enum exit_fastpath_completion *exit_fastpath)
+static void reload_tss(struct kvm_vcpu *vcpu)
 {
-       if (!is_guest_mode(vcpu) &&
-           to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
-           to_svm(vcpu)->vmcb->control.exit_info_1)
-               *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
-}
+       int cpu = raw_smp_processor_id();
 
-static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
-       if (pause_filter_thresh)
-               shrink_ple_window(vcpu);
+       struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+       sd->tss_desc->type = 9; /* available 32/64-bit TSS */
+       load_TR_desc();
 }
 
-static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
+static void pre_svm_run(struct vcpu_svm *svm)
 {
-       if (avic_handle_apic_id_update(vcpu) != 0)
-               return;
-       avic_handle_dfr_update(vcpu);
-       avic_handle_ldr_update(vcpu);
-}
+       int cpu = raw_smp_processor_id();
 
-static void svm_setup_mce(struct kvm_vcpu *vcpu)
-{
-       /* [63:9] are reserved. */
-       vcpu->arch.mcg_cap &= 0x1ff;
+       struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+
+       if (sev_guest(svm->vcpu.kvm))
+               return pre_sev_run(svm, cpu);
+
+       /* FIXME: handle wraparound of asid_generation */
+       if (svm->asid_generation != sd->asid_generation)
+               new_asid(svm, sd);
 }
 
-static int svm_smi_allowed(struct kvm_vcpu *vcpu)
+static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       /* Per APM Vol.2 15.22.2 "Response to SMI" */
-       if (!gif_set(svm))
-               return 0;
-
-       if (is_guest_mode(&svm->vcpu) &&
-           svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
-               /* TODO: Might need to set exit_info_1 and exit_info_2 here */
-               svm->vmcb->control.exit_code = SVM_EXIT_SMI;
-               svm->nested.exit_required = true;
-               return 0;
-       }
-
-       return 1;
+       svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+       vcpu->arch.hflags |= HF_NMI_MASK;
+       set_intercept(svm, INTERCEPT_IRET);
+       ++vcpu->stat.nmi_injections;
 }
 
-static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static void svm_set_irq(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       int ret;
 
-       if (is_guest_mode(vcpu)) {
-               /* FED8h - SVM Guest */
-               put_smstate(u64, smstate, 0x7ed8, 1);
-               /* FEE0h - SVM Guest VMCB Physical Address */
-               put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
+       BUG_ON(!(gif_set(svm)));
 
-               svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
-               svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-               svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+       trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
+       ++vcpu->stat.irq_injections;
 
-               ret = nested_svm_vmexit(svm);
-               if (ret)
-                       return ret;
-       }
-       return 0;
+       svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
+               SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
 }
 
-static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct vmcb *nested_vmcb;
-       struct kvm_host_map map;
-       u64 guest;
-       u64 vmcb;
 
-       guest = GET_SMSTATE(u64, smstate, 0x7ed8);
-       vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
+       if (svm_nested_virtualize_tpr(vcpu))
+               return;
 
-       if (guest) {
-               if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
-                       return 1;
-               nested_vmcb = map.hva;
-               enter_svm_guest_mode(svm, vmcb, nested_vmcb, &map);
-       }
-       return 0;
+       clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+       if (irr == -1)
+               return;
+
+       if (tpr >= irr)
+               set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
 }
 
-static int enable_smi_window(struct kvm_vcpu *vcpu)
+static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb = svm->vmcb;
+       int ret;
+       ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+             !(svm->vcpu.arch.hflags & HF_NMI_MASK);
+       ret = ret && gif_set(svm) && nested_svm_nmi(svm);
 
-       if (!gif_set(svm)) {
-               if (vgif_enabled(svm))
-                       set_intercept(svm, INTERCEPT_STGI);
-               /* STGI will cause a vm exit */
-               return 1;
-       }
-       return 0;
+       return ret;
 }
 
-static int sev_flush_asids(void)
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
-       int ret, error;
-
-       /*
-        * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
-        * so it must be guarded.
-        */
-       down_write(&sev_deactivate_lock);
-
-       wbinvd_on_all_cpus();
-       ret = sev_guest_df_flush(&error);
-
-       up_write(&sev_deactivate_lock);
-
-       if (ret)
-               pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       return ret;
+       return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
 }
 
-/* Must be called with the sev_bitmap_lock held */
-static bool __sev_recycle_asids(void)
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 {
-       int pos;
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       /* Check if there are any ASIDs to reclaim before performing a flush */
-       pos = find_next_bit(sev_reclaim_asid_bitmap,
-                           max_sev_asid, min_sev_asid - 1);
-       if (pos >= max_sev_asid)
-               return false;
+       if (masked) {
+               svm->vcpu.arch.hflags |= HF_NMI_MASK;
+               set_intercept(svm, INTERCEPT_IRET);
+       } else {
+               svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
+               clr_intercept(svm, INTERCEPT_IRET);
+       }
+}
 
-       if (sev_flush_asids())
-               return false;
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *vmcb = svm->vmcb;
 
-       bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
-                  max_sev_asid);
-       bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid);
+       if (!gif_set(svm) ||
+            (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
+               return 0;
 
-       return true;
+       if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK))
+               return !!(svm->vcpu.arch.hflags & HF_HIF_MASK);
+       else
+               return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
 }
 
-static int sev_asid_new(void)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
-       bool retry = true;
-       int pos;
-
-       mutex_lock(&sev_bitmap_lock);
+       struct vcpu_svm *svm = to_svm(vcpu);
 
        /*
-        * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
+        * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
+        * 1, because that's a separate STGI/VMRUN intercept.  The next time we
+        * get that intercept, this function will be called again though and
+        * we'll get the vintr intercept. However, if the vGIF feature is
+        * enabled, the STGI interception will not occur. Enable the irq
+        * window under the assumption that the hardware will set the GIF.
         */
-again:
-       pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
-       if (pos >= max_sev_asid) {
-               if (retry && __sev_recycle_asids()) {
-                       retry = false;
-                       goto again;
-               }
-               mutex_unlock(&sev_bitmap_lock);
-               return -EBUSY;
+       if (vgif_enabled(svm) || gif_set(svm)) {
+               /*
+                * IRQ window is not needed when AVIC is enabled,
+                * unless we have pending ExtINT since it cannot be injected
+                * via AVIC. In such case, we need to temporarily disable AVIC,
+                * and fallback to injecting IRQ via V_IRQ.
+                */
+               svm_toggle_avic_for_irq_window(vcpu, false);
+               svm_set_vintr(svm);
        }
-
-       __set_bit(pos, sev_asid_bitmap);
-
-       mutex_unlock(&sev_bitmap_lock);
-
-       return pos + 1;
 }
 
-static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       int asid, ret;
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       ret = -EBUSY;
-       if (unlikely(sev->active))
-               return ret;
+       if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
+           == HF_NMI_MASK)
+               return; /* IRET will cause a vm exit */
 
-       asid = sev_asid_new();
-       if (asid < 0)
-               return ret;
+       if (!gif_set(svm)) {
+               if (vgif_enabled(svm))
+                       set_intercept(svm, INTERCEPT_STGI);
+               return; /* STGI will cause a vm exit */
+       }
 
-       ret = sev_platform_init(&argp->error);
-       if (ret)
-               goto e_free;
+       if (svm->nested.exit_required)
+               return; /* we're not going to run the guest yet */
 
-       sev->active = true;
-       sev->asid = asid;
-       INIT_LIST_HEAD(&sev->regions_list);
+       /*
+        * Something prevents NMI from been injected. Single step over possible
+        * problem (IRET or exception injection or interrupt shadow)
+        */
+       svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
+       svm->nmi_singlestep = true;
+       svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+}
 
+static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
        return 0;
+}
 
-e_free:
-       sev_asid_free(asid);
-       return ret;
+static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+       return 0;
 }
 
-static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
+void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 {
-       struct sev_data_activate *data;
-       int asid = sev_get_asid(kvm);
-       int ret;
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
+               svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+       else
+               svm->asid_generation--;
+}
 
-       /* activate ASID on the given handle */
-       data->handle = handle;
-       data->asid   = asid;
-       ret = sev_guest_activate(data, error);
-       kfree(data);
+static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       return ret;
+       invlpga(gva, svm->vmcb->control.asid);
 }
 
-static int __sev_issue_cmd(int fd, int id, void *data, int *error)
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
-       struct fd f;
-       int ret;
+}
 
-       f = fdget(fd);
-       if (!f.file)
-               return -EBADF;
+static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       ret = sev_issue_cmd_external_user(f.file, id, data, error);
+       if (svm_nested_virtualize_tpr(vcpu))
+               return;
 
-       fdput(f);
-       return ret;
+       if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
+               int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
+               kvm_set_cr8(vcpu, cr8);
+       }
 }
 
-static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
+static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 {
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u64 cr8;
+
+       if (svm_nested_virtualize_tpr(vcpu) ||
+           kvm_vcpu_apicv_active(vcpu))
+               return;
 
-       return __sev_issue_cmd(sev->fd, id, data, error);
+       cr8 = kvm_get_cr8(vcpu);
+       svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
+       svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
 }
 
-static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+static void svm_complete_interrupts(struct vcpu_svm *svm)
 {
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_start *start;
-       struct kvm_sev_launch_start params;
-       void *dh_blob, *session_blob;
-       int *error = &argp->error;
-       int ret;
+       u8 vector;
+       int type;
+       u32 exitintinfo = svm->vmcb->control.exit_int_info;
+       unsigned int3_injected = svm->int3_injected;
 
-       if (!sev_guest(kvm))
-               return -ENOTTY;
+       svm->int3_injected = 0;
 
-       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
-               return -EFAULT;
+       /*
+        * If we've made progress since setting HF_IRET_MASK, we've
+        * executed an IRET and can allow NMI injection.
+        */
+       if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
+           && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
+               svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+               kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+       }
 
-       start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
-       if (!start)
-               return -ENOMEM;
+       svm->vcpu.arch.nmi_injected = false;
+       kvm_clear_exception_queue(&svm->vcpu);
+       kvm_clear_interrupt_queue(&svm->vcpu);
 
-       dh_blob = NULL;
-       if (params.dh_uaddr) {
-               dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
-               if (IS_ERR(dh_blob)) {
-                       ret = PTR_ERR(dh_blob);
-                       goto e_free;
-               }
+       if (!(exitintinfo & SVM_EXITINTINFO_VALID))
+               return;
 
-               start->dh_cert_address = __sme_set(__pa(dh_blob));
-               start->dh_cert_len = params.dh_len;
-       }
+       kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 
-       session_blob = NULL;
-       if (params.session_uaddr) {
-               session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
-               if (IS_ERR(session_blob)) {
-                       ret = PTR_ERR(session_blob);
-                       goto e_free_dh;
-               }
+       vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
+       type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
 
-               start->session_address = __sme_set(__pa(session_blob));
-               start->session_len = params.session_len;
-       }
+       switch (type) {
+       case SVM_EXITINTINFO_TYPE_NMI:
+               svm->vcpu.arch.nmi_injected = true;
+               break;
+       case SVM_EXITINTINFO_TYPE_EXEPT:
+               /*
+                * In case of software exceptions, do not reinject the vector,
+                * but re-execute the instruction instead. Rewind RIP first
+                * if we emulated INT3 before.
+                */
+               if (kvm_exception_is_soft(vector)) {
+                       if (vector == BP_VECTOR && int3_injected &&
+                           kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
+                               kvm_rip_write(&svm->vcpu,
+                                             kvm_rip_read(&svm->vcpu) -
+                                             int3_injected);
+                       break;
+               }
+               if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
+                       u32 err = svm->vmcb->control.exit_int_info_err;
+                       kvm_requeue_exception_e(&svm->vcpu, vector, err);
 
-       start->handle = params.handle;
-       start->policy = params.policy;
-
-       /* create memory encryption context */
-       ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
-       if (ret)
-               goto e_free_session;
-
-       /* Bind ASID to this guest */
-       ret = sev_bind_asid(kvm, start->handle, error);
-       if (ret)
-               goto e_free_session;
-
-       /* return handle to userspace */
-       params.handle = start->handle;
-       if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
-               sev_unbind_asid(kvm, start->handle);
-               ret = -EFAULT;
-               goto e_free_session;
+               } else
+                       kvm_requeue_exception(&svm->vcpu, vector);
+               break;
+       case SVM_EXITINTINFO_TYPE_INTR:
+               kvm_queue_interrupt(&svm->vcpu, vector, false);
+               break;
+       default:
+               break;
        }
+}
 
-       sev->handle = start->handle;
-       sev->fd = argp->sev_fd;
+static void svm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb_control_area *control = &svm->vmcb->control;
 
-e_free_session:
-       kfree(session_blob);
-e_free_dh:
-       kfree(dh_blob);
-e_free:
-       kfree(start);
-       return ret;
+       control->exit_int_info = control->event_inj;
+       control->exit_int_info_err = control->event_inj_err;
+       control->event_inj = 0;
+       svm_complete_interrupts(svm);
 }
 
-static unsigned long get_num_contig_pages(unsigned long idx,
-                               struct page **inpages, unsigned long npages)
+bool __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
+
+static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
-       unsigned long paddr, next_paddr;
-       unsigned long i = idx + 1, pages = 1;
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       /* find the number of contiguous pages starting from idx */
-       paddr = __sme_page_pa(inpages[idx]);
-       while (i < npages) {
-               next_paddr = __sme_page_pa(inpages[i++]);
-               if ((paddr + PAGE_SIZE) == next_paddr) {
-                       pages++;
-                       paddr = next_paddr;
-                       continue;
-               }
-               break;
+       svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+       svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+       svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+       /*
+        * A vmexit emulation is required before the vcpu can be executed
+        * again.
+        */
+       if (unlikely(svm->nested.exit_required))
+               return;
+
+       /*
+        * Disable singlestep if we're injecting an interrupt/exception.
+        * We don't want our modified rflags to be pushed on the stack where
+        * we might not be able to easily reset them if we disabled NMI
+        * singlestep later.
+        */
+       if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
+               /*
+                * Event injection happens before external interrupts cause a
+                * vmexit and interrupts are disabled here, so smp_send_reschedule
+                * is enough to force an immediate vmexit.
+                */
+               disable_nmi_singlestep(svm);
+               smp_send_reschedule(vcpu->cpu);
        }
 
-       return pages;
-}
-
-static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
-       unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct kvm_sev_launch_update_data params;
-       struct sev_data_launch_update_data *data;
-       struct page **inpages;
-       int ret;
-
-       if (!sev_guest(kvm))
-               return -ENOTTY;
+       pre_svm_run(svm);
 
-       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
-               return -EFAULT;
+       sync_lapic_to_cr8(vcpu);
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       svm->vmcb->save.cr2 = vcpu->arch.cr2;
 
-       vaddr = params.uaddr;
-       size = params.len;
-       vaddr_end = vaddr + size;
+       clgi();
+       kvm_load_guest_xsave_state(vcpu);
 
-       /* Lock the user memory. */
-       inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
-       if (!inpages) {
-               ret = -ENOMEM;
-               goto e_free;
-       }
+       if (lapic_in_kernel(vcpu) &&
+               vcpu->arch.apic->lapic_timer.timer_advance_ns)
+               kvm_wait_lapic_expire(vcpu);
 
        /*
-        * The LAUNCH_UPDATE command will perform in-place encryption of the
-        * memory content (i.e it will write the same memory region with C=1).
-        * It's possible that the cache may contain the data with C=0, i.e.,
-        * unencrypted so invalidate it first.
+        * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+        * it's non-zero. Since vmentry is serialising on affected CPUs, there
+        * is no need to worry about the conditional branch over the wrmsr
+        * being speculatively taken.
         */
-       sev_clflush_pages(inpages, npages);
-
-       for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
-               int offset, len;
+       x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
 
-               /*
-                * If the user buffer is not page-aligned, calculate the offset
-                * within the page.
-                */
-               offset = vaddr & (PAGE_SIZE - 1);
+       local_irq_enable();
 
-               /* Calculate the number of pages that can be encrypted in one go. */
-               pages = get_num_contig_pages(i, inpages, npages);
+       __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
 
-               len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
+       /* Eliminate branch target predictions from guest mode */
+       vmexit_fill_RSB();
 
-               data->handle = sev->handle;
-               data->len = len;
-               data->address = __sme_page_pa(inpages[i]) + offset;
-               ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
-               if (ret)
-                       goto e_unpin;
+#ifdef CONFIG_X86_64
+       wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+#else
+       loadsegment(fs, svm->host.fs);
+#ifndef CONFIG_X86_32_LAZY_GS
+       loadsegment(gs, svm->host.gs);
+#endif
+#endif
 
-               size -= len;
-               next_vaddr = vaddr + len;
-       }
+       /*
+        * We do not use IBRS in the kernel. If this vCPU has used the
+        * SPEC_CTRL MSR it may have left it on; save the value and
+        * turn it off. This is much more efficient than blindly adding
+        * it to the atomic save/restore list. Especially as the former
+        * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+        *
+        * For non-nested case:
+        * If the L01 MSR bitmap does not intercept the MSR, then we need to
+        * save it.
+        *
+        * For nested case:
+        * If the L02 MSR bitmap does not intercept the MSR, then we need to
+        * save it.
+        */
+       if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+               svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-e_unpin:
-       /* content of memory is updated, mark pages dirty */
-       for (i = 0; i < npages; i++) {
-               set_page_dirty_lock(inpages[i]);
-               mark_page_accessed(inpages[i]);
-       }
-       /* unlock the user pages */
-       sev_unpin_memory(kvm, inpages, npages);
-e_free:
-       kfree(data);
-       return ret;
-}
+       reload_tss(vcpu);
 
-static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
-       void __user *measure = (void __user *)(uintptr_t)argp->data;
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_measure *data;
-       struct kvm_sev_launch_measure params;
-       void __user *p = NULL;
-       void *blob = NULL;
-       int ret;
+       local_irq_disable();
 
-       if (!sev_guest(kvm))
-               return -ENOTTY;
+       x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
 
-       if (copy_from_user(&params, measure, sizeof(params)))
-               return -EFAULT;
+       vcpu->arch.cr2 = svm->vmcb->save.cr2;
+       vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+       vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+       vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+               kvm_before_interrupt(&svm->vcpu);
 
-       /* User wants to query the blob length */
-       if (!params.len)
-               goto cmd;
+       kvm_load_host_xsave_state(vcpu);
+       stgi();
 
-       p = (void __user *)(uintptr_t)params.uaddr;
-       if (p) {
-               if (params.len > SEV_FW_BLOB_MAX_SIZE) {
-                       ret = -EINVAL;
-                       goto e_free;
-               }
+       /* Any pending NMI will happen here */
 
-               ret = -ENOMEM;
-               blob = kmalloc(params.len, GFP_KERNEL);
-               if (!blob)
-                       goto e_free;
+       if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+               kvm_after_interrupt(&svm->vcpu);
 
-               data->address = __psp_pa(blob);
-               data->len = params.len;
-       }
+       sync_cr8_to_lapic(vcpu);
 
-cmd:
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+       svm->next_rip = 0;
 
-       /*
-        * If we query the session length, FW responded with expected data.
-        */
-       if (!params.len)
-               goto done;
+       svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
 
-       if (ret)
-               goto e_free_blob;
+       /* if exit due to PF check for async PF */
+       if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
+               svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
 
-       if (blob) {
-               if (copy_to_user(p, blob, params.len))
-                       ret = -EFAULT;
+       if (npt_enabled) {
+               vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
+               vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
        }
 
-done:
-       params.len = data->len;
-       if (copy_to_user(measure, &params, sizeof(params)))
-               ret = -EFAULT;
-e_free_blob:
-       kfree(blob);
-e_free:
-       kfree(data);
-       return ret;
-}
-
-static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_finish *data;
-       int ret;
-
-       if (!sev_guest(kvm))
-               return -ENOTTY;
-
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
-
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
+       /*
+        * We need to handle MC intercepts here before the vcpu has a chance to
+        * change the physical cpu
+        */
+       if (unlikely(svm->vmcb->control.exit_code ==
+                    SVM_EXIT_EXCP_BASE + MC_VECTOR))
+               svm_handle_mce(svm);
 
-       kfree(data);
-       return ret;
+       mark_all_clean(svm->vmcb);
 }
+STACK_FRAME_NON_STANDARD(svm_vcpu_run);
 
-static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
+static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
 {
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct kvm_sev_guest_status params;
-       struct sev_data_guest_status *data;
-       int ret;
-
-       if (!sev_guest(kvm))
-               return -ENOTTY;
-
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       bool update_guest_cr3 = true;
+       unsigned long cr3;
 
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
-       if (ret)
-               goto e_free;
+       cr3 = __sme_set(root);
+       if (npt_enabled) {
+               svm->vmcb->control.nested_cr3 = cr3;
+               mark_dirty(svm->vmcb, VMCB_NPT);
 
-       params.policy = data->policy;
-       params.state = data->state;
-       params.handle = data->handle;
+               /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
+               if (is_guest_mode(vcpu))
+                       update_guest_cr3 = false;
+               else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+                       cr3 = vcpu->arch.cr3;
+               else /* CR3 is already up-to-date.  */
+                       update_guest_cr3 = false;
+       }
 
-       if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
-               ret = -EFAULT;
-e_free:
-       kfree(data);
-       return ret;
+       if (update_guest_cr3) {
+               svm->vmcb->save.cr3 = cr3;
+               mark_dirty(svm->vmcb, VMCB_CR);
+       }
 }
 
-static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
-                              unsigned long dst, int size,
-                              int *error, bool enc)
+static int is_disabled(void)
 {
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_dbg *data;
-       int ret;
-
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               return -ENOMEM;
+       u64 vm_cr;
 
-       data->handle = sev->handle;
-       data->dst_addr = dst;
-       data->src_addr = src;
-       data->len = size;
+       rdmsrl(MSR_VM_CR, vm_cr);
+       if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
+               return 1;
 
-       ret = sev_issue_cmd(kvm,
-                           enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
-                           data, error);
-       kfree(data);
-       return ret;
+       return 0;
 }
 
-static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
-                            unsigned long dst_paddr, int sz, int *err)
+static void
+svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 {
-       int offset;
-
        /*
-        * Its safe to read more than we are asked, caller should ensure that
-        * destination has enough space.
+        * Patch in the VMMCALL instruction:
         */
-       src_paddr = round_down(src_paddr, 16);
-       offset = src_paddr & 15;
-       sz = round_up(sz + offset, 16);
-
-       return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
+       hypercall[0] = 0x0f;
+       hypercall[1] = 0x01;
+       hypercall[2] = 0xd9;
 }
 
-static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
-                                 unsigned long __user dst_uaddr,
-                                 unsigned long dst_paddr,
-                                 int size, int *err)
+static int __init svm_check_processor_compat(void)
 {
-       struct page *tpage = NULL;
-       int ret, offset;
-
-       /* if inputs are not 16-byte then use intermediate buffer */
-       if (!IS_ALIGNED(dst_paddr, 16) ||
-           !IS_ALIGNED(paddr,     16) ||
-           !IS_ALIGNED(size,      16)) {
-               tpage = (void *)alloc_page(GFP_KERNEL);
-               if (!tpage)
-                       return -ENOMEM;
-
-               dst_paddr = __sme_page_pa(tpage);
-       }
+       return 0;
+}
 
-       ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
-       if (ret)
-               goto e_free;
+static bool svm_cpu_has_accelerated_tpr(void)
+{
+       return false;
+}
 
-       if (tpage) {
-               offset = paddr & 15;
-               if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
-                                page_address(tpage) + offset, size))
-                       ret = -EFAULT;
+static bool svm_has_emulated_msr(int index)
+{
+       switch (index) {
+       case MSR_IA32_MCG_EXT_CTL:
+       case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+               return false;
+       default:
+               break;
        }
 
-e_free:
-       if (tpage)
-               __free_page(tpage);
+       return true;
+}
 
-       return ret;
+static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+       return 0;
 }
 
-static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
-                                 unsigned long __user vaddr,
-                                 unsigned long dst_paddr,
-                                 unsigned long __user dst_vaddr,
-                                 int size, int *error)
+static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 {
-       struct page *src_tpage = NULL;
-       struct page *dst_tpage = NULL;
-       int ret, len = size;
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       /* If source buffer is not aligned then use an intermediate buffer */
-       if (!IS_ALIGNED(vaddr, 16)) {
-               src_tpage = alloc_page(GFP_KERNEL);
-               if (!src_tpage)
-                       return -ENOMEM;
+       vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+                                   boot_cpu_has(X86_FEATURE_XSAVE) &&
+                                   boot_cpu_has(X86_FEATURE_XSAVES);
 
-               if (copy_from_user(page_address(src_tpage),
-                               (void __user *)(uintptr_t)vaddr, size)) {
-                       __free_page(src_tpage);
-                       return -EFAULT;
-               }
+       /* Update nrips enabled cache */
+       svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
+                            guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
 
-               paddr = __sme_page_pa(src_tpage);
-       }
+       if (!kvm_vcpu_apicv_active(vcpu))
+               return;
+
+       /*
+        * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
+        * is exposed to the guest, disable AVIC.
+        */
+       if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
+               kvm_request_apicv_update(vcpu->kvm, false,
+                                        APICV_INHIBIT_REASON_X2APIC);
 
        /*
-        *  If destination buffer or length is not aligned then do read-modify-write:
-        *   - decrypt destination in an intermediate buffer
-        *   - copy the source buffer in an intermediate buffer
-        *   - use the intermediate buffer as source buffer
+        * Currently, AVIC does not work with nested virtualization.
+        * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
         */
-       if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
-               int dst_offset;
+       if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+               kvm_request_apicv_update(vcpu->kvm, false,
+                                        APICV_INHIBIT_REASON_NESTED);
+}
 
-               dst_tpage = alloc_page(GFP_KERNEL);
-               if (!dst_tpage) {
-                       ret = -ENOMEM;
-                       goto e_free;
-               }
+static bool svm_has_wbinvd_exit(void)
+{
+       return true;
+}
 
-               ret = __sev_dbg_decrypt(kvm, dst_paddr,
-                                       __sme_page_pa(dst_tpage), size, error);
-               if (ret)
-                       goto e_free;
+#define PRE_EX(exit)  { .exit_code = (exit), \
+                       .stage = X86_ICPT_PRE_EXCEPT, }
+#define POST_EX(exit) { .exit_code = (exit), \
+                       .stage = X86_ICPT_POST_EXCEPT, }
+#define POST_MEM(exit) { .exit_code = (exit), \
+                       .stage = X86_ICPT_POST_MEMACCESS, }
 
-               /*
-                *  If source is kernel buffer then use memcpy() otherwise
-                *  copy_from_user().
-                */
-               dst_offset = dst_paddr & 15;
+static const struct __x86_intercept {
+       u32 exit_code;
+       enum x86_intercept_stage stage;
+} x86_intercept_map[] = {
+       [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
+       [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
+       [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
+       [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
+       [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
+       [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
+       [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
+       [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
+       [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
+       [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
+       [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
+       [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
+       [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
+       [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
+       [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
+       [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
+       [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
+       [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
+       [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
+       [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
+       [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
+       [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
+       [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
+       [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
+       [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
+       [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
+       [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
+       [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
+       [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
+       [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
+       [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
+       [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
+       [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
+       [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
+       [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
+       [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
+       [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
+       [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
+       [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
+       [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
+       [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
+       [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
+       [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
+       [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
+       [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
+       [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
+       [x86_intercept_xsetbv]          = PRE_EX(SVM_EXIT_XSETBV),
+};
 
-               if (src_tpage)
-                       memcpy(page_address(dst_tpage) + dst_offset,
-                              page_address(src_tpage), size);
-               else {
-                       if (copy_from_user(page_address(dst_tpage) + dst_offset,
-                                          (void __user *)(uintptr_t)vaddr, size)) {
-                               ret = -EFAULT;
-                               goto e_free;
-                       }
-               }
+#undef PRE_EX
+#undef POST_EX
+#undef POST_MEM
 
-               paddr = __sme_page_pa(dst_tpage);
-               dst_paddr = round_down(dst_paddr, 16);
-               len = round_up(size, 16);
-       }
+static int svm_check_intercept(struct kvm_vcpu *vcpu,
+                              struct x86_instruction_info *info,
+                              enum x86_intercept_stage stage,
+                              struct x86_exception *exception)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       int vmexit, ret = X86EMUL_CONTINUE;
+       struct __x86_intercept icpt_info;
+       struct vmcb *vmcb = svm->vmcb;
 
-       ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
+       if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
+               goto out;
 
-e_free:
-       if (src_tpage)
-               __free_page(src_tpage);
-       if (dst_tpage)
-               __free_page(dst_tpage);
-       return ret;
-}
+       icpt_info = x86_intercept_map[info->intercept];
 
-static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
-{
-       unsigned long vaddr, vaddr_end, next_vaddr;
-       unsigned long dst_vaddr;
-       struct page **src_p, **dst_p;
-       struct kvm_sev_dbg debug;
-       unsigned long n;
-       unsigned int size;
-       int ret;
+       if (stage != icpt_info.stage)
+               goto out;
 
-       if (!sev_guest(kvm))
-               return -ENOTTY;
+       switch (icpt_info.exit_code) {
+       case SVM_EXIT_READ_CR0:
+               if (info->intercept == x86_intercept_cr_read)
+                       icpt_info.exit_code += info->modrm_reg;
+               break;
+       case SVM_EXIT_WRITE_CR0: {
+               unsigned long cr0, val;
+               u64 intercept;
 
-       if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
-               return -EFAULT;
+               if (info->intercept == x86_intercept_cr_write)
+                       icpt_info.exit_code += info->modrm_reg;
 
-       if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
-               return -EINVAL;
-       if (!debug.dst_uaddr)
-               return -EINVAL;
+               if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
+                   info->intercept == x86_intercept_clts)
+                       break;
 
-       vaddr = debug.src_uaddr;
-       size = debug.len;
-       vaddr_end = vaddr + size;
-       dst_vaddr = debug.dst_uaddr;
+               intercept = svm->nested.intercept;
 
-       for (; vaddr < vaddr_end; vaddr = next_vaddr) {
-               int len, s_off, d_off;
+               if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
+                       break;
 
-               /* lock userspace source and destination page */
-               src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
-               if (!src_p)
-                       return -EFAULT;
+               cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
+               val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
 
-               dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
-               if (!dst_p) {
-                       sev_unpin_memory(kvm, src_p, n);
-                       return -EFAULT;
+               if (info->intercept == x86_intercept_lmsw) {
+                       cr0 &= 0xfUL;
+                       val &= 0xfUL;
+                       /* lmsw can't clear PE - catch this here */
+                       if (cr0 & X86_CR0_PE)
+                               val |= X86_CR0_PE;
                }
 
-               /*
-                * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
-                * memory content (i.e it will write the same memory region with C=1).
-                * It's possible that the cache may contain the data with C=0, i.e.,
-                * unencrypted so invalidate it first.
-                */
-               sev_clflush_pages(src_p, 1);
-               sev_clflush_pages(dst_p, 1);
+               if (cr0 ^ val)
+                       icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
 
+               break;
+       }
+       case SVM_EXIT_READ_DR0:
+       case SVM_EXIT_WRITE_DR0:
+               icpt_info.exit_code += info->modrm_reg;
+               break;
+       case SVM_EXIT_MSR:
+               if (info->intercept == x86_intercept_wrmsr)
+                       vmcb->control.exit_info_1 = 1;
+               else
+                       vmcb->control.exit_info_1 = 0;
+               break;
+       case SVM_EXIT_PAUSE:
                /*
-                * Since user buffer may not be page aligned, calculate the
-                * offset within the page.
+                * We get this for NOP only, but pause
+                * is rep not, check this here
                 */
-               s_off = vaddr & ~PAGE_MASK;
-               d_off = dst_vaddr & ~PAGE_MASK;
-               len = min_t(size_t, (PAGE_SIZE - s_off), size);
-
-               if (dec)
-                       ret = __sev_dbg_decrypt_user(kvm,
-                                                    __sme_page_pa(src_p[0]) + s_off,
-                                                    dst_vaddr,
-                                                    __sme_page_pa(dst_p[0]) + d_off,
-                                                    len, &argp->error);
-               else
-                       ret = __sev_dbg_encrypt_user(kvm,
-                                                    __sme_page_pa(src_p[0]) + s_off,
-                                                    vaddr,
-                                                    __sme_page_pa(dst_p[0]) + d_off,
-                                                    dst_vaddr,
-                                                    len, &argp->error);
-
-               sev_unpin_memory(kvm, src_p, n);
-               sev_unpin_memory(kvm, dst_p, n);
-
-               if (ret)
-                       goto err;
-
-               next_vaddr = vaddr + len;
-               dst_vaddr = dst_vaddr + len;
-               size -= len;
-       }
-err:
-       return ret;
-}
+               if (info->rep_prefix != REPE_PREFIX)
+                       goto out;
+               break;
+       case SVM_EXIT_IOIO: {
+               u64 exit_info;
+               u32 bytes;
 
-static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
-{
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct sev_data_launch_secret *data;
-       struct kvm_sev_launch_secret params;
-       struct page **pages;
-       void *blob, *hdr;
-       unsigned long n;
-       int ret, offset;
+               if (info->intercept == x86_intercept_in ||
+                   info->intercept == x86_intercept_ins) {
+                       exit_info = ((info->src_val & 0xffff) << 16) |
+                               SVM_IOIO_TYPE_MASK;
+                       bytes = info->dst_bytes;
+               } else {
+                       exit_info = (info->dst_val & 0xffff) << 16;
+                       bytes = info->src_bytes;
+               }
 
-       if (!sev_guest(kvm))
-               return -ENOTTY;
+               if (info->intercept == x86_intercept_outs ||
+                   info->intercept == x86_intercept_ins)
+                       exit_info |= SVM_IOIO_STR_MASK;
 
-       if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
-               return -EFAULT;
+               if (info->rep_prefix)
+                       exit_info |= SVM_IOIO_REP_MASK;
 
-       pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
-       if (!pages)
-               return -ENOMEM;
+               bytes = min(bytes, 4u);
 
-       /*
-        * The secret must be copied into contiguous memory region, lets verify
-        * that userspace memory pages are contiguous before we issue command.
-        */
-       if (get_num_contig_pages(0, pages, n) != n) {
-               ret = -EINVAL;
-               goto e_unpin_memory;
-       }
+               exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
 
-       ret = -ENOMEM;
-       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
-       if (!data)
-               goto e_unpin_memory;
+               exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
 
-       offset = params.guest_uaddr & (PAGE_SIZE - 1);
-       data->guest_address = __sme_page_pa(pages[0]) + offset;
-       data->guest_len = params.guest_len;
+               vmcb->control.exit_info_1 = exit_info;
+               vmcb->control.exit_info_2 = info->next_rip;
 
-       blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
-       if (IS_ERR(blob)) {
-               ret = PTR_ERR(blob);
-               goto e_free;
+               break;
        }
-
-       data->trans_address = __psp_pa(blob);
-       data->trans_len = params.trans_len;
-
-       hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
-       if (IS_ERR(hdr)) {
-               ret = PTR_ERR(hdr);
-               goto e_free_blob;
+       default:
+               break;
        }
-       data->hdr_address = __psp_pa(hdr);
-       data->hdr_len = params.hdr_len;
 
-       data->handle = sev->handle;
-       ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+       /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
+       if (static_cpu_has(X86_FEATURE_NRIPS))
+               vmcb->control.next_rip  = info->next_rip;
+       vmcb->control.exit_code = icpt_info.exit_code;
+       vmexit = nested_svm_exit_handled(svm);
 
-       kfree(hdr);
+       ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
+                                          : X86EMUL_CONTINUE;
 
-e_free_blob:
-       kfree(blob);
-e_free:
-       kfree(data);
-e_unpin_memory:
-       sev_unpin_memory(kvm, pages, n);
+out:
        return ret;
 }
 
-static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu,
+       enum exit_fastpath_completion *exit_fastpath)
 {
-       struct kvm_sev_cmd sev_cmd;
-       int r;
+       if (!is_guest_mode(vcpu) &&
+           to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+           to_svm(vcpu)->vmcb->control.exit_info_1)
+               *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
+}
 
-       if (!svm_sev_enabled())
-               return -ENOTTY;
+static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+       if (pause_filter_thresh)
+               shrink_ple_window(vcpu);
+}
 
-       if (!argp)
-               return 0;
+static void svm_setup_mce(struct kvm_vcpu *vcpu)
+{
+       /* [63:9] are reserved. */
+       vcpu->arch.mcg_cap &= 0x1ff;
+}
 
-       if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
-               return -EFAULT;
+static int svm_smi_allowed(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       mutex_lock(&kvm->lock);
+       /* Per APM Vol.2 15.22.2 "Response to SMI" */
+       if (!gif_set(svm))
+               return 0;
 
-       switch (sev_cmd.id) {
-       case KVM_SEV_INIT:
-               r = sev_guest_init(kvm, &sev_cmd);
-               break;
-       case KVM_SEV_LAUNCH_START:
-               r = sev_launch_start(kvm, &sev_cmd);
-               break;
-       case KVM_SEV_LAUNCH_UPDATE_DATA:
-               r = sev_launch_update_data(kvm, &sev_cmd);
-               break;
-       case KVM_SEV_LAUNCH_MEASURE:
-               r = sev_launch_measure(kvm, &sev_cmd);
-               break;
-       case KVM_SEV_LAUNCH_FINISH:
-               r = sev_launch_finish(kvm, &sev_cmd);
-               break;
-       case KVM_SEV_GUEST_STATUS:
-               r = sev_guest_status(kvm, &sev_cmd);
-               break;
-       case KVM_SEV_DBG_DECRYPT:
-               r = sev_dbg_crypt(kvm, &sev_cmd, true);
-               break;
-       case KVM_SEV_DBG_ENCRYPT:
-               r = sev_dbg_crypt(kvm, &sev_cmd, false);
-               break;
-       case KVM_SEV_LAUNCH_SECRET:
-               r = sev_launch_secret(kvm, &sev_cmd);
-               break;
-       default:
-               r = -EINVAL;
-               goto out;
+       if (is_guest_mode(&svm->vcpu) &&
+           svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
+               /* TODO: Might need to set exit_info_1 and exit_info_2 here */
+               svm->vmcb->control.exit_code = SVM_EXIT_SMI;
+               svm->nested.exit_required = true;
+               return 0;
        }
 
-       if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
-               r = -EFAULT;
-
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
+       return 1;
 }
 
-static int svm_register_enc_region(struct kvm *kvm,
-                                  struct kvm_enc_region *range)
+static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
 {
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct enc_region *region;
-       int ret = 0;
-
-       if (!sev_guest(kvm))
-               return -ENOTTY;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       int ret;
 
-       if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
-               return -EINVAL;
+       if (is_guest_mode(vcpu)) {
+               /* FED8h - SVM Guest */
+               put_smstate(u64, smstate, 0x7ed8, 1);
+               /* FEE0h - SVM Guest VMCB Physical Address */
+               put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
 
-       region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
-       if (!region)
-               return -ENOMEM;
+               svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+               svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+               svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
 
-       region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
-       if (!region->pages) {
-               ret = -ENOMEM;
-               goto e_free;
+               ret = nested_svm_vmexit(svm);
+               if (ret)
+                       return ret;
        }
-
-       /*
-        * The guest may change the memory encryption attribute from C=0 -> C=1
-        * or vice versa for this memory range. Lets make sure caches are
-        * flushed to ensure that guest data gets written into memory with
-        * correct C-bit.
-        */
-       sev_clflush_pages(region->pages, region->npages);
-
-       region->uaddr = range->addr;
-       region->size = range->size;
-
-       mutex_lock(&kvm->lock);
-       list_add_tail(&region->list, &sev->regions_list);
-       mutex_unlock(&kvm->lock);
-
-       return ret;
-
-e_free:
-       kfree(region);
-       return ret;
+       return 0;
 }
 
-static struct enc_region *
-find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
+static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
 {
-       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-       struct list_head *head = &sev->regions_list;
-       struct enc_region *i;
+       struct vcpu_svm *svm = to_svm(vcpu);
+       struct vmcb *nested_vmcb;
+       struct kvm_host_map map;
+       u64 guest;
+       u64 vmcb;
 
-       list_for_each_entry(i, head, list) {
-               if (i->uaddr == range->addr &&
-                   i->size == range->size)
-                       return i;
-       }
+       guest = GET_SMSTATE(u64, smstate, 0x7ed8);
+       vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
 
-       return NULL;
+       if (guest) {
+               if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
+                       return 1;
+               nested_vmcb = map.hva;
+               enter_svm_guest_mode(svm, vmcb, nested_vmcb, &map);
+       }
+       return 0;
 }
 
-
-static int svm_unregister_enc_region(struct kvm *kvm,
-                                    struct kvm_enc_region *range)
+static int enable_smi_window(struct kvm_vcpu *vcpu)
 {
-       struct enc_region *region;
-       int ret;
-
-       mutex_lock(&kvm->lock);
-
-       if (!sev_guest(kvm)) {
-               ret = -ENOTTY;
-               goto failed;
-       }
+       struct vcpu_svm *svm = to_svm(vcpu);
 
-       region = find_enc_region(kvm, range);
-       if (!region) {
-               ret = -EINVAL;
-               goto failed;
+       if (!gif_set(svm)) {
+               if (vgif_enabled(svm))
+                       set_intercept(svm, INTERCEPT_STGI);
+               /* STGI will cause a vm exit */
+               return 1;
        }
-
-       /*
-        * Ensure that all guest tagged cache entries are flushed before
-        * releasing the pages back to the system for use. CLFLUSH will
-        * not do this, so issue a WBINVD.
-        */
-       wbinvd_on_all_cpus();
-
-       __unregister_enc_region_locked(kvm, region);
-
-       mutex_unlock(&kvm->lock);
        return 0;
-
-failed:
-       mutex_unlock(&kvm->lock);
-       return ret;
 }
 
 static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
@@ -7347,21 +3880,22 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
                   (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
 }
 
-static bool svm_check_apicv_inhibit_reasons(ulong bit)
+static void svm_vm_destroy(struct kvm *kvm)
 {
-       ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
-                         BIT(APICV_INHIBIT_REASON_HYPERV) |
-                         BIT(APICV_INHIBIT_REASON_NESTED) |
-                         BIT(APICV_INHIBIT_REASON_IRQWIN) |
-                         BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
-                         BIT(APICV_INHIBIT_REASON_X2APIC);
-
-       return supported & BIT(bit);
+       avic_vm_destroy(kvm);
+       sev_vm_destroy(kvm);
 }
 
-static void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate)
+static int svm_vm_init(struct kvm *kvm)
 {
-       avic_update_access_page(kvm, activate);
+       if (avic) {
+               int ret = avic_vm_init(kvm);
+               if (ret)
+                       return ret;
+       }
+
+       kvm_apicv_init(kvm, avic);
+       return 0;
 }
 
 static struct kvm_x86_ops svm_x86_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
new file mode 100644 (file)
index 0000000..df3474f
--- /dev/null
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ */
+
+#ifndef __SVM_SVM_H
+#define __SVM_SVM_H
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+
+#include <asm/svm.h>
+
+static const u32 host_save_user_msrs[] = {
+#ifdef CONFIG_X86_64
+       MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+       MSR_FS_BASE,
+#endif
+       MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+       MSR_TSC_AUX,
+};
+
+#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+
+#define MSRPM_OFFSETS  16
+extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+extern bool npt_enabled;
+
+enum {
+       VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+                           pause filter count */
+       VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
+       VMCB_ASID,       /* ASID */
+       VMCB_INTR,       /* int_ctl, int_vector */
+       VMCB_NPT,        /* npt_en, nCR3, gPAT */
+       VMCB_CR,         /* CR0, CR3, CR4, EFER */
+       VMCB_DR,         /* DR6, DR7 */
+       VMCB_DT,         /* GDT, IDT */
+       VMCB_SEG,        /* CS, DS, SS, ES, CPL */
+       VMCB_CR2,        /* CR2 only */
+       VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+       VMCB_AVIC,       /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
+                         * AVIC PHYSICAL_TABLE pointer,
+                         * AVIC LOGICAL_TABLE pointer
+                         */
+       VMCB_DIRTY_MAX,
+};
+
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+
+struct kvm_sev_info {
+       bool active;            /* SEV enabled guest */
+       unsigned int asid;      /* ASID used for this guest */
+       unsigned int handle;    /* SEV firmware handle */
+       int fd;                 /* SEV device fd */
+       unsigned long pages_locked; /* Number of pages locked */
+       struct list_head regions_list;  /* List of registered regions */
+};
+
+struct kvm_svm {
+       struct kvm kvm;
+
+       /* Struct members for AVIC */
+       u32 avic_vm_id;
+       struct page *avic_logical_id_table_page;
+       struct page *avic_physical_id_table_page;
+       struct hlist_node hnode;
+
+       struct kvm_sev_info sev_info;
+};
+
+struct kvm_vcpu;
+
+struct nested_state {
+       struct vmcb *hsave;
+       u64 hsave_msr;
+       u64 vm_cr_msr;
+       u64 vmcb;
+
+       /* These are the merged vectors */
+       u32 *msrpm;
+
+       /* gpa pointers to the real vectors */
+       u64 vmcb_msrpm;
+       u64 vmcb_iopm;
+
+       /* A VMEXIT is required but not yet emulated */
+       bool exit_required;
+
+       /* cache for intercepts of the guest */
+       u32 intercept_cr;
+       u32 intercept_dr;
+       u32 intercept_exceptions;
+       u64 intercept;
+
+       /* Nested Paging related state */
+       u64 nested_cr3;
+};
+
+struct vcpu_svm {
+       struct kvm_vcpu vcpu;
+       struct vmcb *vmcb;
+       unsigned long vmcb_pa;
+       struct svm_cpu_data *svm_data;
+       uint64_t asid_generation;
+       uint64_t sysenter_esp;
+       uint64_t sysenter_eip;
+       uint64_t tsc_aux;
+
+       u64 msr_decfg;
+
+       u64 next_rip;
+
+       u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+       struct {
+               u16 fs;
+               u16 gs;
+               u16 ldt;
+               u64 gs_base;
+       } host;
+
+       u64 spec_ctrl;
+       /*
+        * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
+        * translated into the appropriate L2_CFG bits on the host to
+        * perform speculative control.
+        */
+       u64 virt_spec_ctrl;
+
+       u32 *msrpm;
+
+       ulong nmi_iret_rip;
+
+       struct nested_state nested;
+
+       bool nmi_singlestep;
+       u64 nmi_singlestep_guest_rflags;
+
+       unsigned int3_injected;
+       unsigned long int3_rip;
+
+       /* cached guest cpuid flags for faster access */
+       bool nrips_enabled      : 1;
+
+       u32 ldr_reg;
+       u32 dfr_reg;
+       struct page *avic_backing_page;
+       u64 *avic_physical_id_cache;
+       bool avic_is_running;
+
+       /*
+        * Per-vcpu list of struct amd_svm_iommu_ir:
+        * This is used mainly to store interrupt remapping information used
+        * when update the vcpu affinity. This avoids the need to scan for
+        * IRTE and try to match ga_tag in the IOMMU driver.
+        */
+       struct list_head ir_list;
+       spinlock_t ir_list_lock;
+
+       /* which host CPU was used for running this vcpu */
+       unsigned int last_cpu;
+};
+
+struct svm_cpu_data {
+       int cpu;
+
+       u64 asid_generation;
+       u32 max_asid;
+       u32 next_asid;
+       u32 min_asid;
+       struct kvm_ldttss_desc *tss_desc;
+
+       struct page *save_area;
+       struct vmcb *current_vmcb;
+
+       /* index = sev_asid, value = vmcb pointer */
+       struct vmcb **sev_vmcbs;
+};
+
+DECLARE_PER_CPU(struct svm_cpu_data *, svm_data);
+
+void recalc_intercepts(struct vcpu_svm *svm);
+
+static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
+{
+       return container_of(kvm, struct kvm_svm, kvm);
+}
+
+static inline void mark_all_dirty(struct vmcb *vmcb)
+{
+       vmcb->control.clean = 0;
+}
+
+static inline void mark_all_clean(struct vmcb *vmcb)
+{
+       vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
+                              & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void mark_dirty(struct vmcb *vmcb, int bit)
+{
+       vmcb->control.clean &= ~(1 << bit);
+}
+
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+{
+       return container_of(vcpu, struct vcpu_svm, vcpu);
+}
+
+static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+{
+       if (is_guest_mode(&svm->vcpu))
+               return svm->nested.hsave;
+       else
+               return svm->vmcb;
+}
+
+static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept_cr |= (1U << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept_cr &= ~(1U << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       return vmcb->control.intercept_cr & (1U << bit);
+}
+
+static inline void set_dr_intercepts(struct vcpu_svm *svm)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
+               | (1 << INTERCEPT_DR1_READ)
+               | (1 << INTERCEPT_DR2_READ)
+               | (1 << INTERCEPT_DR3_READ)
+               | (1 << INTERCEPT_DR4_READ)
+               | (1 << INTERCEPT_DR5_READ)
+               | (1 << INTERCEPT_DR6_READ)
+               | (1 << INTERCEPT_DR7_READ)
+               | (1 << INTERCEPT_DR0_WRITE)
+               | (1 << INTERCEPT_DR1_WRITE)
+               | (1 << INTERCEPT_DR2_WRITE)
+               | (1 << INTERCEPT_DR3_WRITE)
+               | (1 << INTERCEPT_DR4_WRITE)
+               | (1 << INTERCEPT_DR5_WRITE)
+               | (1 << INTERCEPT_DR6_WRITE)
+               | (1 << INTERCEPT_DR7_WRITE);
+
+       recalc_intercepts(svm);
+}
+
+static inline void clr_dr_intercepts(struct vcpu_svm *svm)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept_dr = 0;
+
+       recalc_intercepts(svm);
+}
+
+static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept_exceptions |= (1U << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept_exceptions &= ~(1U << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline void set_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept |= (1ULL << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline void clr_intercept(struct vcpu_svm *svm, int bit)
+{
+       struct vmcb *vmcb = get_host_vmcb(svm);
+
+       vmcb->control.intercept &= ~(1ULL << bit);
+
+       recalc_intercepts(svm);
+}
+
+static inline bool is_intercept(struct vcpu_svm *svm, int bit)
+{
+       return (svm->vmcb->control.intercept & (1ULL << bit)) != 0;
+}
+
+static inline bool vgif_enabled(struct vcpu_svm *svm)
+{
+       return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
+}
+
+static inline void enable_gif(struct vcpu_svm *svm)
+{
+       if (vgif_enabled(svm))
+               svm->vmcb->control.int_ctl |= V_GIF_MASK;
+       else
+               svm->vcpu.arch.hflags |= HF_GIF_MASK;
+}
+
+static inline void disable_gif(struct vcpu_svm *svm)
+{
+       if (vgif_enabled(svm))
+               svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
+       else
+               svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+}
+
+static inline bool gif_set(struct vcpu_svm *svm)
+{
+       if (vgif_enabled(svm))
+               return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
+       else
+               return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
+}
+
+/* svm.c */
+#define MSR_INVALID                    0xffffffffU
+
+u32 svm_msrpm_offset(u32 msr);
+void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
+void disable_nmi_singlestep(struct vcpu_svm *svm);
+
+/* nested.c */
+
+#define NESTED_EXIT_HOST       0       /* Exit handled on host level */
+#define NESTED_EXIT_DONE       1       /* Exit caused nested vmexit  */
+#define NESTED_EXIT_CONTINUE   2       /* Further checks needed      */
+
+/* This function returns true if it is save to enable the nmi window */
+static inline bool nested_svm_nmi(struct vcpu_svm *svm)
+{
+       if (!is_guest_mode(&svm->vcpu))
+               return true;
+
+       if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
+               return true;
+
+       svm->vmcb->control.exit_code = SVM_EXIT_NMI;
+       svm->nested.exit_required = true;
+
+       return false;
+}
+
+static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
+{
+       return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
+}
+
+void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+                         struct vmcb *nested_vmcb, struct kvm_host_map *map);
+int nested_svm_vmrun(struct vcpu_svm *svm);
+void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
+int nested_svm_vmexit(struct vcpu_svm *svm);
+int nested_svm_exit_handled(struct vcpu_svm *svm);
+int nested_svm_check_permissions(struct vcpu_svm *svm);
+int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+                              bool has_error_code, u32 error_code);
+int svm_check_nested_events(struct kvm_vcpu *vcpu);
+int nested_svm_exit_special(struct vcpu_svm *svm);
+
+/* avic.c */
+
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK   (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT                        31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK               (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK   (0xFFULL)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK       (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK         (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK              (1ULL << 63)
+
+#define VMCB_AVIC_APIC_BAR_MASK                0xFFFFFFFFFF000ULL
+
+extern int avic;
+
+static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
+{
+       svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
+       mark_dirty(svm->vmcb, VMCB_AVIC);
+}
+
+static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+       u64 *entry = svm->avic_physical_id_cache;
+
+       if (!entry)
+               return false;
+
+       return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+}
+
+int avic_ga_log_notifier(u32 ga_tag);
+void avic_vm_destroy(struct kvm *kvm);
+int avic_vm_init(struct kvm *kvm);
+void avic_init_vmcb(struct vcpu_svm *svm);
+void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
+int avic_incomplete_ipi_interception(struct vcpu_svm *svm);
+int avic_unaccelerated_access_interception(struct vcpu_svm *svm);
+int avic_init_vcpu(struct vcpu_svm *svm);
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void avic_vcpu_put(struct kvm_vcpu *vcpu);
+void avic_post_state_restore(struct kvm_vcpu *vcpu);
+void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+bool svm_check_apicv_inhibit_reasons(ulong bit);
+void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate);
+void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
+void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
+int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
+bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
+int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+                      uint32_t guest_irq, bool set);
+void svm_vcpu_blocking(struct kvm_vcpu *vcpu);
+void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
+
+/* sev.c */
+
+extern unsigned int max_sev_asid;
+
+static inline bool sev_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+       struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+       return sev->active;
+#else
+       return false;
+#endif
+}
+
+static inline bool svm_sev_enabled(void)
+{
+       return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
+}
+
+void sev_vm_destroy(struct kvm *kvm);
+int svm_mem_enc_op(struct kvm *kvm, void __user *argp);
+int svm_register_enc_region(struct kvm *kvm,
+                           struct kvm_enc_region *range);
+int svm_unregister_enc_region(struct kvm *kvm,
+                             struct kvm_enc_region *range);
+void pre_sev_run(struct vcpu_svm *svm, int cpu);
+int __init sev_hardware_setup(void);
+void sev_hardware_teardown(void);
+
+#endif
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
new file mode 100644 (file)
index 0000000..fa1af90
--- /dev/null
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/bitsperlong.h>
+#include <asm/kvm_vcpu_regs.h>
+
+#define WORD_SIZE (BITS_PER_LONG / 8)
+
+/* Intentionally omit RAX as it's context switched by hardware */
+#define VCPU_RCX       __VCPU_REGS_RCX * WORD_SIZE
+#define VCPU_RDX       __VCPU_REGS_RDX * WORD_SIZE
+#define VCPU_RBX       __VCPU_REGS_RBX * WORD_SIZE
+/* Intentionally omit RSP as it's context switched by hardware */
+#define VCPU_RBP       __VCPU_REGS_RBP * WORD_SIZE
+#define VCPU_RSI       __VCPU_REGS_RSI * WORD_SIZE
+#define VCPU_RDI       __VCPU_REGS_RDI * WORD_SIZE
+
+#ifdef CONFIG_X86_64
+#define VCPU_R8                __VCPU_REGS_R8  * WORD_SIZE
+#define VCPU_R9                __VCPU_REGS_R9  * WORD_SIZE
+#define VCPU_R10       __VCPU_REGS_R10 * WORD_SIZE
+#define VCPU_R11       __VCPU_REGS_R11 * WORD_SIZE
+#define VCPU_R12       __VCPU_REGS_R12 * WORD_SIZE
+#define VCPU_R13       __VCPU_REGS_R13 * WORD_SIZE
+#define VCPU_R14       __VCPU_REGS_R14 * WORD_SIZE
+#define VCPU_R15       __VCPU_REGS_R15 * WORD_SIZE
+#endif
+
+       .text
+
+/**
+ * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode
+ * @vmcb_pa:   unsigned long
+ * @regs:      unsigned long * (to guest registers)
+ */
+SYM_FUNC_START(__svm_vcpu_run)
+       push %_ASM_BP
+       mov  %_ASM_SP, %_ASM_BP
+#ifdef CONFIG_X86_64
+       push %r15
+       push %r14
+       push %r13
+       push %r12
+#else
+       push %edi
+       push %esi
+#endif
+       push %_ASM_BX
+
+       /* Save @regs. */
+       push %_ASM_ARG2
+
+       /* Save @vmcb. */
+       push %_ASM_ARG1
+
+       /* Move @regs to RAX. */
+       mov %_ASM_ARG2, %_ASM_AX
+
+       /* Load guest registers. */
+       mov VCPU_RCX(%_ASM_AX), %_ASM_CX
+       mov VCPU_RDX(%_ASM_AX), %_ASM_DX
+       mov VCPU_RBX(%_ASM_AX), %_ASM_BX
+       mov VCPU_RBP(%_ASM_AX), %_ASM_BP
+       mov VCPU_RSI(%_ASM_AX), %_ASM_SI
+       mov VCPU_RDI(%_ASM_AX), %_ASM_DI
+#ifdef CONFIG_X86_64
+       mov VCPU_R8 (%_ASM_AX),  %r8
+       mov VCPU_R9 (%_ASM_AX),  %r9
+       mov VCPU_R10(%_ASM_AX), %r10
+       mov VCPU_R11(%_ASM_AX), %r11
+       mov VCPU_R12(%_ASM_AX), %r12
+       mov VCPU_R13(%_ASM_AX), %r13
+       mov VCPU_R14(%_ASM_AX), %r14
+       mov VCPU_R15(%_ASM_AX), %r15
+#endif
+
+       /* "POP" @vmcb to RAX. */
+       pop %_ASM_AX
+
+       /* Enter guest mode */
+1:     vmload %_ASM_AX
+       jmp 3f
+2:     cmpb $0, kvm_rebooting
+       jne 3f
+       ud2
+       _ASM_EXTABLE(1b, 2b)
+
+3:     vmrun %_ASM_AX
+       jmp 5f
+4:     cmpb $0, kvm_rebooting
+       jne 5f
+       ud2
+       _ASM_EXTABLE(3b, 4b)
+
+5:     vmsave %_ASM_AX
+       jmp 7f
+6:     cmpb $0, kvm_rebooting
+       jne 7f
+       ud2
+       _ASM_EXTABLE(5b, 6b)
+7:
+       /* "POP" @regs to RAX. */
+       pop %_ASM_AX
+
+       /* Save all guest registers.  */
+       mov %_ASM_CX,   VCPU_RCX(%_ASM_AX)
+       mov %_ASM_DX,   VCPU_RDX(%_ASM_AX)
+       mov %_ASM_BX,   VCPU_RBX(%_ASM_AX)
+       mov %_ASM_BP,   VCPU_RBP(%_ASM_AX)
+       mov %_ASM_SI,   VCPU_RSI(%_ASM_AX)
+       mov %_ASM_DI,   VCPU_RDI(%_ASM_AX)
+#ifdef CONFIG_X86_64
+       mov %r8,  VCPU_R8 (%_ASM_AX)
+       mov %r9,  VCPU_R9 (%_ASM_AX)
+       mov %r10, VCPU_R10(%_ASM_AX)
+       mov %r11, VCPU_R11(%_ASM_AX)
+       mov %r12, VCPU_R12(%_ASM_AX)
+       mov %r13, VCPU_R13(%_ASM_AX)
+       mov %r14, VCPU_R14(%_ASM_AX)
+       mov %r15, VCPU_R15(%_ASM_AX)
+#endif
+
+       /*
+        * Clear all general purpose registers except RSP and RAX to prevent
+        * speculative use of the guest's values, even those that are reloaded
+        * via the stack.  In theory, an L1 cache miss when restoring registers
+        * could lead to speculative execution with the guest's values.
+        * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
+        * free.  RSP and RAX are exempt as they are restored by hardware
+        * during VM-Exit.
+        */
+       xor %ecx, %ecx
+       xor %edx, %edx
+       xor %ebx, %ebx
+       xor %ebp, %ebp
+       xor %esi, %esi
+       xor %edi, %edi
+#ifdef CONFIG_X86_64
+       xor %r8d,  %r8d
+       xor %r9d,  %r9d
+       xor %r10d, %r10d
+       xor %r11d, %r11d
+       xor %r12d, %r12d
+       xor %r13d, %r13d
+       xor %r14d, %r14d
+       xor %r15d, %r15d
+#endif
+
+       pop %_ASM_BX
+
+#ifdef CONFIG_X86_64
+       pop %r12
+       pop %r13
+       pop %r14
+       pop %r15
+#else
+       pop %esi
+       pop %edi
+#endif
+       pop %_ASM_BP
+       ret
+SYM_FUNC_END(__svm_vcpu_run)
index de23230..cbc9ea2 100644 (file)
@@ -3645,7 +3645,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
         * Clear the MTF state. If a higher priority VM-exit is delivered first,
         * this state is discarded.
         */
-       vmx->nested.mtf_pending = false;
+       if (!block_nested_events)
+               vmx->nested.mtf_pending = false;
 
        if (lapic_in_kernel(vcpu) &&
                test_bit(KVM_APIC_INIT, &apic->pending_events)) {
index 9651ba3..87f3f24 100644 (file)
@@ -58,12 +58,8 @@ SYM_FUNC_START(vmx_vmenter)
        ret
 4:     ud2
 
-       .pushsection .fixup, "ax"
-5:     jmp 3b
-       .popsection
-
-       _ASM_EXTABLE(1b, 5b)
-       _ASM_EXTABLE(2b, 5b)
+       _ASM_EXTABLE(1b, 3b)
+       _ASM_EXTABLE(2b, 3b)
 
 SYM_FUNC_END(vmx_vmenter)
 
index 91749f1..8959514 100644 (file)
@@ -2261,10 +2261,6 @@ static int hardware_enable(void)
            !hv_get_vp_assist_page(cpu))
                return -EFAULT;
 
-       INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
-       INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
-       spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-
        r = kvm_cpu_vmxon(phys_addr);
        if (r)
                return r;
@@ -8044,7 +8040,7 @@ module_exit(vmx_exit);
 
 static int __init vmx_init(void)
 {
-       int r;
+       int r, cpu;
 
 #if IS_ENABLED(CONFIG_HYPERV)
        /*
@@ -8098,6 +8094,12 @@ static int __init vmx_init(void)
                return r;
        }
 
+       for_each_possible_cpu(cpu) {
+               INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+               INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+               spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+       }
+
 #ifdef CONFIG_KEXEC_CORE
        rcu_assign_pointer(crash_vmclear_loaded_vmcss,
                           crash_vmclear_local_loaded_vmcss);
index b8124b5..027dfd2 100644 (file)
@@ -1586,7 +1586,8 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data
 
        if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
                ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
-               ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
+               ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
+               ((u32)(data >> 32) != X2APIC_BROADCAST)) {
 
                data &= ~(1 << 12);
                kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
index 859519f..a51df51 100644 (file)
@@ -1222,7 +1222,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
                return 1;
 
        /* read, not present: */
-       if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+       if (unlikely(!vma_is_accessible(vma)))
                return 1;
 
        return 0;
index 99f7a68..59ba008 100644 (file)
@@ -25,11 +25,8 @@ nodemask_t numa_nodes_parsed __initdata;
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
-static struct numa_meminfo numa_meminfo
-#ifndef CONFIG_MEMORY_HOTPLUG
-__initdata
-#endif
-;
+static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
+static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
 
 static int numa_distance_cnt;
 static u8 *numa_distance;
@@ -169,6 +166,19 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
 }
 
 /**
+ * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
+ * @dst: numa_meminfo to append block to
+ * @idx: Index of memblk to remove
+ * @src: numa_meminfo to remove memblk from
+ */
+static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
+                                        struct numa_meminfo *src)
+{
+       dst->blk[dst->nr_blks++] = src->blk[idx];
+       numa_remove_memblk_from(idx, src);
+}
+
+/**
  * numa_add_memblk - Add one numa_memblk to numa_meminfo
  * @nid: NUMA node ID of the new memblk
  * @start: Start address of the new memblk
@@ -237,14 +247,19 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
        for (i = 0; i < mi->nr_blks; i++) {
                struct numa_memblk *bi = &mi->blk[i];
 
-               /* make sure all blocks are inside the limits */
+               /* move / save reserved memory ranges */
+               if (!memblock_overlaps_region(&memblock.memory,
+                                       bi->start, bi->end - bi->start)) {
+                       numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
+                       continue;
+               }
+
+               /* make sure all non-reserved blocks are inside the limits */
                bi->start = max(bi->start, low);
                bi->end = min(bi->end, high);
 
-               /* and there's no empty or non-exist block */
-               if (bi->start >= bi->end ||
-                   !memblock_overlaps_region(&memblock.memory,
-                       bi->start, bi->end - bi->start))
+               /* and there's no empty block */
+               if (bi->start >= bi->end)
                        numa_remove_memblk_from(i--, mi);
        }
 
@@ -881,16 +896,38 @@ EXPORT_SYMBOL(cpumask_of_node);
 
 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-int memory_add_physaddr_to_nid(u64 start)
+#ifdef CONFIG_NUMA_KEEP_MEMINFO
+static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
 {
-       struct numa_meminfo *mi = &numa_meminfo;
-       int nid = mi->blk[0].nid;
        int i;
 
        for (i = 0; i < mi->nr_blks; i++)
                if (mi->blk[i].start <= start && mi->blk[i].end > start)
-                       nid = mi->blk[i].nid;
+                       return mi->blk[i].nid;
+       return NUMA_NO_NODE;
+}
+
+int phys_to_target_node(phys_addr_t start)
+{
+       int nid = meminfo_to_nid(&numa_meminfo, start);
+
+       /*
+        * Prefer online nodes, but if reserved memory might be
+        * hot-added continue the search with reserved ranges.
+        */
+       if (nid != NUMA_NO_NODE)
+               return nid;
+
+       return meminfo_to_nid(&numa_reserved_meminfo, start);
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
+
+int memory_add_physaddr_to_nid(u64 start)
+{
+       int nid = meminfo_to_nid(&numa_meminfo, start);
+
+       if (nid == NUMA_NO_NODE)
+               nid = numa_meminfo.blk[0].nid;
        return nid;
 }
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
index d37db64..eebfb8a 100644 (file)
@@ -32,5 +32,5 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
        printf "#define __NR_syscalls\t%s\n" "${nxt}"
        printf "#endif\n"
        printf "\n"
-       printf "#endif /* %s */" "${fileguard}"
+       printf "#endif /* %s */\n" "${fileguard}"
 ) > "$out"
index c15a260..c5dc833 100644 (file)
@@ -883,8 +883,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
        /* this prevents anyone from attaching or migrating to this blkcg */
        wb_blkcg_offline(blkcg);
 
-       /* put the base cgwb reference allowing step 2 to be triggered */
-       blkcg_cgwb_put(blkcg);
+       /* put the base online pin allowing step 2 to be triggered */
+       blkcg_unpin_online(blkcg);
 }
 
 /**
@@ -983,11 +983,11 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
        }
 
        spin_lock_init(&blkcg->lock);
+       refcount_set(&blkcg->online_pin, 1);
        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
 #ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&blkcg->cgwb_list);
-       refcount_set(&blkcg->cgwb_refcnt, 1);
 #endif
        list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
 
@@ -1006,6 +1006,21 @@ unlock:
        return ret;
 }
 
+static int blkcg_css_online(struct cgroup_subsys_state *css)
+{
+       struct blkcg *blkcg = css_to_blkcg(css);
+       struct blkcg *parent = blkcg_parent(blkcg);
+
+       /*
+        * blkcg_pin_online() is used to delay blkcg offline so that blkgs
+        * don't go offline while cgwbs are still active on them.  Pin the
+        * parent so that offline always happens towards the root.
+        */
+       if (parent)
+               blkcg_pin_online(parent);
+       return 0;
+}
+
 /**
  * blkcg_init_queue - initialize blkcg part of request queue
  * @q: request_queue to initialize
@@ -1199,6 +1214,7 @@ static void blkcg_exit(struct task_struct *tsk)
 
 struct cgroup_subsys io_cgrp_subsys = {
        .css_alloc = blkcg_css_alloc,
+       .css_online = blkcg_css_online,
        .css_offline = blkcg_css_offline,
        .css_free = blkcg_css_free,
        .can_attach = blkcg_can_attach,
index f6291ce..8e56884 100644 (file)
@@ -1289,7 +1289,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 * the driver there was more coming, but that turned out to
                 * be a lie.
                 */
-               if (q->mq_ops->commit_rqs)
+               if (q->mq_ops->commit_rqs && queued)
                        q->mq_ops->commit_rqs(hctx);
 
                spin_lock(&hctx->lock);
@@ -1911,6 +1911,8 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
                struct list_head *list)
 {
+       int queued = 0;
+
        while (!list_empty(list)) {
                blk_status_t ret;
                struct request *rq = list_first_entry(list, struct request,
@@ -1926,7 +1928,8 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
                                break;
                        }
                        blk_mq_end_request(rq, ret);
-               }
+               } else
+                       queued++;
        }
 
        /*
@@ -1934,7 +1937,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
         * the driver there was more coming, but that turned out to
         * be a lie.
         */
-       if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs)
+       if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs && queued)
                hctx->queue->mq_ops->commit_rqs(hctx);
 }
 
index b79c451..bc1ded1 100644 (file)
@@ -496,7 +496,7 @@ int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev)
 
        if (!disk_part_scan_enabled(disk))
                return 0;
-       if (bdev->bd_part_count || bdev->bd_super)
+       if (bdev->bd_part_count || bdev->bd_openers > 1)
                return -EBUSY;
        res = invalidate_partition(disk, 0);
        if (res)
index 74920bd..dcecc9f 100644 (file)
@@ -138,6 +138,10 @@ source "drivers/virt/Kconfig"
 
 source "drivers/virtio/Kconfig"
 
+source "drivers/vdpa/Kconfig"
+
+source "drivers/vhost/Kconfig"
+
 source "drivers/hv/Kconfig"
 
 source "drivers/xen/Kconfig"
index 7646549..c0cd1b9 100644 (file)
@@ -42,6 +42,7 @@ obj-$(CONFIG_DMADEVICES)      += dma/
 obj-y                          += soc/
 
 obj-$(CONFIG_VIRTIO)           += virtio/
+obj-$(CONFIG_VDPA)             += vdpa/
 obj-$(CONFIG_XEN)              += xen/
 
 # regulators early, since some subsystems rely on them to initialize
index ed3d2d1..7d04424 100644 (file)
@@ -1015,6 +1015,7 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev)
                return ops;
 
        if (dev_is_pci(dev)) {
+               struct iommu_fwspec *fwspec;
                struct pci_bus *bus = to_pci_dev(dev)->bus;
                struct iort_pci_alias_info info = { .dev = dev };
 
@@ -1027,8 +1028,9 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev)
                err = pci_for_each_dma_alias(to_pci_dev(dev),
                                             iort_pci_iommu_init, &info);
 
-               if (!err && iort_pci_rc_supports_ats(node))
-                       dev->iommu_fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS;
+               fwspec = dev_iommu_fwspec_get(dev);
+               if (fwspec && iort_pci_rc_supports_ats(node))
+                       fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS;
        } else {
                int i = 0;
 
index 4816df5..b4c0152 100644 (file)
@@ -1589,8 +1589,8 @@ static int acpi_ec_add(struct acpi_device *device)
        strcpy(acpi_device_name(device), ACPI_EC_DEVICE_NAME);
        strcpy(acpi_device_class(device), ACPI_EC_CLASS);
 
-       if ((boot_ec && boot_ec->handle == device->handle) ||
-           !strcmp(acpi_device_hid(device), ACPI_ECDT_HID)) {
+       if (boot_ec && (boot_ec->handle == device->handle ||
+           !strcmp(acpi_device_hid(device), ACPI_ECDT_HID))) {
                /* Fast path: this device corresponds to the boot EC. */
                ec = boot_ec;
        } else {
index a3320f9..fa4500f 100644 (file)
@@ -360,7 +360,7 @@ static union acpi_object *acpi_label_info(acpi_handle handle)
 
 static u8 nfit_dsm_revid(unsigned family, unsigned func)
 {
-       static const u8 revid_table[NVDIMM_FAMILY_MAX+1][32] = {
+       static const u8 revid_table[NVDIMM_FAMILY_MAX+1][NVDIMM_CMD_MAX+1] = {
                [NVDIMM_FAMILY_INTEL] = {
                        [NVDIMM_INTEL_GET_MODES] = 2,
                        [NVDIMM_INTEL_GET_FWINFO] = 2,
@@ -386,7 +386,7 @@ static u8 nfit_dsm_revid(unsigned family, unsigned func)
 
        if (family > NVDIMM_FAMILY_MAX)
                return 0;
-       if (func > 31)
+       if (func > NVDIMM_CMD_MAX)
                return 0;
        id = revid_table[family][func];
        if (id == 0)
@@ -492,7 +492,8 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
         * Check for a valid command.  For ND_CMD_CALL, we also have to
         * make sure that the DSM function is supported.
         */
-       if (cmd == ND_CMD_CALL && !test_bit(func, &dsm_mask))
+       if (cmd == ND_CMD_CALL &&
+           (func > NVDIMM_CMD_MAX || !test_bit(func, &dsm_mask)))
                return -ENOTTY;
        else if (!test_bit(cmd, &cmd_mask))
                return -ENOTTY;
@@ -2026,8 +2027,10 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
                        continue;
                }
 
-               if (nfit_mem->bdw && nfit_mem->memdev_pmem)
+               if (nfit_mem->bdw && nfit_mem->memdev_pmem) {
                        set_bit(NDD_ALIASING, &flags);
+                       set_bit(NDD_LABELING, &flags);
+               }
 
                /* collate flags across all memdevs for this dimm */
                list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
@@ -3492,7 +3495,8 @@ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc,
        if (nvdimm && cmd == ND_CMD_CALL &&
                        call_pkg->nd_family == NVDIMM_FAMILY_INTEL) {
                func = call_pkg->nd_command;
-               if ((1 << func) & NVDIMM_INTEL_SECURITY_CMDMASK)
+               if (func > NVDIMM_CMD_MAX ||
+                   (1 << func) & NVDIMM_INTEL_SECURITY_CMDMASK)
                        return -EOPNOTSUPP;
        }
 
index 2424194..f5525f8 100644 (file)
@@ -34,6 +34,7 @@
                | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
 
 #define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV
+#define NVDIMM_CMD_MAX 31
 
 #define NVDIMM_STANDARD_CMDMASK \
 (1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \
@@ -144,32 +145,32 @@ struct nfit_spa {
        unsigned long ars_state;
        u32 clear_err_unit;
        u32 max_ars;
-       struct acpi_nfit_system_address spa[0];
+       struct acpi_nfit_system_address spa[];
 };
 
 struct nfit_dcr {
        struct list_head list;
-       struct acpi_nfit_control_region dcr[0];
+       struct acpi_nfit_control_region dcr[];
 };
 
 struct nfit_bdw {
        struct list_head list;
-       struct acpi_nfit_data_region bdw[0];
+       struct acpi_nfit_data_region bdw[];
 };
 
 struct nfit_idt {
        struct list_head list;
-       struct acpi_nfit_interleave idt[0];
+       struct acpi_nfit_interleave idt[];
 };
 
 struct nfit_flush {
        struct list_head list;
-       struct acpi_nfit_flush_address flush[0];
+       struct acpi_nfit_flush_address flush[];
 };
 
 struct nfit_memdev {
        struct list_head list;
-       struct acpi_nfit_memory_map memdev[0];
+       struct acpi_nfit_memory_map memdev[];
 };
 
 enum nfit_mem_flags {
index eadbf90..47b4969 100644 (file)
@@ -72,47 +72,6 @@ int acpi_map_pxm_to_node(int pxm)
 }
 EXPORT_SYMBOL(acpi_map_pxm_to_node);
 
-/**
- * acpi_map_pxm_to_online_node - Map proximity ID to online node
- * @pxm: ACPI proximity ID
- *
- * This is similar to acpi_map_pxm_to_node(), but always returns an online
- * node.  When the mapped node from a given proximity ID is offline, it
- * looks up the node distance table and returns the nearest online node.
- *
- * ACPI device drivers, which are called after the NUMA initialization has
- * completed in the kernel, can call this interface to obtain their device
- * NUMA topology from ACPI tables.  Such drivers do not have to deal with
- * offline nodes.  A node may be offline when a device proximity ID is
- * unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
- * "numa=off" on x86.
- */
-int acpi_map_pxm_to_online_node(int pxm)
-{
-       int node, min_node;
-
-       node = acpi_map_pxm_to_node(pxm);
-
-       if (node == NUMA_NO_NODE)
-               node = 0;
-
-       min_node = node;
-       if (!node_online(node)) {
-               int min_dist = INT_MAX, dist, n;
-
-               for_each_online_node(n) {
-                       dist = node_distance(node, n);
-                       if (dist < min_dist) {
-                               min_dist = dist;
-                               min_node = n;
-                       }
-               }
-       }
-
-       return min_node;
-}
-EXPORT_SYMBOL(acpi_map_pxm_to_online_node);
-
 static void __init
 acpi_table_print_srat_entry(struct acpi_subtable_header *header)
 {
index 4086718..dbec3a0 100644 (file)
 
 #define MEMORY_CLASS_NAME      "memory"
 
+static const char *const online_type_to_str[] = {
+       [MMOP_OFFLINE] = "offline",
+       [MMOP_ONLINE] = "online",
+       [MMOP_ONLINE_KERNEL] = "online_kernel",
+       [MMOP_ONLINE_MOVABLE] = "online_movable",
+};
+
+int memhp_online_type_from_str(const char *str)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
+               if (sysfs_streq(str, online_type_to_str[i]))
+                       return i;
+       }
+       return -EINVAL;
+}
+
 #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
 
 static int sections_per_block;
@@ -145,45 +163,6 @@ int memory_notify(unsigned long val, void *v)
 }
 
 /*
- * The probe routines leave the pages uninitialized, just as the bootmem code
- * does. Make sure we do not access them, but instead use only information from
- * within sections.
- */
-static bool pages_correctly_probed(unsigned long start_pfn)
-{
-       unsigned long section_nr = pfn_to_section_nr(start_pfn);
-       unsigned long section_nr_end = section_nr + sections_per_block;
-       unsigned long pfn = start_pfn;
-
-       /*
-        * memmap between sections is not contiguous except with
-        * SPARSEMEM_VMEMMAP. We lookup the page once per section
-        * and assume memmap is contiguous within each section
-        */
-       for (; section_nr < section_nr_end; section_nr++) {
-               if (WARN_ON_ONCE(!pfn_valid(pfn)))
-                       return false;
-
-               if (!present_section_nr(section_nr)) {
-                       pr_warn("section %ld pfn[%lx, %lx) not present\n",
-                               section_nr, pfn, pfn + PAGES_PER_SECTION);
-                       return false;
-               } else if (!valid_section_nr(section_nr)) {
-                       pr_warn("section %ld pfn[%lx, %lx) no valid memmap\n",
-                               section_nr, pfn, pfn + PAGES_PER_SECTION);
-                       return false;
-               } else if (online_section_nr(section_nr)) {
-                       pr_warn("section %ld pfn[%lx, %lx) is already online\n",
-                               section_nr, pfn, pfn + PAGES_PER_SECTION);
-                       return false;
-               }
-               pfn += PAGES_PER_SECTION;
-       }
-
-       return true;
-}
-
-/*
  * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
  * OK to have direct references to sparsemem variables in here.
  */
@@ -199,9 +178,6 @@ memory_block_action(unsigned long start_section_nr, unsigned long action,
 
        switch (action) {
        case MEM_ONLINE:
-               if (!pages_correctly_probed(start_pfn))
-                       return -EBUSY;
-
                ret = online_pages(start_pfn, nr_pages, online_type, nid);
                break;
        case MEM_OFFLINE:
@@ -245,17 +221,14 @@ static int memory_subsys_online(struct device *dev)
                return 0;
 
        /*
-        * If we are called from state_store(), online_type will be
-        * set >= 0 Otherwise we were called from the device online
-        * attribute and need to set the online_type.
+        * When called via device_online() without configuring the online_type,
+        * we want to default to MMOP_ONLINE.
         */
-       if (mem->online_type < 0)
-               mem->online_type = MMOP_ONLINE_KEEP;
+       if (mem->online_type == MMOP_OFFLINE)
+               mem->online_type = MMOP_ONLINE;
 
        ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
-
-       /* clear online_type */
-       mem->online_type = -1;
+       mem->online_type = MMOP_OFFLINE;
 
        return ret;
 }
@@ -267,40 +240,27 @@ static int memory_subsys_offline(struct device *dev)
        if (mem->state == MEM_OFFLINE)
                return 0;
 
-       /* Can't offline block with non-present sections */
-       if (mem->section_count != sections_per_block)
-               return -EINVAL;
-
        return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
 }
 
 static ssize_t state_store(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t count)
 {
+       const int online_type = memhp_online_type_from_str(buf);
        struct memory_block *mem = to_memory_block(dev);
-       int ret, online_type;
+       int ret;
+
+       if (online_type < 0)
+               return -EINVAL;
 
        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;
 
-       if (sysfs_streq(buf, "online_kernel"))
-               online_type = MMOP_ONLINE_KERNEL;
-       else if (sysfs_streq(buf, "online_movable"))
-               online_type = MMOP_ONLINE_MOVABLE;
-       else if (sysfs_streq(buf, "online"))
-               online_type = MMOP_ONLINE_KEEP;
-       else if (sysfs_streq(buf, "offline"))
-               online_type = MMOP_OFFLINE;
-       else {
-               ret = -EINVAL;
-               goto err;
-       }
-
        switch (online_type) {
        case MMOP_ONLINE_KERNEL:
        case MMOP_ONLINE_MOVABLE:
-       case MMOP_ONLINE_KEEP:
+       case MMOP_ONLINE:
                /* mem->online_type is protected by device_hotplug_lock */
                mem->online_type = online_type;
                ret = device_online(&mem->dev);
@@ -312,7 +272,6 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr,
                ret = -EINVAL; /* should never happen */
        }
 
-err:
        unlock_device_hotplug();
 
        if (ret < 0)
@@ -380,7 +339,8 @@ static ssize_t valid_zones_show(struct device *dev,
        }
 
        nid = mem->nid;
-       default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
+       default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, start_pfn,
+                                         nr_pages);
        strcat(buf, default_zone->name);
 
        print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
@@ -418,23 +378,20 @@ static DEVICE_ATTR_RO(block_size_bytes);
 static ssize_t auto_online_blocks_show(struct device *dev,
                                       struct device_attribute *attr, char *buf)
 {
-       if (memhp_auto_online)
-               return sprintf(buf, "online\n");
-       else
-               return sprintf(buf, "offline\n");
+       return sprintf(buf, "%s\n",
+                      online_type_to_str[memhp_default_online_type]);
 }
 
 static ssize_t auto_online_blocks_store(struct device *dev,
                                        struct device_attribute *attr,
                                        const char *buf, size_t count)
 {
-       if (sysfs_streq(buf, "online"))
-               memhp_auto_online = true;
-       else if (sysfs_streq(buf, "offline"))
-               memhp_auto_online = false;
-       else
+       const int online_type = memhp_online_type_from_str(buf);
+
+       if (online_type < 0)
                return -EINVAL;
 
+       memhp_default_online_type = online_type;
        return count;
 }
 
@@ -627,7 +584,7 @@ static int init_memory_block(struct memory_block **memory,
 
 static int add_memory_block(unsigned long base_section_nr)
 {
-       int ret, section_count = 0;
+       int section_count = 0;
        struct memory_block *mem;
        unsigned long nr;
 
@@ -638,12 +595,8 @@ static int add_memory_block(unsigned long base_section_nr)
 
        if (section_count == 0)
                return 0;
-       ret = init_memory_block(&mem, base_memory_block_id(base_section_nr),
-                               MEM_ONLINE);
-       if (ret)
-               return ret;
-       mem->section_count = section_count;
-       return 0;
+       return init_memory_block(&mem, base_memory_block_id(base_section_nr),
+                                MEM_ONLINE);
 }
 
 static void unregister_memory(struct memory_block *memory)
@@ -679,7 +632,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
                ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
                if (ret)
                        break;
-               mem->section_count = sections_per_block;
        }
        if (ret) {
                end_block_id = block_id;
@@ -688,7 +640,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
                        mem = find_memory_block_by_id(block_id);
                        if (WARN_ON_ONCE(!mem))
                                continue;
-                       mem->section_count = 0;
                        unregister_memory(mem);
                }
        }
@@ -717,7 +668,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
                mem = find_memory_block_by_id(block_id);
                if (WARN_ON_ONCE(!mem))
                        continue;
-               mem->section_count = 0;
                unregister_memory_block_under_nodes(mem);
                unregister_memory(mem);
        }
index a42c49e..da693e6 100644 (file)
@@ -429,11 +429,12 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos,
         * information.
         */
        struct file *file = lo->lo_backing_file;
+       struct request_queue *q = lo->lo_queue;
        int ret;
 
        mode |= FALLOC_FL_KEEP_SIZE;
 
-       if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
+       if (!blk_queue_discard(q)) {
                ret = -EOPNOTSUPP;
                goto out;
        }
@@ -463,7 +464,7 @@ static void lo_complete_rq(struct request *rq)
        if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) ||
            req_op(rq) != REQ_OP_READ) {
                if (cmd->ret < 0)
-                       ret = BLK_STS_IOERR;
+                       ret = errno_to_blk_status(cmd->ret);
                goto end_io;
        }
 
@@ -868,27 +869,46 @@ static void loop_config_discard(struct loop_device *lo)
        struct request_queue *q = lo->lo_queue;
 
        /*
+        * If the backing device is a block device, mirror its zeroing
+        * capability. Set the discard sectors to the block device's zeroing
+        * capabilities because loop discards result in blkdev_issue_zeroout(),
+        * not blkdev_issue_discard(). This maintains consistent behavior with
+        * file-backed loop devices: discarded regions read back as zero.
+        */
+       if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) {
+               struct request_queue *backingq;
+
+               backingq = bdev_get_queue(inode->i_bdev);
+               blk_queue_max_discard_sectors(q,
+                       backingq->limits.max_write_zeroes_sectors);
+
+               blk_queue_max_write_zeroes_sectors(q,
+                       backingq->limits.max_write_zeroes_sectors);
+
+       /*
         * We use punch hole to reclaim the free space used by the
         * image a.k.a. discard. However we do not support discard if
         * encryption is enabled, because it may give an attacker
         * useful information.
         */
-       if ((!file->f_op->fallocate) ||
-           lo->lo_encrypt_key_size) {
+       } else if (!file->f_op->fallocate || lo->lo_encrypt_key_size) {
                q->limits.discard_granularity = 0;
                q->limits.discard_alignment = 0;
                blk_queue_max_discard_sectors(q, 0);
                blk_queue_max_write_zeroes_sectors(q, 0);
-               blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
-               return;
-       }
 
-       q->limits.discard_granularity = inode->i_sb->s_blocksize;
-       q->limits.discard_alignment = 0;
+       } else {
+               q->limits.discard_granularity = inode->i_sb->s_blocksize;
+               q->limits.discard_alignment = 0;
+
+               blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
+               blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
+       }
 
-       blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
-       blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
-       blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+       if (q->limits.max_write_zeroes_sectors)
+               blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+       else
+               blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
 }
 
 static void loop_unprepare_queue(struct loop_device *lo)
@@ -1955,7 +1975,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
  failed:
        /* complete non-aio request */
        if (!cmd->use_aio || ret) {
-               cmd->ret = ret ? -EIO : 0;
+               if (ret == -EOPNOTSUPP)
+                       cmd->ret = ret;
+               else
+                       cmd->ret = ret ? -EIO : 0;
                blk_mq_complete_request(rq);
        }
 }
index 6343402..1e0a6b1 100644 (file)
@@ -337,10 +337,7 @@ struct rbd_img_request {
                u64                     snap_id;        /* for reads */
                struct ceph_snap_context *snapc;        /* for writes */
        };
-       union {
-               struct request          *rq;            /* block request */
-               struct rbd_obj_request  *obj_request;   /* obj req initiator */
-       };
+       struct rbd_obj_request  *obj_request;   /* obj req initiator */
 
        struct list_head        lock_item;
        struct list_head        object_extents; /* obj_req.ex structs */
@@ -349,7 +346,6 @@ struct rbd_img_request {
        struct pending_result   pending;
        struct work_struct      work;
        int                     work_result;
-       struct kref             kref;
 };
 
 #define for_each_obj_request(ireq, oreq) \
@@ -1320,15 +1316,6 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
        kref_put(&obj_request->kref, rbd_obj_request_destroy);
 }
 
-static void rbd_img_request_destroy(struct kref *kref);
-static void rbd_img_request_put(struct rbd_img_request *img_request)
-{
-       rbd_assert(img_request != NULL);
-       dout("%s: img %p (was %d)\n", __func__, img_request,
-               kref_read(&img_request->kref));
-       kref_put(&img_request->kref, rbd_img_request_destroy);
-}
-
 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
                                        struct rbd_obj_request *obj_request)
 {
@@ -1366,18 +1353,10 @@ static void rbd_osd_submit(struct ceph_osd_request *osd_req)
 static void img_request_layered_set(struct rbd_img_request *img_request)
 {
        set_bit(IMG_REQ_LAYERED, &img_request->flags);
-       smp_mb();
-}
-
-static void img_request_layered_clear(struct rbd_img_request *img_request)
-{
-       clear_bit(IMG_REQ_LAYERED, &img_request->flags);
-       smp_mb();
 }
 
 static bool img_request_layered_test(struct rbd_img_request *img_request)
 {
-       smp_mb();
        return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
 }
 
@@ -1619,10 +1598,8 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
        if (!rbd_dev->parent_spec)
                return false;
 
-       down_read(&rbd_dev->header_rwsem);
        if (rbd_dev->parent_overlap)
                counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
-       up_read(&rbd_dev->header_rwsem);
 
        if (counter < 0)
                rbd_warn(rbd_dev, "parent reference overflow");
@@ -1630,63 +1607,54 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
        return counter > 0;
 }
 
-/*
- * Caller is responsible for filling in the list of object requests
- * that comprises the image request, and the Linux request pointer
- * (if there is one).
- */
-static struct rbd_img_request *rbd_img_request_create(
-                                       struct rbd_device *rbd_dev,
-                                       enum obj_operation_type op_type,
-                                       struct ceph_snap_context *snapc)
+static void rbd_img_request_init(struct rbd_img_request *img_request,
+                                struct rbd_device *rbd_dev,
+                                enum obj_operation_type op_type)
 {
-       struct rbd_img_request *img_request;
-
-       img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
-       if (!img_request)
-               return NULL;
+       memset(img_request, 0, sizeof(*img_request));
 
        img_request->rbd_dev = rbd_dev;
        img_request->op_type = op_type;
-       if (!rbd_img_is_write(img_request))
-               img_request->snap_id = rbd_dev->spec->snap_id;
-       else
-               img_request->snapc = snapc;
-
-       if (rbd_dev_parent_get(rbd_dev))
-               img_request_layered_set(img_request);
 
        INIT_LIST_HEAD(&img_request->lock_item);
        INIT_LIST_HEAD(&img_request->object_extents);
        mutex_init(&img_request->state_mutex);
-       kref_init(&img_request->kref);
+}
+
+static void rbd_img_capture_header(struct rbd_img_request *img_req)
+{
+       struct rbd_device *rbd_dev = img_req->rbd_dev;
+
+       lockdep_assert_held(&rbd_dev->header_rwsem);
 
-       return img_request;
+       if (rbd_img_is_write(img_req))
+               img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+       else
+               img_req->snap_id = rbd_dev->spec->snap_id;
+
+       if (rbd_dev_parent_get(rbd_dev))
+               img_request_layered_set(img_req);
 }
 
-static void rbd_img_request_destroy(struct kref *kref)
+static void rbd_img_request_destroy(struct rbd_img_request *img_request)
 {
-       struct rbd_img_request *img_request;
        struct rbd_obj_request *obj_request;
        struct rbd_obj_request *next_obj_request;
 
-       img_request = container_of(kref, struct rbd_img_request, kref);
-
        dout("%s: img %p\n", __func__, img_request);
 
        WARN_ON(!list_empty(&img_request->lock_item));
        for_each_obj_request_safe(img_request, obj_request, next_obj_request)
                rbd_img_obj_request_del(img_request, obj_request);
 
-       if (img_request_layered_test(img_request)) {
-               img_request_layered_clear(img_request);
+       if (img_request_layered_test(img_request))
                rbd_dev_parent_put(img_request->rbd_dev);
-       }
 
        if (rbd_img_is_write(img_request))
                ceph_put_snap_context(img_request->snapc);
 
-       kmem_cache_free(rbd_img_request_cache, img_request);
+       if (test_bit(IMG_REQ_CHILD, &img_request->flags))
+               kmem_cache_free(rbd_img_request_cache, img_request);
 }
 
 #define BITS_PER_OBJ   2
@@ -2849,17 +2817,22 @@ static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
 static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
 {
        struct rbd_img_request *img_req = obj_req->img_request;
+       struct rbd_device *parent = img_req->rbd_dev->parent;
        struct rbd_img_request *child_img_req;
        int ret;
 
-       child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
-                                              OBJ_OP_READ, NULL);
+       child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
        if (!child_img_req)
                return -ENOMEM;
 
+       rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
        __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
        child_img_req->obj_request = obj_req;
 
+       down_read(&parent->header_rwsem);
+       rbd_img_capture_header(child_img_req);
+       up_read(&parent->header_rwsem);
+
        dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
             obj_req);
 
@@ -2888,7 +2861,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
                                              obj_req->copyup_bvecs);
        }
        if (ret) {
-               rbd_img_request_put(child_img_req);
+               rbd_img_request_destroy(child_img_req);
                return ret;
        }
 
@@ -3647,15 +3620,15 @@ again:
        if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
                struct rbd_obj_request *obj_req = img_req->obj_request;
 
-               rbd_img_request_put(img_req);
+               rbd_img_request_destroy(img_req);
                if (__rbd_obj_handle_request(obj_req, &result)) {
                        img_req = obj_req->img_request;
                        goto again;
                }
        } else {
-               struct request *rq = img_req->rq;
+               struct request *rq = blk_mq_rq_from_pdu(img_req);
 
-               rbd_img_request_put(img_req);
+               rbd_img_request_destroy(img_req);
                blk_mq_end_request(rq, errno_to_blk_status(result));
        }
 }
@@ -4707,84 +4680,36 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 
 static void rbd_queue_workfn(struct work_struct *work)
 {
-       struct request *rq = blk_mq_rq_from_pdu(work);
-       struct rbd_device *rbd_dev = rq->q->queuedata;
-       struct rbd_img_request *img_request;
-       struct ceph_snap_context *snapc = NULL;
+       struct rbd_img_request *img_request =
+           container_of(work, struct rbd_img_request, work);
+       struct rbd_device *rbd_dev = img_request->rbd_dev;
+       enum obj_operation_type op_type = img_request->op_type;
+       struct request *rq = blk_mq_rq_from_pdu(img_request);
        u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
        u64 length = blk_rq_bytes(rq);
-       enum obj_operation_type op_type;
        u64 mapping_size;
        int result;
 
-       switch (req_op(rq)) {
-       case REQ_OP_DISCARD:
-               op_type = OBJ_OP_DISCARD;
-               break;
-       case REQ_OP_WRITE_ZEROES:
-               op_type = OBJ_OP_ZEROOUT;
-               break;
-       case REQ_OP_WRITE:
-               op_type = OBJ_OP_WRITE;
-               break;
-       case REQ_OP_READ:
-               op_type = OBJ_OP_READ;
-               break;
-       default:
-               dout("%s: non-fs request type %d\n", __func__, req_op(rq));
-               result = -EIO;
-               goto err;
-       }
-
        /* Ignore/skip any zero-length requests */
-
        if (!length) {
                dout("%s: zero-length request\n", __func__);
                result = 0;
-               goto err_rq;
-       }
-
-       if (op_type != OBJ_OP_READ) {
-               if (rbd_is_ro(rbd_dev)) {
-                       rbd_warn(rbd_dev, "%s on read-only mapping",
-                                obj_op_name(op_type));
-                       result = -EIO;
-                       goto err;
-               }
-               rbd_assert(!rbd_is_snap(rbd_dev));
-       }
-
-       if (offset && length > U64_MAX - offset + 1) {
-               rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
-                        length);
-               result = -EINVAL;
-               goto err_rq;    /* Shouldn't happen */
+               goto err_img_request;
        }
 
        blk_mq_start_request(rq);
 
        down_read(&rbd_dev->header_rwsem);
        mapping_size = rbd_dev->mapping.size;
-       if (op_type != OBJ_OP_READ) {
-               snapc = rbd_dev->header.snapc;
-               ceph_get_snap_context(snapc);
-       }
+       rbd_img_capture_header(img_request);
        up_read(&rbd_dev->header_rwsem);
 
        if (offset + length > mapping_size) {
                rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
                         length, mapping_size);
                result = -EIO;
-               goto err_rq;
-       }
-
-       img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
-       if (!img_request) {
-               result = -ENOMEM;
-               goto err_rq;
+               goto err_img_request;
        }
-       img_request->rq = rq;
-       snapc = NULL; /* img_request consumes a ref */
 
        dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
             img_request, obj_op_name(op_type), offset, length);
@@ -4801,23 +4726,51 @@ static void rbd_queue_workfn(struct work_struct *work)
        return;
 
 err_img_request:
-       rbd_img_request_put(img_request);
-err_rq:
+       rbd_img_request_destroy(img_request);
        if (result)
                rbd_warn(rbd_dev, "%s %llx at %llx result %d",
                         obj_op_name(op_type), length, offset, result);
-       ceph_put_snap_context(snapc);
-err:
        blk_mq_end_request(rq, errno_to_blk_status(result));
 }
 
 static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
                const struct blk_mq_queue_data *bd)
 {
-       struct request *rq = bd->rq;
-       struct work_struct *work = blk_mq_rq_to_pdu(rq);
+       struct rbd_device *rbd_dev = hctx->queue->queuedata;
+       struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
+       enum obj_operation_type op_type;
 
-       queue_work(rbd_wq, work);
+       switch (req_op(bd->rq)) {
+       case REQ_OP_DISCARD:
+               op_type = OBJ_OP_DISCARD;
+               break;
+       case REQ_OP_WRITE_ZEROES:
+               op_type = OBJ_OP_ZEROOUT;
+               break;
+       case REQ_OP_WRITE:
+               op_type = OBJ_OP_WRITE;
+               break;
+       case REQ_OP_READ:
+               op_type = OBJ_OP_READ;
+               break;
+       default:
+               rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
+               return BLK_STS_IOERR;
+       }
+
+       rbd_img_request_init(img_req, rbd_dev, op_type);
+
+       if (rbd_img_is_write(img_req)) {
+               if (rbd_is_ro(rbd_dev)) {
+                       rbd_warn(rbd_dev, "%s on read-only mapping",
+                                obj_op_name(img_req->op_type));
+                       return BLK_STS_IOERR;
+               }
+               rbd_assert(!rbd_is_snap(rbd_dev));
+       }
+
+       INIT_WORK(&img_req->work, rbd_queue_workfn);
+       queue_work(rbd_wq, &img_req->work);
        return BLK_STS_OK;
 }
 
@@ -4984,18 +4937,8 @@ out:
        return ret;
 }
 
-static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
-               unsigned int hctx_idx, unsigned int numa_node)
-{
-       struct work_struct *work = blk_mq_rq_to_pdu(rq);
-
-       INIT_WORK(work, rbd_queue_workfn);
-       return 0;
-}
-
 static const struct blk_mq_ops rbd_mq_ops = {
        .queue_rq       = rbd_queue_rq,
-       .init_request   = rbd_init_request,
 };
 
 static int rbd_init_disk(struct rbd_device *rbd_dev)
@@ -5027,8 +4970,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
        rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
        rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
        rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
-       rbd_dev->tag_set.nr_hw_queues = 1;
-       rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
+       rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
+       rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
 
        err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
        if (err)
index db124bc..fcc5321 100644 (file)
@@ -94,7 +94,7 @@ static void haltpoll_uninit(void)
        haltpoll_cpuidle_devices = NULL;
 }
 
-static bool haltpool_want(void)
+static bool haltpoll_want(void)
 {
        return kvm_para_has_hint(KVM_HINTS_REALTIME) || force;
 }
@@ -110,7 +110,7 @@ static int __init haltpoll_init(void)
 
        cpuidle_poll_state_init(drv);
 
-       if (!kvm_para_available() || !haltpool_want())
+       if (!kvm_para_available() || !haltpoll_want())
                return -ENODEV;
 
        ret = cpuidle_register_driver(drv);
index 095850d..f09c6cf 100644 (file)
@@ -27,6 +27,7 @@ config CRYPTO_DEV_HISI_SEC2
        select CRYPTO_SHA256
        select CRYPTO_SHA512
        depends on PCI && PCI_MSI
+       depends on UACCE || UACCE=n
        depends on ARM64 || (COMPILE_TEST && 64BIT)
        help
          Support for HiSilicon SEC Engine of version 2 in crypto subsystem.
@@ -58,6 +59,7 @@ config CRYPTO_DEV_HISI_ZIP
 config CRYPTO_DEV_HISI_HPRE
        tristate "Support for HISI HPRE accelerator"
        depends on PCI && PCI_MSI
+       depends on UACCE || UACCE=n
        depends on ARM64 || (COMPILE_TEST && 64BIT)
        select CRYPTO_DEV_HISI_QM
        select CRYPTO_DH
index 946fb62..06202bc 100644 (file)
@@ -1161,13 +1161,13 @@ static inline u32 create_aead_null_output_list(struct aead_request *req,
                                           inputlen);
                if (status != inputlen) {
                        status = -EINVAL;
-                       goto error;
+                       goto error_free;
                }
                status = sg_copy_from_buffer(req->dst, sg_nents(req->dst), ptr,
                                             inputlen);
                if (status != inputlen) {
                        status = -EINVAL;
-                       goto error;
+                       goto error_free;
                }
                kfree(ptr);
        }
@@ -1209,8 +1209,10 @@ static inline u32 create_aead_null_output_list(struct aead_request *req,
 
        req_info->outcnt = argcnt;
        return 0;
-error:
+
+error_free:
        kfree(ptr);
+error:
        return status;
 }
 
index 46e4604..df238c8 100644 (file)
@@ -421,8 +421,10 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
         * device outside of mmap of the resulting character device.
         */
        dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
-       if (!dax_dev)
+       if (IS_ERR(dax_dev)) {
+               rc = PTR_ERR(dax_dev);
                goto err;
+       }
 
        /* a device_dax instance is dead while the driver is not attached */
        kill_dax(dax_dev);
index 0aa4b6b..8e32345 100644 (file)
@@ -344,6 +344,23 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 }
 EXPORT_SYMBOL_GPL(dax_copy_to_iter);
 
+int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+                       size_t nr_pages)
+{
+       if (!dax_alive(dax_dev))
+               return -ENXIO;
+       /*
+        * There are no callers that want to zero more than one page as of now.
+        * Once users are there, this check can be removed after the
+        * device mapper code has been updated to split ranges across targets.
+        */
+       if (nr_pages != 1)
+               return -EIO;
+
+       return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages);
+}
+EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
@@ -551,9 +568,16 @@ struct dax_device *alloc_dax(void *private, const char *__host,
        dev_t devt;
        int minor;
 
+       if (ops && !ops->zero_page_range) {
+               pr_debug("%s: error: device does not provide dax"
+                        " operation zero_page_range()\n",
+                        __host ? __host : "Unknown");
+               return ERR_PTR(-EINVAL);
+       }
+
        host = kstrdup(__host, GFP_KERNEL);
        if (__host && !host)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
 
        minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL);
        if (minor < 0)
@@ -576,7 +600,7 @@ struct dax_device *alloc_dax(void *private, const char *__host,
        ida_simple_remove(&dax_minor_ida, minor);
  err_minor:
        kfree(host);
-       return NULL;
+       return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL_GPL(alloc_dax);
 
index ef73b67..9626673 100644 (file)
@@ -43,11 +43,12 @@ config DMABUF_MOVE_NOTIFY
        bool "Move notify between drivers (EXPERIMENTAL)"
        default n
        help
-         Don''t pin buffers if the dynamic DMA-buf interface is available on both the
-         exporter as well as the importer. This fixes a security problem where
-         userspace is able to pin unrestricted amounts of memory through DMA-buf.
-         But marked experimental because we don''t jet have a consistent execution
-         context and memory management between drivers.
+         Don't pin buffers if the dynamic DMA-buf interface is available on
+         both the exporter as well as the importer. This fixes a security
+         problem where userspace is able to pin unrestricted amounts of memory
+         through DMA-buf.
+         This is marked experimental because we don't yet have a consistent
+         execution context and memory management between drivers.
 
 config DMABUF_SELFTESTS
        tristate "Selftests for the dma-buf interfaces"
index db0c1a9..fc9f8ab 100644 (file)
@@ -75,14 +75,12 @@ efi_status_t handle_kernel_image(unsigned long *image_addr,
 
        if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && phys_seed != 0) {
                /*
-                * If CONFIG_DEBUG_ALIGN_RODATA is not set, produce a
-                * displacement in the interval [0, MIN_KIMG_ALIGN) that
-                * doesn't violate this kernel's de-facto alignment
+                * Produce a displacement in the interval [0, MIN_KIMG_ALIGN)
+                * that doesn't violate this kernel's de-facto alignment
                 * constraints.
                 */
                u32 mask = (MIN_KIMG_ALIGN - 1) & ~(EFI_KIMG_ALIGN - 1);
-               u32 offset = !IS_ENABLED(CONFIG_DEBUG_ALIGN_RODATA) ?
-                            (phys_seed >> 32) & mask : TEXT_OFFSET;
+               u32 offset = (phys_seed >> 32) & mask;
 
                /*
                 * With CONFIG_RANDOMIZE_TEXT_OFFSET=y, TEXT_OFFSET may not
index c8f2aa1..f6e3f59 100644 (file)
@@ -1113,7 +1113,7 @@ static int gfx_v10_0_mec_init(struct amdgpu_device *adev)
                return r;
        }
 
-       memset(hpd, 0, adev->gfx.mec.hpd_eop_obj->tbo.mem.size);
+       memset(hpd, 0, mec_hpd_size);
 
        amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
        amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
index 37c8231..608ffe3 100644 (file)
@@ -1946,7 +1946,7 @@ static int gfx_v9_0_mec_init(struct amdgpu_device *adev)
                return r;
        }
 
-       memset(hpd, 0, adev->gfx.mec.hpd_eop_obj->tbo.mem.size);
+       memset(hpd, 0, mec_hpd_size);
 
        amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
        amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
index d5386f1..05bc6d9 100644 (file)
@@ -1112,9 +1112,9 @@ kfd_gtt_out:
        return 0;
 
 kfd_gtt_no_free_chunk:
-       pr_debug("Allocation failed with mem_obj = %p\n", mem_obj);
+       pr_debug("Allocation failed with mem_obj = %p\n", *mem_obj);
        mutex_unlock(&kfd->gtt_sa_lock);
-       kfree(mem_obj);
+       kfree(*mem_obj);
        return -ENOMEM;
 }
 
index d3674d8..bab587a 100644 (file)
@@ -3639,6 +3639,9 @@ fill_dc_plane_info_and_addr(struct amdgpu_device *adev,
        case DRM_FORMAT_NV12:
                plane_info->format = SURFACE_PIXEL_FORMAT_VIDEO_420_YCrCb;
                break;
+       case DRM_FORMAT_P010:
+               plane_info->format = SURFACE_PIXEL_FORMAT_VIDEO_420_10bpc_YCrCb;
+               break;
        default:
                DRM_ERROR(
                        "Unsupported screen format %s\n",
@@ -5535,6 +5538,8 @@ static int get_plane_formats(const struct drm_plane *plane,
 
                if (plane_cap && plane_cap->pixel_format_support.nv12)
                        formats[num_formats++] = DRM_FORMAT_NV12;
+               if (plane_cap && plane_cap->pixel_format_support.p010)
+                       formats[num_formats++] = DRM_FORMAT_P010;
                break;
 
        case DRM_PLANE_TYPE_OVERLAY:
@@ -5587,12 +5592,15 @@ static int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm,
        }
 
        if (plane->type == DRM_PLANE_TYPE_PRIMARY &&
-           plane_cap && plane_cap->pixel_format_support.nv12) {
+           plane_cap &&
+           (plane_cap->pixel_format_support.nv12 ||
+            plane_cap->pixel_format_support.p010)) {
                /* This only affects YUV formats. */
                drm_plane_create_color_properties(
                        plane,
                        BIT(DRM_COLOR_YCBCR_BT601) |
-                       BIT(DRM_COLOR_YCBCR_BT709),
+                       BIT(DRM_COLOR_YCBCR_BT709) |
+                       BIT(DRM_COLOR_YCBCR_BT2020),
                        BIT(DRM_COLOR_YCBCR_LIMITED_RANGE) |
                        BIT(DRM_COLOR_YCBCR_FULL_RANGE),
                        DRM_COLOR_YCBCR_BT709, DRM_COLOR_YCBCR_LIMITED_RANGE);
index 5b70ed3..78e1c11 100644 (file)
@@ -192,10 +192,13 @@ void hdcp_update_display(struct hdcp_workqueue *hdcp_work,
                                            &hdcp_work->srm_version);
 
                        display->adjust.disable = 0;
-                       if (content_type == DRM_MODE_HDCP_CONTENT_TYPE0)
+                       if (content_type == DRM_MODE_HDCP_CONTENT_TYPE0) {
+                               hdcp_w->link.adjust.hdcp1.disable = 0;
                                hdcp_w->link.adjust.hdcp2.force_type = MOD_HDCP_FORCE_TYPE_0;
-                       else if (content_type == DRM_MODE_HDCP_CONTENT_TYPE1)
+                       } else if (content_type == DRM_MODE_HDCP_CONTENT_TYPE1) {
+                               hdcp_w->link.adjust.hdcp1.disable = 1;
                                hdcp_w->link.adjust.hdcp2.force_type = MOD_HDCP_FORCE_TYPE_1;
+                       }
 
                        schedule_delayed_work(&hdcp_w->property_validate_dwork,
                                              msecs_to_jiffies(DRM_HDCP_CHECK_PERIOD_MS));
@@ -263,7 +266,7 @@ static void event_callback(struct work_struct *work)
 
        mutex_lock(&hdcp_work->mutex);
 
-       cancel_delayed_work(&hdcp_work->watchdog_timer_dwork);
+       cancel_delayed_work(&hdcp_work->callback_dwork);
 
        mod_hdcp_process_event(&hdcp_work->hdcp, MOD_HDCP_EVENT_CALLBACK,
                               &hdcp_work->output);
@@ -344,6 +347,8 @@ static void event_watchdog_timer(struct work_struct *work)
 
        mutex_lock(&hdcp_work->mutex);
 
+       cancel_delayed_work(&hdcp_work->watchdog_timer_dwork);
+
        mod_hdcp_process_event(&hdcp_work->hdcp,
                               MOD_HDCP_EVENT_WATCHDOG_TIMEOUT,
                               &hdcp_work->output);
@@ -414,7 +419,8 @@ static void update_config(void *handle, struct cp_psp_stream_config *config)
        link->dp.rev = aconnector->dc_link->dpcd_caps.dpcd_rev.raw;
        link->dp.mst_supported = config->mst_supported;
        display->adjust.disable = 1;
-       link->adjust.auth_delay = 2;
+       link->adjust.auth_delay = 3;
+       link->adjust.hdcp1.disable = 0;
 
        hdcp_update_display(hdcp_work, link_index, aconnector, DRM_MODE_HDCP_CONTENT_TYPE0, false);
 }
index 2ffb221..f21bbb2 100644 (file)
@@ -1360,6 +1360,26 @@ bool dc_commit_state(struct dc *dc, struct dc_state *context)
        return (result == DC_OK);
 }
 
+static bool is_flip_pending_in_pipes(struct dc *dc, struct dc_state *context)
+{
+       int i;
+       struct pipe_ctx *pipe;
+
+       for (i = 0; i < MAX_PIPES; i++) {
+               pipe = &context->res_ctx.pipe_ctx[i];
+
+               if (!pipe->plane_state)
+                       continue;
+
+               /* Must set to false to start with, due to OR in update function */
+               pipe->plane_state->status.is_flip_pending = false;
+               dc->hwss.update_pending_status(pipe);
+               if (pipe->plane_state->status.is_flip_pending)
+                       return true;
+       }
+       return false;
+}
+
 bool dc_post_update_surfaces_to_stream(struct dc *dc)
 {
        int i;
@@ -1370,6 +1390,9 @@ bool dc_post_update_surfaces_to_stream(struct dc *dc)
 
        post_surface_trace(dc);
 
+       if (is_flip_pending_in_pipes(dc, context))
+               return true;
+
        for (i = 0; i < dc->res_pool->pipe_count; i++)
                if (context->res_ctx.pipe_ctx[i].stream == NULL ||
                    context->res_ctx.pipe_ctx[i].plane_state == NULL) {
@@ -1703,6 +1726,9 @@ static enum surface_update_type det_surface_update(const struct dc *dc,
        if (u->coeff_reduction_factor)
                update_flags->bits.coeff_reduction_change = 1;
 
+       if (u->gamut_remap_matrix)
+               update_flags->bits.gamut_remap_change = 1;
+
        if (u->gamma) {
                enum surface_pixel_format format = SURFACE_PIXEL_FORMAT_GRPH_BEGIN;
 
@@ -1728,7 +1754,8 @@ static enum surface_update_type det_surface_update(const struct dc *dc,
 
        if (update_flags->bits.input_csc_change
                        || update_flags->bits.coeff_reduction_change
-                       || update_flags->bits.gamma_change) {
+                       || update_flags->bits.gamma_change
+                       || update_flags->bits.gamut_remap_change) {
                type = UPDATE_TYPE_FULL;
                elevate_update_type(&overall_type, type);
        }
@@ -1973,6 +2000,10 @@ static void copy_surface_update_to_plane(
        if (srf_update->coeff_reduction_factor)
                surface->coeff_reduction_factor =
                        *srf_update->coeff_reduction_factor;
+
+       if (srf_update->gamut_remap_matrix)
+               surface->gamut_remap_matrix =
+                       *srf_update->gamut_remap_matrix;
 }
 
 static void copy_stream_update_to_stream(struct dc *dc,
index 75c7ce4..f4bcc71 100644 (file)
@@ -1077,6 +1077,7 @@ bool resource_build_scaling_params(struct pipe_ctx *pipe_ctx)
         * on certain displays, such as the Sharp 4k
         */
        pipe_ctx->plane_res.scl_data.lb_params.depth = LB_PIXEL_DEPTH_30BPP;
+       pipe_ctx->plane_res.scl_data.lb_params.alpha_en = plane_state->per_pixel_alpha;
 
        pipe_ctx->plane_res.scl_data.recout.x += timing->h_border_left;
        pipe_ctx->plane_res.scl_data.recout.y += timing->v_border_top;
index d3ceb39..1935cf6 100644 (file)
@@ -726,6 +726,7 @@ union surface_update_flags {
                uint32_t output_tf_change:1;
                uint32_t pixel_format_change:1;
                uint32_t plane_size_change:1;
+               uint32_t gamut_remap_change:1;
 
                /* Full updates */
                uint32_t new_plane:1;
@@ -760,6 +761,7 @@ struct dc_plane_state {
        struct dc_csc_transform input_csc_color_matrix;
        struct fixed31_32 coeff_reduction_factor;
        struct fixed31_32 hdr_mult;
+       struct colorspace_transform gamut_remap_matrix;
 
        // TODO: No longer used, remove
        struct dc_hdr_static_metadata hdr_static_ctx;
@@ -839,6 +841,7 @@ struct dc_surface_update {
        const struct dc_transfer_func *func_shaper;
        const struct dc_3dlut *lut3d_func;
        const struct dc_transfer_func *blend_tf;
+       const struct colorspace_transform *gamut_remap_matrix;
 };
 
 /*
index 9cc3314..0be0100 100644 (file)
@@ -2004,6 +2004,12 @@ void dcn10_program_gamut_remap(struct pipe_ctx *pipe_ctx)
                for (i = 0; i < CSC_TEMPERATURE_MATRIX_SIZE; i++)
                        adjust.temperature_matrix[i] =
                                pipe_ctx->stream->gamut_remap_matrix.matrix[i];
+       } else if (pipe_ctx->plane_state &&
+                  pipe_ctx->plane_state->gamut_remap_matrix.enable_remap == true) {
+               adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_SW;
+               for (i = 0; i < CSC_TEMPERATURE_MATRIX_SIZE; i++)
+                       adjust.temperature_matrix[i] =
+                               pipe_ctx->plane_state->gamut_remap_matrix.matrix[i];
        }
 
        pipe_ctx->plane_res.dpp->funcs->dpp_set_gamut_remap(pipe_ctx->plane_res.dpp, &adjust);
index 63acb8f..17d96ec 100644 (file)
@@ -343,6 +343,23 @@ void optc1_set_blank_data_double_buffer(struct timing_generator *optc, bool enab
 }
 
 /**
+ * optc1_set_timing_double_buffer() - DRR double buffering control
+ *
+ * Sets double buffer point for V_TOTAL, H_TOTAL, VTOTAL_MIN,
+ * VTOTAL_MAX, VTOTAL_MIN_SEL and VTOTAL_MAX_SEL registers.
+ *
+ * Options: any time,  start of frame, dp start of frame (range timing)
+ */
+void optc1_set_timing_double_buffer(struct timing_generator *optc, bool enable)
+{
+       struct optc *optc1 = DCN10TG_FROM_TG(optc);
+       uint32_t mode = enable ? 2 : 0;
+
+       REG_UPDATE(OTG_DOUBLE_BUFFER_CONTROL,
+                  OTG_RANGE_TIMING_DBUF_UPDATE_MODE, mode);
+}
+
+/**
  * unblank_crtc
  * Call ASIC Control Object to UnBlank CRTC.
  */
@@ -1353,6 +1370,7 @@ void optc1_clear_optc_underflow(struct timing_generator *optc)
 void optc1_tg_init(struct timing_generator *optc)
 {
        optc1_set_blank_data_double_buffer(optc, true);
+       optc1_set_timing_double_buffer(optc, true);
        optc1_clear_optc_underflow(optc);
 }
 
index f277656..9a459a8 100644 (file)
@@ -185,6 +185,7 @@ struct dcn_optc_registers {
        SF(OTG0_OTG_GLOBAL_CONTROL0, OTG_MASTER_UPDATE_LOCK_SEL, mask_sh),\
        SF(OTG0_OTG_DOUBLE_BUFFER_CONTROL, OTG_UPDATE_PENDING, mask_sh),\
        SF(OTG0_OTG_DOUBLE_BUFFER_CONTROL, OTG_BLANK_DATA_DOUBLE_BUFFER_EN, mask_sh),\
+       SF(OTG0_OTG_DOUBLE_BUFFER_CONTROL, OTG_RANGE_TIMING_DBUF_UPDATE_MODE, mask_sh),\
        SF(OTG0_OTG_H_TOTAL, OTG_H_TOTAL, mask_sh),\
        SF(OTG0_OTG_H_BLANK_START_END, OTG_H_BLANK_START, mask_sh),\
        SF(OTG0_OTG_H_BLANK_START_END, OTG_H_BLANK_END, mask_sh),\
@@ -643,6 +644,8 @@ bool optc1_is_optc_underflow_occurred(struct timing_generator *optc);
 
 void optc1_set_blank_data_double_buffer(struct timing_generator *optc, bool enable);
 
+void optc1_set_timing_double_buffer(struct timing_generator *optc, bool enable);
+
 bool optc1_get_otg_active_size(struct timing_generator *optc,
                uint32_t *otg_active_width,
                uint32_t *otg_active_height);
index 261bdc3..8b71222 100644 (file)
@@ -552,7 +552,8 @@ static const struct dc_plane_cap plane_cap = {
        .pixel_format_support = {
                        .argb8888 = true,
                        .nv12 = true,
-                       .fp16 = true
+                       .fp16 = true,
+                       .p010 = true
        },
 
        .max_upscale_factor = {
index a673952..5cdbba0 100644 (file)
@@ -1012,7 +1012,8 @@ static const struct dc_plane_cap plane_cap = {
        .pixel_format_support = {
                        .argb8888 = true,
                        .nv12 = true,
-                       .fp16 = true
+                       .fp16 = true,
+                       .p010 = true
        },
 
        .max_upscale_factor = {
@@ -3342,7 +3343,7 @@ void dcn20_cap_soc_clocks(
 void dcn20_update_bounding_box(struct dc *dc, struct _vcs_dpi_soc_bounding_box_st *bb,
                struct pp_smu_nv_clock_table *max_clocks, unsigned int *uclk_states, unsigned int num_states)
 {
-       struct _vcs_dpi_voltage_scaling_st calculated_states[MAX_CLOCK_LIMIT_STATES];
+       struct _vcs_dpi_voltage_scaling_st calculated_states[DC__VOLTAGE_STATES];
        int i;
        int num_calculated_states = 0;
        int min_dcfclk = 0;
index 51b5910..b25484a 100644 (file)
@@ -300,7 +300,7 @@ struct _vcs_dpi_soc_bounding_box_st dcn2_1_soc = {
        .xfc_bus_transport_time_us = 4,
        .xfc_xbuf_latency_tolerance_us = 4,
        .use_urgent_burst_bw = 1,
-       .num_states = 9
+       .num_states = 8
 };
 
 #ifndef MAX
@@ -838,7 +838,8 @@ static const struct dc_plane_cap plane_cap = {
        .pixel_format_support = {
                        .argb8888 = true,
                        .nv12 = true,
-                       .fp16 = true
+                       .fp16 = true,
+                       .p010 = true
        },
 
        .max_upscale_factor = {
@@ -1376,21 +1377,8 @@ static void update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_param
        unsigned int i, j, k;
        int closest_clk_lvl;
 
-       // diags does not retrieve proper values from SMU
-       // cap states to 5 and make state 5 the max state
-       if (IS_FPGA_MAXIMUS_DC(dc->ctx->dce_environment) || IS_DIAG_DC(dc->ctx->dce_environment)) {
-               dcn2_1_soc.num_states = 5;
-
-               dcn2_1_soc.clock_limits[5].state = 5;
-               dcn2_1_soc.clock_limits[5].dcfclk_mhz = 810.0;
-               dcn2_1_soc.clock_limits[5].fabricclk_mhz = 1600.0;
-               dcn2_1_soc.clock_limits[5].dispclk_mhz = 1395.0;
-               dcn2_1_soc.clock_limits[5].dppclk_mhz = 1285.0;
-               dcn2_1_soc.clock_limits[5].phyclk_mhz = 1325.0;
-               dcn2_1_soc.clock_limits[5].socclk_mhz = 953.0;
-               dcn2_1_soc.clock_limits[5].dscclk_mhz = 489.0;
-               dcn2_1_soc.clock_limits[5].dram_speed_mts = 4266.0;
-       } else {
+       // Default clock levels are used for diags, which may lead to overclocking.
+       if (!IS_FPGA_MAXIMUS_DC(dc->ctx->dce_environment) && !IS_DIAG_DC(dc->ctx->dce_environment)) {
                dcn2_1_ip.max_num_otg = pool->base.res_cap->num_timing_generator;
                dcn2_1_ip.max_num_dpp = pool->base.pipe_count;
                dcn2_1_soc.num_chans = bw_params->num_channels;
@@ -1403,16 +1391,16 @@ static void update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_param
                dcn2_1_soc.clock_limits[0].dram_speed_mts = clk_table->entries[0].memclk_mhz * 2;
 
                /*
-                * Other levels: find cloest DCN clocks that fit the given clock limit using dcfclk
-                * as indicater
+                * Other levels: find closest DCN clocks that fit the given clock limit using dcfclk
+                * as indicator
                 */
 
                closest_clk_lvl = -1;
                /* index currently being filled */
                k = 1;
                for (i = 1; i < clk_table->num_entries; i++) {
-                       /* loop backwards, skip duplicate state, +1 because SMU has precision issue */
-                       for (j = dcn2_1_soc.num_states - 2; j >= k; j--) {
+                       /* loop backwards, skip duplicate state*/
+                       for (j = dcn2_1_soc.num_states - 1; j >= k; j--) {
                                if ((unsigned int) dcn2_1_soc.clock_limits[j].dcfclk_mhz <= clk_table->entries[i].dcfclk_mhz) {
                                        closest_clk_lvl = j;
                                        break;
@@ -1437,13 +1425,13 @@ static void update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw_param
                                k++;
                        }
                }
-
-               /* duplicate last level */
-               dcn2_1_soc.clock_limits[k] = dcn2_1_soc.clock_limits[k - 1];
-               dcn2_1_soc.clock_limits[k].state = k;
-               dcn2_1_soc.num_states = k + 1;
+               dcn2_1_soc.num_states = k;
        }
 
+       /* duplicate last level */
+       dcn2_1_soc.clock_limits[dcn2_1_soc.num_states] = dcn2_1_soc.clock_limits[dcn2_1_soc.num_states - 1];
+       dcn2_1_soc.clock_limits[dcn2_1_soc.num_states].state = dcn2_1_soc.num_states;
+
        dml_init_instance(&dc->dml, &dcn2_1_soc, &dcn2_1_ip, DML_PROJECT_DCN21);
 }
 
index ea4cde9..2a19833 100644 (file)
@@ -29,7 +29,7 @@
 #define DC__PRESENT 1
 #define DC__PRESENT__1 1
 #define DC__NUM_DPP 4
-#define DC__VOLTAGE_STATES 7
+#define DC__VOLTAGE_STATES 9
 #define DC__NUM_DPP__4 1
 #define DC__NUM_DPP__0_PRESENT 1
 #define DC__NUM_DPP__1_PRESENT 1
index dfd3be4..687010c 100644 (file)
  * Authors: AMD
  *
  */
+
+#include "dc_features.h"
+
 #ifndef __DISPLAY_MODE_STRUCTS_H__
 #define __DISPLAY_MODE_STRUCTS_H__
 
-#define MAX_CLOCK_LIMIT_STATES 9
-
 typedef struct _vcs_dpi_voltage_scaling_st voltage_scaling_st;
 typedef struct _vcs_dpi_soc_bounding_box_st soc_bounding_box_st;
 typedef struct _vcs_dpi_ip_params_st ip_params_st;
@@ -68,7 +69,7 @@ struct _vcs_dpi_voltage_scaling_st {
 };
 
 struct _vcs_dpi_soc_bounding_box_st {
-       struct _vcs_dpi_voltage_scaling_st clock_limits[MAX_CLOCK_LIMIT_STATES];
+       struct _vcs_dpi_voltage_scaling_st clock_limits[DC__VOLTAGE_STATES];
        unsigned int num_states;
        double sr_exit_time_us;
        double sr_enter_plus_exit_time_us;
index 4e54282..c33454a 100644 (file)
@@ -734,6 +734,7 @@ void mod_freesync_build_vrr_params(struct mod_freesync *mod_freesync,
 {
        struct core_freesync *core_freesync = NULL;
        unsigned long long nominal_field_rate_in_uhz = 0;
+       unsigned long long rounded_nominal_in_uhz = 0;
        unsigned int refresh_range = 0;
        unsigned long long min_refresh_in_uhz = 0;
        unsigned long long max_refresh_in_uhz = 0;
@@ -750,17 +751,20 @@ void mod_freesync_build_vrr_params(struct mod_freesync *mod_freesync,
        min_refresh_in_uhz = in_config->min_refresh_in_uhz;
        max_refresh_in_uhz = in_config->max_refresh_in_uhz;
 
-       // Don't allow min > max
-       if (min_refresh_in_uhz > max_refresh_in_uhz)
-               min_refresh_in_uhz = max_refresh_in_uhz;
-
        // Full range may be larger than current video timing, so cap at nominal
        if (max_refresh_in_uhz > nominal_field_rate_in_uhz)
                max_refresh_in_uhz = nominal_field_rate_in_uhz;
 
        // Full range may be larger than current video timing, so cap at nominal
-       if (min_refresh_in_uhz > nominal_field_rate_in_uhz)
-               min_refresh_in_uhz = nominal_field_rate_in_uhz;
+       if (min_refresh_in_uhz > max_refresh_in_uhz)
+               min_refresh_in_uhz = max_refresh_in_uhz;
+
+       // If a monitor reports exactly max refresh of 2x of min, enforce it on nominal
+       rounded_nominal_in_uhz =
+                       div_u64(nominal_field_rate_in_uhz + 50000, 100000) * 100000;
+       if (in_config->max_refresh_in_uhz == (2 * in_config->min_refresh_in_uhz) &&
+               in_config->max_refresh_in_uhz == rounded_nominal_in_uhz)
+               min_refresh_in_uhz = div_u64(nominal_field_rate_in_uhz, 2);
 
        if (!vrr_settings_require_update(core_freesync,
                        in_config, (unsigned int)min_refresh_in_uhz, (unsigned int)max_refresh_in_uhz,
@@ -792,11 +796,6 @@ void mod_freesync_build_vrr_params(struct mod_freesync *mod_freesync,
                refresh_range = in_out_vrr->max_refresh_in_uhz -
                                in_out_vrr->min_refresh_in_uhz;
 
-               in_out_vrr->btr.margin_in_us = in_out_vrr->max_duration_in_us -
-                               2 * in_out_vrr->min_duration_in_us;
-               if (in_out_vrr->btr.margin_in_us > BTR_MAX_MARGIN)
-                       in_out_vrr->btr.margin_in_us = BTR_MAX_MARGIN;
-
                in_out_vrr->supported = true;
        }
 
@@ -804,9 +803,14 @@ void mod_freesync_build_vrr_params(struct mod_freesync *mod_freesync,
 
        in_out_vrr->btr.btr_enabled = in_config->btr;
 
-       if (in_out_vrr->max_refresh_in_uhz <
-                       2 * in_out_vrr->min_refresh_in_uhz)
+       if (in_out_vrr->max_refresh_in_uhz < (2 * in_out_vrr->min_refresh_in_uhz))
                in_out_vrr->btr.btr_enabled = false;
+       else {
+               in_out_vrr->btr.margin_in_us = in_out_vrr->max_duration_in_us -
+                               2 * in_out_vrr->min_duration_in_us;
+               if (in_out_vrr->btr.margin_in_us > BTR_MAX_MARGIN)
+                       in_out_vrr->btr.margin_in_us = BTR_MAX_MARGIN;
+       }
 
        in_out_vrr->btr.btr_active = false;
        in_out_vrr->btr.inserted_duration_in_us = 0;
@@ -1008,8 +1012,8 @@ unsigned long long mod_freesync_calc_nominal_field_rate(
        unsigned int total = stream->timing.h_total * stream->timing.v_total;
 
        /* Calculate nominal field rate for stream, rounded up to nearest integer */
-       nominal_field_rate_in_uhz = stream->timing.pix_clk_100hz / 10;
-       nominal_field_rate_in_uhz *= 1000ULL * 1000ULL * 1000ULL;
+       nominal_field_rate_in_uhz = stream->timing.pix_clk_100hz;
+       nominal_field_rate_in_uhz *= 100000000ULL;
 
        nominal_field_rate_in_uhz =     div_u64(nominal_field_rate_in_uhz, total);
 
index e9fbd94..cc1d3f4 100644 (file)
@@ -328,8 +328,7 @@ enum mod_hdcp_status mod_hdcp_add_display(struct mod_hdcp *hdcp,
        /* add display to connection */
        hdcp->connection.link = *link;
        *display_container = *display;
-       status = mod_hdcp_add_display_to_topology(hdcp, display_container);
-
+       status = mod_hdcp_add_display_to_topology(hdcp, display->index);
        if (status != MOD_HDCP_STATUS_SUCCESS)
                goto out;
 
@@ -375,7 +374,7 @@ enum mod_hdcp_status mod_hdcp_remove_display(struct mod_hdcp *hdcp,
        status = mod_hdcp_remove_display_from_topology(hdcp, index);
        if (status != MOD_HDCP_STATUS_SUCCESS)
                goto out;
-       memset(display, 0, sizeof(struct mod_hdcp_display));
+       display->state = MOD_HDCP_DISPLAY_INACTIVE;
 
        /* request authentication when connection is not reset */
        if (current_state(hdcp) != HDCP_UNINITIALIZED)
index 60ff1a0..5cb4546 100644 (file)
@@ -328,7 +328,7 @@ void mod_hdcp_dump_binary_message(uint8_t *msg, uint32_t msg_size,
 
 /* psp functions */
 enum mod_hdcp_status mod_hdcp_add_display_to_topology(
-               struct mod_hdcp *hdcp, struct mod_hdcp_display *display);
+               struct mod_hdcp *hdcp, uint8_t index);
 enum mod_hdcp_status mod_hdcp_remove_display_from_topology(
                struct mod_hdcp *hdcp, uint8_t index);
 enum mod_hdcp_status mod_hdcp_hdcp1_create_session(struct mod_hdcp *hdcp);
@@ -503,6 +503,11 @@ static inline uint8_t is_display_active(struct mod_hdcp_display *display)
        return display->state >= MOD_HDCP_DISPLAY_ACTIVE;
 }
 
+static inline uint8_t is_display_added(struct mod_hdcp_display *display)
+{
+       return display->state >= MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED;
+}
+
 static inline uint8_t is_display_encryption_enabled(struct mod_hdcp_display *display)
 {
        return display->state >= MOD_HDCP_DISPLAY_ENCRYPTION_ENABLED;
@@ -510,23 +515,34 @@ static inline uint8_t is_display_encryption_enabled(struct mod_hdcp_display *dis
 
 static inline uint8_t get_active_display_count(struct mod_hdcp *hdcp)
 {
-       uint8_t active_count = 0;
+       uint8_t added_count = 0;
        uint8_t i;
 
        for (i = 0; i < MAX_NUM_OF_DISPLAYS; i++)
                if (is_display_active(&hdcp->displays[i]))
-                       active_count++;
-       return active_count;
+                       added_count++;
+       return added_count;
+}
+
+static inline uint8_t get_added_display_count(struct mod_hdcp *hdcp)
+{
+       uint8_t added_count = 0;
+       uint8_t i;
+
+       for (i = 0; i < MAX_NUM_OF_DISPLAYS; i++)
+               if (is_display_added(&hdcp->displays[i]))
+                       added_count++;
+       return added_count;
 }
 
-static inline struct mod_hdcp_display *get_first_active_display(
+static inline struct mod_hdcp_display *get_first_added_display(
                struct mod_hdcp *hdcp)
 {
        uint8_t i;
        struct mod_hdcp_display *display = NULL;
 
        for (i = 0; i < MAX_NUM_OF_DISPLAYS; i++)
-               if (is_display_active(&hdcp->displays[i])) {
+               if (is_display_added(&hdcp->displays[i])) {
                        display = &hdcp->displays[i];
                        break;
                }
index f244b72..37c8c05 100644 (file)
@@ -129,7 +129,7 @@ static inline uint8_t get_device_count(struct mod_hdcp *hdcp)
 static inline enum mod_hdcp_status check_device_count(struct mod_hdcp *hdcp)
 {
        /* device count must be greater than or equal to tracked hdcp displays */
-       return (get_device_count(hdcp) < get_active_display_count(hdcp)) ?
+       return (get_device_count(hdcp) < get_added_display_count(hdcp)) ?
                        MOD_HDCP_STATUS_HDCP1_DEVICE_COUNT_MISMATCH_FAILURE :
                        MOD_HDCP_STATUS_SUCCESS;
 }
index 549c113..491c00f 100644 (file)
@@ -208,7 +208,7 @@ static inline uint8_t get_device_count(struct mod_hdcp *hdcp)
 static enum mod_hdcp_status check_device_count(struct mod_hdcp *hdcp)
 {
        /* device count must be greater than or equal to tracked hdcp displays */
-       return (get_device_count(hdcp) < get_active_display_count(hdcp)) ?
+       return (get_device_count(hdcp) < get_added_display_count(hdcp)) ?
                        MOD_HDCP_STATUS_HDCP2_DEVICE_COUNT_MISMATCH_FAILURE :
                        MOD_HDCP_STATUS_SUCCESS;
 }
index 836e479..c292981 100644 (file)
@@ -54,7 +54,7 @@ enum mod_hdcp_status mod_hdcp_remove_display_from_topology(
 
        dtm_cmd = (struct ta_dtm_shared_memory *)psp->dtm_context.dtm_shared_buf;
 
-       if (!display || !is_display_active(display))
+       if (!display || !is_display_added(display))
                return MOD_HDCP_STATUS_DISPLAY_NOT_FOUND;
 
        memset(dtm_cmd, 0, sizeof(struct ta_dtm_shared_memory));
@@ -73,21 +73,25 @@ enum mod_hdcp_status mod_hdcp_remove_display_from_topology(
        HDCP_TOP_REMOVE_DISPLAY_TRACE(hdcp, display->index);
  
        return MOD_HDCP_STATUS_SUCCESS;
- }
-
-enum mod_hdcp_status mod_hdcp_add_display_to_topology(
-               struct mod_hdcp *hdcp, struct mod_hdcp_display *display)
+}
+enum mod_hdcp_status mod_hdcp_add_display_to_topology(struct mod_hdcp *hdcp,
+                                                     uint8_t index)
 {
        struct psp_context *psp = hdcp->config.psp.handle;
        struct ta_dtm_shared_memory *dtm_cmd;
+       struct mod_hdcp_display *display =
+                       get_active_display_at_index(hdcp, index);
        struct mod_hdcp_link *link = &hdcp->connection.link;
 
        if (!psp->dtm_context.dtm_initialized) {
                DRM_ERROR("Failed to add display topology, DTM TA is not initialized.");
-               display->state = MOD_HDCP_DISPLAY_INACTIVE;
                return MOD_HDCP_STATUS_FAILURE;
        }
 
+       if (!display || is_display_added(display))
+               return MOD_HDCP_STATUS_UPDATE_TOPOLOGY_FAILURE;
+
        dtm_cmd = (struct ta_dtm_shared_memory *)psp->dtm_context.dtm_shared_buf;
 
        memset(dtm_cmd, 0, sizeof(struct ta_dtm_shared_memory));
@@ -109,11 +113,10 @@ enum mod_hdcp_status mod_hdcp_add_display_to_topology(
 
        psp_dtm_invoke(psp, dtm_cmd->cmd_id);
 
-       if (dtm_cmd->dtm_status != TA_DTM_STATUS__SUCCESS) {
-               display->state = MOD_HDCP_DISPLAY_INACTIVE;
+       if (dtm_cmd->dtm_status != TA_DTM_STATUS__SUCCESS)
                return MOD_HDCP_STATUS_UPDATE_TOPOLOGY_FAILURE;
-       }
 
+       display->state = MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED;
        HDCP_TOP_ADD_DISPLAY_TRACE(hdcp, display->index);
 
        return MOD_HDCP_STATUS_SUCCESS;
@@ -123,7 +126,7 @@ enum mod_hdcp_status mod_hdcp_hdcp1_create_session(struct mod_hdcp *hdcp)
 {
 
        struct psp_context *psp = hdcp->config.psp.handle;
-       struct mod_hdcp_display *display = get_first_active_display(hdcp);
+       struct mod_hdcp_display *display = get_first_added_display(hdcp);
        struct ta_hdcp_shared_memory *hdcp_cmd;
 
        if (!psp->hdcp_context.hdcp_initialized) {
@@ -176,7 +179,7 @@ enum mod_hdcp_status mod_hdcp_hdcp1_destroy_session(struct mod_hdcp *hdcp)
                if (is_display_encryption_enabled(
                                &hdcp->displays[i])) {
                        hdcp->displays[i].state =
-                                                               MOD_HDCP_DISPLAY_ACTIVE;
+                                       MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED;
                        HDCP_HDCP1_DISABLED_TRACE(hdcp,
                                        hdcp->displays[i].index);
                }
@@ -228,7 +231,7 @@ enum mod_hdcp_status mod_hdcp_hdcp1_enable_encryption(struct mod_hdcp *hdcp)
 {
        struct psp_context *psp = hdcp->config.psp.handle;
        struct ta_hdcp_shared_memory *hdcp_cmd;
-       struct mod_hdcp_display *display = get_first_active_display(hdcp);
+       struct mod_hdcp_display *display = get_first_added_display(hdcp);
 
        hdcp_cmd = (struct ta_hdcp_shared_memory *)psp->hdcp_context.hdcp_shared_buf;
        memset(hdcp_cmd, 0, sizeof(struct ta_hdcp_shared_memory));
@@ -298,7 +301,8 @@ enum mod_hdcp_status mod_hdcp_hdcp1_enable_dp_stream_encryption(struct mod_hdcp
 
        for (i = 0; i < MAX_NUM_OF_DISPLAYS; i++) {
 
-               if (hdcp->displays[i].adjust.disable)
+               if (hdcp->displays[i].state != MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED ||
+                   hdcp->displays[i].adjust.disable)
                        continue;
 
                memset(hdcp_cmd, 0, sizeof(struct ta_hdcp_shared_memory));
@@ -360,7 +364,7 @@ enum mod_hdcp_status mod_hdcp_hdcp2_create_session(struct mod_hdcp *hdcp)
 {
        struct psp_context *psp = hdcp->config.psp.handle;
        struct ta_hdcp_shared_memory *hdcp_cmd;
-       struct mod_hdcp_display *display = get_first_active_display(hdcp);
+       struct mod_hdcp_display *display = get_first_added_display(hdcp);
 
        if (!psp->hdcp_context.hdcp_initialized) {
                DRM_ERROR("Failed to create hdcp session, HDCP TA is not initialized");
@@ -419,7 +423,7 @@ enum mod_hdcp_status mod_hdcp_hdcp2_destroy_session(struct mod_hdcp *hdcp)
                if (is_display_encryption_enabled(
                                &hdcp->displays[i])) {
                        hdcp->displays[i].state =
-                                                               MOD_HDCP_DISPLAY_ACTIVE;
+                                       MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED;
                        HDCP_HDCP2_DISABLED_TRACE(hdcp,
                                        hdcp->displays[i].index);
                }
@@ -658,7 +662,7 @@ enum mod_hdcp_status mod_hdcp_hdcp2_enable_encryption(struct mod_hdcp *hdcp)
 {
        struct psp_context *psp = hdcp->config.psp.handle;
        struct ta_hdcp_shared_memory *hdcp_cmd;
-       struct mod_hdcp_display *display = get_first_active_display(hdcp);
+       struct mod_hdcp_display *display = get_first_added_display(hdcp);
 
        hdcp_cmd = (struct ta_hdcp_shared_memory *)psp->hdcp_context.hdcp_shared_buf;
        memset(hdcp_cmd, 0, sizeof(struct ta_hdcp_shared_memory));
@@ -743,7 +747,8 @@ enum mod_hdcp_status mod_hdcp_hdcp2_enable_dp_stream_encryption(struct mod_hdcp
 
 
        for (i = 0; i < MAX_NUM_OF_DISPLAYS; i++) {
-               if (hdcp->displays[i].adjust.disable)
+               if (hdcp->displays[i].state != MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED ||
+                   hdcp->displays[i].adjust.disable)
                        continue;
                hdcp_cmd->in_msg.hdcp2_enable_dp_stream_encryption.display_handle = hdcp->displays[i].index;
                hdcp_cmd->in_msg.hdcp2_enable_dp_stream_encryption.session_handle = hdcp->auth.id;
index eae9309..c088602 100644 (file)
@@ -117,6 +117,7 @@ enum mod_hdcp_operation_mode {
 enum mod_hdcp_display_state {
        MOD_HDCP_DISPLAY_INACTIVE = 0,
        MOD_HDCP_DISPLAY_ACTIVE,
+       MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED,
        MOD_HDCP_DISPLAY_ENCRYPTION_ENABLED
 };
 
index c6d3bef..5db8c56 100644 (file)
@@ -35,6 +35,7 @@
 #include "arcturus_ppt.h"
 #include "smu_v11_0_pptable.h"
 #include "arcturus_ppsmc.h"
+#include "nbio/nbio_7_4_offset.h"
 #include "nbio/nbio_7_4_sh_mask.h"
 #include "amdgpu_xgmi.h"
 #include <linux/i2c.h>
@@ -2210,6 +2211,18 @@ static void arcturus_i2c_eeprom_control_fini(struct i2c_adapter *control)
        i2c_del_adapter(control);
 }
 
+static bool arcturus_is_baco_supported(struct smu_context *smu)
+{
+       struct amdgpu_device *adev = smu->adev;
+       uint32_t val;
+
+       if (!smu_v11_0_baco_is_support(smu))
+               return false;
+
+       val = RREG32_SOC15(NBIO, 0, mmRCC_BIF_STRAP0);
+       return (val & RCC_BIF_STRAP0__STRAP_PX_CAPABLE_MASK) ? true : false;
+}
+
 static uint32_t arcturus_get_pptable_power_limit(struct smu_context *smu)
 {
        PPTable_t *pptable = smu->smu_table.driver_pptable;
@@ -2321,7 +2334,7 @@ static const struct pptable_funcs arcturus_ppt_funcs = {
        .register_irq_handler = smu_v11_0_register_irq_handler,
        .set_azalia_d3_pme = smu_v11_0_set_azalia_d3_pme,
        .get_max_sustainable_clocks_by_dc = smu_v11_0_get_max_sustainable_clocks_by_dc,
-       .baco_is_support= smu_v11_0_baco_is_support,
+       .baco_is_support= arcturus_is_baco_supported,
        .baco_get_state = smu_v11_0_baco_get_state,
        .baco_set_state = smu_v11_0_baco_set_state,
        .baco_enter = smu_v11_0_baco_enter,
index 9c60b38..1503028 100644 (file)
 #include "smu_internal.h"
 #include "atomfirmware.h"
 #include "amdgpu_atomfirmware.h"
+#include "soc15_common.h"
 #include "smu_v11_0.h"
 #include "smu11_driver_if_navi10.h"
 #include "atom.h"
 #include "navi10_ppt.h"
 #include "smu_v11_0_pptable.h"
 #include "smu_v11_0_ppsmc.h"
-#include "nbio/nbio_7_4_sh_mask.h"
+#include "nbio/nbio_2_3_offset.h"
+#include "nbio/nbio_2_3_sh_mask.h"
 
 #include "asic_reg/mp/mp_11_0_sh_mask.h"
 
@@ -1985,6 +1987,18 @@ static int navi10_setup_od_limits(struct smu_context *smu) {
        return 0;
 }
 
+static bool navi10_is_baco_supported(struct smu_context *smu)
+{
+       struct amdgpu_device *adev = smu->adev;
+       uint32_t val;
+
+       if (!smu_v11_0_baco_is_support(smu))
+               return false;
+
+       val = RREG32_SOC15(NBIO, 0, mmRCC_BIF_STRAP0);
+       return (val & RCC_BIF_STRAP0__STRAP_PX_CAPABLE_MASK) ? true : false;
+}
+
 static int navi10_set_default_od_settings(struct smu_context *smu, bool initialize) {
        OverDriveTable_t *od_table, *boot_od_table;
        int ret = 0;
@@ -2361,7 +2375,7 @@ static const struct pptable_funcs navi10_ppt_funcs = {
        .register_irq_handler = smu_v11_0_register_irq_handler,
        .set_azalia_d3_pme = smu_v11_0_set_azalia_d3_pme,
        .get_max_sustainable_clocks_by_dc = smu_v11_0_get_max_sustainable_clocks_by_dc,
-       .baco_is_support= smu_v11_0_baco_is_support,
+       .baco_is_support= navi10_is_baco_supported,
        .baco_get_state = smu_v11_0_baco_get_state,
        .baco_set_state = smu_v11_0_baco_set_state,
        .baco_enter = smu_v11_0_baco_enter,
index d19e1d0..541c932 100644 (file)
@@ -42,8 +42,6 @@
 #include "asic_reg/thm/thm_11_0_2_sh_mask.h"
 #include "asic_reg/mp/mp_11_0_offset.h"
 #include "asic_reg/mp/mp_11_0_sh_mask.h"
-#include "asic_reg/nbio/nbio_7_4_offset.h"
-#include "asic_reg/nbio/nbio_7_4_sh_mask.h"
 #include "asic_reg/smuio/smuio_11_0_0_offset.h"
 #include "asic_reg/smuio/smuio_11_0_0_sh_mask.h"
 
@@ -1662,9 +1660,7 @@ static int smu_v11_0_baco_set_armd3_sequence(struct smu_context *smu, enum smu_v
 
 bool smu_v11_0_baco_is_support(struct smu_context *smu)
 {
-       struct amdgpu_device *adev = smu->adev;
        struct smu_baco_context *smu_baco = &smu->smu_baco;
-       uint32_t val;
        bool baco_support;
 
        mutex_lock(&smu_baco->mutex);
@@ -1679,11 +1675,7 @@ bool smu_v11_0_baco_is_support(struct smu_context *smu)
           !smu_feature_is_enabled(smu, SMU_FEATURE_BACO_BIT))
                return false;
 
-       val = RREG32_SOC15(NBIO, 0, mmRCC_BIF_STRAP0);
-       if (val & RCC_BIF_STRAP0__STRAP_PX_CAPABLE_MASK)
-               return true;
-
-       return false;
+       return true;
 }
 
 enum smu_baco_state smu_v11_0_baco_get_state(struct smu_context *smu)
@@ -1700,11 +1692,9 @@ enum smu_baco_state smu_v11_0_baco_get_state(struct smu_context *smu)
 
 int smu_v11_0_baco_set_state(struct smu_context *smu, enum smu_baco_state state)
 {
-
        struct smu_baco_context *smu_baco = &smu->smu_baco;
        struct amdgpu_device *adev = smu->adev;
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-       uint32_t bif_doorbell_intr_cntl;
        uint32_t data;
        int ret = 0;
 
@@ -1713,14 +1703,7 @@ int smu_v11_0_baco_set_state(struct smu_context *smu, enum smu_baco_state state)
 
        mutex_lock(&smu_baco->mutex);
 
-       bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL);
-
        if (state == SMU_BACO_STATE_ENTER) {
-               bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
-                                               BIF_DOORBELL_INT_CNTL,
-                                               DOORBELL_INTERRUPT_DISABLE, 1);
-               WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
-
                if (!ras || !ras->supported) {
                        data = RREG32_SOC15(THM, 0, mmTHM_BACO_CNTL);
                        data |= 0x80000000;
@@ -1735,11 +1718,6 @@ int smu_v11_0_baco_set_state(struct smu_context *smu, enum smu_baco_state state)
                if (ret)
                        goto out;
 
-               bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
-                                               BIF_DOORBELL_INT_CNTL,
-                                               DOORBELL_INTERRUPT_DISABLE, 0);
-               WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
-
                /* clear vbios scratch 6 and 7 for coming asic reinit */
                WREG32(adev->bios_scratch_reg_offset + 6, 0);
                WREG32(adev->bios_scratch_reg_offset + 7, 0);
index 49ff375..3f10443 100644 (file)
@@ -35,6 +35,7 @@
 #include "vega20_ppt.h"
 #include "vega20_pptable.h"
 #include "vega20_ppsmc.h"
+#include "nbio/nbio_7_4_offset.h"
 #include "nbio/nbio_7_4_sh_mask.h"
 #include "asic_reg/thm/thm_11_0_2_offset.h"
 #include "asic_reg/thm/thm_11_0_2_sh_mask.h"
@@ -3174,6 +3175,17 @@ static int vega20_update_pcie_parameters(struct smu_context *smu,
        return ret;
 }
 
+static bool vega20_is_baco_supported(struct smu_context *smu)
+{
+       struct amdgpu_device *adev = smu->adev;
+       uint32_t val;
+
+       if (!smu_v11_0_baco_is_support(smu))
+               return false;
+
+       val = RREG32_SOC15(NBIO, 0, mmRCC_BIF_STRAP0);
+       return (val & RCC_BIF_STRAP0__STRAP_PX_CAPABLE_MASK) ? true : false;
+}
 
 static const struct pptable_funcs vega20_ppt_funcs = {
        .tables_init = vega20_tables_init,
@@ -3262,7 +3274,7 @@ static const struct pptable_funcs vega20_ppt_funcs = {
        .register_irq_handler = smu_v11_0_register_irq_handler,
        .set_azalia_d3_pme = smu_v11_0_set_azalia_d3_pme,
        .get_max_sustainable_clocks_by_dc = smu_v11_0_get_max_sustainable_clocks_by_dc,
-       .baco_is_support= smu_v11_0_baco_is_support,
+       .baco_is_support= vega20_is_baco_supported,
        .baco_get_state = smu_v11_0_baco_get_state,
        .baco_set_state = smu_v11_0_baco_set_state,
        .baco_enter = smu_v11_0_baco_enter,
index bc6e208..8981abe 100644 (file)
@@ -45,7 +45,6 @@
 #include <linux/export.h>
 #include <linux/interval_tree_generic.h>
 #include <linux/seq_file.h>
-#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/stacktrace.h>
 
@@ -367,11 +366,6 @@ next_hole(struct drm_mm *mm,
          struct drm_mm_node *node,
          enum drm_mm_insert_mode mode)
 {
-       /* Searching is slow; check if we ran out of time/patience */
-       cond_resched();
-       if (fatal_signal_pending(current))
-               return NULL;
-
        switch (mode) {
        default:
        case DRM_MM_INSERT_BEST:
@@ -563,7 +557,7 @@ int drm_mm_insert_node_in_range(struct drm_mm * const mm,
                return 0;
        }
 
-       return signal_pending(current) ? -ERESTARTSYS : -ENOSPC;
+       return -ENOSPC;
 }
 EXPORT_SYMBOL(drm_mm_insert_node_in_range);
 
index 9e065ad..a3cc080 100644 (file)
@@ -164,6 +164,7 @@ struct decode_info {
 #define OP_STATE_BASE_ADDRESS                   OP_3D_MEDIA(0x0, 0x1, 0x01)
 #define OP_STATE_SIP                            OP_3D_MEDIA(0x0, 0x1, 0x02)
 #define OP_3D_MEDIA_0_1_4                      OP_3D_MEDIA(0x0, 0x1, 0x04)
+#define OP_SWTESS_BASE_ADDRESS                 OP_3D_MEDIA(0x0, 0x1, 0x03)
 
 #define OP_3DSTATE_VF_STATISTICS_GM45           OP_3D_MEDIA(0x1, 0x0, 0x0B)
 
@@ -967,18 +968,6 @@ static int cmd_handler_lri(struct parser_exec_state *s)
 {
        int i, ret = 0;
        int cmd_len = cmd_length(s);
-       u32 valid_len = CMD_LEN(1);
-
-       /*
-        * Official intel docs are somewhat sloppy , check the definition of
-        * MI_LOAD_REGISTER_IMM.
-        */
-       #define MAX_VALID_LEN 127
-       if ((cmd_len < valid_len) || (cmd_len > MAX_VALID_LEN)) {
-               gvt_err("len is not valid:  len=%u  valid_len=%u\n",
-                       cmd_len, valid_len);
-               return -EFAULT;
-       }
 
        for (i = 1; i < cmd_len; i += 2) {
                if (IS_BROADWELL(s->engine->i915) && s->engine->id != RCS0) {
@@ -2485,6 +2474,9 @@ static const struct cmd_info cmd_info[] = {
        {"OP_3D_MEDIA_0_1_4", OP_3D_MEDIA_0_1_4, F_LEN_VAR, R_RCS, D_ALL,
                ADDR_FIX_1(1), 8, NULL},
 
+       {"OP_SWTESS_BASE_ADDRESS", OP_SWTESS_BASE_ADDRESS,
+               F_LEN_VAR, R_RCS, D_ALL, ADDR_FIX_2(1, 2), 3, NULL},
+
        {"3DSTATE_VS", OP_3DSTATE_VS, F_LEN_VAR, R_RCS, D_ALL, 0, 8, NULL},
 
        {"3DSTATE_SF", OP_3DSTATE_SF, F_LEN_VAR, R_RCS, D_ALL, 0, 8, NULL},
index 6e5c988..a83df2f 100644 (file)
@@ -221,7 +221,7 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu)
                        ~(TRANS_DDI_BPC_MASK | TRANS_DDI_MODE_SELECT_MASK |
                        TRANS_DDI_PORT_MASK);
                vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) |=
-                       (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DVI |
+                       (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST |
                        (PORT_B << TRANS_DDI_PORT_SHIFT) |
                        TRANS_DDI_FUNC_ENABLE);
                if (IS_BROADWELL(dev_priv)) {
@@ -241,7 +241,7 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu)
                        ~(TRANS_DDI_BPC_MASK | TRANS_DDI_MODE_SELECT_MASK |
                        TRANS_DDI_PORT_MASK);
                vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) |=
-                       (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DVI |
+                       (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST |
                        (PORT_C << TRANS_DDI_PORT_SHIFT) |
                        TRANS_DDI_FUNC_ENABLE);
                if (IS_BROADWELL(dev_priv)) {
@@ -261,7 +261,7 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu)
                        ~(TRANS_DDI_BPC_MASK | TRANS_DDI_MODE_SELECT_MASK |
                        TRANS_DDI_PORT_MASK);
                vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) |=
-                       (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DVI |
+                       (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST |
                        (PORT_D << TRANS_DDI_PORT_SHIFT) |
                        TRANS_DDI_FUNC_ENABLE);
                if (IS_BROADWELL(dev_priv)) {
index 0182e2a..2faf50e 100644 (file)
@@ -462,11 +462,14 @@ static int pipeconf_mmio_write(struct intel_vgpu *vgpu, unsigned int offset,
        return 0;
 }
 
-/* ascendingly sorted */
+/* sorted in ascending order */
 static i915_reg_t force_nonpriv_white_list[] = {
+       _MMIO(0xd80),
        GEN9_CS_DEBUG_MODE1, //_MMIO(0x20ec)
        GEN9_CTX_PREEMPT_REG,//_MMIO(0x2248)
-       PS_INVOCATION_COUNT,//_MMIO(0x2348)
+       CL_PRIMITIVES_COUNT, //_MMIO(0x2340)
+       PS_INVOCATION_COUNT, //_MMIO(0x2348)
+       PS_DEPTH_COUNT, //_MMIO(0x2350)
        GEN8_CS_CHICKEN1,//_MMIO(0x2580)
        _MMIO(0x2690),
        _MMIO(0x2694),
@@ -491,6 +494,7 @@ static i915_reg_t force_nonpriv_white_list[] = {
        _MMIO(0xe18c),
        _MMIO(0xe48c),
        _MMIO(0xe5f4),
+       _MMIO(0x64844),
 };
 
 /* a simple bsearch */
index 1c95bf8..cb11c31 100644 (file)
@@ -296,8 +296,8 @@ shadow_context_descriptor_update(struct intel_context *ce,
         * Update bits 0-11 of the context descriptor which includes flags
         * like GEN8_CTX_* cached in desc_template
         */
-       desc &= ~(0x3 << GEN8_CTX_ADDRESSING_MODE_SHIFT);
-       desc |= workload->ctx_desc.addressing_mode <<
+       desc &= ~(0x3ull << GEN8_CTX_ADDRESSING_MODE_SHIFT);
+       desc |= (u64)workload->ctx_desc.addressing_mode <<
                GEN8_CTX_ADDRESSING_MODE_SHIFT;
 
        ce->lrc_desc = desc;
index 6650f47..47b9898 100644 (file)
@@ -633,7 +633,7 @@ struct msm_kms *mdp5_kms_init(struct drm_device *dev)
 
        if (config->platform.iommu) {
                iommu_dev = &pdev->dev;
-               if (!iommu_dev->iommu_fwspec)
+               if (!dev_iommu_fwspec_get(iommu_dev))
                        iommu_dev = iommu_dev->parent;
 
                aspace = msm_gem_address_space_create(iommu_dev,
index e8eef88..ffdd447 100644 (file)
@@ -35,7 +35,8 @@
 
 #include <subdev/bios/gpio.h>
 #include <subdev/gpio.h>
-#include <subdev/timer.h>
+
+#include <nvif/timer.h>
 
 int nv04_dac_output_offset(struct drm_encoder *encoder)
 {
index 3fdfafa..b674d68 100644 (file)
@@ -26,6 +26,7 @@
 #include "hw.h"
 
 #include <subdev/bios/pll.h>
+#include <nvif/timer.h>
 
 #define CHIPSET_NFORCE 0x01a0
 #define CHIPSET_NFORCE2 0x01f0
index 00a85f1..ee78215 100644 (file)
@@ -23,6 +23,7 @@
 
 #include <nvif/cl507c.h>
 #include <nvif/event.h>
+#include <nvif/timer.h>
 
 #include <drm/drm_atomic_helper.h>
 #include <drm/drm_fourcc.h>
index e7fcfa6..c5152c3 100644 (file)
@@ -23,6 +23,7 @@
 #include "head.h"
 
 #include <nvif/cl507d.h>
+#include <nvif/timer.h>
 
 #include "nouveau_bo.h"
 
index 3b36dc8..c03cb98 100644 (file)
@@ -24,6 +24,8 @@
 
 #include <nouveau_bo.h>
 
+#include <nvif/timer.h>
+
 void
 corec37d_wndw_owner(struct nv50_core *core)
 {
index 397143b..8c5cf09 100644 (file)
 #include "head.h"
 
 #include <nvif/cl507a.h>
+#include <nvif/timer.h>
 
 #include <drm/drm_atomic_helper.h>
 #include <drm/drm_plane_helper.h>
 
+bool
+curs507a_space(struct nv50_wndw *wndw)
+{
+       nvif_msec(&nouveau_drm(wndw->plane.dev)->client.device, 2,
+               if (nvif_rd32(&wndw->wimm.base.user, 0x0008) >= 4)
+                       return true;
+       );
+       WARN_ON(1);
+       return false;
+}
+
 static void
 curs507a_update(struct nv50_wndw *wndw, u32 *interlock)
 {
-       nvif_wr32(&wndw->wimm.base.user, 0x0080, 0x00000000);
+       if (curs507a_space(wndw))
+               nvif_wr32(&wndw->wimm.base.user, 0x0080, 0x00000000);
 }
 
 static void
 curs507a_point(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw)
 {
-       nvif_wr32(&wndw->wimm.base.user, 0x0084, asyw->point.y << 16 |
-                                                asyw->point.x);
+       if (curs507a_space(wndw)) {
+               nvif_wr32(&wndw->wimm.base.user, 0x0084, asyw->point.y << 16 |
+                                                        asyw->point.x);
+       }
 }
 
 const struct nv50_wimm_func
index 23fb29d..96dff4f 100644 (file)
 static void
 cursc37a_update(struct nv50_wndw *wndw, u32 *interlock)
 {
-       nvif_wr32(&wndw->wimm.base.user, 0x0200, 0x00000001);
+       if (curs507a_space(wndw))
+               nvif_wr32(&wndw->wimm.base.user, 0x0200, 0x00000001);
 }
 
 static void
 cursc37a_point(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw)
 {
-       nvif_wr32(&wndw->wimm.base.user, 0x0208, asyw->point.y << 16 |
-                                                asyw->point.x);
+       if (curs507a_space(wndw)) {
+               nvif_wr32(&wndw->wimm.base.user, 0x0208, asyw->point.y << 16 |
+                                                        asyw->point.x);
+       }
 }
 
 static const struct nv50_wimm_func
index 4d1c584..6be9df1 100644 (file)
@@ -45,6 +45,7 @@
 #include <nvif/cl5070.h>
 #include <nvif/cl507d.h>
 #include <nvif/event.h>
+#include <nvif/timer.h>
 
 #include "nouveau_drv.h"
 #include "nouveau_dma.h"
index 2e68fc7..4f7ce57 100644 (file)
@@ -24,6 +24,8 @@
 
 #include <nouveau_bo.h>
 
+#include <nvif/timer.h>
+
 static void
 ovly827e_image_set(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw)
 {
index caf3974..a7412b9 100644 (file)
@@ -97,6 +97,7 @@ struct nv50_wimm_func {
 };
 
 extern const struct nv50_wimm_func curs507a;
+bool curs507a_space(struct nv50_wndw *);
 
 int wndwc37e_new(struct nouveau_drm *, enum drm_plane_type, int, s32,
                 struct nv50_wndw **);
index 25d969d..c2a572c 100644 (file)
@@ -23,27 +23,6 @@ int  nvif_device_init(struct nvif_object *, u32 handle, s32 oclass, void *, u32,
 void nvif_device_fini(struct nvif_device *);
 u64  nvif_device_time(struct nvif_device *);
 
-/* Delay based on GPU time (ie. PTIMER).
- *
- * Will return -ETIMEDOUT unless the loop was terminated with 'break',
- * where it will return the number of nanoseconds taken instead.
- */
-#define nvif_nsec(d,n,cond...) ({                                              \
-       struct nvif_device *_device = (d);                                     \
-       u64 _nsecs = (n), _time0 = nvif_device_time(_device);                  \
-       s64 _taken = 0;                                                        \
-                                                                               \
-       do {                                                                   \
-               cond                                                           \
-       } while (_taken = nvif_device_time(_device) - _time0, _taken < _nsecs);\
-                                                                               \
-       if (_taken >= _nsecs)                                                  \
-               _taken = -ETIMEDOUT;                                           \
-       _taken;                                                                \
-})
-#define nvif_usec(d,u,cond...) nvif_nsec((d), (u) * 1000, ##cond)
-#define nvif_msec(d,m,cond...) nvif_usec((d), (m) * 1000, ##cond)
-
 /*XXX*/
 #include <subdev/bios.h>
 #include <subdev/fb.h>
diff --git a/drivers/gpu/drm/nouveau/include/nvif/timer.h b/drivers/gpu/drm/nouveau/include/nvif/timer.h
new file mode 100644 (file)
index 0000000..57587a9
--- /dev/null
@@ -0,0 +1,35 @@
+#ifndef __NVIF_TIMER_H__
+#define __NVIF_TIMER_H__
+#include <nvif/os.h>
+
+struct nvif_timer_wait {
+       struct nvif_device *device;
+       u64 limit;
+       u64 time0;
+       u64 time1;
+       int reads;
+};
+
+void nvif_timer_wait_init(struct nvif_device *, u64 nsec,
+                         struct nvif_timer_wait *);
+s64 nvif_timer_wait_test(struct nvif_timer_wait *);
+
+/* Delay based on GPU time (ie. PTIMER).
+ *
+ * Will return -ETIMEDOUT unless the loop was terminated with 'break',
+ * where it will return the number of nanoseconds taken instead.
+ */
+#define nvif_nsec(d,n,cond...) ({                                              \
+       struct nvif_timer_wait _wait;                                          \
+       s64 _taken = 0;                                                        \
+                                                                               \
+       nvif_timer_wait_init((d), (n), &_wait);                                \
+       do {                                                                   \
+               cond                                                           \
+       } while ((_taken = nvif_timer_wait_test(&_wait)) >= 0);                \
+                                                                               \
+       _taken;                                                                \
+})
+#define nvif_usec(d,u,cond...) nvif_nsec((d), (u) * 1000, ##cond)
+#define nvif_msec(d,m,cond...) nvif_usec((d), (m) * 1000, ##cond)
+#endif
index 03c1182..6825574 100644 (file)
@@ -10,6 +10,7 @@ struct nvif_user {
 
 struct nvif_user_func {
        void (*doorbell)(struct nvif_user *, u32 token);
+       u64 (*time)(struct nvif_user *);
 };
 
 int nvif_user_init(struct nvif_device *);
index 2b4b21b..c40f127 100644 (file)
@@ -1494,8 +1494,13 @@ nouveau_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_reg *reg)
                        ret = nvif_object_map_handle(&mem->mem.object,
                                                     &args, argc,
                                                     &handle, &length);
-                       if (ret != 1)
-                               return ret ? ret : -EINVAL;
+                       if (ret != 1) {
+                               if (WARN_ON(ret == 0))
+                                       return -EINVAL;
+                               if (ret == -ENOSPC)
+                                       return -EAGAIN;
+                               return ret;
+                       }
 
                        reg->bus.base = 0;
                        reg->bus.offset = handle;
index 7dfbbbc..15a3d40 100644 (file)
@@ -222,22 +222,18 @@ nouveau_drm_debugfs_init(struct drm_minor *minor)
 {
        struct nouveau_drm *drm = nouveau_drm(minor->dev);
        struct dentry *dentry;
-       int i, ret;
+       int i;
 
        for (i = 0; i < ARRAY_SIZE(nouveau_debugfs_files); i++) {
-               dentry = debugfs_create_file(nouveau_debugfs_files[i].name,
-                                            S_IRUGO | S_IWUSR,
-                                            minor->debugfs_root, minor->dev,
-                                            nouveau_debugfs_files[i].fops);
-               if (!dentry)
-                       return -ENOMEM;
+               debugfs_create_file(nouveau_debugfs_files[i].name,
+                                   S_IRUGO | S_IWUSR,
+                                   minor->debugfs_root, minor->dev,
+                                   nouveau_debugfs_files[i].fops);
        }
 
-       ret = drm_debugfs_create_files(nouveau_debugfs_list,
-                                      NOUVEAU_DEBUGFS_ENTRIES,
-                                      minor->debugfs_root, minor);
-       if (ret)
-               return ret;
+       drm_debugfs_create_files(nouveau_debugfs_list,
+                                NOUVEAU_DEBUGFS_ENTRIES,
+                                minor->debugfs_root, minor);
 
        /* Set the size of the vbios since we know it, and it's confusing to
         * userspace if it wants to seek() but the file has a length of 0
index 6b1629c..ca4087f 100644 (file)
@@ -618,6 +618,64 @@ nouveau_drm_device_fini(struct drm_device *dev)
        kfree(drm);
 }
 
+/*
+ * On some Intel PCIe bridge controllers doing a
+ * D0 -> D3hot -> D3cold -> D0 sequence causes Nvidia GPUs to not reappear.
+ * Skipping the intermediate D3hot step seems to make it work again. This is
+ * probably caused by not meeting the expectation the involved AML code has
+ * when the GPU is put into D3hot state before invoking it.
+ *
+ * This leads to various manifestations of this issue:
+ *  - AML code execution to power on the GPU hits an infinite loop (as the
+ *    code waits on device memory to change).
+ *  - kernel crashes, as all PCI reads return -1, which most code isn't able
+ *    to handle well enough.
+ *
+ * In all cases dmesg will contain at least one line like this:
+ * 'nouveau 0000:01:00.0: Refused to change power state, currently in D3'
+ * followed by a lot of nouveau timeouts.
+ *
+ * In the \_SB.PCI0.PEG0.PG00._OFF code deeper down writes bit 0x80 to the not
+ * documented PCI config space register 0x248 of the Intel PCIe bridge
+ * controller (0x1901) in order to change the state of the PCIe link between
+ * the PCIe port and the GPU. There are alternative code paths using other
+ * registers, which seem to work fine (executed pre Windows 8):
+ *  - 0xbc bit 0x20 (publicly available documentation claims 'reserved')
+ *  - 0xb0 bit 0x10 (link disable)
+ * Changing the conditions inside the firmware by poking into the relevant
+ * addresses does resolve the issue, but it seemed to be ACPI private memory
+ * and not any device accessible memory at all, so there is no portable way of
+ * changing the conditions.
+ * On a XPS 9560 that means bits [0,3] on \CPEX need to be cleared.
+ *
+ * The only systems where this behavior can be seen are hybrid graphics laptops
+ * with a secondary Nvidia Maxwell, Pascal or Turing GPU. It's unclear whether
+ * this issue only occurs in combination with listed Intel PCIe bridge
+ * controllers and the mentioned GPUs or other devices as well.
+ *
+ * documentation on the PCIe bridge controller can be found in the
+ * "7th Generation Intel® Processor Families for H Platforms Datasheet Volume 2"
+ * Section "12 PCI Express* Controller (x16) Registers"
+ */
+
+static void quirk_broken_nv_runpm(struct pci_dev *pdev)
+{
+       struct drm_device *dev = pci_get_drvdata(pdev);
+       struct nouveau_drm *drm = nouveau_drm(dev);
+       struct pci_dev *bridge = pci_upstream_bridge(pdev);
+
+       if (!bridge || bridge->vendor != PCI_VENDOR_ID_INTEL)
+               return;
+
+       switch (bridge->device) {
+       case 0x1901:
+               drm->old_pm_cap = pdev->pm_cap;
+               pdev->pm_cap = 0;
+               NV_INFO(drm, "Disabling PCI power management to avoid bug\n");
+               break;
+       }
+}
+
 static int nouveau_drm_probe(struct pci_dev *pdev,
                             const struct pci_device_id *pent)
 {
@@ -699,6 +757,7 @@ static int nouveau_drm_probe(struct pci_dev *pdev,
        if (ret)
                goto fail_drm_dev_init;
 
+       quirk_broken_nv_runpm(pdev);
        return 0;
 
 fail_drm_dev_init:
@@ -734,7 +793,11 @@ static void
 nouveau_drm_remove(struct pci_dev *pdev)
 {
        struct drm_device *dev = pci_get_drvdata(pdev);
+       struct nouveau_drm *drm = nouveau_drm(dev);
 
+       /* revert our workaround */
+       if (drm->old_pm_cap)
+               pdev->pm_cap = drm->old_pm_cap;
        nouveau_drm_device_remove(dev);
        pci_disable_device(pdev);
 }
index c2c332f..2a65197 100644 (file)
@@ -140,6 +140,8 @@ struct nouveau_drm {
 
        struct list_head clients;
 
+       u8 old_pm_cap;
+
        struct {
                struct agp_bridge_data *bridge;
                u32 base;
index e3797b2..645fedd 100644 (file)
@@ -171,6 +171,11 @@ nouveau_svmm_bind(struct drm_device *dev, void *data,
        mm = get_task_mm(current);
        down_read(&mm->mmap_sem);
 
+       if (!cli->svm.svmm) {
+               up_read(&mm->mmap_sem);
+               return -EINVAL;
+       }
+
        for (addr = args->va_start, end = args->va_start + size; addr < end;) {
                struct vm_area_struct *vma;
                unsigned long next;
@@ -179,6 +184,7 @@ nouveau_svmm_bind(struct drm_device *dev, void *data,
                if (!vma)
                        break;
 
+               addr = max(addr, vma->vm_start);
                next = min(vma->vm_end, end);
                /* This is a best effort so we ignore errors */
                nouveau_dmem_migrate_vma(cli->drm, vma, addr, next);
@@ -656,9 +662,6 @@ nouveau_svm_fault(struct nvif_notify *notify)
                limit = start + (ARRAY_SIZE(args.phys) << PAGE_SHIFT);
                if (start < svmm->unmanaged.limit)
                        limit = min_t(u64, limit, svmm->unmanaged.start);
-               else
-               if (limit > svmm->unmanaged.start)
-                       start = max_t(u64, start, svmm->unmanaged.limit);
                SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);
 
                mm = svmm->notifier.mm;
index 50d583d..f194d35 100644 (file)
@@ -8,6 +8,7 @@ nvif-y += nvif/fifo.o
 nvif-y += nvif/mem.o
 nvif-y += nvif/mmu.o
 nvif-y += nvif/notify.o
+nvif-y += nvif/timer.o
 nvif-y += nvif/vmm.o
 
 # Usermode classes
index 1ec101b..0e92db4 100644 (file)
 u64
 nvif_device_time(struct nvif_device *device)
 {
-       struct nv_device_time_v0 args = {};
-       int ret = nvif_object_mthd(&device->object, NV_DEVICE_V0_TIME,
-                                  &args, sizeof(args));
-       WARN_ON_ONCE(ret != 0);
-       return args.time;
+       if (!device->user.func) {
+               struct nv_device_time_v0 args = {};
+               int ret = nvif_object_mthd(&device->object, NV_DEVICE_V0_TIME,
+                                          &args, sizeof(args));
+               WARN_ON_ONCE(ret != 0);
+               return args.time;
+       }
+
+       return device->user.func->time(&device->user);
 }
 
 void
diff --git a/drivers/gpu/drm/nouveau/nvif/timer.c b/drivers/gpu/drm/nouveau/nvif/timer.c
new file mode 100644 (file)
index 0000000..602c1a2
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2020 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <nvif/timer.h>
+#include <nvif/device.h>
+
+s64
+nvif_timer_wait_test(struct nvif_timer_wait *wait)
+{
+       u64 time = nvif_device_time(wait->device);
+
+       if (wait->reads == 0) {
+               wait->time0 = time;
+               wait->time1 = time;
+       }
+
+       if (wait->time1 == time) {
+               if (WARN_ON(wait->reads++ == 16))
+                       return -ETIMEDOUT;
+       } else {
+               wait->time1 = time;
+               wait->reads = 1;
+       }
+
+       if (wait->time1 - wait->time0 > wait->limit)
+               return -ETIMEDOUT;
+
+       return wait->time1 - wait->time0;
+}
+
+void
+nvif_timer_wait_init(struct nvif_device *device, u64 nsec,
+                    struct nvif_timer_wait *wait)
+{
+       wait->device = device;
+       wait->limit = nsec;
+       wait->reads = 0;
+}
index 19f9958..1116f87 100644 (file)
  */
 #include <nvif/user.h>
 
+static u64
+nvif_userc361_time(struct nvif_user *user)
+{
+       u32 hi, lo;
+
+       do {
+               hi = nvif_rd32(&user->object, 0x084);
+               lo = nvif_rd32(&user->object, 0x080);
+       } while (hi != nvif_rd32(&user->object, 0x084));
+
+       return ((u64)hi << 32 | lo);
+}
+
 static void
 nvif_userc361_doorbell(struct nvif_user *user, u32 token)
 {
@@ -30,4 +43,5 @@ nvif_userc361_doorbell(struct nvif_user *user, u32 token)
 const struct nvif_user_func
 nvif_userc361 = {
        .doorbell = nvif_userc361_doorbell,
+       .time = nvif_userc361_time,
 };
index dd8f85b..f2f5636 100644 (file)
@@ -1981,8 +1981,34 @@ gf100_gr_init_(struct nvkm_gr *base)
 {
        struct gf100_gr *gr = gf100_gr(base);
        struct nvkm_subdev *subdev = &base->engine.subdev;
+       struct nvkm_device *device = subdev->device;
+       bool reset = device->chipset == 0x137 || device->chipset == 0x138;
        u32 ret;
 
+       /* On certain GP107/GP108 boards, we trigger a weird issue where
+        * GR will stop responding to PRI accesses after we've asked the
+        * SEC2 RTOS to boot the GR falcons.  This happens with far more
+        * frequency when cold-booting a board (ie. returning from D3).
+        *
+        * The root cause for this is not known and has proven difficult
+        * to isolate, with many avenues being dead-ends.
+        *
+        * A workaround was discovered by Karol, whereby putting GR into
+        * reset for an extended period right before initialisation
+        * prevents the problem from occuring.
+        *
+        * XXX: As RM does not require any such workaround, this is more
+        *      of a hack than a true fix.
+        */
+       reset = nvkm_boolopt(device->cfgopt, "NvGrResetWar", reset);
+       if (reset) {
+               nvkm_mask(device, 0x000200, 0x00001000, 0x00000000);
+               nvkm_rd32(device, 0x000200);
+               msleep(50);
+               nvkm_mask(device, 0x000200, 0x00001000, 0x00001000);
+               nvkm_rd32(device, 0x000200);
+       }
+
        nvkm_pmu_pgob(gr->base.engine.subdev.device->pmu, false);
 
        ret = nvkm_falcon_get(&gr->fecs.falcon, subdev);
index 0ce81b1..3ad828e 100644 (file)
@@ -361,7 +361,6 @@ static int panel_dpi_probe(struct device *dev,
        struct panel_desc *desc;
        unsigned int bus_flags;
        struct videomode vm;
-       const char *mapping;
        int ret;
 
        np = dev->of_node;
@@ -386,16 +385,6 @@ static int panel_dpi_probe(struct device *dev,
        of_property_read_u32(np, "width-mm", &desc->size.width);
        of_property_read_u32(np, "height-mm", &desc->size.height);
 
-       of_property_read_string(np, "data-mapping", &mapping);
-       if (!strcmp(mapping, "rgb24"))
-               desc->bus_format = MEDIA_BUS_FMT_RGB888_1X24;
-       else if (!strcmp(mapping, "rgb565"))
-               desc->bus_format = MEDIA_BUS_FMT_RGB565_1X16;
-       else if (!strcmp(mapping, "bgr666"))
-               desc->bus_format = MEDIA_BUS_FMT_RGB666_1X18;
-       else if (!strcmp(mapping, "lvds666"))
-               desc->bus_format = MEDIA_BUS_FMT_RGB666_1X24_CPADHI;
-
        /* Extract bus_flags from display_timing */
        bus_flags = 0;
        vm.flags = timing->flags;
index 8512d97..ac8f75d 100644 (file)
@@ -41,6 +41,10 @@ static int vbox_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        if (!vbox_check_supported(VBE_DISPI_ID_HGSMI))
                return -ENODEV;
 
+       ret = drm_fb_helper_remove_conflicting_pci_framebuffers(pdev, "vboxvideodrmfb");
+       if (ret)
+               return ret;
+
        vbox = kzalloc(sizeof(*vbox), GFP_KERNEL);
        if (!vbox)
                return -ENOMEM;
index cea18dc..3407192 100644 (file)
@@ -681,11 +681,23 @@ static enum drm_mode_status
 vc4_hdmi_encoder_mode_valid(struct drm_encoder *crtc,
                            const struct drm_display_mode *mode)
 {
-       /* HSM clock must be 108% of the pixel clock.  Additionally,
-        * the AXI clock needs to be at least 25% of pixel clock, but
-        * HSM ends up being the limiting factor.
+       /*
+        * As stated in RPi's vc4 firmware "HDMI state machine (HSM) clock must
+        * be faster than pixel clock, infinitesimally faster, tested in
+        * simulation. Otherwise, exact value is unimportant for HDMI
+        * operation." This conflicts with bcm2835's vc4 documentation, which
+        * states HSM's clock has to be at least 108% of the pixel clock.
+        *
+        * Real life tests reveal that vc4's firmware statement holds up, and
+        * users are able to use pixel clocks closer to HSM's, namely for
+        * 1920x1200@60Hz. So it was decided to have leave a 1% margin between
+        * both clocks. Which, for RPi0-3 implies a maximum pixel clock of
+        * 162MHz.
+        *
+        * Additionally, the AXI clock needs to be at least 25% of
+        * pixel clock, but HSM ends up being the limiting factor.
         */
-       if (mode->clock > HSM_CLOCK_FREQ / (1000 * 108 / 100))
+       if (mode->clock > HSM_CLOCK_FREQ / (1000 * 101 / 100))
                return MODE_CLOCK_HIGH;
 
        return MODE_OK;
index 4be49c1..3741420 100644 (file)
@@ -401,7 +401,7 @@ static int xen_drm_drv_dumb_create(struct drm_file *filp,
 
        obj = xen_drm_front_gem_create(dev, args->size);
        if (IS_ERR_OR_NULL(obj)) {
-               ret = PTR_ERR(obj);
+               ret = PTR_ERR_OR_ZERO(obj);
                goto fail;
        }
 
index a02ce43..32e3bc0 100644 (file)
@@ -533,7 +533,6 @@ struct hv_dynmem_device {
         * State to synchronize hot-add.
         */
        struct completion  ol_waitevent;
-       bool ha_waiting;
        /*
         * This thread handles hot-add
         * requests from the host as well as notifying
@@ -634,10 +633,7 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
        switch (val) {
        case MEM_ONLINE:
        case MEM_CANCEL_ONLINE:
-               if (dm_device.ha_waiting) {
-                       dm_device.ha_waiting = false;
-                       complete(&dm_device.ol_waitevent);
-               }
+               complete(&dm_device.ol_waitevent);
                break;
 
        case MEM_OFFLINE:
@@ -726,8 +722,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
                has->covered_end_pfn +=  processed_pfn;
                spin_unlock_irqrestore(&dm_device.ha_lock, flags);
 
-               init_completion(&dm_device.ol_waitevent);
-               dm_device.ha_waiting = !memhp_auto_online;
+               reinit_completion(&dm_device.ol_waitevent);
 
                nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
                ret = add_memory(nid, PFN_PHYS((start_pfn)),
@@ -753,15 +748,14 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
                }
 
                /*
-                * Wait for the memory block to be onlined when memory onlining
-                * is done outside of kernel (memhp_auto_online). Since the hot
-                * add has succeeded, it is ok to proceed even if the pages in
-                * the hot added region have not been "onlined" within the
-                * allowed time.
+                * Wait for memory to get onlined. If the kernel onlined the
+                * memory when adding it, this will return directly. Otherwise,
+                * it will wait for user space to online the memory. This helps
+                * to avoid adding memory faster than it is getting onlined. As
+                * adding succeeded, it is ok to proceed even if the memory was
+                * not onlined in time.
                 */
-               if (dm_device.ha_waiting)
-                       wait_for_completion_timeout(&dm_device.ol_waitevent,
-                                                   5*HZ);
+               wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ);
                post_status(&dm_device);
        }
 }
@@ -1706,6 +1700,7 @@ static int balloon_probe(struct hv_device *dev,
 
 #ifdef CONFIG_MEMORY_HOTPLUG
        set_online_page_callback(&hv_online_page);
+       init_completion(&dm_device.ol_waitevent);
        register_memory_notifier(&hv_memory_nb);
 #endif
 
index 5bd5185..d5c073a 100644 (file)
@@ -88,6 +88,7 @@ source "drivers/iio/orientation/Kconfig"
 if IIO_TRIGGER
    source "drivers/iio/trigger/Kconfig"
 endif #IIO_TRIGGER
+source "drivers/iio/position/Kconfig"
 source "drivers/iio/potentiometer/Kconfig"
 source "drivers/iio/potentiostat/Kconfig"
 source "drivers/iio/pressure/Kconfig"
index bff682a..1712011 100644 (file)
@@ -31,6 +31,7 @@ obj-y += light/
 obj-y += magnetometer/
 obj-y += multiplexer/
 obj-y += orientation/
+obj-y += position/
 obj-y += potentiometer/
 obj-y += potentiostat/
 obj-y += pressure/
index 68e847c..2532b9a 100644 (file)
@@ -170,7 +170,8 @@ static int cros_ec_accel_legacy_probe(struct platform_device *pdev)
        if (!indio_dev)
                return -ENOMEM;
 
-       ret = cros_ec_sensors_core_init(pdev, indio_dev, true);
+       ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
+                                       cros_ec_sensors_capture, NULL);
        if (ret)
                return ret;
 
@@ -190,11 +191,6 @@ static int cros_ec_accel_legacy_probe(struct platform_device *pdev)
                state->sign[CROS_EC_SENSOR_Z] = -1;
        }
 
-       ret = devm_iio_triggered_buffer_setup(dev, indio_dev, NULL,
-                       cros_ec_sensors_capture, NULL);
-       if (ret)
-               return ret;
-
        return devm_iio_device_register(dev, indio_dev);
 }
 
index f4da821..12bb8b7 100644 (file)
@@ -795,6 +795,16 @@ config RCAR_GYRO_ADC
          To compile this driver as a module, choose M here: the
          module will be called rcar-gyroadc.
 
+config RN5T618_ADC
+       tristate "ADC for the RN5T618/RC5T619 family of chips"
+       depends on MFD_RN5T618
+       help
+         Say yes here to build support for the integrated ADC inside the
+         RN5T618/619 series PMICs:
+
+         This driver can also be built as a module. If so, the module
+         will be called rn5t618-adc.
+
 config ROCKCHIP_SARADC
        tristate "Rockchip SARADC driver"
        depends on ARCH_ROCKCHIP || (ARM && COMPILE_TEST)
index 8462455..6378078 100644 (file)
@@ -75,6 +75,7 @@ obj-$(CONFIG_QCOM_VADC_COMMON) += qcom-vadc-common.o
 obj-$(CONFIG_QCOM_SPMI_VADC) += qcom-spmi-vadc.o
 obj-$(CONFIG_QCOM_PM8XXX_XOADC) += qcom-pm8xxx-xoadc.o
 obj-$(CONFIG_RCAR_GYRO_ADC) += rcar-gyroadc.o
+obj-$(CONFIG_RN5T618_ADC) += rn5t618-adc.o
 obj-$(CONFIG_ROCKCHIP_SARADC) += rockchip_saradc.o
 obj-$(CONFIG_SC27XX_ADC) += sc27xx_adc.o
 obj-$(CONFIG_SPEAR_ADC) += spear_adc.o
diff --git a/drivers/iio/adc/rn5t618-adc.c b/drivers/iio/adc/rn5t618-adc.c
new file mode 100644 (file)
index 0000000..f21027e
--- /dev/null
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * ADC driver for the RICOH RN5T618 power management chip family
+ *
+ * Copyright (C) 2019 Andreas Kemnade
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mfd/rn5t618.h>
+#include <linux/platform_device.h>
+#include <linux/completion.h>
+#include <linux/regmap.h>
+#include <linux/iio/iio.h>
+#include <linux/slab.h>
+
+#define RN5T618_ADC_CONVERSION_TIMEOUT   (msecs_to_jiffies(500))
+#define RN5T618_REFERENCE_VOLT 2500
+
+/* mask for selecting channels for single conversion */
+#define RN5T618_ADCCNT3_CHANNEL_MASK 0x7
+/* average 4-time conversion mode */
+#define RN5T618_ADCCNT3_AVG BIT(3)
+/* set for starting a single conversion, gets cleared by hw when done */
+#define RN5T618_ADCCNT3_GODONE BIT(4)
+/* automatic conversion, period is in ADCCNT2, selected channels are
+ * in ADCCNT1
+ */
+#define RN5T618_ADCCNT3_AUTO BIT(5)
+#define RN5T618_ADCEND_IRQ BIT(0)
+
+struct rn5t618_adc_data {
+       struct device *dev;
+       struct rn5t618 *rn5t618;
+       struct completion conv_completion;
+       int irq;
+};
+
+struct rn5t618_channel_ratios {
+       u16 numerator;
+       u16 denominator;
+};
+
+enum rn5t618_channels {
+       LIMMON = 0,
+       VBAT,
+       VADP,
+       VUSB,
+       VSYS,
+       VTHM,
+       AIN1,
+       AIN0
+};
+
+static const struct rn5t618_channel_ratios rn5t618_ratios[8] = {
+       [LIMMON] = {50, 32}, /* measured across 20mOhm, amplified by 32 */
+       [VBAT] = {2, 1},
+       [VADP] = {3, 1},
+       [VUSB] = {3, 1},
+       [VSYS] = {3, 1},
+       [VTHM] = {1, 1},
+       [AIN1] = {1, 1},
+       [AIN0] = {1, 1},
+};
+
+static int rn5t618_read_adc_reg(struct rn5t618 *rn5t618, int reg, u16 *val)
+{
+       u8 data[2];
+       int ret;
+
+       ret = regmap_bulk_read(rn5t618->regmap, reg, data, sizeof(data));
+       if (ret < 0)
+               return ret;
+
+       *val = (data[0] << 4) | (data[1] & 0xF);
+
+       return 0;
+}
+
+static irqreturn_t rn5t618_adc_irq(int irq, void *data)
+{
+       struct rn5t618_adc_data *adc = data;
+       unsigned int r = 0;
+       int ret;
+
+       /* clear low & high threshold irqs */
+       regmap_write(adc->rn5t618->regmap, RN5T618_IR_ADC1, 0);
+       regmap_write(adc->rn5t618->regmap, RN5T618_IR_ADC2, 0);
+
+       ret = regmap_read(adc->rn5t618->regmap, RN5T618_IR_ADC3, &r);
+       if (ret < 0)
+               dev_err(adc->dev, "failed to read IRQ status: %d\n", ret);
+
+       regmap_write(adc->rn5t618->regmap, RN5T618_IR_ADC3, 0);
+
+       if (r & RN5T618_ADCEND_IRQ)
+               complete(&adc->conv_completion);
+
+       return IRQ_HANDLED;
+}
+
+static int rn5t618_adc_read(struct iio_dev *iio_dev,
+                           const struct iio_chan_spec *chan,
+                           int *val, int *val2, long mask)
+{
+       struct rn5t618_adc_data *adc = iio_priv(iio_dev);
+       u16 raw;
+       int ret;
+
+       if (mask == IIO_CHAN_INFO_SCALE) {
+               *val = RN5T618_REFERENCE_VOLT *
+                      rn5t618_ratios[chan->channel].numerator;
+               *val2 = rn5t618_ratios[chan->channel].denominator * 4095;
+
+               return IIO_VAL_FRACTIONAL;
+       }
+
+       /* select channel */
+       ret = regmap_update_bits(adc->rn5t618->regmap, RN5T618_ADCCNT3,
+                                RN5T618_ADCCNT3_CHANNEL_MASK,
+                                chan->channel);
+       if (ret < 0)
+               return ret;
+
+       ret = regmap_write(adc->rn5t618->regmap, RN5T618_EN_ADCIR3,
+                          RN5T618_ADCEND_IRQ);
+       if (ret < 0)
+               return ret;
+
+       ret = regmap_update_bits(adc->rn5t618->regmap, RN5T618_ADCCNT3,
+                                RN5T618_ADCCNT3_AVG,
+                                mask == IIO_CHAN_INFO_AVERAGE_RAW ?
+                                RN5T618_ADCCNT3_AVG : 0);
+       if (ret < 0)
+               return ret;
+
+       init_completion(&adc->conv_completion);
+       /* single conversion */
+       ret = regmap_update_bits(adc->rn5t618->regmap, RN5T618_ADCCNT3,
+                                RN5T618_ADCCNT3_GODONE,
+                                RN5T618_ADCCNT3_GODONE);
+       if (ret < 0)
+               return ret;
+
+       ret = wait_for_completion_timeout(&adc->conv_completion,
+                                         RN5T618_ADC_CONVERSION_TIMEOUT);
+       if (ret == 0) {
+               dev_warn(adc->dev, "timeout waiting for adc result\n");
+               return -ETIMEDOUT;
+       }
+
+       ret = rn5t618_read_adc_reg(adc->rn5t618,
+                                  RN5T618_ILIMDATAH + 2 * chan->channel,
+                                  &raw);
+       if (ret < 0)
+               return ret;
+
+       *val = raw;
+
+       return IIO_VAL_INT;
+}
+
+static const struct iio_info rn5t618_adc_iio_info = {
+       .read_raw = &rn5t618_adc_read,
+};
+
+#define RN5T618_ADC_CHANNEL(_channel, _type, _name) { \
+       .type = _type, \
+       .channel = _channel, \
+       .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \
+                             BIT(IIO_CHAN_INFO_AVERAGE_RAW) | \
+                             BIT(IIO_CHAN_INFO_SCALE), \
+       .datasheet_name = _name, \
+       .indexed = 1. \
+}
+
+static const struct iio_chan_spec rn5t618_adc_iio_channels[] = {
+       RN5T618_ADC_CHANNEL(LIMMON, IIO_CURRENT, "LIMMON"),
+       RN5T618_ADC_CHANNEL(VBAT, IIO_VOLTAGE, "VBAT"),
+       RN5T618_ADC_CHANNEL(VADP, IIO_VOLTAGE, "VADP"),
+       RN5T618_ADC_CHANNEL(VUSB, IIO_VOLTAGE, "VUSB"),
+       RN5T618_ADC_CHANNEL(VSYS, IIO_VOLTAGE, "VSYS"),
+       RN5T618_ADC_CHANNEL(VTHM, IIO_VOLTAGE, "VTHM"),
+       RN5T618_ADC_CHANNEL(AIN1, IIO_VOLTAGE, "AIN1"),
+       RN5T618_ADC_CHANNEL(AIN0, IIO_VOLTAGE, "AIN0")
+};
+
+static int rn5t618_adc_probe(struct platform_device *pdev)
+{
+       int ret;
+       struct iio_dev *iio_dev;
+       struct rn5t618_adc_data *adc;
+       struct rn5t618 *rn5t618 = dev_get_drvdata(pdev->dev.parent);
+
+       iio_dev = devm_iio_device_alloc(&pdev->dev, sizeof(*adc));
+       if (!iio_dev) {
+               dev_err(&pdev->dev, "failed allocating iio device\n");
+               return -ENOMEM;
+       }
+
+       adc = iio_priv(iio_dev);
+       adc->dev = &pdev->dev;
+       adc->rn5t618 = rn5t618;
+
+       if (rn5t618->irq_data)
+               adc->irq = regmap_irq_get_virq(rn5t618->irq_data,
+                                              RN5T618_IRQ_ADC);
+
+       if (adc->irq <= 0) {
+               dev_err(&pdev->dev, "get virq failed\n");
+               return -EINVAL;
+       }
+
+       init_completion(&adc->conv_completion);
+
+       iio_dev->name = dev_name(&pdev->dev);
+       iio_dev->dev.parent = &pdev->dev;
+       iio_dev->info = &rn5t618_adc_iio_info;
+       iio_dev->modes = INDIO_DIRECT_MODE;
+       iio_dev->channels = rn5t618_adc_iio_channels;
+       iio_dev->num_channels = ARRAY_SIZE(rn5t618_adc_iio_channels);
+
+       /* stop any auto-conversion */
+       ret = regmap_write(rn5t618->regmap, RN5T618_ADCCNT3, 0);
+       if (ret < 0)
+               return ret;
+
+       platform_set_drvdata(pdev, iio_dev);
+
+       ret = devm_request_threaded_irq(adc->dev, adc->irq, NULL,
+                                       rn5t618_adc_irq,
+                                       IRQF_ONESHOT, dev_name(adc->dev),
+                                       adc);
+       if (ret < 0) {
+               dev_err(adc->dev, "request irq %d failed: %d\n", adc->irq, ret);
+               return ret;
+       }
+
+       return devm_iio_device_register(adc->dev, iio_dev);
+}
+
+static struct platform_driver rn5t618_adc_driver = {
+       .driver = {
+               .name   = "rn5t618-adc",
+       },
+       .probe = rn5t618_adc_probe,
+};
+
+module_platform_driver(rn5t618_adc_driver);
+MODULE_ALIAS("platform:rn5t618-adc");
+MODULE_DESCRIPTION("RICOH RN5T618 ADC driver");
+MODULE_LICENSE("GPL");
index 1dcc2a1..af801e2 100644 (file)
@@ -97,7 +97,7 @@ static int cros_ec_lid_angle_probe(struct platform_device *pdev)
        if (!indio_dev)
                return -ENOMEM;
 
-       ret = cros_ec_sensors_core_init(pdev, indio_dev, false);
+       ret = cros_ec_sensors_core_init(pdev, indio_dev, false, NULL, NULL);
        if (ret)
                return ret;
 
@@ -127,7 +127,6 @@ MODULE_DEVICE_TABLE(platform, cros_ec_lid_angle_ids);
 static struct platform_driver cros_ec_lid_angle_platform_driver = {
        .driver = {
                .name   = DRV_NAME,
-               .pm     = &cros_ec_sensors_pm_ops,
        },
        .probe          = cros_ec_lid_angle_probe,
        .id_table       = cros_ec_lid_angle_ids,
index 576e45f..a66941f 100644 (file)
@@ -230,10 +230,14 @@ static int cros_ec_sensors_probe(struct platform_device *pdev)
        if (!indio_dev)
                return -ENOMEM;
 
-       ret = cros_ec_sensors_core_init(pdev, indio_dev, true);
+       ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
+                                       cros_ec_sensors_capture,
+                                       cros_ec_sensors_push_data);
        if (ret)
                return ret;
 
+       iio_buffer_set_attrs(indio_dev->buffer, cros_ec_sensor_fifo_attributes);
+
        indio_dev->info = &ec_sensors_info;
        state = iio_priv(indio_dev);
        for (channel = state->channels, i = CROS_EC_SENSOR_X;
@@ -245,7 +249,6 @@ static int cros_ec_sensors_probe(struct platform_device *pdev)
                        BIT(IIO_CHAN_INFO_CALIBSCALE);
                channel->info_mask_shared_by_all =
                        BIT(IIO_CHAN_INFO_SCALE) |
-                       BIT(IIO_CHAN_INFO_FREQUENCY) |
                        BIT(IIO_CHAN_INFO_SAMP_FREQ);
                channel->info_mask_shared_by_all_available =
                        BIT(IIO_CHAN_INFO_SAMP_FREQ);
@@ -292,11 +295,6 @@ static int cros_ec_sensors_probe(struct platform_device *pdev)
        else
                state->core.read_ec_sensors_data = cros_ec_sensors_read_cmd;
 
-       ret = devm_iio_triggered_buffer_setup(dev, indio_dev, NULL,
-                       cros_ec_sensors_capture, NULL);
-       if (ret)
-               return ret;
-
        return devm_iio_device_register(dev, indio_dev);
 }
 
@@ -317,7 +315,6 @@ MODULE_DEVICE_TABLE(platform, cros_ec_sensors_ids);
 static struct platform_driver cros_ec_sensors_platform_driver = {
        .driver = {
                .name   = "cros-ec-sensors",
-               .pm     = &cros_ec_sensors_pm_ops,
        },
        .probe          = cros_ec_sensors_probe,
        .id_table       = cros_ec_sensors_ids,
index d3a3626..c831915 100644 (file)
@@ -11,7 +11,9 @@
 #include <linux/iio/common/cros_ec_sensors_core.h>
 #include <linux/iio/iio.h>
 #include <linux/iio/kfifo_buf.h>
+#include <linux/iio/sysfs.h>
 #include <linux/iio/trigger_consumer.h>
+#include <linux/iio/triggered_buffer.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/platform_data/cros_ec_sensorhub.h>
 #include <linux/platform_device.h>
 
+/*
+ * Hard coded to the first device to support sensor fifo.  The EC has a 2048
+ * byte fifo and will trigger an interrupt when fifo is 2/3 full.
+ */
+#define CROS_EC_FIFO_SIZE (2048 * 2 / 3)
+
 static char *cros_ec_loc[] = {
        [MOTIONSENSE_LOC_BASE] = "base",
        [MOTIONSENSE_LOC_LID] = "lid",
@@ -53,8 +61,15 @@ static int cros_ec_get_host_cmd_version_mask(struct cros_ec_device *ec_dev,
 
 static void get_default_min_max_freq(enum motionsensor_type type,
                                     u32 *min_freq,
-                                    u32 *max_freq)
+                                    u32 *max_freq,
+                                    u32 *max_fifo_events)
 {
+       /*
+        * We don't know fifo size, set to size previously used by older
+        * hardware.
+        */
+       *max_fifo_events = CROS_EC_FIFO_SIZE;
+
        switch (type) {
        case MOTIONSENSE_TYPE_ACCEL:
        case MOTIONSENSE_TYPE_GYRO:
@@ -82,9 +97,155 @@ static void get_default_min_max_freq(enum motionsensor_type type,
        }
 }
 
+static int cros_ec_sensor_set_ec_rate(struct cros_ec_sensors_core_state *st,
+                                     int rate)
+{
+       int ret;
+
+       if (rate > U16_MAX)
+               rate = U16_MAX;
+
+       mutex_lock(&st->cmd_lock);
+       st->param.cmd = MOTIONSENSE_CMD_EC_RATE;
+       st->param.ec_rate.data = rate;
+       ret = cros_ec_motion_send_host_cmd(st, 0);
+       mutex_unlock(&st->cmd_lock);
+       return ret;
+}
+
+static ssize_t cros_ec_sensor_set_report_latency(struct device *dev,
+                                                struct device_attribute *attr,
+                                                const char *buf, size_t len)
+{
+       struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+       struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
+       int integer, fract, ret;
+       int latency;
+
+       ret = iio_str_to_fixpoint(buf, 100000, &integer, &fract);
+       if (ret)
+               return ret;
+
+       /* EC rate is in ms. */
+       latency = integer * 1000 + fract / 1000;
+       ret = cros_ec_sensor_set_ec_rate(st, latency);
+       if (ret < 0)
+               return ret;
+
+       return len;
+}
+
+static ssize_t cros_ec_sensor_get_report_latency(struct device *dev,
+                                                struct device_attribute *attr,
+                                                char *buf)
+{
+       struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+       struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
+       int latency, ret;
+
+       mutex_lock(&st->cmd_lock);
+       st->param.cmd = MOTIONSENSE_CMD_EC_RATE;
+       st->param.ec_rate.data = EC_MOTION_SENSE_NO_VALUE;
+
+       ret = cros_ec_motion_send_host_cmd(st, 0);
+       latency = st->resp->ec_rate.ret;
+       mutex_unlock(&st->cmd_lock);
+       if (ret < 0)
+               return ret;
+
+       return sprintf(buf, "%d.%06u\n",
+                      latency / 1000,
+                      (latency % 1000) * 1000);
+}
+
+static IIO_DEVICE_ATTR(hwfifo_timeout, 0644,
+                      cros_ec_sensor_get_report_latency,
+                      cros_ec_sensor_set_report_latency, 0);
+
+static ssize_t hwfifo_watermark_max_show(struct device *dev,
+                                        struct device_attribute *attr,
+                                        char *buf)
+{
+       struct iio_dev *indio_dev = dev_to_iio_dev(dev);
+       struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
+
+       return sprintf(buf, "%d\n", st->fifo_max_event_count);
+}
+
+static IIO_DEVICE_ATTR_RO(hwfifo_watermark_max, 0);
+
+const struct attribute *cros_ec_sensor_fifo_attributes[] = {
+       &iio_dev_attr_hwfifo_timeout.dev_attr.attr,
+       &iio_dev_attr_hwfifo_watermark_max.dev_attr.attr,
+       NULL,
+};
+EXPORT_SYMBOL_GPL(cros_ec_sensor_fifo_attributes);
+
+int cros_ec_sensors_push_data(struct iio_dev *indio_dev,
+                             s16 *data,
+                             s64 timestamp)
+{
+       struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
+       s16 *out;
+       s64 delta;
+       unsigned int i;
+
+       /*
+        * Ignore samples if the buffer is not set: it is needed if the ODR is
+        * set but the buffer is not enabled yet.
+        */
+       if (!iio_buffer_enabled(indio_dev))
+               return 0;
+
+       out = (s16 *)st->samples;
+       for_each_set_bit(i,
+                        indio_dev->active_scan_mask,
+                        indio_dev->masklength) {
+               *out = data[i];
+               out++;
+       }
+
+       if (iio_device_get_clock(indio_dev) != CLOCK_BOOTTIME)
+               delta = iio_get_time_ns(indio_dev) - cros_ec_get_time_ns();
+       else
+               delta = 0;
+
+       iio_push_to_buffers_with_timestamp(indio_dev, st->samples,
+                                          timestamp + delta);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(cros_ec_sensors_push_data);
+
+static void cros_ec_sensors_core_clean(void *arg)
+{
+       struct platform_device *pdev = (struct platform_device *)arg;
+       struct cros_ec_sensorhub *sensor_hub =
+               dev_get_drvdata(pdev->dev.parent);
+       struct iio_dev *indio_dev = platform_get_drvdata(pdev);
+       struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
+       u8 sensor_num = st->param.info.sensor_num;
+
+       cros_ec_sensorhub_unregister_push_data(sensor_hub, sensor_num);
+}
+
+/**
+ * cros_ec_sensors_core_init() - basic initialization of the core structure
+ * @pdev:              platform device created for the sensors
+ * @indio_dev:         iio device structure of the device
+ * @physical_device:   true if the device refers to a physical device
+ * @trigger_capture:    function pointer to call buffer is triggered,
+ *    for backward compatibility.
+ * @push_data:          function to call when cros_ec_sensorhub receives
+ *    a sample for that sensor.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
 int cros_ec_sensors_core_init(struct platform_device *pdev,
                              struct iio_dev *indio_dev,
-                             bool physical_device)
+                             bool physical_device,
+                             cros_ec_sensors_capture_t trigger_capture,
+                             cros_ec_sensorhub_push_data_cb_t push_data)
 {
        struct device *dev = &pdev->dev;
        struct cros_ec_sensors_core_state *state = iio_priv(indio_dev);
@@ -92,6 +253,7 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
        struct cros_ec_dev *ec = sensor_hub->ec;
        struct cros_ec_sensor_platform *sensor_platform = dev_get_platdata(dev);
        u32 ver_mask;
+       int frequencies[ARRAY_SIZE(state->frequencies) / 2] = { 0 };
        int ret, i;
 
        platform_set_drvdata(pdev, indio_dev);
@@ -123,8 +285,6 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
        indio_dev->name = pdev->name;
 
        if (physical_device) {
-               indio_dev->modes = INDIO_DIRECT_MODE;
-
                state->param.cmd = MOTIONSENSE_CMD_INFO;
                state->param.info.sensor_num = sensor_platform->sensor_num;
                ret = cros_ec_motion_send_host_cmd(state, 0);
@@ -142,16 +302,63 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
                        state->calib[i].scale = MOTION_SENSE_DEFAULT_SCALE;
 
                /* 0 is a correct value used to stop the device */
-               state->frequencies[0] = 0;
                if (state->msg->version < 3) {
                        get_default_min_max_freq(state->resp->info.type,
-                                                &state->frequencies[1],
-                                                &state->frequencies[2]);
+                                                &frequencies[1],
+                                                &frequencies[2],
+                                                &state->fifo_max_event_count);
                } else {
-                       state->frequencies[1] =
-                           state->resp->info_3.min_frequency;
-                       state->frequencies[2] =
-                           state->resp->info_3.max_frequency;
+                       frequencies[1] = state->resp->info_3.min_frequency;
+                       frequencies[2] = state->resp->info_3.max_frequency;
+                       state->fifo_max_event_count =
+                           state->resp->info_3.fifo_max_event_count;
+               }
+               for (i = 0; i < ARRAY_SIZE(frequencies); i++) {
+                       state->frequencies[2 * i] = frequencies[i] / 1000;
+                       state->frequencies[2 * i + 1] =
+                               (frequencies[i] % 1000) * 1000;
+               }
+
+               if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE_FIFO)) {
+                       /*
+                        * Create a software buffer, feed by the EC FIFO.
+                        * We can not use trigger here, as events are generated
+                        * as soon as sample_frequency is set.
+                        */
+                       struct iio_buffer *buffer;
+
+                       buffer = devm_iio_kfifo_allocate(dev);
+                       if (!buffer)
+                               return -ENOMEM;
+
+                       iio_device_attach_buffer(indio_dev, buffer);
+                       indio_dev->modes = INDIO_BUFFER_SOFTWARE;
+
+                       ret = cros_ec_sensorhub_register_push_data(
+                                       sensor_hub, sensor_platform->sensor_num,
+                                       indio_dev, push_data);
+                       if (ret)
+                               return ret;
+
+                       ret = devm_add_action_or_reset(
+                                       dev, cros_ec_sensors_core_clean, pdev);
+                       if (ret)
+                               return ret;
+
+                       /* Timestamp coming from FIFO are in ns since boot. */
+                       ret = iio_device_set_clock(indio_dev, CLOCK_BOOTTIME);
+                       if (ret)
+                               return ret;
+               } else {
+                       /*
+                        * The only way to get samples in buffer is to set a
+                        * software tigger (systrig, hrtimer).
+                        */
+                       ret = devm_iio_triggered_buffer_setup(
+                                       dev, indio_dev, NULL, trigger_capture,
+                                       NULL);
+                       if (ret)
+                               return ret;
                }
        }
 
@@ -159,6 +366,16 @@ int cros_ec_sensors_core_init(struct platform_device *pdev,
 }
 EXPORT_SYMBOL_GPL(cros_ec_sensors_core_init);
 
+/**
+ * cros_ec_motion_send_host_cmd() - send motion sense host command
+ * @state:             pointer to state information for device
+ * @opt_length:        optional length to reduce the response size, useful on the data
+ *             path. Otherwise, the maximal allowed response size is used
+ *
+ * When called, the sub-command is assumed to be set in param->cmd.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
 int cros_ec_motion_send_host_cmd(struct cros_ec_sensors_core_state *state,
                                 u16 opt_length)
 {
@@ -421,6 +638,14 @@ int cros_ec_sensors_read_lpc(struct iio_dev *indio_dev,
 }
 EXPORT_SYMBOL_GPL(cros_ec_sensors_read_lpc);
 
+/**
+ * cros_ec_sensors_read_cmd() - retrieve data using the EC command protocol
+ * @indio_dev: pointer to IIO device
+ * @scan_mask: bitmap of the sensor indices to scan
+ * @data:      location to store data
+ *
+ * Return: 0 on success, -errno on failure.
+ */
 int cros_ec_sensors_read_cmd(struct iio_dev *indio_dev,
                             unsigned long scan_mask, s16 *data)
 {
@@ -445,6 +670,18 @@ int cros_ec_sensors_read_cmd(struct iio_dev *indio_dev,
 }
 EXPORT_SYMBOL_GPL(cros_ec_sensors_read_cmd);
 
+/**
+ * cros_ec_sensors_capture() - the trigger handler function
+ * @irq:       the interrupt number.
+ * @p:         a pointer to the poll function.
+ *
+ * On a trigger event occurring, if the pollfunc is attached then this
+ * handler is called as a threaded interrupt (and hence may sleep). It
+ * is responsible for grabbing data from the device and pushing it into
+ * the associated buffer.
+ *
+ * Return: IRQ_HANDLED
+ */
 irqreturn_t cros_ec_sensors_capture(int irq, void *p)
 {
        struct iio_poll_func *pf = p;
@@ -480,26 +717,24 @@ done:
 }
 EXPORT_SYMBOL_GPL(cros_ec_sensors_capture);
 
+/**
+ * cros_ec_sensors_core_read() - function to request a value from the sensor
+ * @st:                pointer to state information for device
+ * @chan:      channel specification structure table
+ * @val:       will contain one element making up the returned value
+ * @val2:      will contain another element making up the returned value
+ * @mask:      specifies which values to be requested
+ *
+ * Return:     the type of value returned by the device
+ */
 int cros_ec_sensors_core_read(struct cros_ec_sensors_core_state *st,
                          struct iio_chan_spec const *chan,
                          int *val, int *val2, long mask)
 {
-       int ret;
+       int ret, frequency;
 
        switch (mask) {
        case IIO_CHAN_INFO_SAMP_FREQ:
-               st->param.cmd = MOTIONSENSE_CMD_EC_RATE;
-               st->param.ec_rate.data =
-                       EC_MOTION_SENSE_NO_VALUE;
-
-               ret = cros_ec_motion_send_host_cmd(st, 0);
-               if (ret)
-                       break;
-
-               *val = st->resp->ec_rate.ret;
-               ret = IIO_VAL_INT;
-               break;
-       case IIO_CHAN_INFO_FREQUENCY:
                st->param.cmd = MOTIONSENSE_CMD_SENSOR_ODR;
                st->param.sensor_odr.data =
                        EC_MOTION_SENSE_NO_VALUE;
@@ -508,8 +743,10 @@ int cros_ec_sensors_core_read(struct cros_ec_sensors_core_state *st,
                if (ret)
                        break;
 
-               *val = st->resp->sensor_odr.ret;
-               ret = IIO_VAL_INT;
+               frequency = st->resp->sensor_odr.ret;
+               *val = frequency / 1000;
+               *val2 = (frequency % 1000) * 1000;
+               ret = IIO_VAL_INT_PLUS_MICRO;
                break;
        default:
                ret = -EINVAL;
@@ -520,6 +757,17 @@ int cros_ec_sensors_core_read(struct cros_ec_sensors_core_state *st,
 }
 EXPORT_SYMBOL_GPL(cros_ec_sensors_core_read);
 
+/**
+ * cros_ec_sensors_core_read_avail() - get available values
+ * @indio_dev:         pointer to state information for device
+ * @chan:      channel specification structure table
+ * @vals:      list of available values
+ * @type:      type of data returned
+ * @length:    number of data returned in the array
+ * @mask:      specifies which values to be requested
+ *
+ * Return:     an error code, IIO_AVAIL_RANGE or IIO_AVAIL_LIST
+ */
 int cros_ec_sensors_core_read_avail(struct iio_dev *indio_dev,
                                    struct iio_chan_spec const *chan,
                                    const int **vals,
@@ -533,7 +781,7 @@ int cros_ec_sensors_core_read_avail(struct iio_dev *indio_dev,
        case IIO_CHAN_INFO_SAMP_FREQ:
                *length = ARRAY_SIZE(state->frequencies);
                *vals = (const int *)&state->frequencies;
-               *type = IIO_VAL_INT;
+               *type = IIO_VAL_INT_PLUS_MICRO;
                return IIO_AVAIL_LIST;
        }
 
@@ -541,31 +789,33 @@ int cros_ec_sensors_core_read_avail(struct iio_dev *indio_dev,
 }
 EXPORT_SYMBOL_GPL(cros_ec_sensors_core_read_avail);
 
+/**
+ * cros_ec_sensors_core_write() - function to write a value to the sensor
+ * @st:                pointer to state information for device
+ * @chan:      channel specification structure table
+ * @val:       first part of value to write
+ * @val2:      second part of value to write
+ * @mask:      specifies which values to write
+ *
+ * Return:     the type of value returned by the device
+ */
 int cros_ec_sensors_core_write(struct cros_ec_sensors_core_state *st,
                               struct iio_chan_spec const *chan,
                               int val, int val2, long mask)
 {
-       int ret;
+       int ret, frequency;
 
        switch (mask) {
-       case IIO_CHAN_INFO_FREQUENCY:
+       case IIO_CHAN_INFO_SAMP_FREQ:
+               frequency = val * 1000 + val2 / 1000;
                st->param.cmd = MOTIONSENSE_CMD_SENSOR_ODR;
-               st->param.sensor_odr.data = val;
+               st->param.sensor_odr.data = frequency;
 
                /* Always roundup, so caller gets at least what it asks for. */
                st->param.sensor_odr.roundup = 1;
 
                ret = cros_ec_motion_send_host_cmd(st, 0);
                break;
-       case IIO_CHAN_INFO_SAMP_FREQ:
-               st->param.cmd = MOTIONSENSE_CMD_EC_RATE;
-               st->param.ec_rate.data = val;
-
-               ret = cros_ec_motion_send_host_cmd(st, 0);
-               if (ret)
-                       break;
-               st->curr_sampl_freq = val;
-               break;
        default:
                ret = -EINVAL;
                break;
@@ -574,52 +824,5 @@ int cros_ec_sensors_core_write(struct cros_ec_sensors_core_state *st,
 }
 EXPORT_SYMBOL_GPL(cros_ec_sensors_core_write);
 
-static int __maybe_unused cros_ec_sensors_prepare(struct device *dev)
-{
-       struct iio_dev *indio_dev = dev_get_drvdata(dev);
-       struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
-
-       if (st->curr_sampl_freq == 0)
-               return 0;
-
-       /*
-        * If the sensors are sampled at high frequency, we will not be able to
-        * sleep. Set sampling to a long period if necessary.
-        */
-       if (st->curr_sampl_freq < CROS_EC_MIN_SUSPEND_SAMPLING_FREQUENCY) {
-               mutex_lock(&st->cmd_lock);
-               st->param.cmd = MOTIONSENSE_CMD_EC_RATE;
-               st->param.ec_rate.data = CROS_EC_MIN_SUSPEND_SAMPLING_FREQUENCY;
-               cros_ec_motion_send_host_cmd(st, 0);
-               mutex_unlock(&st->cmd_lock);
-       }
-       return 0;
-}
-
-static void __maybe_unused cros_ec_sensors_complete(struct device *dev)
-{
-       struct iio_dev *indio_dev = dev_get_drvdata(dev);
-       struct cros_ec_sensors_core_state *st = iio_priv(indio_dev);
-
-       if (st->curr_sampl_freq == 0)
-               return;
-
-       if (st->curr_sampl_freq < CROS_EC_MIN_SUSPEND_SAMPLING_FREQUENCY) {
-               mutex_lock(&st->cmd_lock);
-               st->param.cmd = MOTIONSENSE_CMD_EC_RATE;
-               st->param.ec_rate.data = st->curr_sampl_freq;
-               cros_ec_motion_send_host_cmd(st, 0);
-               mutex_unlock(&st->cmd_lock);
-       }
-}
-
-const struct dev_pm_ops cros_ec_sensors_pm_ops = {
-#ifdef CONFIG_PM_SLEEP
-       .prepare = cros_ec_sensors_prepare,
-       .complete = cros_ec_sensors_complete
-#endif
-};
-EXPORT_SYMBOL_GPL(cros_ec_sensors_pm_ops);
-
 MODULE_DESCRIPTION("ChromeOS EC sensor hub core functions");
 MODULE_LICENSE("GPL v2");
index eac63c1..2352c42 100644 (file)
@@ -189,7 +189,12 @@ ssize_t iio_read_const_attr(struct device *dev,
 }
 EXPORT_SYMBOL(iio_read_const_attr);
 
-static int iio_device_set_clock(struct iio_dev *indio_dev, clockid_t clock_id)
+/**
+ * iio_device_set_clock() - Set current timestamping clock for the device
+ * @indio_dev: IIO device structure containing the device
+ * @clock_id: timestamping clock posix identifier to set.
+ */
+int iio_device_set_clock(struct iio_dev *indio_dev, clockid_t clock_id)
 {
        int ret;
        const struct iio_event_interface *ev_int = indio_dev->event_interface;
@@ -207,6 +212,7 @@ static int iio_device_set_clock(struct iio_dev *indio_dev, clockid_t clock_id)
 
        return 0;
 }
+EXPORT_SYMBOL(iio_device_set_clock);
 
 /**
  * iio_get_time_ns() - utility function to get a time stamp for events etc
index 74970f1..b27719c 100644 (file)
@@ -194,6 +194,16 @@ config GP2AP020A00F
          To compile this driver as a module, choose M here: the
          module will be called gp2ap020a00f.
 
+config IQS621_ALS
+       tristate "Azoteq IQS621/622 ambient light sensors"
+       depends on MFD_IQS62X || COMPILE_TEST
+       help
+         Say Y here if you want to build support for the Azoteq IQS621
+         and IQS622 ambient light sensors.
+
+         To compile this driver as a module, choose M here: the module
+         will be called iqs621-als.
+
 config SENSORS_ISL29018
        tristate "Intersil 29018 light and proximity sensor"
        depends on I2C
index 5c1ebaf..d1c8aa3 100644 (file)
@@ -23,6 +23,7 @@ obj-$(CONFIG_GP2AP002)                += gp2ap002.o
 obj-$(CONFIG_GP2AP020A00F)     += gp2ap020a00f.o
 obj-$(CONFIG_HID_SENSOR_ALS)   += hid-sensor-als.o
 obj-$(CONFIG_HID_SENSOR_PROX)  += hid-sensor-prox.o
+obj-$(CONFIG_IQS621_ALS)       += iqs621-als.o
 obj-$(CONFIG_SENSORS_ISL29018) += isl29018.o
 obj-$(CONFIG_SENSORS_ISL29028) += isl29028.o
 obj-$(CONFIG_ISL29125)         += isl29125.o
index 7a838e2..2198b50 100644 (file)
@@ -177,10 +177,14 @@ static int cros_ec_light_prox_probe(struct platform_device *pdev)
        if (!indio_dev)
                return -ENOMEM;
 
-       ret = cros_ec_sensors_core_init(pdev, indio_dev, true);
+       ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
+                                       cros_ec_sensors_capture,
+                                       cros_ec_sensors_push_data);
        if (ret)
                return ret;
 
+       iio_buffer_set_attrs(indio_dev->buffer, cros_ec_sensor_fifo_attributes);
+
        indio_dev->info = &cros_ec_light_prox_info;
        state = iio_priv(indio_dev);
        state->core.type = state->core.resp->info.type;
@@ -189,8 +193,7 @@ static int cros_ec_light_prox_probe(struct platform_device *pdev)
 
        /* Common part */
        channel->info_mask_shared_by_all =
-               BIT(IIO_CHAN_INFO_SAMP_FREQ) |
-               BIT(IIO_CHAN_INFO_FREQUENCY);
+               BIT(IIO_CHAN_INFO_SAMP_FREQ);
        channel->info_mask_shared_by_all_available =
                BIT(IIO_CHAN_INFO_SAMP_FREQ);
        channel->scan_type.realbits = CROS_EC_SENSOR_BITS;
@@ -236,11 +239,6 @@ static int cros_ec_light_prox_probe(struct platform_device *pdev)
 
        state->core.read_ec_sensors_data = cros_ec_sensors_read_cmd;
 
-       ret = devm_iio_triggered_buffer_setup(dev, indio_dev, NULL,
-                                             cros_ec_sensors_capture, NULL);
-       if (ret)
-               return ret;
-
        return devm_iio_device_register(dev, indio_dev);
 }
 
@@ -258,7 +256,6 @@ MODULE_DEVICE_TABLE(platform, cros_ec_light_prox_ids);
 static struct platform_driver cros_ec_light_prox_platform_driver = {
        .driver = {
                .name   = "cros-ec-light-prox",
-               .pm     = &cros_ec_sensors_pm_ops,
        },
        .probe          = cros_ec_light_prox_probe,
        .id_table       = cros_ec_light_prox_ids,
diff --git a/drivers/iio/light/iqs621-als.c b/drivers/iio/light/iqs621-als.c
new file mode 100644 (file)
index 0000000..b2988a7
--- /dev/null
@@ -0,0 +1,617 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Azoteq IQS621/622 Ambient Light Sensors
+ *
+ * Copyright (C) 2019 Jeff LaBundy <jeff@labundy.com>
+ */
+
+#include <linux/device.h>
+#include <linux/iio/events.h>
+#include <linux/iio/iio.h>
+#include <linux/kernel.h>
+#include <linux/mfd/iqs62x.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+#define IQS621_ALS_FLAGS_LIGHT                 BIT(7)
+#define IQS621_ALS_FLAGS_RANGE                 GENMASK(3, 0)
+
+#define IQS621_ALS_UI_OUT                      0x17
+
+#define IQS621_ALS_THRESH_DARK                 0x80
+#define IQS621_ALS_THRESH_LIGHT                        0x81
+
+#define IQS622_IR_RANGE                                0x15
+#define IQS622_IR_FLAGS                                0x16
+#define IQS622_IR_FLAGS_TOUCH                  BIT(1)
+#define IQS622_IR_FLAGS_PROX                   BIT(0)
+
+#define IQS622_IR_UI_OUT                       0x17
+
+#define IQS622_IR_THRESH_PROX                  0x91
+#define IQS622_IR_THRESH_TOUCH                 0x92
+
+struct iqs621_als_private {
+       struct iqs62x_core *iqs62x;
+       struct notifier_block notifier;
+       struct mutex lock;
+       bool light_en;
+       bool range_en;
+       bool prox_en;
+       u8 als_flags;
+       u8 ir_flags_mask;
+       u8 ir_flags;
+       u8 thresh_light;
+       u8 thresh_dark;
+       u8 thresh_prox;
+};
+
+static int iqs621_als_init(struct iqs621_als_private *iqs621_als)
+{
+       struct iqs62x_core *iqs62x = iqs621_als->iqs62x;
+       unsigned int event_mask = 0;
+       int ret;
+
+       switch (iqs621_als->ir_flags_mask) {
+       case IQS622_IR_FLAGS_TOUCH:
+               ret = regmap_write(iqs62x->regmap, IQS622_IR_THRESH_TOUCH,
+                                  iqs621_als->thresh_prox);
+               break;
+
+       case IQS622_IR_FLAGS_PROX:
+               ret = regmap_write(iqs62x->regmap, IQS622_IR_THRESH_PROX,
+                                  iqs621_als->thresh_prox);
+               break;
+
+       default:
+               ret = regmap_write(iqs62x->regmap, IQS621_ALS_THRESH_LIGHT,
+                                  iqs621_als->thresh_light);
+               if (ret)
+                       return ret;
+
+               ret = regmap_write(iqs62x->regmap, IQS621_ALS_THRESH_DARK,
+                                  iqs621_als->thresh_dark);
+       }
+
+       if (ret)
+               return ret;
+
+       if (iqs621_als->light_en || iqs621_als->range_en)
+               event_mask |= iqs62x->dev_desc->als_mask;
+
+       if (iqs621_als->prox_en)
+               event_mask |= iqs62x->dev_desc->ir_mask;
+
+       return regmap_update_bits(iqs62x->regmap, IQS620_GLBL_EVENT_MASK,
+                                 event_mask, 0);
+}
+
+static int iqs621_als_notifier(struct notifier_block *notifier,
+                              unsigned long event_flags, void *context)
+{
+       struct iqs62x_event_data *event_data = context;
+       struct iqs621_als_private *iqs621_als;
+       struct iio_dev *indio_dev;
+       bool light_new, light_old;
+       bool prox_new, prox_old;
+       u8 range_new, range_old;
+       s64 timestamp;
+       int ret;
+
+       iqs621_als = container_of(notifier, struct iqs621_als_private,
+                                 notifier);
+       indio_dev = iio_priv_to_dev(iqs621_als);
+       timestamp = iio_get_time_ns(indio_dev);
+
+       mutex_lock(&iqs621_als->lock);
+
+       if (event_flags & BIT(IQS62X_EVENT_SYS_RESET)) {
+               ret = iqs621_als_init(iqs621_als);
+               if (ret) {
+                       dev_err(indio_dev->dev.parent,
+                               "Failed to re-initialize device: %d\n", ret);
+                       ret = NOTIFY_BAD;
+               } else {
+                       ret = NOTIFY_OK;
+               }
+
+               goto err_mutex;
+       }
+
+       if (!iqs621_als->light_en && !iqs621_als->range_en &&
+           !iqs621_als->prox_en) {
+               ret = NOTIFY_DONE;
+               goto err_mutex;
+       }
+
+       /* IQS621 only */
+       light_new = event_data->als_flags & IQS621_ALS_FLAGS_LIGHT;
+       light_old = iqs621_als->als_flags & IQS621_ALS_FLAGS_LIGHT;
+
+       if (iqs621_als->light_en && light_new && !light_old)
+               iio_push_event(indio_dev,
+                              IIO_UNMOD_EVENT_CODE(IIO_LIGHT, 0,
+                                                   IIO_EV_TYPE_THRESH,
+                                                   IIO_EV_DIR_RISING),
+                              timestamp);
+       else if (iqs621_als->light_en && !light_new && light_old)
+               iio_push_event(indio_dev,
+                              IIO_UNMOD_EVENT_CODE(IIO_LIGHT, 0,
+                                                   IIO_EV_TYPE_THRESH,
+                                                   IIO_EV_DIR_FALLING),
+                              timestamp);
+
+       /* IQS621 and IQS622 */
+       range_new = event_data->als_flags & IQS621_ALS_FLAGS_RANGE;
+       range_old = iqs621_als->als_flags & IQS621_ALS_FLAGS_RANGE;
+
+       if (iqs621_als->range_en && (range_new > range_old))
+               iio_push_event(indio_dev,
+                              IIO_UNMOD_EVENT_CODE(IIO_INTENSITY, 0,
+                                                   IIO_EV_TYPE_CHANGE,
+                                                   IIO_EV_DIR_RISING),
+                              timestamp);
+       else if (iqs621_als->range_en && (range_new < range_old))
+               iio_push_event(indio_dev,
+                              IIO_UNMOD_EVENT_CODE(IIO_INTENSITY, 0,
+                                                   IIO_EV_TYPE_CHANGE,
+                                                   IIO_EV_DIR_FALLING),
+                              timestamp);
+
+       /* IQS622 only */
+       prox_new = event_data->ir_flags & iqs621_als->ir_flags_mask;
+       prox_old = iqs621_als->ir_flags & iqs621_als->ir_flags_mask;
+
+       if (iqs621_als->prox_en && prox_new && !prox_old)
+               iio_push_event(indio_dev,
+                              IIO_UNMOD_EVENT_CODE(IIO_PROXIMITY, 0,
+                                                   IIO_EV_TYPE_THRESH,
+                                                   IIO_EV_DIR_RISING),
+                              timestamp);
+       else if (iqs621_als->prox_en && !prox_new && prox_old)
+               iio_push_event(indio_dev,
+                              IIO_UNMOD_EVENT_CODE(IIO_PROXIMITY, 0,
+                                                   IIO_EV_TYPE_THRESH,
+                                                   IIO_EV_DIR_FALLING),
+                              timestamp);
+
+       iqs621_als->als_flags = event_data->als_flags;
+       iqs621_als->ir_flags = event_data->ir_flags;
+       ret = NOTIFY_OK;
+
+err_mutex:
+       mutex_unlock(&iqs621_als->lock);
+
+       return ret;
+}
+
+static void iqs621_als_notifier_unregister(void *context)
+{
+       struct iqs621_als_private *iqs621_als = context;
+       struct iio_dev *indio_dev = iio_priv_to_dev(iqs621_als);
+       int ret;
+
+       ret = blocking_notifier_chain_unregister(&iqs621_als->iqs62x->nh,
+                                                &iqs621_als->notifier);
+       if (ret)
+               dev_err(indio_dev->dev.parent,
+                       "Failed to unregister notifier: %d\n", ret);
+}
+
+static int iqs621_als_read_raw(struct iio_dev *indio_dev,
+                              struct iio_chan_spec const *chan,
+                              int *val, int *val2, long mask)
+{
+       struct iqs621_als_private *iqs621_als = iio_priv(indio_dev);
+       struct iqs62x_core *iqs62x = iqs621_als->iqs62x;
+       int ret;
+       __le16 val_buf;
+
+       switch (chan->type) {
+       case IIO_INTENSITY:
+               ret = regmap_read(iqs62x->regmap, chan->address, val);
+               if (ret)
+                       return ret;
+
+               *val &= IQS621_ALS_FLAGS_RANGE;
+               return IIO_VAL_INT;
+
+       case IIO_PROXIMITY:
+       case IIO_LIGHT:
+               ret = regmap_raw_read(iqs62x->regmap, chan->address, &val_buf,
+                                     sizeof(val_buf));
+               if (ret)
+                       return ret;
+
+               *val = le16_to_cpu(val_buf);
+               return IIO_VAL_INT;
+
+       default:
+               return -EINVAL;
+       }
+}
+
+static int iqs621_als_read_event_config(struct iio_dev *indio_dev,
+                                       const struct iio_chan_spec *chan,
+                                       enum iio_event_type type,
+                                       enum iio_event_direction dir)
+{
+       struct iqs621_als_private *iqs621_als = iio_priv(indio_dev);
+       int ret;
+
+       mutex_lock(&iqs621_als->lock);
+
+       switch (chan->type) {
+       case IIO_LIGHT:
+               ret = iqs621_als->light_en;
+               break;
+
+       case IIO_INTENSITY:
+               ret = iqs621_als->range_en;
+               break;
+
+       case IIO_PROXIMITY:
+               ret = iqs621_als->prox_en;
+               break;
+
+       default:
+               ret = -EINVAL;
+       }
+
+       mutex_unlock(&iqs621_als->lock);
+
+       return ret;
+}
+
+static int iqs621_als_write_event_config(struct iio_dev *indio_dev,
+                                        const struct iio_chan_spec *chan,
+                                        enum iio_event_type type,
+                                        enum iio_event_direction dir,
+                                        int state)
+{
+       struct iqs621_als_private *iqs621_als = iio_priv(indio_dev);
+       struct iqs62x_core *iqs62x = iqs621_als->iqs62x;
+       unsigned int val;
+       int ret;
+
+       mutex_lock(&iqs621_als->lock);
+
+       ret = regmap_read(iqs62x->regmap, iqs62x->dev_desc->als_flags, &val);
+       if (ret)
+               goto err_mutex;
+       iqs621_als->als_flags = val;
+
+       switch (chan->type) {
+       case IIO_LIGHT:
+               ret = regmap_update_bits(iqs62x->regmap, IQS620_GLBL_EVENT_MASK,
+                                        iqs62x->dev_desc->als_mask,
+                                        iqs621_als->range_en || state ? 0 :
+                                                                        0xFF);
+               if (!ret)
+                       iqs621_als->light_en = state;
+               break;
+
+       case IIO_INTENSITY:
+               ret = regmap_update_bits(iqs62x->regmap, IQS620_GLBL_EVENT_MASK,
+                                        iqs62x->dev_desc->als_mask,
+                                        iqs621_als->light_en || state ? 0 :
+                                                                        0xFF);
+               if (!ret)
+                       iqs621_als->range_en = state;
+               break;
+
+       case IIO_PROXIMITY:
+               ret = regmap_read(iqs62x->regmap, IQS622_IR_FLAGS, &val);
+               if (ret)
+                       goto err_mutex;
+               iqs621_als->ir_flags = val;
+
+               ret = regmap_update_bits(iqs62x->regmap, IQS620_GLBL_EVENT_MASK,
+                                        iqs62x->dev_desc->ir_mask,
+                                        state ? 0 : 0xFF);
+               if (!ret)
+                       iqs621_als->prox_en = state;
+               break;
+
+       default:
+               ret = -EINVAL;
+       }
+
+err_mutex:
+       mutex_unlock(&iqs621_als->lock);
+
+       return ret;
+}
+
+static int iqs621_als_read_event_value(struct iio_dev *indio_dev,
+                                      const struct iio_chan_spec *chan,
+                                      enum iio_event_type type,
+                                      enum iio_event_direction dir,
+                                      enum iio_event_info info,
+                                      int *val, int *val2)
+{
+       struct iqs621_als_private *iqs621_als = iio_priv(indio_dev);
+       int ret = IIO_VAL_INT;
+
+       mutex_lock(&iqs621_als->lock);
+
+       switch (dir) {
+       case IIO_EV_DIR_RISING:
+               *val = iqs621_als->thresh_light * 16;
+               break;
+
+       case IIO_EV_DIR_FALLING:
+               *val = iqs621_als->thresh_dark * 4;
+               break;
+
+       case IIO_EV_DIR_EITHER:
+               if (iqs621_als->ir_flags_mask == IQS622_IR_FLAGS_TOUCH)
+                       *val = iqs621_als->thresh_prox * 4;
+               else
+                       *val = iqs621_als->thresh_prox;
+               break;
+
+       default:
+               ret = -EINVAL;
+       }
+
+       mutex_unlock(&iqs621_als->lock);
+
+       return ret;
+}
+
+static int iqs621_als_write_event_value(struct iio_dev *indio_dev,
+                                       const struct iio_chan_spec *chan,
+                                       enum iio_event_type type,
+                                       enum iio_event_direction dir,
+                                       enum iio_event_info info,
+                                       int val, int val2)
+{
+       struct iqs621_als_private *iqs621_als = iio_priv(indio_dev);
+       struct iqs62x_core *iqs62x = iqs621_als->iqs62x;
+       unsigned int thresh_reg, thresh_val;
+       u8 ir_flags_mask, *thresh_cache;
+       int ret = -EINVAL;
+
+       mutex_lock(&iqs621_als->lock);
+
+       switch (dir) {
+       case IIO_EV_DIR_RISING:
+               thresh_reg = IQS621_ALS_THRESH_LIGHT;
+               thresh_val = val / 16;
+
+               thresh_cache = &iqs621_als->thresh_light;
+               ir_flags_mask = 0;
+               break;
+
+       case IIO_EV_DIR_FALLING:
+               thresh_reg = IQS621_ALS_THRESH_DARK;
+               thresh_val = val / 4;
+
+               thresh_cache = &iqs621_als->thresh_dark;
+               ir_flags_mask = 0;
+               break;
+
+       case IIO_EV_DIR_EITHER:
+               /*
+                * The IQS622 supports two detection thresholds, both measured
+                * in the same arbitrary units reported by read_raw: proximity
+                * (0 through 255 in steps of 1), and touch (0 through 1020 in
+                * steps of 4).
+                *
+                * Based on the single detection threshold chosen by the user,
+                * select the hardware threshold that gives the best trade-off
+                * between range and resolution.
+                *
+                * By default, the close-range (but coarse) touch threshold is
+                * chosen during probe.
+                */
+               switch (val) {
+               case 0 ... 255:
+                       thresh_reg = IQS622_IR_THRESH_PROX;
+                       thresh_val = val;
+
+                       ir_flags_mask = IQS622_IR_FLAGS_PROX;
+                       break;
+
+               case 256 ... 1020:
+                       thresh_reg = IQS622_IR_THRESH_TOUCH;
+                       thresh_val = val / 4;
+
+                       ir_flags_mask = IQS622_IR_FLAGS_TOUCH;
+                       break;
+
+               default:
+                       goto err_mutex;
+               }
+
+               thresh_cache = &iqs621_als->thresh_prox;
+               break;
+
+       default:
+               goto err_mutex;
+       }
+
+       if (thresh_val > 0xFF)
+               goto err_mutex;
+
+       ret = regmap_write(iqs62x->regmap, thresh_reg, thresh_val);
+       if (ret)
+               goto err_mutex;
+
+       *thresh_cache = thresh_val;
+       iqs621_als->ir_flags_mask = ir_flags_mask;
+
+err_mutex:
+       mutex_unlock(&iqs621_als->lock);
+
+       return ret;
+}
+
+static const struct iio_info iqs621_als_info = {
+       .read_raw = &iqs621_als_read_raw,
+       .read_event_config = iqs621_als_read_event_config,
+       .write_event_config = iqs621_als_write_event_config,
+       .read_event_value = iqs621_als_read_event_value,
+       .write_event_value = iqs621_als_write_event_value,
+};
+
+static const struct iio_event_spec iqs621_als_range_events[] = {
+       {
+               .type = IIO_EV_TYPE_CHANGE,
+               .dir = IIO_EV_DIR_EITHER,
+               .mask_separate = BIT(IIO_EV_INFO_ENABLE),
+       },
+};
+
+static const struct iio_event_spec iqs621_als_light_events[] = {
+       {
+               .type = IIO_EV_TYPE_THRESH,
+               .dir = IIO_EV_DIR_EITHER,
+               .mask_separate = BIT(IIO_EV_INFO_ENABLE),
+       },
+       {
+               .type = IIO_EV_TYPE_THRESH,
+               .dir = IIO_EV_DIR_RISING,
+               .mask_separate = BIT(IIO_EV_INFO_VALUE),
+       },
+       {
+               .type = IIO_EV_TYPE_THRESH,
+               .dir = IIO_EV_DIR_FALLING,
+               .mask_separate = BIT(IIO_EV_INFO_VALUE),
+       },
+};
+
+static const struct iio_chan_spec iqs621_als_channels[] = {
+       {
+               .type = IIO_INTENSITY,
+               .address = IQS621_ALS_FLAGS,
+               .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+               .event_spec = iqs621_als_range_events,
+               .num_event_specs = ARRAY_SIZE(iqs621_als_range_events),
+       },
+       {
+               .type = IIO_LIGHT,
+               .address = IQS621_ALS_UI_OUT,
+               .info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED),
+               .event_spec = iqs621_als_light_events,
+               .num_event_specs = ARRAY_SIZE(iqs621_als_light_events),
+       },
+};
+
+static const struct iio_event_spec iqs622_als_prox_events[] = {
+       {
+               .type = IIO_EV_TYPE_THRESH,
+               .dir = IIO_EV_DIR_EITHER,
+               .mask_separate = BIT(IIO_EV_INFO_ENABLE) |
+                                BIT(IIO_EV_INFO_VALUE),
+       },
+};
+
+static const struct iio_chan_spec iqs622_als_channels[] = {
+       {
+               .type = IIO_INTENSITY,
+               .channel2 = IIO_MOD_LIGHT_BOTH,
+               .address = IQS622_ALS_FLAGS,
+               .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+               .event_spec = iqs621_als_range_events,
+               .num_event_specs = ARRAY_SIZE(iqs621_als_range_events),
+               .modified = true,
+       },
+       {
+               .type = IIO_INTENSITY,
+               .channel2 = IIO_MOD_LIGHT_IR,
+               .address = IQS622_IR_RANGE,
+               .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+               .modified = true,
+       },
+       {
+               .type = IIO_PROXIMITY,
+               .address = IQS622_IR_UI_OUT,
+               .info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+               .event_spec = iqs622_als_prox_events,
+               .num_event_specs = ARRAY_SIZE(iqs622_als_prox_events),
+       },
+};
+
+static int iqs621_als_probe(struct platform_device *pdev)
+{
+       struct iqs62x_core *iqs62x = dev_get_drvdata(pdev->dev.parent);
+       struct iqs621_als_private *iqs621_als;
+       struct iio_dev *indio_dev;
+       unsigned int val;
+       int ret;
+
+       indio_dev = devm_iio_device_alloc(&pdev->dev, sizeof(*iqs621_als));
+       if (!indio_dev)
+               return -ENOMEM;
+
+       iqs621_als = iio_priv(indio_dev);
+       iqs621_als->iqs62x = iqs62x;
+
+       if (iqs62x->dev_desc->prod_num == IQS622_PROD_NUM) {
+               ret = regmap_read(iqs62x->regmap, IQS622_IR_THRESH_TOUCH,
+                                 &val);
+               if (ret)
+                       return ret;
+               iqs621_als->thresh_prox = val;
+               iqs621_als->ir_flags_mask = IQS622_IR_FLAGS_TOUCH;
+
+               indio_dev->channels = iqs622_als_channels;
+               indio_dev->num_channels = ARRAY_SIZE(iqs622_als_channels);
+       } else {
+               ret = regmap_read(iqs62x->regmap, IQS621_ALS_THRESH_LIGHT,
+                                 &val);
+               if (ret)
+                       return ret;
+               iqs621_als->thresh_light = val;
+
+               ret = regmap_read(iqs62x->regmap, IQS621_ALS_THRESH_DARK,
+                                 &val);
+               if (ret)
+                       return ret;
+               iqs621_als->thresh_dark = val;
+
+               indio_dev->channels = iqs621_als_channels;
+               indio_dev->num_channels = ARRAY_SIZE(iqs621_als_channels);
+       }
+
+       indio_dev->modes = INDIO_DIRECT_MODE;
+       indio_dev->dev.parent = &pdev->dev;
+       indio_dev->name = iqs62x->dev_desc->dev_name;
+       indio_dev->info = &iqs621_als_info;
+
+       mutex_init(&iqs621_als->lock);
+
+       iqs621_als->notifier.notifier_call = iqs621_als_notifier;
+       ret = blocking_notifier_chain_register(&iqs621_als->iqs62x->nh,
+                                              &iqs621_als->notifier);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to register notifier: %d\n", ret);
+               return ret;
+       }
+
+       ret = devm_add_action_or_reset(&pdev->dev,
+                                      iqs621_als_notifier_unregister,
+                                      iqs621_als);
+       if (ret)
+               return ret;
+
+       return devm_iio_device_register(&pdev->dev, indio_dev);
+}
+
+static struct platform_driver iqs621_als_platform_driver = {
+       .driver = {
+               .name = "iqs621-als",
+       },
+       .probe = iqs621_als_probe,
+};
+module_platform_driver(iqs621_als_platform_driver);
+
+MODULE_AUTHOR("Jeff LaBundy <jeff@labundy.com>");
+MODULE_DESCRIPTION("Azoteq IQS621/622 Ambient Light Sensors");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:iqs621-als");
diff --git a/drivers/iio/position/Kconfig b/drivers/iio/position/Kconfig
new file mode 100644 (file)
index 0000000..eda67f0
--- /dev/null
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Linear and angular position sensors
+#
+# When adding new entries keep the list in alphabetical order
+
+menu "Linear and angular position sensors"
+
+config IQS624_POS
+       tristate "Azoteq IQS624/625 angular position sensors"
+       depends on MFD_IQS62X || COMPILE_TEST
+       help
+         Say Y here if you want to build support for the Azoteq IQS624
+         and IQS625 angular position sensors.
+
+         To compile this driver as a module, choose M here: the module
+         will be called iqs624-pos.
+
+endmenu
diff --git a/drivers/iio/position/Makefile b/drivers/iio/position/Makefile
new file mode 100644 (file)
index 0000000..3cbe7a7
--- /dev/null
@@ -0,0 +1,7 @@
+#
+# Makefile for IIO linear and angular position sensors
+#
+
+# When adding new entries keep the list in alphabetical order
+
+obj-$(CONFIG_IQS624_POS)       += iqs624-pos.o
diff --git a/drivers/iio/position/iqs624-pos.c b/drivers/iio/position/iqs624-pos.c
new file mode 100644 (file)
index 0000000..77096c3
--- /dev/null
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Azoteq IQS624/625 Angular Position Sensors
+ *
+ * Copyright (C) 2019 Jeff LaBundy <jeff@labundy.com>
+ */
+
+#include <linux/device.h>
+#include <linux/iio/events.h>
+#include <linux/iio/iio.h>
+#include <linux/kernel.h>
+#include <linux/mfd/iqs62x.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+#define IQS624_POS_DEG_OUT                     0x16
+
+#define IQS624_POS_SCALE1                      (314159 / 180)
+#define IQS624_POS_SCALE2                      100000
+
+struct iqs624_pos_private {
+       struct iqs62x_core *iqs62x;
+       struct notifier_block notifier;
+       struct mutex lock;
+       bool angle_en;
+       u16 angle;
+};
+
+static int iqs624_pos_angle_en(struct iqs62x_core *iqs62x, bool angle_en)
+{
+       unsigned int event_mask = IQS624_HALL_UI_WHL_EVENT;
+
+       /*
+        * The IQS625 reports angular position in the form of coarse intervals,
+        * so only interval change events are unmasked. Conversely, the IQS624
+        * reports angular position down to one degree of resolution, so wheel
+        * movement events are unmasked instead.
+        */
+       if (iqs62x->dev_desc->prod_num == IQS625_PROD_NUM)
+               event_mask = IQS624_HALL_UI_INT_EVENT;
+
+       return regmap_update_bits(iqs62x->regmap, IQS624_HALL_UI, event_mask,
+                                 angle_en ? 0 : 0xFF);
+}
+
+static int iqs624_pos_notifier(struct notifier_block *notifier,
+                              unsigned long event_flags, void *context)
+{
+       struct iqs62x_event_data *event_data = context;
+       struct iqs624_pos_private *iqs624_pos;
+       struct iqs62x_core *iqs62x;
+       struct iio_dev *indio_dev;
+       u16 angle = event_data->ui_data;
+       s64 timestamp;
+       int ret;
+
+       iqs624_pos = container_of(notifier, struct iqs624_pos_private,
+                                 notifier);
+       indio_dev = iio_priv_to_dev(iqs624_pos);
+       timestamp = iio_get_time_ns(indio_dev);
+
+       iqs62x = iqs624_pos->iqs62x;
+       if (iqs62x->dev_desc->prod_num == IQS625_PROD_NUM)
+               angle = event_data->interval;
+
+       mutex_lock(&iqs624_pos->lock);
+
+       if (event_flags & BIT(IQS62X_EVENT_SYS_RESET)) {
+               ret = iqs624_pos_angle_en(iqs62x, iqs624_pos->angle_en);
+               if (ret) {
+                       dev_err(indio_dev->dev.parent,
+                               "Failed to re-initialize device: %d\n", ret);
+                       ret = NOTIFY_BAD;
+               } else {
+                       ret = NOTIFY_OK;
+               }
+       } else if (iqs624_pos->angle_en && (angle != iqs624_pos->angle)) {
+               iio_push_event(indio_dev,
+                              IIO_UNMOD_EVENT_CODE(IIO_ANGL, 0,
+                                                   IIO_EV_TYPE_CHANGE,
+                                                   IIO_EV_DIR_NONE),
+                              timestamp);
+
+               iqs624_pos->angle = angle;
+               ret = NOTIFY_OK;
+       } else {
+               ret = NOTIFY_DONE;
+       }
+
+       mutex_unlock(&iqs624_pos->lock);
+
+       return ret;
+}
+
+static void iqs624_pos_notifier_unregister(void *context)
+{
+       struct iqs624_pos_private *iqs624_pos = context;
+       struct iio_dev *indio_dev = iio_priv_to_dev(iqs624_pos);
+       int ret;
+
+       ret = blocking_notifier_chain_unregister(&iqs624_pos->iqs62x->nh,
+                                                &iqs624_pos->notifier);
+       if (ret)
+               dev_err(indio_dev->dev.parent,
+                       "Failed to unregister notifier: %d\n", ret);
+}
+
+static int iqs624_pos_angle_get(struct iqs62x_core *iqs62x, unsigned int *val)
+{
+       int ret;
+       __le16 val_buf;
+
+       if (iqs62x->dev_desc->prod_num == IQS625_PROD_NUM)
+               return regmap_read(iqs62x->regmap, iqs62x->dev_desc->interval,
+                                  val);
+
+       ret = regmap_raw_read(iqs62x->regmap, IQS624_POS_DEG_OUT, &val_buf,
+                             sizeof(val_buf));
+       if (ret)
+               return ret;
+
+       *val = le16_to_cpu(val_buf);
+
+       return 0;
+}
+
+static int iqs624_pos_read_raw(struct iio_dev *indio_dev,
+                              struct iio_chan_spec const *chan,
+                              int *val, int *val2, long mask)
+{
+       struct iqs624_pos_private *iqs624_pos = iio_priv(indio_dev);
+       struct iqs62x_core *iqs62x = iqs624_pos->iqs62x;
+       unsigned int scale = 1;
+       int ret;
+
+       switch (mask) {
+       case IIO_CHAN_INFO_RAW:
+               ret = iqs624_pos_angle_get(iqs62x, val);
+               if (ret)
+                       return ret;
+
+               return IIO_VAL_INT;
+
+       case IIO_CHAN_INFO_SCALE:
+               if (iqs62x->dev_desc->prod_num == IQS625_PROD_NUM) {
+                       ret = regmap_read(iqs62x->regmap, IQS624_INTERVAL_DIV,
+                                         &scale);
+                       if (ret)
+                               return ret;
+               }
+
+               *val = scale * IQS624_POS_SCALE1;
+               *val2 = IQS624_POS_SCALE2;
+               return IIO_VAL_FRACTIONAL;
+
+       default:
+               return -EINVAL;
+       }
+}
+
+static int iqs624_pos_read_event_config(struct iio_dev *indio_dev,
+                                       const struct iio_chan_spec *chan,
+                                       enum iio_event_type type,
+                                       enum iio_event_direction dir)
+{
+       struct iqs624_pos_private *iqs624_pos = iio_priv(indio_dev);
+       int ret;
+
+       mutex_lock(&iqs624_pos->lock);
+       ret = iqs624_pos->angle_en;
+       mutex_unlock(&iqs624_pos->lock);
+
+       return ret;
+}
+
+static int iqs624_pos_write_event_config(struct iio_dev *indio_dev,
+                                        const struct iio_chan_spec *chan,
+                                        enum iio_event_type type,
+                                        enum iio_event_direction dir,
+                                        int state)
+{
+       struct iqs624_pos_private *iqs624_pos = iio_priv(indio_dev);
+       struct iqs62x_core *iqs62x = iqs624_pos->iqs62x;
+       unsigned int val;
+       int ret;
+
+       mutex_lock(&iqs624_pos->lock);
+
+       ret = iqs624_pos_angle_get(iqs62x, &val);
+       if (ret)
+               goto err_mutex;
+
+       ret = iqs624_pos_angle_en(iqs62x, state);
+       if (ret)
+               goto err_mutex;
+
+       iqs624_pos->angle = val;
+       iqs624_pos->angle_en = state;
+
+err_mutex:
+       mutex_unlock(&iqs624_pos->lock);
+
+       return ret;
+}
+
+static const struct iio_info iqs624_pos_info = {
+       .read_raw = &iqs624_pos_read_raw,
+       .read_event_config = iqs624_pos_read_event_config,
+       .write_event_config = iqs624_pos_write_event_config,
+};
+
+static const struct iio_event_spec iqs624_pos_events[] = {
+       {
+               .type = IIO_EV_TYPE_CHANGE,
+               .dir = IIO_EV_DIR_NONE,
+               .mask_separate = BIT(IIO_EV_INFO_ENABLE),
+       },
+};
+
+static const struct iio_chan_spec iqs624_pos_channels[] = {
+       {
+               .type = IIO_ANGL,
+               .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+                                     BIT(IIO_CHAN_INFO_SCALE),
+               .event_spec = iqs624_pos_events,
+               .num_event_specs = ARRAY_SIZE(iqs624_pos_events),
+       },
+};
+
+static int iqs624_pos_probe(struct platform_device *pdev)
+{
+       struct iqs62x_core *iqs62x = dev_get_drvdata(pdev->dev.parent);
+       struct iqs624_pos_private *iqs624_pos;
+       struct iio_dev *indio_dev;
+       int ret;
+
+       indio_dev = devm_iio_device_alloc(&pdev->dev, sizeof(*iqs624_pos));
+       if (!indio_dev)
+               return -ENOMEM;
+
+       iqs624_pos = iio_priv(indio_dev);
+       iqs624_pos->iqs62x = iqs62x;
+
+       indio_dev->modes = INDIO_DIRECT_MODE;
+       indio_dev->dev.parent = &pdev->dev;
+       indio_dev->channels = iqs624_pos_channels;
+       indio_dev->num_channels = ARRAY_SIZE(iqs624_pos_channels);
+       indio_dev->name = iqs62x->dev_desc->dev_name;
+       indio_dev->info = &iqs624_pos_info;
+
+       mutex_init(&iqs624_pos->lock);
+
+       iqs624_pos->notifier.notifier_call = iqs624_pos_notifier;
+       ret = blocking_notifier_chain_register(&iqs624_pos->iqs62x->nh,
+                                              &iqs624_pos->notifier);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to register notifier: %d\n", ret);
+               return ret;
+       }
+
+       ret = devm_add_action_or_reset(&pdev->dev,
+                                      iqs624_pos_notifier_unregister,
+                                      iqs624_pos);
+       if (ret)
+               return ret;
+
+       return devm_iio_device_register(&pdev->dev, indio_dev);
+}
+
+static struct platform_driver iqs624_pos_platform_driver = {
+       .driver = {
+               .name = "iqs624-pos",
+       },
+       .probe = iqs624_pos_probe,
+};
+module_platform_driver(iqs624_pos_platform_driver);
+
+MODULE_AUTHOR("Jeff LaBundy <jeff@labundy.com>");
+MODULE_DESCRIPTION("Azoteq IQS624/625 Angular Position Sensors");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:iqs624-pos");
index b521beb..c079b89 100644 (file)
@@ -134,10 +134,14 @@ static int cros_ec_baro_probe(struct platform_device *pdev)
        if (!indio_dev)
                return -ENOMEM;
 
-       ret = cros_ec_sensors_core_init(pdev, indio_dev, true);
+       ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
+                                       cros_ec_sensors_capture,
+                                       cros_ec_sensors_push_data);
        if (ret)
                return ret;
 
+       iio_buffer_set_attrs(indio_dev->buffer, cros_ec_sensor_fifo_attributes);
+
        indio_dev->info = &cros_ec_baro_info;
        state = iio_priv(indio_dev);
        state->core.type = state->core.resp->info.type;
@@ -147,8 +151,7 @@ static int cros_ec_baro_probe(struct platform_device *pdev)
        channel->info_mask_separate = BIT(IIO_CHAN_INFO_RAW);
        channel->info_mask_shared_by_all =
                BIT(IIO_CHAN_INFO_SCALE) |
-               BIT(IIO_CHAN_INFO_SAMP_FREQ) |
-               BIT(IIO_CHAN_INFO_FREQUENCY);
+               BIT(IIO_CHAN_INFO_SAMP_FREQ);
        channel->info_mask_shared_by_all_available =
                BIT(IIO_CHAN_INFO_SAMP_FREQ);
        channel->scan_type.realbits = CROS_EC_SENSOR_BITS;
@@ -182,11 +185,6 @@ static int cros_ec_baro_probe(struct platform_device *pdev)
 
        state->core.read_ec_sensors_data = cros_ec_sensors_read_cmd;
 
-       ret = devm_iio_triggered_buffer_setup(dev, indio_dev, NULL,
-                                             cros_ec_sensors_capture, NULL);
-       if (ret)
-               return ret;
-
        return devm_iio_device_register(dev, indio_dev);
 }
 
index e1ccb40..f1f2a14 100644 (file)
@@ -4,6 +4,16 @@
 #
 menu "Temperature sensors"
 
+config IQS620AT_TEMP
+       tristate "Azoteq IQS620AT temperature sensor"
+       depends on MFD_IQS62X || COMPILE_TEST
+       help
+         Say Y here if you want to build support for the Azoteq IQS620AT
+         temperature sensor.
+
+         To compile this driver as a module, choose M here: the module
+         will be called iqs620at-temp.
+
 config LTC2983
        tristate "Analog Devices Multi-Sensor Digital Temperature Measurement System"
        depends on SPI
index d6b850b..90c1131 100644 (file)
@@ -3,6 +3,7 @@
 # Makefile for industrial I/O temperature drivers
 #
 
+obj-$(CONFIG_IQS620AT_TEMP) += iqs620at-temp.o
 obj-$(CONFIG_LTC2983) += ltc2983.o
 obj-$(CONFIG_HID_SENSOR_TEMP) += hid-sensor-temperature.o
 obj-$(CONFIG_MAXIM_THERMOCOUPLE) += maxim_thermocouple.o
diff --git a/drivers/iio/temperature/iqs620at-temp.c b/drivers/iio/temperature/iqs620at-temp.c
new file mode 100644 (file)
index 0000000..3fd52b3
--- /dev/null
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Azoteq IQS620AT Temperature Sensor
+ *
+ * Copyright (C) 2019 Jeff LaBundy <jeff@labundy.com>
+ */
+
+#include <linux/device.h>
+#include <linux/iio/iio.h>
+#include <linux/kernel.h>
+#include <linux/mfd/iqs62x.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+#define IQS620_TEMP_UI_OUT                     0x1A
+
+#define IQS620_TEMP_SCALE                      1000
+#define IQS620_TEMP_OFFSET                     (-100)
+
+static int iqs620_temp_read_raw(struct iio_dev *indio_dev,
+                               struct iio_chan_spec const *chan,
+                               int *val, int *val2, long mask)
+{
+       struct iqs62x_core *iqs62x = iio_device_get_drvdata(indio_dev);
+       int ret;
+       __le16 val_buf;
+
+       switch (mask) {
+       case IIO_CHAN_INFO_RAW:
+               ret = regmap_raw_read(iqs62x->regmap, IQS620_TEMP_UI_OUT,
+                                     &val_buf, sizeof(val_buf));
+               if (ret)
+                       return ret;
+
+               *val = le16_to_cpu(val_buf);
+               return IIO_VAL_INT;
+
+       case IIO_CHAN_INFO_SCALE:
+               *val = IQS620_TEMP_SCALE;
+               return IIO_VAL_INT;
+
+       case IIO_CHAN_INFO_OFFSET:
+               *val = IQS620_TEMP_OFFSET;
+               return IIO_VAL_INT;
+
+       default:
+               return -EINVAL;
+       }
+}
+
+static const struct iio_info iqs620_temp_info = {
+       .read_raw = &iqs620_temp_read_raw,
+};
+
+static const struct iio_chan_spec iqs620_temp_channels[] = {
+       {
+               .type = IIO_TEMP,
+               .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+                                     BIT(IIO_CHAN_INFO_SCALE) |
+                                     BIT(IIO_CHAN_INFO_OFFSET),
+       },
+};
+
+static int iqs620_temp_probe(struct platform_device *pdev)
+{
+       struct iqs62x_core *iqs62x = dev_get_drvdata(pdev->dev.parent);
+       struct iio_dev *indio_dev;
+
+       indio_dev = devm_iio_device_alloc(&pdev->dev, 0);
+       if (!indio_dev)
+               return -ENOMEM;
+
+       iio_device_set_drvdata(indio_dev, iqs62x);
+
+       indio_dev->modes = INDIO_DIRECT_MODE;
+       indio_dev->dev.parent = &pdev->dev;
+       indio_dev->channels = iqs620_temp_channels;
+       indio_dev->num_channels = ARRAY_SIZE(iqs620_temp_channels);
+       indio_dev->name = iqs62x->dev_desc->dev_name;
+       indio_dev->info = &iqs620_temp_info;
+
+       return devm_iio_device_register(&pdev->dev, indio_dev);
+}
+
+static struct platform_driver iqs620_temp_platform_driver = {
+       .driver = {
+               .name = "iqs620at-temp",
+       },
+       .probe = iqs620_temp_probe,
+};
+module_platform_driver(iqs620_temp_platform_driver);
+
+MODULE_AUTHOR("Jeff LaBundy <jeff@labundy.com>");
+MODULE_DESCRIPTION("Azoteq IQS620AT Temperature Sensor");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:iqs620at-temp");
index 4706ff0..28de965 100644 (file)
@@ -663,6 +663,16 @@ config KEYBOARD_IPAQ_MICRO
          To compile this driver as a module, choose M here: the
          module will be called ipaq-micro-keys.
 
+config KEYBOARD_IQS62X
+       tristate "Azoteq IQS620A/621/622/624/625 keys and switches"
+       depends on MFD_IQS62X
+       help
+         Say Y here to enable key and switch support for the Azoteq IQS620A,
+         IQS621, IQS622, IQS624 and IQS625 multi-function sensors.
+
+         To compile this driver as a module, choose M here: the module will
+         be called iqs62x-keys.
+
 config KEYBOARD_OMAP
        tristate "TI OMAP keypad support"
        depends on ARCH_OMAP1
index f5b1752..1d689fd 100644 (file)
@@ -28,6 +28,7 @@ obj-$(CONFIG_KEYBOARD_TCA8418)                += tca8418_keypad.o
 obj-$(CONFIG_KEYBOARD_HIL)             += hil_kbd.o
 obj-$(CONFIG_KEYBOARD_HIL_OLD)         += hilkbd.o
 obj-$(CONFIG_KEYBOARD_IPAQ_MICRO)      += ipaq-micro-keys.o
+obj-$(CONFIG_KEYBOARD_IQS62X)          += iqs62x-keys.o
 obj-$(CONFIG_KEYBOARD_IMX)             += imx_keypad.o
 obj-$(CONFIG_KEYBOARD_IMX_SC_KEY)      += imx_sc_key.o
 obj-$(CONFIG_KEYBOARD_HP6XX)           += jornada680_kbd.o
diff --git a/drivers/input/keyboard/iqs62x-keys.c b/drivers/input/keyboard/iqs62x-keys.c
new file mode 100644 (file)
index 0000000..93446b2
--- /dev/null
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Azoteq IQS620A/621/622/624/625 Keys and Switches
+ *
+ * Copyright (C) 2019 Jeff LaBundy <jeff@labundy.com>
+ */
+
+#include <linux/device.h>
+#include <linux/input.h>
+#include <linux/kernel.h>
+#include <linux/mfd/iqs62x.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+enum {
+       IQS62X_SW_HALL_N,
+       IQS62X_SW_HALL_S,
+};
+
+static const char * const iqs62x_switch_names[] = {
+       [IQS62X_SW_HALL_N] = "hall-switch-north",
+       [IQS62X_SW_HALL_S] = "hall-switch-south",
+};
+
+struct iqs62x_switch_desc {
+       enum iqs62x_event_flag flag;
+       unsigned int code;
+       bool enabled;
+};
+
+struct iqs62x_keys_private {
+       struct iqs62x_core *iqs62x;
+       struct input_dev *input;
+       struct notifier_block notifier;
+       struct iqs62x_switch_desc switches[ARRAY_SIZE(iqs62x_switch_names)];
+       unsigned int keycode[IQS62X_NUM_KEYS];
+       unsigned int keycodemax;
+       u8 interval;
+};
+
+static int iqs62x_keys_parse_prop(struct platform_device *pdev,
+                                 struct iqs62x_keys_private *iqs62x_keys)
+{
+       struct fwnode_handle *child;
+       unsigned int val;
+       int ret, i;
+
+       ret = device_property_count_u32(&pdev->dev, "linux,keycodes");
+       if (ret > IQS62X_NUM_KEYS) {
+               dev_err(&pdev->dev, "Too many keycodes present\n");
+               return -EINVAL;
+       } else if (ret < 0) {
+               dev_err(&pdev->dev, "Failed to count keycodes: %d\n", ret);
+               return ret;
+       }
+       iqs62x_keys->keycodemax = ret;
+
+       ret = device_property_read_u32_array(&pdev->dev, "linux,keycodes",
+                                            iqs62x_keys->keycode,
+                                            iqs62x_keys->keycodemax);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to read keycodes: %d\n", ret);
+               return ret;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(iqs62x_keys->switches); i++) {
+               child = device_get_named_child_node(&pdev->dev,
+                                                   iqs62x_switch_names[i]);
+               if (!child)
+                       continue;
+
+               ret = fwnode_property_read_u32(child, "linux,code", &val);
+               if (ret) {
+                       dev_err(&pdev->dev, "Failed to read switch code: %d\n",
+                               ret);
+                       return ret;
+               }
+               iqs62x_keys->switches[i].code = val;
+               iqs62x_keys->switches[i].enabled = true;
+
+               if (fwnode_property_present(child, "azoteq,use-prox"))
+                       iqs62x_keys->switches[i].flag = (i == IQS62X_SW_HALL_N ?
+                                                        IQS62X_EVENT_HALL_N_P :
+                                                        IQS62X_EVENT_HALL_S_P);
+               else
+                       iqs62x_keys->switches[i].flag = (i == IQS62X_SW_HALL_N ?
+                                                        IQS62X_EVENT_HALL_N_T :
+                                                        IQS62X_EVENT_HALL_S_T);
+       }
+
+       return 0;
+}
+
+static int iqs62x_keys_init(struct iqs62x_keys_private *iqs62x_keys)
+{
+       struct iqs62x_core *iqs62x = iqs62x_keys->iqs62x;
+       enum iqs62x_event_flag flag;
+       unsigned int event_reg, val;
+       unsigned int event_mask = 0;
+       int ret, i;
+
+       switch (iqs62x->dev_desc->prod_num) {
+       case IQS620_PROD_NUM:
+       case IQS621_PROD_NUM:
+       case IQS622_PROD_NUM:
+               event_reg = IQS620_GLBL_EVENT_MASK;
+
+               /*
+                * Discreet button, hysteresis and SAR UI flags represent keys
+                * and are unmasked if mapped to a valid keycode.
+                */
+               for (i = 0; i < iqs62x_keys->keycodemax; i++) {
+                       if (iqs62x_keys->keycode[i] == KEY_RESERVED)
+                               continue;
+
+                       if (iqs62x_events[i].reg == IQS62X_EVENT_PROX)
+                               event_mask |= iqs62x->dev_desc->prox_mask;
+                       else if (iqs62x_events[i].reg == IQS62X_EVENT_HYST)
+                               event_mask |= (iqs62x->dev_desc->hyst_mask |
+                                              iqs62x->dev_desc->sar_mask);
+               }
+
+               ret = regmap_read(iqs62x->regmap, iqs62x->dev_desc->hall_flags,
+                                 &val);
+               if (ret)
+                       return ret;
+
+               /*
+                * Hall UI flags represent switches and are unmasked if their
+                * corresponding child nodes are present.
+                */
+               for (i = 0; i < ARRAY_SIZE(iqs62x_keys->switches); i++) {
+                       if (!(iqs62x_keys->switches[i].enabled))
+                               continue;
+
+                       flag = iqs62x_keys->switches[i].flag;
+
+                       if (iqs62x_events[flag].reg != IQS62X_EVENT_HALL)
+                               continue;
+
+                       event_mask |= iqs62x->dev_desc->hall_mask;
+
+                       input_report_switch(iqs62x_keys->input,
+                                           iqs62x_keys->switches[i].code,
+                                           (val & iqs62x_events[flag].mask) ==
+                                           iqs62x_events[flag].val);
+               }
+
+               input_sync(iqs62x_keys->input);
+               break;
+
+       case IQS624_PROD_NUM:
+               event_reg = IQS624_HALL_UI;
+
+               /*
+                * Interval change events represent keys and are unmasked if
+                * either wheel movement flag is mapped to a valid keycode.
+                */
+               if (iqs62x_keys->keycode[IQS62X_EVENT_WHEEL_UP] != KEY_RESERVED)
+                       event_mask |= IQS624_HALL_UI_INT_EVENT;
+
+               if (iqs62x_keys->keycode[IQS62X_EVENT_WHEEL_DN] != KEY_RESERVED)
+                       event_mask |= IQS624_HALL_UI_INT_EVENT;
+
+               ret = regmap_read(iqs62x->regmap, iqs62x->dev_desc->interval,
+                                 &val);
+               if (ret)
+                       return ret;
+
+               iqs62x_keys->interval = val;
+               break;
+
+       default:
+               return 0;
+       }
+
+       return regmap_update_bits(iqs62x->regmap, event_reg, event_mask, 0);
+}
+
+static int iqs62x_keys_notifier(struct notifier_block *notifier,
+                               unsigned long event_flags, void *context)
+{
+       struct iqs62x_event_data *event_data = context;
+       struct iqs62x_keys_private *iqs62x_keys;
+       int ret, i;
+
+       iqs62x_keys = container_of(notifier, struct iqs62x_keys_private,
+                                  notifier);
+
+       if (event_flags & BIT(IQS62X_EVENT_SYS_RESET)) {
+               ret = iqs62x_keys_init(iqs62x_keys);
+               if (ret) {
+                       dev_err(iqs62x_keys->input->dev.parent,
+                               "Failed to re-initialize device: %d\n", ret);
+                       return NOTIFY_BAD;
+               }
+
+               return NOTIFY_OK;
+       }
+
+       for (i = 0; i < iqs62x_keys->keycodemax; i++) {
+               if (iqs62x_events[i].reg == IQS62X_EVENT_WHEEL &&
+                   event_data->interval == iqs62x_keys->interval)
+                       continue;
+
+               input_report_key(iqs62x_keys->input, iqs62x_keys->keycode[i],
+                                event_flags & BIT(i));
+       }
+
+       for (i = 0; i < ARRAY_SIZE(iqs62x_keys->switches); i++)
+               if (iqs62x_keys->switches[i].enabled)
+                       input_report_switch(iqs62x_keys->input,
+                                           iqs62x_keys->switches[i].code,
+                                           event_flags &
+                                           BIT(iqs62x_keys->switches[i].flag));
+
+       input_sync(iqs62x_keys->input);
+
+       if (event_data->interval == iqs62x_keys->interval)
+               return NOTIFY_OK;
+
+       /*
+        * Each frame contains at most one wheel event (up or down), in which
+        * case a complementary release cycle is emulated.
+        */
+       if (event_flags & BIT(IQS62X_EVENT_WHEEL_UP)) {
+               input_report_key(iqs62x_keys->input,
+                                iqs62x_keys->keycode[IQS62X_EVENT_WHEEL_UP],
+                                0);
+               input_sync(iqs62x_keys->input);
+       } else if (event_flags & BIT(IQS62X_EVENT_WHEEL_DN)) {
+               input_report_key(iqs62x_keys->input,
+                                iqs62x_keys->keycode[IQS62X_EVENT_WHEEL_DN],
+                                0);
+               input_sync(iqs62x_keys->input);
+       }
+
+       iqs62x_keys->interval = event_data->interval;
+
+       return NOTIFY_OK;
+}
+
+static int iqs62x_keys_probe(struct platform_device *pdev)
+{
+       struct iqs62x_core *iqs62x = dev_get_drvdata(pdev->dev.parent);
+       struct iqs62x_keys_private *iqs62x_keys;
+       struct input_dev *input;
+       int ret, i;
+
+       iqs62x_keys = devm_kzalloc(&pdev->dev, sizeof(*iqs62x_keys),
+                                  GFP_KERNEL);
+       if (!iqs62x_keys)
+               return -ENOMEM;
+
+       platform_set_drvdata(pdev, iqs62x_keys);
+
+       ret = iqs62x_keys_parse_prop(pdev, iqs62x_keys);
+       if (ret)
+               return ret;
+
+       input = devm_input_allocate_device(&pdev->dev);
+       if (!input)
+               return -ENOMEM;
+
+       input->keycodemax = iqs62x_keys->keycodemax;
+       input->keycode = iqs62x_keys->keycode;
+       input->keycodesize = sizeof(*iqs62x_keys->keycode);
+
+       input->name = iqs62x->dev_desc->dev_name;
+       input->id.bustype = BUS_I2C;
+
+       for (i = 0; i < iqs62x_keys->keycodemax; i++)
+               if (iqs62x_keys->keycode[i] != KEY_RESERVED)
+                       input_set_capability(input, EV_KEY,
+                                            iqs62x_keys->keycode[i]);
+
+       for (i = 0; i < ARRAY_SIZE(iqs62x_keys->switches); i++)
+               if (iqs62x_keys->switches[i].enabled)
+                       input_set_capability(input, EV_SW,
+                                            iqs62x_keys->switches[i].code);
+
+       iqs62x_keys->iqs62x = iqs62x;
+       iqs62x_keys->input = input;
+
+       ret = iqs62x_keys_init(iqs62x_keys);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to initialize device: %d\n", ret);
+               return ret;
+       }
+
+       ret = input_register_device(iqs62x_keys->input);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to register device: %d\n", ret);
+               return ret;
+       }
+
+       iqs62x_keys->notifier.notifier_call = iqs62x_keys_notifier;
+       ret = blocking_notifier_chain_register(&iqs62x_keys->iqs62x->nh,
+                                              &iqs62x_keys->notifier);
+       if (ret)
+               dev_err(&pdev->dev, "Failed to register notifier: %d\n", ret);
+
+       return ret;
+}
+
+static int iqs62x_keys_remove(struct platform_device *pdev)
+{
+       struct iqs62x_keys_private *iqs62x_keys = platform_get_drvdata(pdev);
+       int ret;
+
+       ret = blocking_notifier_chain_unregister(&iqs62x_keys->iqs62x->nh,
+                                                &iqs62x_keys->notifier);
+       if (ret)
+               dev_err(&pdev->dev, "Failed to unregister notifier: %d\n", ret);
+
+       return ret;
+}
+
+static struct platform_driver iqs62x_keys_platform_driver = {
+       .driver = {
+               .name = "iqs62x-keys",
+       },
+       .probe = iqs62x_keys_probe,
+       .remove = iqs62x_keys_remove,
+};
+module_platform_driver(iqs62x_keys_platform_driver);
+
+MODULE_AUTHOR("Jeff LaBundy <jeff@labundy.com>");
+MODULE_DESCRIPTION("Azoteq IQS620A/621/622/624/625 Keys and Switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:iqs62x-keys");
index dc974c2..08e919d 100644 (file)
@@ -530,6 +530,17 @@ static const struct dmi_system_id __initconst i8042_dmi_nomux_table[] = {
                        DMI_MATCH(DMI_PRODUCT_VERSION, "Lenovo LaVie Z"),
                },
        },
+       {
+               /*
+                * Acer Aspire 5738z
+                * Touchpad stops working in mux mode when dis- + re-enabled
+                * with the touchpad enable/disable toggle hotkey
+                */
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5738"),
+               },
+       },
        { }
 };
 
index 4911799..14c577c 100644 (file)
@@ -1309,6 +1309,7 @@ static int elants_i2c_probe(struct i2c_client *client,
        input_set_abs_params(ts->input, ABS_MT_PRESSURE, 0, 255, 0, 0);
        input_abs_set_res(ts->input, ABS_MT_POSITION_X, ts->x_res);
        input_abs_set_res(ts->input, ABS_MT_POSITION_Y, ts->y_res);
+       input_abs_set_res(ts->input, ABS_MT_TOUCH_MAJOR, 1);
 
        error = input_register_device(ts->input);
        if (error) {
index 0403102..02c75ea 100644 (file)
 #include <linux/of.h>
 #include <asm/unaligned.h>
 
-struct goodix_ts_data;
-
-struct goodix_chip_data {
-       u16 config_addr;
-       int config_len;
-       int (*check_config)(struct goodix_ts_data *, const struct firmware *);
-};
-
-struct goodix_ts_data {
-       struct i2c_client *client;
-       struct input_dev *input_dev;
-       const struct goodix_chip_data *chip;
-       struct touchscreen_properties prop;
-       unsigned int max_touch_num;
-       unsigned int int_trigger_type;
-       struct regulator *avdd28;
-       struct regulator *vddio;
-       struct gpio_desc *gpiod_int;
-       struct gpio_desc *gpiod_rst;
-       u16 id;
-       u16 version;
-       const char *cfg_name;
-       struct completion firmware_loading_complete;
-       unsigned long irq_flags;
-       unsigned int contact_size;
-};
-
 #define GOODIX_GPIO_INT_NAME           "irq"
 #define GOODIX_GPIO_RST_NAME           "reset"
 
@@ -65,10 +38,13 @@ struct goodix_ts_data {
 #define GOODIX_CONTACT_SIZE            8
 #define GOODIX_MAX_CONTACT_SIZE                9
 #define GOODIX_MAX_CONTACTS            10
+#define GOODIX_MAX_KEYS                        7
 
-#define GOODIX_CONFIG_MAX_LENGTH       240
+#define GOODIX_CONFIG_MIN_LENGTH       186
 #define GOODIX_CONFIG_911_LENGTH       186
 #define GOODIX_CONFIG_967_LENGTH       228
+#define GOODIX_CONFIG_GT9X_LENGTH      240
+#define GOODIX_CONFIG_MAX_LENGTH       240
 
 /* Register defines */
 #define GOODIX_REG_COMMAND             0x8040
@@ -80,39 +56,118 @@ struct goodix_ts_data {
 #define GOODIX_REG_ID                  0x8140
 
 #define GOODIX_BUFFER_STATUS_READY     BIT(7)
+#define GOODIX_HAVE_KEY                        BIT(4)
 #define GOODIX_BUFFER_STATUS_TIMEOUT   20
 
 #define RESOLUTION_LOC         1
 #define MAX_CONTACTS_LOC       5
 #define TRIGGER_LOC            6
 
+/* Our special handling for GPIO accesses through ACPI is x86 specific */
+#if defined CONFIG_X86 && defined CONFIG_ACPI
+#define ACPI_GPIO_SUPPORT
+#endif
+
+struct goodix_ts_data;
+
+enum goodix_irq_pin_access_method {
+       IRQ_PIN_ACCESS_NONE,
+       IRQ_PIN_ACCESS_GPIO,
+       IRQ_PIN_ACCESS_ACPI_GPIO,
+       IRQ_PIN_ACCESS_ACPI_METHOD,
+};
+
+struct goodix_chip_data {
+       u16 config_addr;
+       int config_len;
+       int (*check_config)(struct goodix_ts_data *ts, const u8 *cfg, int len);
+       void (*calc_config_checksum)(struct goodix_ts_data *ts);
+};
+
+struct goodix_chip_id {
+       const char *id;
+       const struct goodix_chip_data *data;
+};
+
+#define GOODIX_ID_MAX_LEN      4
+
+struct goodix_ts_data {
+       struct i2c_client *client;
+       struct input_dev *input_dev;
+       const struct goodix_chip_data *chip;
+       struct touchscreen_properties prop;
+       unsigned int max_touch_num;
+       unsigned int int_trigger_type;
+       struct regulator *avdd28;
+       struct regulator *vddio;
+       struct gpio_desc *gpiod_int;
+       struct gpio_desc *gpiod_rst;
+       int gpio_count;
+       int gpio_int_idx;
+       char id[GOODIX_ID_MAX_LEN + 1];
+       u16 version;
+       const char *cfg_name;
+       bool reset_controller_at_probe;
+       bool load_cfg_from_disk;
+       struct completion firmware_loading_complete;
+       unsigned long irq_flags;
+       enum goodix_irq_pin_access_method irq_pin_access_method;
+       unsigned int contact_size;
+       u8 config[GOODIX_CONFIG_MAX_LENGTH];
+       unsigned short keymap[GOODIX_MAX_KEYS];
+};
+
 static int goodix_check_cfg_8(struct goodix_ts_data *ts,
-                       const struct firmware *cfg);
+                             const u8 *cfg, int len);
 static int goodix_check_cfg_16(struct goodix_ts_data *ts,
-                       const struct firmware *cfg);
+                              const u8 *cfg, int len);
+static void goodix_calc_cfg_checksum_8(struct goodix_ts_data *ts);
+static void goodix_calc_cfg_checksum_16(struct goodix_ts_data *ts);
 
 static const struct goodix_chip_data gt1x_chip_data = {
        .config_addr            = GOODIX_GT1X_REG_CONFIG_DATA,
-       .config_len             = GOODIX_CONFIG_MAX_LENGTH,
+       .config_len             = GOODIX_CONFIG_GT9X_LENGTH,
        .check_config           = goodix_check_cfg_16,
+       .calc_config_checksum   = goodix_calc_cfg_checksum_16,
 };
 
 static const struct goodix_chip_data gt911_chip_data = {
        .config_addr            = GOODIX_GT9X_REG_CONFIG_DATA,
        .config_len             = GOODIX_CONFIG_911_LENGTH,
        .check_config           = goodix_check_cfg_8,
+       .calc_config_checksum   = goodix_calc_cfg_checksum_8,
 };
 
 static const struct goodix_chip_data gt967_chip_data = {
        .config_addr            = GOODIX_GT9X_REG_CONFIG_DATA,
        .config_len             = GOODIX_CONFIG_967_LENGTH,
        .check_config           = goodix_check_cfg_8,
+       .calc_config_checksum   = goodix_calc_cfg_checksum_8,
 };
 
 static const struct goodix_chip_data gt9x_chip_data = {
        .config_addr            = GOODIX_GT9X_REG_CONFIG_DATA,
-       .config_len             = GOODIX_CONFIG_MAX_LENGTH,
+       .config_len             = GOODIX_CONFIG_GT9X_LENGTH,
        .check_config           = goodix_check_cfg_8,
+       .calc_config_checksum   = goodix_calc_cfg_checksum_8,
+};
+
+static const struct goodix_chip_id goodix_chip_ids[] = {
+       { .id = "1151", .data = &gt1x_chip_data },
+       { .id = "5663", .data = &gt1x_chip_data },
+       { .id = "5688", .data = &gt1x_chip_data },
+       { .id = "917S", .data = &gt1x_chip_data },
+
+       { .id = "911", .data = &gt911_chip_data },
+       { .id = "9271", .data = &gt911_chip_data },
+       { .id = "9110", .data = &gt911_chip_data },
+       { .id = "927", .data = &gt911_chip_data },
+       { .id = "928", .data = &gt911_chip_data },
+
+       { .id = "912", .data = &gt967_chip_data },
+       { .id = "9147", .data = &gt967_chip_data },
+       { .id = "967", .data = &gt967_chip_data },
+       { }
 };
 
 static const unsigned long goodix_irq_flags[] = {
@@ -168,6 +223,22 @@ static const struct dmi_system_id nine_bytes_report[] = {
        {}
 };
 
+/*
+ * Those tablets have their x coordinate inverted
+ */
+static const struct dmi_system_id inverted_x_screen[] = {
+#if defined(CONFIG_DMI) && defined(CONFIG_X86)
+       {
+               .ident = "Cube I15-TC",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Cube"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "I15-TC")
+               },
+       },
+#endif
+       {}
+};
+
 /**
  * goodix_i2c_read - read data from a register of the i2c slave device.
  *
@@ -235,28 +306,16 @@ static int goodix_i2c_write_u8(struct i2c_client *client, u16 reg, u8 value)
        return goodix_i2c_write(client, reg, &value, sizeof(value));
 }
 
-static const struct goodix_chip_data *goodix_get_chip_data(u16 id)
+static const struct goodix_chip_data *goodix_get_chip_data(const char *id)
 {
-       switch (id) {
-       case 1151:
-       case 5663:
-       case 5688:
-               return &gt1x_chip_data;
-
-       case 911:
-       case 9271:
-       case 9110:
-       case 927:
-       case 928:
-               return &gt911_chip_data;
-
-       case 912:
-       case 967:
-               return &gt967_chip_data;
+       unsigned int i;
 
-       default:
-               return &gt9x_chip_data;
+       for (i = 0; goodix_chip_ids[i].id; i++) {
+               if (!strcmp(goodix_chip_ids[i].id, id))
+                       return goodix_chip_ids[i].data;
        }
+
+       return &gt9x_chip_data;
 }
 
 static int goodix_ts_read_input_report(struct goodix_ts_data *ts, u8 *data)
@@ -264,6 +323,13 @@ static int goodix_ts_read_input_report(struct goodix_ts_data *ts, u8 *data)
        unsigned long max_timeout;
        int touch_num;
        int error;
+       u16 addr = GOODIX_READ_COOR_ADDR;
+       /*
+        * We are going to read 1-byte header,
+        * ts->contact_size * max(1, touch_num) bytes of coordinates
+        * and 1-byte footer which contains the touch-key code.
+        */
+       const int header_contact_keycode_size = 1 + ts->contact_size + 1;
 
        /*
         * The 'buffer status' bit, which indicates that the data is valid, is
@@ -272,8 +338,8 @@ static int goodix_ts_read_input_report(struct goodix_ts_data *ts, u8 *data)
         */
        max_timeout = jiffies + msecs_to_jiffies(GOODIX_BUFFER_STATUS_TIMEOUT);
        do {
-               error = goodix_i2c_read(ts->client, GOODIX_READ_COOR_ADDR,
-                                       data, ts->contact_size + 1);
+               error = goodix_i2c_read(ts->client, addr, data,
+                                       header_contact_keycode_size);
                if (error) {
                        dev_err(&ts->client->dev, "I2C transfer error: %d\n",
                                        error);
@@ -286,11 +352,10 @@ static int goodix_ts_read_input_report(struct goodix_ts_data *ts, u8 *data)
                                return -EPROTO;
 
                        if (touch_num > 1) {
-                               data += 1 + ts->contact_size;
+                               addr += header_contact_keycode_size;
+                               data += header_contact_keycode_size;
                                error = goodix_i2c_read(ts->client,
-                                               GOODIX_READ_COOR_ADDR +
-                                                       1 + ts->contact_size,
-                                               data,
+                                               addr, data,
                                                ts->contact_size *
                                                        (touch_num - 1));
                                if (error)
@@ -307,7 +372,7 @@ static int goodix_ts_read_input_report(struct goodix_ts_data *ts, u8 *data)
         * The Goodix panel will send spurious interrupts after a
         * 'finger up' event, which will always cause a timeout.
         */
-       return 0;
+       return -ENOMSG;
 }
 
 static void goodix_ts_report_touch_8b(struct goodix_ts_data *ts, u8 *coor_data)
@@ -340,6 +405,25 @@ static void goodix_ts_report_touch_9b(struct goodix_ts_data *ts, u8 *coor_data)
        input_report_abs(ts->input_dev, ABS_MT_WIDTH_MAJOR, input_w);
 }
 
+static void goodix_ts_report_key(struct goodix_ts_data *ts, u8 *data)
+{
+       int touch_num;
+       u8 key_value;
+       int i;
+
+       if (data[0] & GOODIX_HAVE_KEY) {
+               touch_num = data[0] & 0x0f;
+               key_value = data[1 + ts->contact_size * touch_num];
+               for (i = 0; i < GOODIX_MAX_KEYS; i++)
+                       if (key_value & BIT(i))
+                               input_report_key(ts->input_dev,
+                                                ts->keymap[i], 1);
+       } else {
+               for (i = 0; i < GOODIX_MAX_KEYS; i++)
+                       input_report_key(ts->input_dev, ts->keymap[i], 0);
+       }
+}
+
 /**
  * goodix_process_events - Process incoming events
  *
@@ -350,7 +434,7 @@ static void goodix_ts_report_touch_9b(struct goodix_ts_data *ts, u8 *coor_data)
  */
 static void goodix_process_events(struct goodix_ts_data *ts)
 {
-       u8  point_data[1 + GOODIX_MAX_CONTACT_SIZE * GOODIX_MAX_CONTACTS];
+       u8  point_data[2 + GOODIX_MAX_CONTACT_SIZE * GOODIX_MAX_CONTACTS];
        int touch_num;
        int i;
 
@@ -358,11 +442,7 @@ static void goodix_process_events(struct goodix_ts_data *ts)
        if (touch_num < 0)
                return;
 
-       /*
-        * Bit 4 of the first byte reports the status of the capacitive
-        * Windows/Home button.
-        */
-       input_report_key(ts->input_dev, KEY_LEFTMETA, point_data[0] & BIT(4));
+       goodix_ts_report_key(ts, point_data);
 
        for (i = 0; i < touch_num; i++)
                if (ts->contact_size == 9)
@@ -406,22 +486,21 @@ static int goodix_request_irq(struct goodix_ts_data *ts)
                                         ts->irq_flags, ts->client->name, ts);
 }
 
-static int goodix_check_cfg_8(struct goodix_ts_data *ts,
-                       const struct firmware *cfg)
+static int goodix_check_cfg_8(struct goodix_ts_data *ts, const u8 *cfg, int len)
 {
-       int i, raw_cfg_len = cfg->size - 2;
+       int i, raw_cfg_len = len - 2;
        u8 check_sum = 0;
 
        for (i = 0; i < raw_cfg_len; i++)
-               check_sum += cfg->data[i];
+               check_sum += cfg[i];
        check_sum = (~check_sum) + 1;
-       if (check_sum != cfg->data[raw_cfg_len]) {
+       if (check_sum != cfg[raw_cfg_len]) {
                dev_err(&ts->client->dev,
                        "The checksum of the config fw is not correct");
                return -EINVAL;
        }
 
-       if (cfg->data[raw_cfg_len + 1] != 1) {
+       if (cfg[raw_cfg_len + 1] != 1) {
                dev_err(&ts->client->dev,
                        "Config fw must have Config_Fresh register set");
                return -EINVAL;
@@ -430,22 +509,35 @@ static int goodix_check_cfg_8(struct goodix_ts_data *ts,
        return 0;
 }
 
-static int goodix_check_cfg_16(struct goodix_ts_data *ts,
-                       const struct firmware *cfg)
+static void goodix_calc_cfg_checksum_8(struct goodix_ts_data *ts)
 {
-       int i, raw_cfg_len = cfg->size - 3;
+       int i, raw_cfg_len = ts->chip->config_len - 2;
+       u8 check_sum = 0;
+
+       for (i = 0; i < raw_cfg_len; i++)
+               check_sum += ts->config[i];
+       check_sum = (~check_sum) + 1;
+
+       ts->config[raw_cfg_len] = check_sum;
+       ts->config[raw_cfg_len + 1] = 1; /* Set "config_fresh" bit */
+}
+
+static int goodix_check_cfg_16(struct goodix_ts_data *ts, const u8 *cfg,
+                              int len)
+{
+       int i, raw_cfg_len = len - 3;
        u16 check_sum = 0;
 
        for (i = 0; i < raw_cfg_len; i += 2)
-               check_sum += get_unaligned_be16(&cfg->data[i]);
+               check_sum += get_unaligned_be16(&cfg[i]);
        check_sum = (~check_sum) + 1;
-       if (check_sum != get_unaligned_be16(&cfg->data[raw_cfg_len])) {
+       if (check_sum != get_unaligned_be16(&cfg[raw_cfg_len])) {
                dev_err(&ts->client->dev,
                        "The checksum of the config fw is not correct");
                return -EINVAL;
        }
 
-       if (cfg->data[raw_cfg_len + 2] != 1) {
+       if (cfg[raw_cfg_len + 2] != 1) {
                dev_err(&ts->client->dev,
                        "Config fw must have Config_Fresh register set");
                return -EINVAL;
@@ -454,22 +546,35 @@ static int goodix_check_cfg_16(struct goodix_ts_data *ts,
        return 0;
 }
 
+static void goodix_calc_cfg_checksum_16(struct goodix_ts_data *ts)
+{
+       int i, raw_cfg_len = ts->chip->config_len - 3;
+       u16 check_sum = 0;
+
+       for (i = 0; i < raw_cfg_len; i += 2)
+               check_sum += get_unaligned_be16(&ts->config[i]);
+       check_sum = (~check_sum) + 1;
+
+       put_unaligned_be16(check_sum, &ts->config[raw_cfg_len]);
+       ts->config[raw_cfg_len + 2] = 1; /* Set "config_fresh" bit */
+}
+
 /**
  * goodix_check_cfg - Checks if config fw is valid
  *
  * @ts: goodix_ts_data pointer
  * @cfg: firmware config data
  */
-static int goodix_check_cfg(struct goodix_ts_data *ts,
-                           const struct firmware *cfg)
+static int goodix_check_cfg(struct goodix_ts_data *ts, const u8 *cfg, int len)
 {
-       if (cfg->size > GOODIX_CONFIG_MAX_LENGTH) {
+       if (len < GOODIX_CONFIG_MIN_LENGTH ||
+           len > GOODIX_CONFIG_MAX_LENGTH) {
                dev_err(&ts->client->dev,
                        "The length of the config fw is not correct");
                return -EINVAL;
        }
 
-       return ts->chip->check_config(ts, cfg);
+       return ts->chip->check_config(ts, cfg, len);
 }
 
 /**
@@ -478,17 +583,15 @@ static int goodix_check_cfg(struct goodix_ts_data *ts,
  * @ts: goodix_ts_data pointer
  * @cfg: config firmware to write to device
  */
-static int goodix_send_cfg(struct goodix_ts_data *ts,
-                          const struct firmware *cfg)
+static int goodix_send_cfg(struct goodix_ts_data *ts, const u8 *cfg, int len)
 {
        int error;
 
-       error = goodix_check_cfg(ts, cfg);
+       error = goodix_check_cfg(ts, cfg, len);
        if (error)
                return error;
 
-       error = goodix_i2c_write(ts->client, ts->chip->config_addr, cfg->data,
-                                cfg->size);
+       error = goodix_i2c_write(ts->client, ts->chip->config_addr, cfg, len);
        if (error) {
                dev_err(&ts->client->dev, "Failed to write config data: %d",
                        error);
@@ -502,17 +605,93 @@ static int goodix_send_cfg(struct goodix_ts_data *ts,
        return 0;
 }
 
+#ifdef ACPI_GPIO_SUPPORT
+static int goodix_pin_acpi_direction_input(struct goodix_ts_data *ts)
+{
+       acpi_handle handle = ACPI_HANDLE(&ts->client->dev);
+       acpi_status status;
+
+       status = acpi_evaluate_object(handle, "INTI", NULL, NULL);
+       return ACPI_SUCCESS(status) ? 0 : -EIO;
+}
+
+static int goodix_pin_acpi_output_method(struct goodix_ts_data *ts, int value)
+{
+       acpi_handle handle = ACPI_HANDLE(&ts->client->dev);
+       acpi_status status;
+
+       status = acpi_execute_simple_method(handle, "INTO", value);
+       return ACPI_SUCCESS(status) ? 0 : -EIO;
+}
+#else
+static int goodix_pin_acpi_direction_input(struct goodix_ts_data *ts)
+{
+       dev_err(&ts->client->dev,
+               "%s called on device without ACPI support\n", __func__);
+       return -EINVAL;
+}
+
+static int goodix_pin_acpi_output_method(struct goodix_ts_data *ts, int value)
+{
+       dev_err(&ts->client->dev,
+               "%s called on device without ACPI support\n", __func__);
+       return -EINVAL;
+}
+#endif
+
+static int goodix_irq_direction_output(struct goodix_ts_data *ts, int value)
+{
+       switch (ts->irq_pin_access_method) {
+       case IRQ_PIN_ACCESS_NONE:
+               dev_err(&ts->client->dev,
+                       "%s called without an irq_pin_access_method set\n",
+                       __func__);
+               return -EINVAL;
+       case IRQ_PIN_ACCESS_GPIO:
+               return gpiod_direction_output(ts->gpiod_int, value);
+       case IRQ_PIN_ACCESS_ACPI_GPIO:
+               /*
+                * The IRQ pin triggers on a falling edge, so its gets marked
+                * as active-low, use output_raw to avoid the value inversion.
+                */
+               return gpiod_direction_output_raw(ts->gpiod_int, value);
+       case IRQ_PIN_ACCESS_ACPI_METHOD:
+               return goodix_pin_acpi_output_method(ts, value);
+       }
+
+       return -EINVAL; /* Never reached */
+}
+
+static int goodix_irq_direction_input(struct goodix_ts_data *ts)
+{
+       switch (ts->irq_pin_access_method) {
+       case IRQ_PIN_ACCESS_NONE:
+               dev_err(&ts->client->dev,
+                       "%s called without an irq_pin_access_method set\n",
+                       __func__);
+               return -EINVAL;
+       case IRQ_PIN_ACCESS_GPIO:
+               return gpiod_direction_input(ts->gpiod_int);
+       case IRQ_PIN_ACCESS_ACPI_GPIO:
+               return gpiod_direction_input(ts->gpiod_int);
+       case IRQ_PIN_ACCESS_ACPI_METHOD:
+               return goodix_pin_acpi_direction_input(ts);
+       }
+
+       return -EINVAL; /* Never reached */
+}
+
 static int goodix_int_sync(struct goodix_ts_data *ts)
 {
        int error;
 
-       error = gpiod_direction_output(ts->gpiod_int, 0);
+       error = goodix_irq_direction_output(ts, 0);
        if (error)
                return error;
 
        msleep(50);                             /* T5: 50ms */
 
-       error = gpiod_direction_input(ts->gpiod_int);
+       error = goodix_irq_direction_input(ts);
        if (error)
                return error;
 
@@ -536,7 +715,7 @@ static int goodix_reset(struct goodix_ts_data *ts)
        msleep(20);                             /* T2: > 10ms */
 
        /* HIGH: 0x28/0x29, LOW: 0xBA/0xBB */
-       error = gpiod_direction_output(ts->gpiod_int, ts->client->addr == 0x14);
+       error = goodix_irq_direction_output(ts, ts->client->addr == 0x14);
        if (error)
                return error;
 
@@ -560,6 +739,124 @@ static int goodix_reset(struct goodix_ts_data *ts)
        return 0;
 }
 
+#ifdef ACPI_GPIO_SUPPORT
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+static const struct x86_cpu_id baytrail_cpu_ids[] = {
+       { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT, X86_FEATURE_ANY, },
+       {}
+};
+
+static inline bool is_byt(void)
+{
+       const struct x86_cpu_id *id = x86_match_cpu(baytrail_cpu_ids);
+
+       return !!id;
+}
+
+static const struct acpi_gpio_params first_gpio = { 0, 0, false };
+static const struct acpi_gpio_params second_gpio = { 1, 0, false };
+
+static const struct acpi_gpio_mapping acpi_goodix_int_first_gpios[] = {
+       { GOODIX_GPIO_INT_NAME "-gpios", &first_gpio, 1 },
+       { GOODIX_GPIO_RST_NAME "-gpios", &second_gpio, 1 },
+       { },
+};
+
+static const struct acpi_gpio_mapping acpi_goodix_int_last_gpios[] = {
+       { GOODIX_GPIO_RST_NAME "-gpios", &first_gpio, 1 },
+       { GOODIX_GPIO_INT_NAME "-gpios", &second_gpio, 1 },
+       { },
+};
+
+static const struct acpi_gpio_mapping acpi_goodix_reset_only_gpios[] = {
+       { GOODIX_GPIO_RST_NAME "-gpios", &first_gpio, 1 },
+       { },
+};
+
+static int goodix_resource(struct acpi_resource *ares, void *data)
+{
+       struct goodix_ts_data *ts = data;
+       struct device *dev = &ts->client->dev;
+       struct acpi_resource_gpio *gpio;
+
+       switch (ares->type) {
+       case ACPI_RESOURCE_TYPE_GPIO:
+               gpio = &ares->data.gpio;
+               if (gpio->connection_type == ACPI_RESOURCE_GPIO_TYPE_INT) {
+                       if (ts->gpio_int_idx == -1) {
+                               ts->gpio_int_idx = ts->gpio_count;
+                       } else {
+                               dev_err(dev, "More then one GpioInt resource, ignoring ACPI GPIO resources\n");
+                               ts->gpio_int_idx = -2;
+                       }
+               }
+               ts->gpio_count++;
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+}
+
+/*
+ * This function gets called in case we fail to get the irq GPIO directly
+ * because the ACPI tables lack GPIO-name to APCI _CRS index mappings
+ * (no _DSD UUID daffd814-6eba-4d8c-8a91-bc9bbf4aa301 data).
+ * In that case we add our own mapping and then goodix_get_gpio_config()
+ * retries to get the GPIOs based on the added mapping.
+ */
+static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts)
+{
+       const struct acpi_gpio_mapping *gpio_mapping = NULL;
+       struct device *dev = &ts->client->dev;
+       LIST_HEAD(resources);
+       int ret;
+
+       ts->gpio_count = 0;
+       ts->gpio_int_idx = -1;
+       ret = acpi_dev_get_resources(ACPI_COMPANION(dev), &resources,
+                                    goodix_resource, ts);
+       if (ret < 0) {
+               dev_err(dev, "Error getting ACPI resources: %d\n", ret);
+               return ret;
+       }
+
+       acpi_dev_free_resource_list(&resources);
+
+       if (ts->gpio_count == 2 && ts->gpio_int_idx == 0) {
+               ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_GPIO;
+               gpio_mapping = acpi_goodix_int_first_gpios;
+       } else if (ts->gpio_count == 2 && ts->gpio_int_idx == 1) {
+               ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_GPIO;
+               gpio_mapping = acpi_goodix_int_last_gpios;
+       } else if (ts->gpio_count == 1 && ts->gpio_int_idx == -1 &&
+                  acpi_has_method(ACPI_HANDLE(dev), "INTI") &&
+                  acpi_has_method(ACPI_HANDLE(dev), "INTO")) {
+               dev_info(dev, "Using ACPI INTI and INTO methods for IRQ pin access\n");
+               ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_METHOD;
+               gpio_mapping = acpi_goodix_reset_only_gpios;
+       } else if (is_byt() && ts->gpio_count == 2 && ts->gpio_int_idx == -1) {
+               dev_info(dev, "No ACPI GpioInt resource, assuming that the GPIO order is reset, int\n");
+               ts->irq_pin_access_method = IRQ_PIN_ACCESS_ACPI_GPIO;
+               gpio_mapping = acpi_goodix_int_last_gpios;
+       } else {
+               dev_warn(dev, "Unexpected ACPI resources: gpio_count %d, gpio_int_idx %d\n",
+                        ts->gpio_count, ts->gpio_int_idx);
+               return -EINVAL;
+       }
+
+       return devm_acpi_dev_add_driver_gpios(dev, gpio_mapping);
+}
+#else
+static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts)
+{
+       return -EINVAL;
+}
+#endif /* CONFIG_X86 && CONFIG_ACPI */
+
 /**
  * goodix_get_gpio_config - Get GPIO config from ACPI/DT
  *
@@ -570,6 +867,7 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts)
        int error;
        struct device *dev;
        struct gpio_desc *gpiod;
+       bool added_acpi_mappings = false;
 
        if (!ts->client)
                return -EINVAL;
@@ -593,6 +891,7 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts)
                return error;
        }
 
+retry_get_irq_gpio:
        /* Get the interrupt GPIO pin number */
        gpiod = devm_gpiod_get_optional(dev, GOODIX_GPIO_INT_NAME, GPIOD_IN);
        if (IS_ERR(gpiod)) {
@@ -602,6 +901,11 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts)
                                GOODIX_GPIO_INT_NAME, error);
                return error;
        }
+       if (!gpiod && has_acpi_companion(dev) && !added_acpi_mappings) {
+               added_acpi_mappings = true;
+               if (goodix_add_acpi_gpio_mappings(ts) == 0)
+                       goto retry_get_irq_gpio;
+       }
 
        ts->gpiod_int = gpiod;
 
@@ -617,6 +921,31 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts)
 
        ts->gpiod_rst = gpiod;
 
+       switch (ts->irq_pin_access_method) {
+       case IRQ_PIN_ACCESS_ACPI_GPIO:
+               /*
+                * We end up here if goodix_add_acpi_gpio_mappings() has
+                * called devm_acpi_dev_add_driver_gpios() because the ACPI
+                * tables did not contain name to index mappings.
+                * Check that we successfully got both GPIOs after we've
+                * added our own acpi_gpio_mapping and if we did not get both
+                * GPIOs reset irq_pin_access_method to IRQ_PIN_ACCESS_NONE.
+                */
+               if (!ts->gpiod_int || !ts->gpiod_rst)
+                       ts->irq_pin_access_method = IRQ_PIN_ACCESS_NONE;
+               break;
+       case IRQ_PIN_ACCESS_ACPI_METHOD:
+               if (!ts->gpiod_rst)
+                       ts->irq_pin_access_method = IRQ_PIN_ACCESS_NONE;
+               break;
+       default:
+               if (ts->gpiod_int && ts->gpiod_rst) {
+                       ts->reset_controller_at_probe = true;
+                       ts->load_cfg_from_disk = true;
+                       ts->irq_pin_access_method = IRQ_PIN_ACCESS_GPIO;
+               }
+       }
+
        return 0;
 }
 
@@ -629,12 +958,11 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts)
  */
 static void goodix_read_config(struct goodix_ts_data *ts)
 {
-       u8 config[GOODIX_CONFIG_MAX_LENGTH];
        int x_max, y_max;
        int error;
 
        error = goodix_i2c_read(ts->client, ts->chip->config_addr,
-                               config, ts->chip->config_len);
+                               ts->config, ts->chip->config_len);
        if (error) {
                dev_warn(&ts->client->dev, "Error reading config: %d\n",
                         error);
@@ -643,15 +971,17 @@ static void goodix_read_config(struct goodix_ts_data *ts)
                return;
        }
 
-       ts->int_trigger_type = config[TRIGGER_LOC] & 0x03;
-       ts->max_touch_num = config[MAX_CONTACTS_LOC] & 0x0f;
+       ts->int_trigger_type = ts->config[TRIGGER_LOC] & 0x03;
+       ts->max_touch_num = ts->config[MAX_CONTACTS_LOC] & 0x0f;
 
-       x_max = get_unaligned_le16(&config[RESOLUTION_LOC]);
-       y_max = get_unaligned_le16(&config[RESOLUTION_LOC + 2]);
+       x_max = get_unaligned_le16(&ts->config[RESOLUTION_LOC]);
+       y_max = get_unaligned_le16(&ts->config[RESOLUTION_LOC + 2]);
        if (x_max && y_max) {
                input_abs_set_max(ts->input_dev, ABS_MT_POSITION_X, x_max - 1);
                input_abs_set_max(ts->input_dev, ABS_MT_POSITION_Y, y_max - 1);
        }
+
+       ts->chip->calc_config_checksum(ts);
 }
 
 /**
@@ -663,7 +993,7 @@ static int goodix_read_version(struct goodix_ts_data *ts)
 {
        int error;
        u8 buf[6];
-       char id_str[5];
+       char id_str[GOODIX_ID_MAX_LEN + 1];
 
        error = goodix_i2c_read(ts->client, GOODIX_REG_ID, buf, sizeof(buf));
        if (error) {
@@ -671,14 +1001,13 @@ static int goodix_read_version(struct goodix_ts_data *ts)
                return error;
        }
 
-       memcpy(id_str, buf, 4);
-       id_str[4] = 0;
-       if (kstrtou16(id_str, 10, &ts->id))
-               ts->id = 0x1001;
+       memcpy(id_str, buf, GOODIX_ID_MAX_LEN);
+       id_str[GOODIX_ID_MAX_LEN] = 0;
+       strscpy(ts->id, id_str, GOODIX_ID_MAX_LEN + 1);
 
        ts->version = get_unaligned_le16(&buf[4]);
 
-       dev_info(&ts->client->dev, "ID %d, version: %04x\n", ts->id,
+       dev_info(&ts->client->dev, "ID %s, version: %04x\n", ts->id,
                 ts->version);
 
        return 0;
@@ -722,6 +1051,7 @@ static int goodix_i2c_test(struct i2c_client *client)
 static int goodix_configure_dev(struct goodix_ts_data *ts)
 {
        int error;
+       int i;
 
        ts->int_trigger_type = GOODIX_INT_TRIGGER;
        ts->max_touch_num = GOODIX_MAX_CONTACTS;
@@ -736,11 +1066,23 @@ static int goodix_configure_dev(struct goodix_ts_data *ts)
        ts->input_dev->phys = "input/ts";
        ts->input_dev->id.bustype = BUS_I2C;
        ts->input_dev->id.vendor = 0x0416;
-       ts->input_dev->id.product = ts->id;
+       if (kstrtou16(ts->id, 10, &ts->input_dev->id.product))
+               ts->input_dev->id.product = 0x1001;
        ts->input_dev->id.version = ts->version;
 
+       ts->input_dev->keycode = ts->keymap;
+       ts->input_dev->keycodesize = sizeof(ts->keymap[0]);
+       ts->input_dev->keycodemax = GOODIX_MAX_KEYS;
+
        /* Capacitive Windows/Home button on some devices */
-       input_set_capability(ts->input_dev, EV_KEY, KEY_LEFTMETA);
+       for (i = 0; i < GOODIX_MAX_KEYS; ++i) {
+               if (i == 0)
+                       ts->keymap[i] = KEY_LEFTMETA;
+               else
+                       ts->keymap[i] = KEY_F1 + (i - 1);
+
+               input_set_capability(ts->input_dev, EV_KEY, ts->keymap[i]);
+       }
 
        input_set_capability(ts->input_dev, EV_ABS, ABS_MT_POSITION_X);
        input_set_capability(ts->input_dev, EV_ABS, ABS_MT_POSITION_Y);
@@ -780,6 +1122,12 @@ static int goodix_configure_dev(struct goodix_ts_data *ts)
                        "Non-standard 9-bytes report format quirk\n");
        }
 
+       if (dmi_check_system(inverted_x_screen)) {
+               ts->prop.invert_x = true;
+               dev_dbg(&ts->client->dev,
+                       "Applying 'inverted x screen' quirk\n");
+       }
+
        error = input_mt_init_slots(ts->input_dev, ts->max_touch_num,
                                    INPUT_MT_DIRECT | INPUT_MT_DROP_UNUSED);
        if (error) {
@@ -820,7 +1168,7 @@ static void goodix_config_cb(const struct firmware *cfg, void *ctx)
 
        if (cfg) {
                /* send device configuration to the firmware */
-               error = goodix_send_cfg(ts, cfg);
+               error = goodix_send_cfg(ts, cfg->data, cfg->size);
                if (error)
                        goto err_release_cfg;
        }
@@ -889,7 +1237,8 @@ static int goodix_ts_probe(struct i2c_client *client,
        if (error)
                return error;
 
-       if (ts->gpiod_int && ts->gpiod_rst) {
+reset:
+       if (ts->reset_controller_at_probe) {
                /* reset the controller */
                error = goodix_reset(ts);
                if (error) {
@@ -900,6 +1249,12 @@ static int goodix_ts_probe(struct i2c_client *client,
 
        error = goodix_i2c_test(client);
        if (error) {
+               if (!ts->reset_controller_at_probe &&
+                   ts->irq_pin_access_method != IRQ_PIN_ACCESS_NONE) {
+                       /* Retry after a controller reset */
+                       ts->reset_controller_at_probe = true;
+                       goto reset;
+               }
                dev_err(&client->dev, "I2C communication failure: %d\n", error);
                return error;
        }
@@ -912,10 +1267,10 @@ static int goodix_ts_probe(struct i2c_client *client,
 
        ts->chip = goodix_get_chip_data(ts->id);
 
-       if (ts->gpiod_int && ts->gpiod_rst) {
+       if (ts->load_cfg_from_disk) {
                /* update device config */
                ts->cfg_name = devm_kasprintf(&client->dev, GFP_KERNEL,
-                                             "goodix_%d_cfg.bin", ts->id);
+                                             "goodix_%s_cfg.bin", ts->id);
                if (!ts->cfg_name)
                        return -ENOMEM;
 
@@ -943,7 +1298,7 @@ static int goodix_ts_remove(struct i2c_client *client)
 {
        struct goodix_ts_data *ts = i2c_get_clientdata(client);
 
-       if (ts->gpiod_int && ts->gpiod_rst)
+       if (ts->load_cfg_from_disk)
                wait_for_completion(&ts->firmware_loading_complete);
 
        return 0;
@@ -955,19 +1310,20 @@ static int __maybe_unused goodix_suspend(struct device *dev)
        struct goodix_ts_data *ts = i2c_get_clientdata(client);
        int error;
 
+       if (ts->load_cfg_from_disk)
+               wait_for_completion(&ts->firmware_loading_complete);
+
        /* We need gpio pins to suspend/resume */
-       if (!ts->gpiod_int || !ts->gpiod_rst) {
+       if (ts->irq_pin_access_method == IRQ_PIN_ACCESS_NONE) {
                disable_irq(client->irq);
                return 0;
        }
 
-       wait_for_completion(&ts->firmware_loading_complete);
-
        /* Free IRQ as IRQ pin is used as output in the suspend sequence */
        goodix_free_irq(ts);
 
        /* Output LOW on the INT pin for 5 ms */
-       error = gpiod_direction_output(ts->gpiod_int, 0);
+       error = goodix_irq_direction_output(ts, 0);
        if (error) {
                goodix_request_irq(ts);
                return error;
@@ -979,7 +1335,7 @@ static int __maybe_unused goodix_suspend(struct device *dev)
                                    GOODIX_CMD_SCREEN_OFF);
        if (error) {
                dev_err(&ts->client->dev, "Screen off command failed\n");
-               gpiod_direction_input(ts->gpiod_int);
+               goodix_irq_direction_input(ts);
                goodix_request_irq(ts);
                return -EAGAIN;
        }
@@ -997,9 +1353,10 @@ static int __maybe_unused goodix_resume(struct device *dev)
 {
        struct i2c_client *client = to_i2c_client(dev);
        struct goodix_ts_data *ts = i2c_get_clientdata(client);
+       u8 config_ver;
        int error;
 
-       if (!ts->gpiod_int || !ts->gpiod_rst) {
+       if (ts->irq_pin_access_method == IRQ_PIN_ACCESS_NONE) {
                enable_irq(client->irq);
                return 0;
        }
@@ -1008,7 +1365,7 @@ static int __maybe_unused goodix_resume(struct device *dev)
         * Exit sleep mode by outputting HIGH level to INT pin
         * for 2ms~5ms.
         */
-       error = gpiod_direction_output(ts->gpiod_int, 1);
+       error = goodix_irq_direction_output(ts, 1);
        if (error)
                return error;
 
@@ -1018,6 +1375,27 @@ static int __maybe_unused goodix_resume(struct device *dev)
        if (error)
                return error;
 
+       error = goodix_i2c_read(ts->client, ts->chip->config_addr,
+                               &config_ver, 1);
+       if (error)
+               dev_warn(dev, "Error reading config version: %d, resetting controller\n",
+                        error);
+       else if (config_ver != ts->config[0])
+               dev_info(dev, "Config version mismatch %d != %d, resetting controller\n",
+                        config_ver, ts->config[0]);
+
+       if (error != 0 || config_ver != ts->config[0]) {
+               error = goodix_reset(ts);
+               if (error) {
+                       dev_err(dev, "Controller reset failed.\n");
+                       return error;
+               }
+
+               error = goodix_send_cfg(ts, ts->config, ts->chip->config_len);
+               if (error)
+                       return error;
+       }
+
        error = goodix_request_irq(ts);
        if (error)
                return error;
@@ -1050,6 +1428,8 @@ static const struct of_device_id goodix_of_match[] = {
        { .compatible = "goodix,gt911" },
        { .compatible = "goodix,gt9110" },
        { .compatible = "goodix,gt912" },
+       { .compatible = "goodix,gt9147" },
+       { .compatible = "goodix,gt917s" },
        { .compatible = "goodix,gt927" },
        { .compatible = "goodix,gt9271" },
        { .compatible = "goodix,gt928" },
index e16ec4c..97342e1 100644 (file)
@@ -66,7 +66,7 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
 {
        struct device *dev = input->dev.parent;
        struct input_absinfo *absinfo;
-       unsigned int axis;
+       unsigned int axis, axis_x, axis_y;
        unsigned int minimum, maximum, fuzz;
        bool data_present;
 
@@ -74,33 +74,34 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
        if (!input->absinfo)
                return;
 
-       axis = multitouch ? ABS_MT_POSITION_X : ABS_X;
+       axis_x = multitouch ? ABS_MT_POSITION_X : ABS_X;
+       axis_y = multitouch ? ABS_MT_POSITION_Y : ABS_Y;
+
        data_present = touchscreen_get_prop_u32(dev, "touchscreen-min-x",
-                                               input_abs_get_min(input, axis),
+                                               input_abs_get_min(input, axis_x),
                                                &minimum) |
                       touchscreen_get_prop_u32(dev, "touchscreen-size-x",
                                                input_abs_get_max(input,
-                                                                 axis) + 1,
+                                                                 axis_x) + 1,
                                                &maximum) |
                       touchscreen_get_prop_u32(dev, "touchscreen-fuzz-x",
-                                               input_abs_get_fuzz(input, axis),
+                                               input_abs_get_fuzz(input, axis_x),
                                                &fuzz);
        if (data_present)
-               touchscreen_set_params(input, axis, minimum, maximum - 1, fuzz);
+               touchscreen_set_params(input, axis_x, minimum, maximum - 1, fuzz);
 
-       axis = multitouch ? ABS_MT_POSITION_Y : ABS_Y;
        data_present = touchscreen_get_prop_u32(dev, "touchscreen-min-y",
-                                               input_abs_get_min(input, axis),
+                                               input_abs_get_min(input, axis_y),
                                                &minimum) |
                       touchscreen_get_prop_u32(dev, "touchscreen-size-y",
                                                input_abs_get_max(input,
-                                                                 axis) + 1,
+                                                                 axis_y) + 1,
                                                &maximum) |
                       touchscreen_get_prop_u32(dev, "touchscreen-fuzz-y",
-                                               input_abs_get_fuzz(input, axis),
+                                               input_abs_get_fuzz(input, axis_y),
                                                &fuzz);
        if (data_present)
-               touchscreen_set_params(input, axis, minimum, maximum - 1, fuzz);
+               touchscreen_set_params(input, axis_y, minimum, maximum - 1, fuzz);
 
        axis = multitouch ? ABS_MT_PRESSURE : ABS_PRESSURE;
        data_present = touchscreen_get_prop_u32(dev,
@@ -117,15 +118,13 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
        if (!prop)
                return;
 
-       axis = multitouch ? ABS_MT_POSITION_X : ABS_X;
-
-       prop->max_x = input_abs_get_max(input, axis);
-       prop->max_y = input_abs_get_max(input, axis + 1);
+       prop->max_x = input_abs_get_max(input, axis_x);
+       prop->max_y = input_abs_get_max(input, axis_y);
 
        prop->invert_x =
                device_property_read_bool(dev, "touchscreen-inverted-x");
        if (prop->invert_x) {
-               absinfo = &input->absinfo[axis];
+               absinfo = &input->absinfo[axis_x];
                absinfo->maximum -= absinfo->minimum;
                absinfo->minimum = 0;
        }
@@ -133,7 +132,7 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
        prop->invert_y =
                device_property_read_bool(dev, "touchscreen-inverted-y");
        if (prop->invert_y) {
-               absinfo = &input->absinfo[axis + 1];
+               absinfo = &input->absinfo[axis_y];
                absinfo->maximum -= absinfo->minimum;
                absinfo->minimum = 0;
        }
@@ -141,7 +140,7 @@ void touchscreen_parse_properties(struct input_dev *input, bool multitouch,
        prop->swap_x_y =
                device_property_read_bool(dev, "touchscreen-swapped-x-y");
        if (prop->swap_x_y)
-               swap(input->absinfo[axis], input->absinfo[axis + 1]);
+               swap(input->absinfo[axis_x], input->absinfo[axis_y]);
 }
 EXPORT_SYMBOL(touchscreen_parse_properties);
 
index d2fade9..58b4a4d 100644 (file)
@@ -188,6 +188,7 @@ config INTEL_IOMMU
        select NEED_DMA_MAP_STATE
        select DMAR_TABLE
        select SWIOTLB
+       select IOASID
        help
          DMA remapping (DMAR) devices support enables independent address
          translations for Direct Memory Access (DMA) from devices.
@@ -273,7 +274,7 @@ config IRQ_REMAP
 # OMAP IOMMU support
 config OMAP_IOMMU
        bool "OMAP IOMMU Support"
-       depends on ARM && MMU
+       depends on ARM && MMU || (COMPILE_TEST && (ARM || ARM64 || IA64 || SPARC))
        depends on ARCH_OMAP2PLUS || COMPILE_TEST
        select IOMMU_API
        ---help---
@@ -291,7 +292,7 @@ config OMAP_IOMMU_DEBUG
 
 config ROCKCHIP_IOMMU
        bool "Rockchip IOMMU Support"
-       depends on ARM || ARM64
+       depends on ARM || ARM64 || (COMPILE_TEST && (ARM64 || IA64 || SPARC))
        depends on ARCH_ROCKCHIP || COMPILE_TEST
        select IOMMU_API
        select ARM_DMA_USE_IOMMU
@@ -325,7 +326,7 @@ config TEGRA_IOMMU_SMMU
 
 config EXYNOS_IOMMU
        bool "Exynos IOMMU Support"
-       depends on ARCH_EXYNOS && MMU
+       depends on ARCH_EXYNOS && MMU || (COMPILE_TEST && (ARM || ARM64 || IA64 || SPARC))
        depends on !CPU_BIG_ENDIAN # revisit driver if we can enable big-endian ptes
        select IOMMU_API
        select ARM_DMA_USE_IOMMU
@@ -361,7 +362,7 @@ config IPMMU_VMSA
 
 config SPAPR_TCE_IOMMU
        bool "sPAPR TCE IOMMU Support"
-       depends on PPC_POWERNV || PPC_PSERIES
+       depends on PPC_POWERNV || PPC_PSERIES || (PPC && COMPILE_TEST)
        select IOMMU_API
        help
          Enables bits of IOMMU API required by VFIO. The iommu_ops
@@ -370,7 +371,7 @@ config SPAPR_TCE_IOMMU
 # ARM IOMMU support
 config ARM_SMMU
        tristate "ARM Ltd. System MMU (SMMU) Support"
-       depends on (ARM64 || ARM) && MMU
+       depends on (ARM64 || ARM || (COMPILE_TEST && !GENERIC_ATOMIC64)) && MMU
        select IOMMU_API
        select IOMMU_IO_PGTABLE_LPAE
        select ARM_DMA_USE_IOMMU if ARM
@@ -440,7 +441,7 @@ config S390_IOMMU
 
 config S390_CCW_IOMMU
        bool "S390 CCW IOMMU Support"
-       depends on S390 && CCW
+       depends on S390 && CCW || COMPILE_TEST
        select IOMMU_API
        help
          Enables bits of IOMMU API required by VFIO. The iommu_ops
@@ -448,7 +449,7 @@ config S390_CCW_IOMMU
 
 config S390_AP_IOMMU
        bool "S390 AP IOMMU Support"
-       depends on S390 && ZCRYPT
+       depends on S390 && ZCRYPT || COMPILE_TEST
        select IOMMU_API
        help
          Enables bits of IOMMU API required by VFIO. The iommu_ops
@@ -456,7 +457,7 @@ config S390_AP_IOMMU
 
 config MTK_IOMMU
        bool "MTK IOMMU Support"
-       depends on ARM || ARM64
+       depends on ARM || ARM64 || COMPILE_TEST
        depends on ARCH_MEDIATEK || COMPILE_TEST
        select ARM_DMA_USE_IOMMU
        select IOMMU_API
@@ -506,8 +507,8 @@ config HYPERV_IOMMU
          guests to run with x2APIC mode enabled.
 
 config VIRTIO_IOMMU
-       bool "Virtio IOMMU driver"
-       depends on VIRTIO=y
+       tristate "Virtio IOMMU driver"
+       depends on VIRTIO
        depends on ARM64
        select IOMMU_API
        select INTERVAL_TREE
index f8d01d6..ca8c452 100644 (file)
 
 #define DTE_GCR3_VAL_A(x)      (((x) >> 12) & 0x00007ULL)
 #define DTE_GCR3_VAL_B(x)      (((x) >> 15) & 0x0ffffULL)
-#define DTE_GCR3_VAL_C(x)      (((x) >> 31) & 0xfffffULL)
+#define DTE_GCR3_VAL_C(x)      (((x) >> 31) & 0x1fffffULL)
 
 #define DTE_GCR3_INDEX_A       0
 #define DTE_GCR3_INDEX_B       1
index aa3ac2a..8250873 100644 (file)
@@ -69,6 +69,9 @@
 #define IDR1_SSIDSIZE                  GENMASK(10, 6)
 #define IDR1_SIDSIZE                   GENMASK(5, 0)
 
+#define ARM_SMMU_IDR3                  0xc
+#define IDR3_RIL                       (1 << 10)
+
 #define ARM_SMMU_IDR5                  0x14
 #define IDR5_STALL_MAX                 GENMASK(31, 16)
 #define IDR5_GRAN64K                   (1 << 6)
 #define CMDQ_CFGI_1_LEAF               (1UL << 0)
 #define CMDQ_CFGI_1_RANGE              GENMASK_ULL(4, 0)
 
+#define CMDQ_TLBI_0_NUM                        GENMASK_ULL(16, 12)
+#define CMDQ_TLBI_RANGE_NUM_MAX                31
+#define CMDQ_TLBI_0_SCALE              GENMASK_ULL(24, 20)
 #define CMDQ_TLBI_0_VMID               GENMASK_ULL(47, 32)
 #define CMDQ_TLBI_0_ASID               GENMASK_ULL(63, 48)
 #define CMDQ_TLBI_1_LEAF               (1UL << 0)
+#define CMDQ_TLBI_1_TTL                        GENMASK_ULL(9, 8)
+#define CMDQ_TLBI_1_TG                 GENMASK_ULL(11, 10)
 #define CMDQ_TLBI_1_VA_MASK            GENMASK_ULL(63, 12)
 #define CMDQ_TLBI_1_IPA_MASK           GENMASK_ULL(51, 12)
 
@@ -473,9 +481,13 @@ struct arm_smmu_cmdq_ent {
                #define CMDQ_OP_TLBI_S2_IPA     0x2a
                #define CMDQ_OP_TLBI_NSNH_ALL   0x30
                struct {
+                       u8                      num;
+                       u8                      scale;
                        u16                     asid;
                        u16                     vmid;
                        bool                    leaf;
+                       u8                      ttl;
+                       u8                      tg;
                        u64                     addr;
                } tlbi;
 
@@ -548,6 +560,11 @@ struct arm_smmu_cmdq {
        atomic_t                        lock;
 };
 
+struct arm_smmu_cmdq_batch {
+       u64                             cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
+       int                             num;
+};
+
 struct arm_smmu_evtq {
        struct arm_smmu_queue           q;
        u32                             max_stalls;
@@ -627,6 +644,7 @@ struct arm_smmu_device {
 #define ARM_SMMU_FEAT_HYP              (1 << 12)
 #define ARM_SMMU_FEAT_STALL_FORCE      (1 << 13)
 #define ARM_SMMU_FEAT_VAX              (1 << 14)
+#define ARM_SMMU_FEAT_RANGE_INV                (1 << 15)
        u32                             features;
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH     (1 << 0)
@@ -895,14 +913,22 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
                cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31);
                break;
        case CMDQ_OP_TLBI_NH_VA:
+               cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
+               cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
                cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
                cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
                cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
+               cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
+               cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
                cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
                break;
        case CMDQ_OP_TLBI_S2_IPA:
+               cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
+               cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
                cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
                cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
+               cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
+               cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
                cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK;
                break;
        case CMDQ_OP_TLBI_NH_ASID:
@@ -1482,6 +1508,24 @@ static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
        return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
 }
 
+static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
+                                   struct arm_smmu_cmdq_batch *cmds,
+                                   struct arm_smmu_cmdq_ent *cmd)
+{
+       if (cmds->num == CMDQ_BATCH_ENTRIES) {
+               arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
+               cmds->num = 0;
+       }
+       arm_smmu_cmdq_build_cmd(&cmds->cmds[cmds->num * CMDQ_ENT_DWORDS], cmd);
+       cmds->num++;
+}
+
+static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
+                                     struct arm_smmu_cmdq_batch *cmds)
+{
+       return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+}
+
 /* Context descriptor manipulation functions */
 static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
                             int ssid, bool leaf)
@@ -1489,6 +1533,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
        size_t i;
        unsigned long flags;
        struct arm_smmu_master *master;
+       struct arm_smmu_cmdq_batch cmds = {};
        struct arm_smmu_device *smmu = smmu_domain->smmu;
        struct arm_smmu_cmdq_ent cmd = {
                .opcode = CMDQ_OP_CFGI_CD,
@@ -1502,12 +1547,12 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain,
        list_for_each_entry(master, &smmu_domain->devices, domain_head) {
                for (i = 0; i < master->num_sids; i++) {
                        cmd.cfgi.sid = master->sids[i];
-                       arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+                       arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
                }
        }
        spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
-       arm_smmu_cmdq_issue_sync(smmu);
+       arm_smmu_cmdq_batch_submit(smmu, &cmds);
 }
 
 static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
@@ -1531,6 +1576,7 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
        u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) |
                  CTXDESC_L1_DESC_V;
 
+       /* See comment in arm_smmu_write_ctx_desc() */
        WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
@@ -1726,7 +1772,8 @@ arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
        val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, desc->span);
        val |= desc->l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK;
 
-       *dst = cpu_to_le64(val);
+       /* See comment in arm_smmu_write_ctx_desc() */
+       WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
 static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
@@ -2132,17 +2179,16 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size,
        cmd->atc.size   = log2_span;
 }
 
-static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
-                                  struct arm_smmu_cmdq_ent *cmd)
+static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
 {
        int i;
+       struct arm_smmu_cmdq_ent cmd;
 
-       if (!master->ats_enabled)
-               return 0;
+       arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
 
        for (i = 0; i < master->num_sids; i++) {
-               cmd->atc.sid = master->sids[i];
-               arm_smmu_cmdq_issue_cmd(master->smmu, cmd);
+               cmd.atc.sid = master->sids[i];
+               arm_smmu_cmdq_issue_cmd(master->smmu, &cmd);
        }
 
        return arm_smmu_cmdq_issue_sync(master->smmu);
@@ -2151,10 +2197,11 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
 static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
                                   int ssid, unsigned long iova, size_t size)
 {
-       int ret = 0;
+       int i;
        unsigned long flags;
        struct arm_smmu_cmdq_ent cmd;
        struct arm_smmu_master *master;
+       struct arm_smmu_cmdq_batch cmds = {};
 
        if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
                return 0;
@@ -2179,11 +2226,18 @@ static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
        arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
 
        spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-       list_for_each_entry(master, &smmu_domain->devices, domain_head)
-               ret |= arm_smmu_atc_inv_master(master, &cmd);
+       list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+               if (!master->ats_enabled)
+                       continue;
+
+               for (i = 0; i < master->num_sids; i++) {
+                       cmd.atc.sid = master->sids[i];
+                       arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
+               }
+       }
        spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
-       return ret ? -ETIMEDOUT : 0;
+       return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds);
 }
 
 /* IO_PGTABLE API */
@@ -2218,10 +2272,10 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
                                   size_t granule, bool leaf,
                                   struct arm_smmu_domain *smmu_domain)
 {
-       u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
        struct arm_smmu_device *smmu = smmu_domain->smmu;
-       unsigned long start = iova, end = iova + size;
-       int i = 0;
+       unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
+       size_t inv_range = granule;
+       struct arm_smmu_cmdq_batch cmds = {};
        struct arm_smmu_cmdq_ent cmd = {
                .tlbi = {
                        .leaf   = leaf,
@@ -2239,19 +2293,50 @@ static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
                cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
        }
 
+       if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
+               /* Get the leaf page size */
+               tg = __ffs(smmu_domain->domain.pgsize_bitmap);
+
+               /* Convert page size of 12,14,16 (log2) to 1,2,3 */
+               cmd.tlbi.tg = (tg - 10) / 2;
+
+               /* Determine what level the granule is at */
+               cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
+
+               num_pages = size >> tg;
+       }
+
        while (iova < end) {
-               if (i == CMDQ_BATCH_ENTRIES) {
-                       arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, false);
-                       i = 0;
+               if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
+                       /*
+                        * On each iteration of the loop, the range is 5 bits
+                        * worth of the aligned size remaining.
+                        * The range in pages is:
+                        *
+                        * range = (num_pages & (0x1f << __ffs(num_pages)))
+                        */
+                       unsigned long scale, num;
+
+                       /* Determine the power of 2 multiple number of pages */
+                       scale = __ffs(num_pages);
+                       cmd.tlbi.scale = scale;
+
+                       /* Determine how many chunks of 2^scale size we have */
+                       num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
+                       cmd.tlbi.num = num - 1;
+
+                       /* range is num * 2^scale * pgsize */
+                       inv_range = num << (scale + tg);
+
+                       /* Clear out the lower order bits for the next iteration */
+                       num_pages -= num << scale;
                }
 
                cmd.tlbi.addr = iova;
-               arm_smmu_cmdq_build_cmd(&cmds[i * CMDQ_ENT_DWORDS], &cmd);
-               iova += granule;
-               i++;
+               arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
+               iova += inv_range;
        }
-
-       arm_smmu_cmdq_issue_cmdlist(smmu, cmds, i, true);
+       arm_smmu_cmdq_batch_submit(smmu, &cmds);
 
        /*
         * Unfortunately, this can't be leaf-only since we may have
@@ -2611,7 +2696,6 @@ static void arm_smmu_enable_ats(struct arm_smmu_master *master)
 
 static void arm_smmu_disable_ats(struct arm_smmu_master *master)
 {
-       struct arm_smmu_cmdq_ent cmd;
        struct arm_smmu_domain *smmu_domain = master->domain;
 
        if (!master->ats_enabled)
@@ -2623,11 +2707,57 @@ static void arm_smmu_disable_ats(struct arm_smmu_master *master)
         * ATC invalidation via the SMMU.
         */
        wmb();
-       arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
-       arm_smmu_atc_inv_master(master, &cmd);
+       arm_smmu_atc_inv_master(master);
        atomic_dec(&smmu_domain->nr_ats_masters);
 }
 
+static int arm_smmu_enable_pasid(struct arm_smmu_master *master)
+{
+       int ret;
+       int features;
+       int num_pasids;
+       struct pci_dev *pdev;
+
+       if (!dev_is_pci(master->dev))
+               return -ENODEV;
+
+       pdev = to_pci_dev(master->dev);
+
+       features = pci_pasid_features(pdev);
+       if (features < 0)
+               return features;
+
+       num_pasids = pci_max_pasids(pdev);
+       if (num_pasids <= 0)
+               return num_pasids;
+
+       ret = pci_enable_pasid(pdev, features);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to enable PASID\n");
+               return ret;
+       }
+
+       master->ssid_bits = min_t(u8, ilog2(num_pasids),
+                                 master->smmu->ssid_bits);
+       return 0;
+}
+
+static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
+{
+       struct pci_dev *pdev;
+
+       if (!dev_is_pci(master->dev))
+               return;
+
+       pdev = to_pci_dev(master->dev);
+
+       if (!pdev->pasid_enabled)
+               return;
+
+       master->ssid_bits = 0;
+       pci_disable_pasid(pdev);
+}
+
 static void arm_smmu_detach_dev(struct arm_smmu_master *master)
 {
        unsigned long flags;
@@ -2659,7 +2789,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
        if (!fwspec)
                return -ENOENT;
 
-       master = fwspec->iommu_priv;
+       master = dev_iommu_priv_get(dev);
        smmu = master->smmu;
 
        arm_smmu_detach_dev(master);
@@ -2795,7 +2925,7 @@ static int arm_smmu_add_device(struct device *dev)
        if (!fwspec || fwspec->ops != &arm_smmu_ops)
                return -ENODEV;
 
-       if (WARN_ON_ONCE(fwspec->iommu_priv))
+       if (WARN_ON_ONCE(dev_iommu_priv_get(dev)))
                return -EBUSY;
 
        smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode);
@@ -2810,7 +2940,7 @@ static int arm_smmu_add_device(struct device *dev)
        master->smmu = smmu;
        master->sids = fwspec->ids;
        master->num_sids = fwspec->num_ids;
-       fwspec->iommu_priv = master;
+       dev_iommu_priv_set(dev, master);
 
        /* Check the SIDs are in range of the SMMU and our stream table */
        for (i = 0; i < master->num_sids; i++) {
@@ -2831,13 +2961,23 @@ static int arm_smmu_add_device(struct device *dev)
 
        master->ssid_bits = min(smmu->ssid_bits, fwspec->num_pasid_bits);
 
+       /*
+        * Note that PASID must be enabled before, and disabled after ATS:
+        * PCI Express Base 4.0r1.0 - 10.5.1.3 ATS Control Register
+        *
+        *   Behavior is undefined if this bit is Set and the value of the PASID
+        *   Enable, Execute Requested Enable, or Privileged Mode Requested bits
+        *   are changed.
+        */
+       arm_smmu_enable_pasid(master);
+
        if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB))
                master->ssid_bits = min_t(u8, master->ssid_bits,
                                          CTXDESC_LINEAR_CDMAX);
 
        ret = iommu_device_link(&smmu->iommu, dev);
        if (ret)
-               goto err_free_master;
+               goto err_disable_pasid;
 
        group = iommu_group_get_for_dev(dev);
        if (IS_ERR(group)) {
@@ -2850,9 +2990,11 @@ static int arm_smmu_add_device(struct device *dev)
 
 err_unlink:
        iommu_device_unlink(&smmu->iommu, dev);
+err_disable_pasid:
+       arm_smmu_disable_pasid(master);
 err_free_master:
        kfree(master);
-       fwspec->iommu_priv = NULL;
+       dev_iommu_priv_set(dev, NULL);
        return ret;
 }
 
@@ -2865,11 +3007,12 @@ static void arm_smmu_remove_device(struct device *dev)
        if (!fwspec || fwspec->ops != &arm_smmu_ops)
                return;
 
-       master = fwspec->iommu_priv;
+       master = dev_iommu_priv_get(dev);
        smmu = master->smmu;
        arm_smmu_detach_dev(master);
        iommu_group_remove_device(dev);
        iommu_device_unlink(&smmu->iommu, dev);
+       arm_smmu_disable_pasid(master);
        kfree(master);
        iommu_fwspec_free(dev);
 }
@@ -3700,6 +3843,11 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
        if (smmu->sid_bits <= STRTAB_SPLIT)
                smmu->features &= ~ARM_SMMU_FEAT_2_LVL_STRTAB;
 
+       /* IDR3 */
+       reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
+       if (FIELD_GET(IDR3_RIL, reg))
+               smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
+
        /* IDR5 */
        reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5);
 
index 16c4b87..a6a5796 100644 (file)
@@ -98,12 +98,10 @@ struct arm_smmu_master_cfg {
        s16                             smendx[];
 };
 #define INVALID_SMENDX                 -1
-#define __fwspec_cfg(fw) ((struct arm_smmu_master_cfg *)fw->iommu_priv)
-#define fwspec_smmu(fw)  (__fwspec_cfg(fw)->smmu)
-#define fwspec_smendx(fw, i) \
-       (i >= fw->num_ids ? INVALID_SMENDX : __fwspec_cfg(fw)->smendx[i])
-#define for_each_cfg_sme(fw, i, idx) \
-       for (i = 0; idx = fwspec_smendx(fw, i), i < fw->num_ids; ++i)
+#define cfg_smendx(cfg, fw, i) \
+       (i >= fw->num_ids ? INVALID_SMENDX : cfg->smendx[i])
+#define for_each_cfg_sme(cfg, fw, i, idx) \
+       for (i = 0; idx = cfg_smendx(cfg, fw, i), i < fw->num_ids; ++i)
 
 static bool using_legacy_binding, using_generic_binding;
 
@@ -1061,7 +1059,7 @@ static bool arm_smmu_free_sme(struct arm_smmu_device *smmu, int idx)
 static int arm_smmu_master_alloc_smes(struct device *dev)
 {
        struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-       struct arm_smmu_master_cfg *cfg = fwspec->iommu_priv;
+       struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
        struct arm_smmu_device *smmu = cfg->smmu;
        struct arm_smmu_smr *smrs = smmu->smrs;
        struct iommu_group *group;
@@ -1069,7 +1067,7 @@ static int arm_smmu_master_alloc_smes(struct device *dev)
 
        mutex_lock(&smmu->stream_map_mutex);
        /* Figure out a viable stream map entry allocation */
-       for_each_cfg_sme(fwspec, i, idx) {
+       for_each_cfg_sme(cfg, fwspec, i, idx) {
                u16 sid = FIELD_GET(ARM_SMMU_SMR_ID, fwspec->ids[i]);
                u16 mask = FIELD_GET(ARM_SMMU_SMR_MASK, fwspec->ids[i]);
 
@@ -1100,7 +1098,7 @@ static int arm_smmu_master_alloc_smes(struct device *dev)
        iommu_group_put(group);
 
        /* It worked! Now, poke the actual hardware */
-       for_each_cfg_sme(fwspec, i, idx) {
+       for_each_cfg_sme(cfg, fwspec, i, idx) {
                arm_smmu_write_sme(smmu, idx);
                smmu->s2crs[idx].group = group;
        }
@@ -1117,14 +1115,14 @@ out_err:
        return ret;
 }
 
-static void arm_smmu_master_free_smes(struct iommu_fwspec *fwspec)
+static void arm_smmu_master_free_smes(struct arm_smmu_master_cfg *cfg,
+                                     struct iommu_fwspec *fwspec)
 {
-       struct arm_smmu_device *smmu = fwspec_smmu(fwspec);
-       struct arm_smmu_master_cfg *cfg = fwspec->iommu_priv;
+       struct arm_smmu_device *smmu = cfg->smmu;
        int i, idx;
 
        mutex_lock(&smmu->stream_map_mutex);
-       for_each_cfg_sme(fwspec, i, idx) {
+       for_each_cfg_sme(cfg, fwspec, i, idx) {
                if (arm_smmu_free_sme(smmu, idx))
                        arm_smmu_write_sme(smmu, idx);
                cfg->smendx[i] = INVALID_SMENDX;
@@ -1133,6 +1131,7 @@ static void arm_smmu_master_free_smes(struct iommu_fwspec *fwspec)
 }
 
 static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain,
+                                     struct arm_smmu_master_cfg *cfg,
                                      struct iommu_fwspec *fwspec)
 {
        struct arm_smmu_device *smmu = smmu_domain->smmu;
@@ -1146,7 +1145,7 @@ static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain,
        else
                type = S2CR_TYPE_TRANS;
 
-       for_each_cfg_sme(fwspec, i, idx) {
+       for_each_cfg_sme(cfg, fwspec, i, idx) {
                if (type == s2cr[idx].type && cbndx == s2cr[idx].cbndx)
                        continue;
 
@@ -1160,10 +1159,11 @@ static int arm_smmu_domain_add_master(struct arm_smmu_domain *smmu_domain,
 
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 {
-       int ret;
+       struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
        struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+       struct arm_smmu_master_cfg *cfg;
        struct arm_smmu_device *smmu;
-       struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+       int ret;
 
        if (!fwspec || fwspec->ops != &arm_smmu_ops) {
                dev_err(dev, "cannot attach to SMMU, is it on the same bus?\n");
@@ -1177,10 +1177,11 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
         * domains, just say no (but more politely than by dereferencing NULL).
         * This should be at least a WARN_ON once that's sorted.
         */
-       if (!fwspec->iommu_priv)
+       cfg = dev_iommu_priv_get(dev);
+       if (!cfg)
                return -ENODEV;
 
-       smmu = fwspec_smmu(fwspec);
+       smmu = cfg->smmu;
 
        ret = arm_smmu_rpm_get(smmu);
        if (ret < 0)
@@ -1204,7 +1205,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
        }
 
        /* Looks ok, so add the device to the domain */
-       ret = arm_smmu_domain_add_master(smmu_domain, fwspec);
+       ret = arm_smmu_domain_add_master(smmu_domain, cfg, fwspec);
 
        /*
         * Setup an autosuspend delay to avoid bouncing runpm state.
@@ -1383,7 +1384,7 @@ struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode)
 
 static int arm_smmu_add_device(struct device *dev)
 {
-       struct arm_smmu_device *smmu;
+       struct arm_smmu_device *smmu = NULL;
        struct arm_smmu_master_cfg *cfg;
        struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
        int i, ret;
@@ -1429,7 +1430,7 @@ static int arm_smmu_add_device(struct device *dev)
                goto out_free;
 
        cfg->smmu = smmu;
-       fwspec->iommu_priv = cfg;
+       dev_iommu_priv_set(dev, cfg);
        while (i--)
                cfg->smendx[i] = INVALID_SMENDX;
 
@@ -1467,7 +1468,7 @@ static void arm_smmu_remove_device(struct device *dev)
        if (!fwspec || fwspec->ops != &arm_smmu_ops)
                return;
 
-       cfg  = fwspec->iommu_priv;
+       cfg  = dev_iommu_priv_get(dev);
        smmu = cfg->smmu;
 
        ret = arm_smmu_rpm_get(smmu);
@@ -1475,23 +1476,25 @@ static void arm_smmu_remove_device(struct device *dev)
                return;
 
        iommu_device_unlink(&smmu->iommu, dev);
-       arm_smmu_master_free_smes(fwspec);
+       arm_smmu_master_free_smes(cfg, fwspec);
 
        arm_smmu_rpm_put(smmu);
 
+       dev_iommu_priv_set(dev, NULL);
        iommu_group_remove_device(dev);
-       kfree(fwspec->iommu_priv);
+       kfree(cfg);
        iommu_fwspec_free(dev);
 }
 
 static struct iommu_group *arm_smmu_device_group(struct device *dev)
 {
+       struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev);
        struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-       struct arm_smmu_device *smmu = fwspec_smmu(fwspec);
+       struct arm_smmu_device *smmu = cfg->smmu;
        struct iommu_group *group = NULL;
        int i, idx;
 
-       for_each_cfg_sme(fwspec, i, idx) {
+       for_each_cfg_sme(cfg, fwspec, i, idx) {
                if (group && smmu->s2crs[idx].group &&
                    group != smmu->s2crs[idx].group)
                        return ERR_PTR(-EINVAL);
index 4be5494..ef0a524 100644 (file)
@@ -4501,7 +4501,8 @@ static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
        struct dmar_atsr_unit *atsru;
        struct acpi_dmar_atsr *tmp;
 
-       list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
+       list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
+                               dmar_rcu_check()) {
                tmp = (struct acpi_dmar_atsr *)atsru->hdr;
                if (atsr->segment != tmp->segment)
                        continue;
index d7f2a53..2998418 100644 (file)
@@ -531,7 +531,7 @@ struct page_req_dsc {
        u64 priv_data[2];
 };
 
-#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x10)
+#define PRQ_RING_MASK  ((0x1000 << PRQ_ORDER) - 0x20)
 
 static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
 {
@@ -611,14 +611,15 @@ static irqreturn_t prq_event_thread(int irq, void *d)
                 * any faults on kernel addresses. */
                if (!svm->mm)
                        goto bad_req;
-               /* If the mm is already defunct, don't handle faults. */
-               if (!mmget_not_zero(svm->mm))
-                       goto bad_req;
 
                /* If address is not canonical, return invalid response */
                if (!is_canonical_address(address))
                        goto bad_req;
 
+               /* If the mm is already defunct, don't handle faults. */
+               if (!mmget_not_zero(svm->mm))
+                       goto bad_req;
+
                down_read(&svm->mm->mmap_sem);
                vma = find_extend_vma(svm->mm, address);
                if (!vma || address < vma->vm_start)
index 3e35284..2b47141 100644 (file)
@@ -152,9 +152,9 @@ void iommu_device_unregister(struct iommu_device *iommu)
 }
 EXPORT_SYMBOL_GPL(iommu_device_unregister);
 
-static struct iommu_param *iommu_get_dev_param(struct device *dev)
+static struct dev_iommu *dev_iommu_get(struct device *dev)
 {
-       struct iommu_param *param = dev->iommu_param;
+       struct dev_iommu *param = dev->iommu;
 
        if (param)
                return param;
@@ -164,14 +164,14 @@ static struct iommu_param *iommu_get_dev_param(struct device *dev)
                return NULL;
 
        mutex_init(&param->lock);
-       dev->iommu_param = param;
+       dev->iommu = param;
        return param;
 }
 
-static void iommu_free_dev_param(struct device *dev)
+static void dev_iommu_free(struct device *dev)
 {
-       kfree(dev->iommu_param);
-       dev->iommu_param = NULL;
+       kfree(dev->iommu);
+       dev->iommu = NULL;
 }
 
 int iommu_probe_device(struct device *dev)
@@ -183,7 +183,7 @@ int iommu_probe_device(struct device *dev)
        if (!ops)
                return -EINVAL;
 
-       if (!iommu_get_dev_param(dev))
+       if (!dev_iommu_get(dev))
                return -ENOMEM;
 
        if (!try_module_get(ops->owner)) {
@@ -200,7 +200,7 @@ int iommu_probe_device(struct device *dev)
 err_module_put:
        module_put(ops->owner);
 err_free_dev_param:
-       iommu_free_dev_param(dev);
+       dev_iommu_free(dev);
        return ret;
 }
 
@@ -211,9 +211,9 @@ void iommu_release_device(struct device *dev)
        if (dev->iommu_group)
                ops->remove_device(dev);
 
-       if (dev->iommu_param) {
+       if (dev->iommu) {
                module_put(ops->owner);
-               iommu_free_dev_param(dev);
+               dev_iommu_free(dev);
        }
 }
 
@@ -972,7 +972,7 @@ int iommu_register_device_fault_handler(struct device *dev,
                                        iommu_dev_fault_handler_t handler,
                                        void *data)
 {
-       struct iommu_param *param = dev->iommu_param;
+       struct dev_iommu *param = dev->iommu;
        int ret = 0;
 
        if (!param)
@@ -1015,7 +1015,7 @@ EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
  */
 int iommu_unregister_device_fault_handler(struct device *dev)
 {
-       struct iommu_param *param = dev->iommu_param;
+       struct dev_iommu *param = dev->iommu;
        int ret = 0;
 
        if (!param)
@@ -1055,7 +1055,7 @@ EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
  */
 int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 {
-       struct iommu_param *param = dev->iommu_param;
+       struct dev_iommu *param = dev->iommu;
        struct iommu_fault_event *evt_pending = NULL;
        struct iommu_fault_param *fparam;
        int ret = 0;
@@ -1104,7 +1104,7 @@ int iommu_page_response(struct device *dev,
        int ret = -EINVAL;
        struct iommu_fault_event *evt;
        struct iommu_fault_page_request *prm;
-       struct iommu_param *param = dev->iommu_param;
+       struct dev_iommu *param = dev->iommu;
        struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 
        if (!domain || !domain->ops->page_response)
@@ -2405,7 +2405,11 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
        if (fwspec)
                return ops == fwspec->ops ? 0 : -EINVAL;
 
-       fwspec = kzalloc(sizeof(*fwspec), GFP_KERNEL);
+       if (!dev_iommu_get(dev))
+               return -ENOMEM;
+
+       /* Preallocate for the overwhelmingly common case of 1 ID */
+       fwspec = kzalloc(struct_size(fwspec, ids, 1), GFP_KERNEL);
        if (!fwspec)
                return -ENOMEM;
 
@@ -2432,15 +2436,15 @@ EXPORT_SYMBOL_GPL(iommu_fwspec_free);
 int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
 {
        struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-       size_t size;
-       int i;
+       int i, new_num;
 
        if (!fwspec)
                return -EINVAL;
 
-       size = offsetof(struct iommu_fwspec, ids[fwspec->num_ids + num_ids]);
-       if (size > sizeof(*fwspec)) {
-               fwspec = krealloc(fwspec, size, GFP_KERNEL);
+       new_num = fwspec->num_ids + num_ids;
+       if (new_num > 1) {
+               fwspec = krealloc(fwspec, struct_size(fwspec, ids, new_num),
+                                 GFP_KERNEL);
                if (!fwspec)
                        return -ENOMEM;
 
@@ -2450,7 +2454,7 @@ int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
        for (i = 0; i < num_ids; i++)
                fwspec->ids[fwspec->num_ids + i] = ids[i];
 
-       fwspec->num_ids += num_ids;
+       fwspec->num_ids = new_num;
        return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
index ecb3f94..310cf09 100644 (file)
@@ -89,9 +89,7 @@ static struct ipmmu_vmsa_domain *to_vmsa_domain(struct iommu_domain *dom)
 
 static struct ipmmu_vmsa_device *to_ipmmu(struct device *dev)
 {
-       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-
-       return fwspec ? fwspec->iommu_priv : NULL;
+       return dev_iommu_priv_get(dev);
 }
 
 #define TLB_LOOP_TIMEOUT               100     /* 100us */
@@ -727,14 +725,13 @@ static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain,
 static int ipmmu_init_platform_device(struct device *dev,
                                      struct of_phandle_args *args)
 {
-       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
        struct platform_device *ipmmu_pdev;
 
        ipmmu_pdev = of_find_device_by_node(args->np);
        if (!ipmmu_pdev)
                return -ENODEV;
 
-       fwspec->iommu_priv = platform_get_drvdata(ipmmu_pdev);
+       dev_iommu_priv_set(dev, platform_get_drvdata(ipmmu_pdev));
 
        return 0;
 }
index 95945f4..5f4d6df 100644 (file)
@@ -358,8 +358,8 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain)
 static int mtk_iommu_attach_device(struct iommu_domain *domain,
                                   struct device *dev)
 {
+       struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
        struct mtk_iommu_domain *dom = to_mtk_domain(domain);
-       struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
 
        if (!data)
                return -ENODEV;
@@ -378,7 +378,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain,
 static void mtk_iommu_detach_device(struct iommu_domain *domain,
                                    struct device *dev)
 {
-       struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
+       struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
 
        if (!data)
                return;
@@ -450,7 +450,7 @@ static int mtk_iommu_add_device(struct device *dev)
        if (!fwspec || fwspec->ops != &mtk_iommu_ops)
                return -ENODEV; /* Not a iommu client device */
 
-       data = fwspec->iommu_priv;
+       data = dev_iommu_priv_get(dev);
        iommu_device_link(&data->iommu, dev);
 
        group = iommu_group_get_for_dev(dev);
@@ -469,7 +469,7 @@ static void mtk_iommu_remove_device(struct device *dev)
        if (!fwspec || fwspec->ops != &mtk_iommu_ops)
                return;
 
-       data = fwspec->iommu_priv;
+       data = dev_iommu_priv_get(dev);
        iommu_device_unlink(&data->iommu, dev);
 
        iommu_group_remove_device(dev);
@@ -496,7 +496,6 @@ static struct iommu_group *mtk_iommu_device_group(struct device *dev)
 
 static int mtk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
 {
-       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
        struct platform_device *m4updev;
 
        if (args->args_count != 1) {
@@ -505,13 +504,13 @@ static int mtk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
                return -EINVAL;
        }
 
-       if (!fwspec->iommu_priv) {
+       if (!dev_iommu_priv_get(dev)) {
                /* Get the m4u device */
                m4updev = of_find_device_by_node(args->np);
                if (WARN_ON(!m4updev))
                        return -EINVAL;
 
-               fwspec->iommu_priv = platform_get_drvdata(m4updev);
+               dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
        }
 
        return iommu_fwspec_add_ids(dev, args->args, 1);
index e93b94e..a31be05 100644 (file)
@@ -263,8 +263,8 @@ static void mtk_iommu_domain_free(struct iommu_domain *domain)
 static int mtk_iommu_attach_device(struct iommu_domain *domain,
                                   struct device *dev)
 {
+       struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
        struct mtk_iommu_domain *dom = to_mtk_domain(domain);
-       struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
        int ret;
 
        if (!data)
@@ -286,7 +286,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain,
 static void mtk_iommu_detach_device(struct iommu_domain *domain,
                                    struct device *dev)
 {
-       struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
+       struct mtk_iommu_data *data = dev_iommu_priv_get(dev);
 
        if (!data)
                return;
@@ -387,20 +387,20 @@ static int mtk_iommu_create_mapping(struct device *dev,
                return -EINVAL;
        }
 
-       if (!fwspec->iommu_priv) {
+       if (!dev_iommu_priv_get(dev)) {
                /* Get the m4u device */
                m4updev = of_find_device_by_node(args->np);
                if (WARN_ON(!m4updev))
                        return -EINVAL;
 
-               fwspec->iommu_priv = platform_get_drvdata(m4updev);
+               dev_iommu_priv_set(dev, platform_get_drvdata(m4updev));
        }
 
        ret = iommu_fwspec_add_ids(dev, args->args, 1);
        if (ret)
                return ret;
 
-       data = fwspec->iommu_priv;
+       data = dev_iommu_priv_get(dev);
        m4udev = data->dev;
        mtk_mapping = m4udev->archdata.iommu;
        if (!mtk_mapping) {
@@ -459,7 +459,7 @@ static int mtk_iommu_add_device(struct device *dev)
        if (err)
                return err;
 
-       data = fwspec->iommu_priv;
+       data = dev_iommu_priv_get(dev);
        mtk_mapping = data->dev->archdata.iommu;
        err = arm_iommu_attach_device(dev, mtk_mapping);
        if (err) {
@@ -478,7 +478,7 @@ static void mtk_iommu_remove_device(struct device *dev)
        if (!fwspec || fwspec->ops != &mtk_iommu_ops)
                return;
 
-       data = fwspec->iommu_priv;
+       data = dev_iommu_priv_get(dev);
        iommu_device_unlink(&data->iommu, dev);
 
        iommu_group_remove_device(dev);
index be551cc..887fefc 100644 (file)
@@ -167,7 +167,7 @@ static int omap2_iommu_enable(struct omap_iommu *obj)
 {
        u32 l, pa;
 
-       if (!obj->iopgd || !IS_ALIGNED((u32)obj->iopgd,  SZ_16K))
+       if (!obj->iopgd || !IS_ALIGNED((unsigned long)obj->iopgd,  SZ_16K))
                return -EINVAL;
 
        pa = virt_to_phys(obj->iopgd);
@@ -434,7 +434,7 @@ static void flush_iotlb_page(struct omap_iommu *obj, u32 da)
                bytes = iopgsz_to_bytes(cr.cam & 3);
 
                if ((start <= da) && (da < start + bytes)) {
-                       dev_dbg(obj->dev, "%s: %08x<=%08x(%x)\n",
+                       dev_dbg(obj->dev, "%s: %08x<=%08x(%zx)\n",
                                __func__, start, da, bytes);
                        iotlb_load_cr(obj, &cr);
                        iommu_write_reg(obj, 1, MMU_FLUSH_ENTRY);
@@ -1352,11 +1352,11 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da,
 
        omap_pgsz = bytes_to_iopgsz(bytes);
        if (omap_pgsz < 0) {
-               dev_err(dev, "invalid size to map: %d\n", bytes);
+               dev_err(dev, "invalid size to map: %zu\n", bytes);
                return -EINVAL;
        }
 
-       dev_dbg(dev, "mapping da 0x%lx to pa %pa size 0x%x\n", da, &pa, bytes);
+       dev_dbg(dev, "mapping da 0x%lx to pa %pa size 0x%zx\n", da, &pa, bytes);
 
        iotlb_init_entry(&e, da, pa, omap_pgsz);
 
@@ -1393,7 +1393,7 @@ static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da,
        size_t bytes = 0;
        int i;
 
-       dev_dbg(dev, "unmapping da 0x%lx size %u\n", da, size);
+       dev_dbg(dev, "unmapping da 0x%lx size %zu\n", da, size);
 
        iommu = omap_domain->iommus;
        for (i = 0; i < omap_domain->num_iommus; i++, iommu++) {
index 1a4adb5..51d7400 100644 (file)
@@ -63,7 +63,8 @@
  *
  * va to pa translation
  */
-static inline phys_addr_t omap_iommu_translate(u32 d, u32 va, u32 mask)
+static inline phys_addr_t omap_iommu_translate(unsigned long d, dma_addr_t va,
+                                              dma_addr_t mask)
 {
        return (d & mask) | (va & (~mask));
 }
index 4328da0..0e2a964 100644 (file)
@@ -48,7 +48,7 @@ struct qcom_iommu_dev {
        void __iomem            *local_base;
        u32                      sec_id;
        u8                       num_ctxs;
-       struct qcom_iommu_ctx   *ctxs[0];   /* indexed by asid-1 */
+       struct qcom_iommu_ctx   *ctxs[];   /* indexed by asid-1 */
 };
 
 struct qcom_iommu_ctx {
@@ -74,16 +74,19 @@ static struct qcom_iommu_domain *to_qcom_iommu_domain(struct iommu_domain *dom)
 
 static const struct iommu_ops qcom_iommu_ops;
 
-static struct qcom_iommu_dev * to_iommu(struct iommu_fwspec *fwspec)
+static struct qcom_iommu_dev * to_iommu(struct device *dev)
 {
+       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
        if (!fwspec || fwspec->ops != &qcom_iommu_ops)
                return NULL;
-       return fwspec->iommu_priv;
+
+       return dev_iommu_priv_get(dev);
 }
 
-static struct qcom_iommu_ctx * to_ctx(struct iommu_fwspec *fwspec, unsigned asid)
+static struct qcom_iommu_ctx * to_ctx(struct device *dev, unsigned asid)
 {
-       struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec);
+       struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
        if (!qcom_iommu)
                return NULL;
        return qcom_iommu->ctxs[asid - 1];
@@ -115,11 +118,14 @@ iommu_readq(struct qcom_iommu_ctx *ctx, unsigned reg)
 
 static void qcom_iommu_tlb_sync(void *cookie)
 {
-       struct iommu_fwspec *fwspec = cookie;
+       struct iommu_fwspec *fwspec;
+       struct device *dev = cookie;
        unsigned i;
 
+       fwspec = dev_iommu_fwspec_get(dev);
+
        for (i = 0; i < fwspec->num_ids; i++) {
-               struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+               struct qcom_iommu_ctx *ctx = to_ctx(dev, fwspec->ids[i]);
                unsigned int val, ret;
 
                iommu_writel(ctx, ARM_SMMU_CB_TLBSYNC, 0);
@@ -133,11 +139,14 @@ static void qcom_iommu_tlb_sync(void *cookie)
 
 static void qcom_iommu_tlb_inv_context(void *cookie)
 {
-       struct iommu_fwspec *fwspec = cookie;
+       struct device *dev = cookie;
+       struct iommu_fwspec *fwspec;
        unsigned i;
 
+       fwspec = dev_iommu_fwspec_get(dev);
+
        for (i = 0; i < fwspec->num_ids; i++) {
-               struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+               struct qcom_iommu_ctx *ctx = to_ctx(dev, fwspec->ids[i]);
                iommu_writel(ctx, ARM_SMMU_CB_S1_TLBIASID, ctx->asid);
        }
 
@@ -147,13 +156,16 @@ static void qcom_iommu_tlb_inv_context(void *cookie)
 static void qcom_iommu_tlb_inv_range_nosync(unsigned long iova, size_t size,
                                            size_t granule, bool leaf, void *cookie)
 {
-       struct iommu_fwspec *fwspec = cookie;
+       struct device *dev = cookie;
+       struct iommu_fwspec *fwspec;
        unsigned i, reg;
 
        reg = leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
 
+       fwspec = dev_iommu_fwspec_get(dev);
+
        for (i = 0; i < fwspec->num_ids; i++) {
-               struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+               struct qcom_iommu_ctx *ctx = to_ctx(dev, fwspec->ids[i]);
                size_t s = size;
 
                iova = (iova >> 12) << 12;
@@ -222,9 +234,10 @@ static irqreturn_t qcom_iommu_fault(int irq, void *dev)
 
 static int qcom_iommu_init_domain(struct iommu_domain *domain,
                                  struct qcom_iommu_dev *qcom_iommu,
-                                 struct iommu_fwspec *fwspec)
+                                 struct device *dev)
 {
        struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
        struct io_pgtable_ops *pgtbl_ops;
        struct io_pgtable_cfg pgtbl_cfg;
        int i, ret = 0;
@@ -243,7 +256,7 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain,
        };
 
        qcom_domain->iommu = qcom_iommu;
-       pgtbl_ops = alloc_io_pgtable_ops(ARM_32_LPAE_S1, &pgtbl_cfg, fwspec);
+       pgtbl_ops = alloc_io_pgtable_ops(ARM_32_LPAE_S1, &pgtbl_cfg, dev);
        if (!pgtbl_ops) {
                dev_err(qcom_iommu->dev, "failed to allocate pagetable ops\n");
                ret = -ENOMEM;
@@ -256,7 +269,7 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain,
        domain->geometry.force_aperture = true;
 
        for (i = 0; i < fwspec->num_ids; i++) {
-               struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+               struct qcom_iommu_ctx *ctx = to_ctx(dev, fwspec->ids[i]);
 
                if (!ctx->secure_init) {
                        ret = qcom_scm_restore_sec_cfg(qcom_iommu->sec_id, ctx->asid);
@@ -363,8 +376,7 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain)
 
 static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
 {
-       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-       struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec);
+       struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
        struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
        int ret;
 
@@ -375,7 +387,7 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev
 
        /* Ensure that the domain is finalized */
        pm_runtime_get_sync(qcom_iommu->dev);
-       ret = qcom_iommu_init_domain(domain, qcom_iommu, fwspec);
+       ret = qcom_iommu_init_domain(domain, qcom_iommu, dev);
        pm_runtime_put_sync(qcom_iommu->dev);
        if (ret < 0)
                return ret;
@@ -397,9 +409,9 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev
 
 static void qcom_iommu_detach_dev(struct iommu_domain *domain, struct device *dev)
 {
-       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-       struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec);
        struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
+       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+       struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
        unsigned i;
 
        if (WARN_ON(!qcom_domain->iommu))
@@ -407,7 +419,7 @@ static void qcom_iommu_detach_dev(struct iommu_domain *domain, struct device *de
 
        pm_runtime_get_sync(qcom_iommu->dev);
        for (i = 0; i < fwspec->num_ids; i++) {
-               struct qcom_iommu_ctx *ctx = to_ctx(fwspec, fwspec->ids[i]);
+               struct qcom_iommu_ctx *ctx = to_ctx(dev, fwspec->ids[i]);
 
                /* Disable the context bank: */
                iommu_writel(ctx, ARM_SMMU_CB_SCTLR, 0);
@@ -514,7 +526,7 @@ static bool qcom_iommu_capable(enum iommu_cap cap)
 
 static int qcom_iommu_add_device(struct device *dev)
 {
-       struct qcom_iommu_dev *qcom_iommu = to_iommu(dev_iommu_fwspec_get(dev));
+       struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
        struct iommu_group *group;
        struct device_link *link;
 
@@ -545,7 +557,7 @@ static int qcom_iommu_add_device(struct device *dev)
 
 static void qcom_iommu_remove_device(struct device *dev)
 {
-       struct qcom_iommu_dev *qcom_iommu = to_iommu(dev_iommu_fwspec_get(dev));
+       struct qcom_iommu_dev *qcom_iommu = to_iommu(dev);
 
        if (!qcom_iommu)
                return;
@@ -557,7 +569,6 @@ static void qcom_iommu_remove_device(struct device *dev)
 
 static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
 {
-       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
        struct qcom_iommu_dev *qcom_iommu;
        struct platform_device *iommu_pdev;
        unsigned asid = args->args[0];
@@ -583,14 +594,14 @@ static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
            WARN_ON(asid > qcom_iommu->num_ctxs))
                return -EINVAL;
 
-       if (!fwspec->iommu_priv) {
-               fwspec->iommu_priv = qcom_iommu;
+       if (!dev_iommu_priv_get(dev)) {
+               dev_iommu_priv_set(dev, qcom_iommu);
        } else {
                /* make sure devices iommus dt node isn't referring to
                 * multiple different iommu devices.  Multiple context
                 * banks are ok, but multiple devices are not:
                 */
-               if (WARN_ON(qcom_iommu != fwspec->iommu_priv))
+               if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev)))
                        return -EINVAL;
        }
 
index 3fb7ba7..db6559e 100644 (file)
@@ -247,7 +247,7 @@ static int gart_iommu_add_device(struct device *dev)
 {
        struct iommu_group *group;
 
-       if (!dev->iommu_fwspec)
+       if (!dev_iommu_fwspec_get(dev))
                return -ENODEV;
 
        group = iommu_group_get_for_dev(dev);
index cce329d..d5cac4f 100644 (file)
@@ -466,7 +466,7 @@ static int viommu_probe_endpoint(struct viommu_dev *viommu, struct device *dev)
        struct virtio_iommu_req_probe *probe;
        struct virtio_iommu_probe_property *prop;
        struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-       struct viommu_endpoint *vdev = fwspec->iommu_priv;
+       struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
 
        if (!fwspec->num_ids)
                return -EINVAL;
@@ -607,24 +607,36 @@ static struct iommu_domain *viommu_domain_alloc(unsigned type)
        return &vdomain->domain;
 }
 
-static int viommu_domain_finalise(struct viommu_dev *viommu,
+static int viommu_domain_finalise(struct viommu_endpoint *vdev,
                                  struct iommu_domain *domain)
 {
        int ret;
+       unsigned long viommu_page_size;
+       struct viommu_dev *viommu = vdev->viommu;
        struct viommu_domain *vdomain = to_viommu_domain(domain);
 
-       vdomain->viommu         = viommu;
-       vdomain->map_flags      = viommu->map_flags;
+       viommu_page_size = 1UL << __ffs(viommu->pgsize_bitmap);
+       if (viommu_page_size > PAGE_SIZE) {
+               dev_err(vdev->dev,
+                       "granule 0x%lx larger than system page size 0x%lx\n",
+                       viommu_page_size, PAGE_SIZE);
+               return -EINVAL;
+       }
+
+       ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain,
+                             viommu->last_domain, GFP_KERNEL);
+       if (ret < 0)
+               return ret;
+
+       vdomain->id             = (unsigned int)ret;
 
        domain->pgsize_bitmap   = viommu->pgsize_bitmap;
        domain->geometry        = viommu->geometry;
 
-       ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain,
-                             viommu->last_domain, GFP_KERNEL);
-       if (ret >= 0)
-               vdomain->id = (unsigned int)ret;
+       vdomain->map_flags      = viommu->map_flags;
+       vdomain->viommu         = viommu;
 
-       return ret > 0 ? 0 : ret;
+       return 0;
 }
 
 static void viommu_domain_free(struct iommu_domain *domain)
@@ -648,7 +660,7 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
        int ret = 0;
        struct virtio_iommu_req_attach req;
        struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-       struct viommu_endpoint *vdev = fwspec->iommu_priv;
+       struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
        struct viommu_domain *vdomain = to_viommu_domain(domain);
 
        mutex_lock(&vdomain->mutex);
@@ -657,7 +669,7 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev)
                 * Properly initialize the domain now that we know which viommu
                 * owns it.
                 */
-               ret = viommu_domain_finalise(vdev->viommu, domain);
+               ret = viommu_domain_finalise(vdev, domain);
        } else if (vdomain->viommu != vdev->viommu) {
                dev_err(dev, "cannot attach to foreign vIOMMU\n");
                ret = -EXDEV;
@@ -807,8 +819,7 @@ static void viommu_iotlb_sync(struct iommu_domain *domain,
 static void viommu_get_resv_regions(struct device *dev, struct list_head *head)
 {
        struct iommu_resv_region *entry, *new_entry, *msi = NULL;
-       struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-       struct viommu_endpoint *vdev = fwspec->iommu_priv;
+       struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
        int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
 
        list_for_each_entry(entry, &vdev->resv_regions, list) {
@@ -876,7 +887,7 @@ static int viommu_add_device(struct device *dev)
        vdev->dev = dev;
        vdev->viommu = viommu;
        INIT_LIST_HEAD(&vdev->resv_regions);
-       fwspec->iommu_priv = vdev;
+       dev_iommu_priv_set(dev, vdev);
 
        if (viommu->probe_size) {
                /* Get additional information for this endpoint */
@@ -920,7 +931,7 @@ static void viommu_remove_device(struct device *dev)
        if (!fwspec || fwspec->ops != &viommu_ops)
                return;
 
-       vdev = fwspec->iommu_priv;
+       vdev = dev_iommu_priv_get(dev);
 
        iommu_group_remove_device(dev);
        iommu_device_unlink(&vdev->viommu->iommu, dev);
@@ -1082,7 +1093,6 @@ static int viommu_probe(struct virtio_device *vdev)
 
 #ifdef CONFIG_PCI
        if (pci_bus_type.iommu_ops != &viommu_ops) {
-               pci_request_acs();
                ret = bus_set_iommu(&pci_bus_type, &viommu_ops);
                if (ret)
                        goto err_unregister;
index d82f1de..c664d84 100644 (file)
@@ -846,6 +846,17 @@ config LEDS_TPS6105X
          It is a single boost converter primarily for white LEDs and
          audio amplifiers.
 
+config LEDS_IP30
+       tristate "LED support for SGI Octane machines"
+       depends on LEDS_CLASS
+       depends on SGI_MFD_IOC3
+       help
+         This option enables support for the Red and White LEDs of
+         SGI Octane machines.
+
+         To compile this driver as a module, choose M here: the module
+         will be called leds-ip30.
+
 comment "LED Triggers"
 source "drivers/leds/trigger/Kconfig"
 
index d7e1107..45235d5 100644 (file)
@@ -6,91 +6,92 @@ obj-$(CONFIG_LEDS_CLASS)              += led-class.o
 obj-$(CONFIG_LEDS_CLASS_FLASH)         += led-class-flash.o
 obj-$(CONFIG_LEDS_TRIGGERS)            += led-triggers.o
 
-# LED Platform Drivers
+# LED Platform Drivers (keep this sorted, M-| sort)
 obj-$(CONFIG_LEDS_88PM860X)            += leds-88pm860x.o
 obj-$(CONFIG_LEDS_AAT1290)             += leds-aat1290.o
+obj-$(CONFIG_LEDS_ADP5520)             += leds-adp5520.o
+obj-$(CONFIG_LEDS_AN30259A)            += leds-an30259a.o
 obj-$(CONFIG_LEDS_APU)                 += leds-apu.o
 obj-$(CONFIG_LEDS_AS3645A)             += leds-as3645a.o
-obj-$(CONFIG_LEDS_AN30259A)            += leds-an30259a.o
+obj-$(CONFIG_LEDS_ASIC3)               += leds-asic3.o
 obj-$(CONFIG_LEDS_BCM6328)             += leds-bcm6328.o
 obj-$(CONFIG_LEDS_BCM6358)             += leds-bcm6358.o
 obj-$(CONFIG_LEDS_BD2802)              += leds-bd2802.o
+obj-$(CONFIG_LEDS_BLINKM)              += leds-blinkm.o
+obj-$(CONFIG_LEDS_CLEVO_MAIL)          += leds-clevo-mail.o
+obj-$(CONFIG_LEDS_COBALT_QUBE)         += leds-cobalt-qube.o
+obj-$(CONFIG_LEDS_COBALT_RAQ)          += leds-cobalt-raq.o
 obj-$(CONFIG_LEDS_CPCAP)               += leds-cpcap.o
-obj-$(CONFIG_LEDS_LOCOMO)              += leds-locomo.o
+obj-$(CONFIG_LEDS_DA903X)              += leds-da903x.o
+obj-$(CONFIG_LEDS_DA9052)              += leds-da9052.o
+obj-$(CONFIG_LEDS_FSG)                 += leds-fsg.o
+obj-$(CONFIG_LEDS_GPIO)                        += leds-gpio.o
+obj-$(CONFIG_LEDS_GPIO_REGISTER)       += leds-gpio-register.o
+obj-$(CONFIG_LEDS_HP6XX)               += leds-hp6xx.o
+obj-$(CONFIG_LEDS_INTEL_SS4200)                += leds-ss4200.o
+obj-$(CONFIG_LEDS_IP30)                        += leds-ip30.o
+obj-$(CONFIG_LEDS_IPAQ_MICRO)          += leds-ipaq-micro.o
+obj-$(CONFIG_LEDS_IS31FL319X)          += leds-is31fl319x.o
+obj-$(CONFIG_LEDS_IS31FL32XX)          += leds-is31fl32xx.o
+obj-$(CONFIG_LEDS_KTD2692)             += leds-ktd2692.o
 obj-$(CONFIG_LEDS_LM3530)              += leds-lm3530.o
 obj-$(CONFIG_LEDS_LM3532)              += leds-lm3532.o
 obj-$(CONFIG_LEDS_LM3533)              += leds-lm3533.o
+obj-$(CONFIG_LEDS_LM355x)              += leds-lm355x.o
+obj-$(CONFIG_LEDS_LM3601X)             += leds-lm3601x.o
+obj-$(CONFIG_LEDS_LM36274)             += leds-lm36274.o
 obj-$(CONFIG_LEDS_LM3642)              += leds-lm3642.o
-obj-$(CONFIG_LEDS_MIKROTIK_RB532)      += leds-rb532.o
-obj-$(CONFIG_LEDS_S3C24XX)             += leds-s3c24xx.o
-obj-$(CONFIG_LEDS_NET48XX)             += leds-net48xx.o
-obj-$(CONFIG_LEDS_WRAP)                        += leds-wrap.o
-obj-$(CONFIG_LEDS_COBALT_QUBE)         += leds-cobalt-qube.o
-obj-$(CONFIG_LEDS_COBALT_RAQ)          += leds-cobalt-raq.o
-obj-$(CONFIG_LEDS_SUNFIRE)             += leds-sunfire.o
-obj-$(CONFIG_LEDS_PCA9532)             += leds-pca9532.o
-obj-$(CONFIG_LEDS_GPIO_REGISTER)       += leds-gpio-register.o
-obj-$(CONFIG_LEDS_GPIO)                        += leds-gpio.o
+obj-$(CONFIG_LEDS_LM3692X)             += leds-lm3692x.o
+obj-$(CONFIG_LEDS_LM3697)              += leds-lm3697.o
+obj-$(CONFIG_LEDS_LOCOMO)              += leds-locomo.o
 obj-$(CONFIG_LEDS_LP3944)              += leds-lp3944.o
 obj-$(CONFIG_LEDS_LP3952)              += leds-lp3952.o
-obj-$(CONFIG_LEDS_LP55XX_COMMON)       += leds-lp55xx-common.o
 obj-$(CONFIG_LEDS_LP5521)              += leds-lp5521.o
 obj-$(CONFIG_LEDS_LP5523)              += leds-lp5523.o
 obj-$(CONFIG_LEDS_LP5562)              += leds-lp5562.o
+obj-$(CONFIG_LEDS_LP55XX_COMMON)       += leds-lp55xx-common.o
 obj-$(CONFIG_LEDS_LP8501)              += leds-lp8501.o
 obj-$(CONFIG_LEDS_LP8788)              += leds-lp8788.o
 obj-$(CONFIG_LEDS_LP8860)              += leds-lp8860.o
-obj-$(CONFIG_LEDS_TCA6507)             += leds-tca6507.o
-obj-$(CONFIG_LEDS_TLC591XX)            += leds-tlc591xx.o
-obj-$(CONFIG_LEDS_CLEVO_MAIL)          += leds-clevo-mail.o
-obj-$(CONFIG_LEDS_IPAQ_MICRO)          += leds-ipaq-micro.o
-obj-$(CONFIG_LEDS_HP6XX)               += leds-hp6xx.o
-obj-$(CONFIG_LEDS_OT200)               += leds-ot200.o
-obj-$(CONFIG_LEDS_FSG)                 += leds-fsg.o
-obj-$(CONFIG_LEDS_PCA955X)             += leds-pca955x.o
-obj-$(CONFIG_LEDS_PCA963X)             += leds-pca963x.o
-obj-$(CONFIG_LEDS_DA903X)              += leds-da903x.o
-obj-$(CONFIG_LEDS_DA9052)              += leds-da9052.o
-obj-$(CONFIG_LEDS_WM831X_STATUS)       += leds-wm831x-status.o
-obj-$(CONFIG_LEDS_WM8350)              += leds-wm8350.o
-obj-$(CONFIG_LEDS_PWM)                 += leds-pwm.o
-obj-$(CONFIG_LEDS_REGULATOR)           += leds-regulator.o
-obj-$(CONFIG_LEDS_INTEL_SS4200)                += leds-ss4200.o
 obj-$(CONFIG_LEDS_LT3593)              += leds-lt3593.o
-obj-$(CONFIG_LEDS_ADP5520)             += leds-adp5520.o
-obj-$(CONFIG_LEDS_MC13783)             += leds-mc13783.o
-obj-$(CONFIG_LEDS_NS2)                 += leds-ns2.o
-obj-$(CONFIG_LEDS_NETXBIG)             += leds-netxbig.o
-obj-$(CONFIG_LEDS_ASIC3)               += leds-asic3.o
 obj-$(CONFIG_LEDS_MAX77650)            += leds-max77650.o
 obj-$(CONFIG_LEDS_MAX77693)            += leds-max77693.o
 obj-$(CONFIG_LEDS_MAX8997)             += leds-max8997.o
-obj-$(CONFIG_LEDS_LM355x)              += leds-lm355x.o
-obj-$(CONFIG_LEDS_BLINKM)              += leds-blinkm.o
-obj-$(CONFIG_LEDS_SYSCON)              += leds-syscon.o
+obj-$(CONFIG_LEDS_MC13783)             += leds-mc13783.o
 obj-$(CONFIG_LEDS_MENF21BMC)           += leds-menf21bmc.o
-obj-$(CONFIG_LEDS_KTD2692)             += leds-ktd2692.o
-obj-$(CONFIG_LEDS_POWERNV)             += leds-powernv.o
-obj-$(CONFIG_LEDS_IS31FL319X)          += leds-is31fl319x.o
-obj-$(CONFIG_LEDS_IS31FL32XX)          += leds-is31fl32xx.o
-obj-$(CONFIG_LEDS_PM8058)              += leds-pm8058.o
+obj-$(CONFIG_LEDS_MIKROTIK_RB532)      += leds-rb532.o
 obj-$(CONFIG_LEDS_MLXCPLD)             += leds-mlxcpld.o
 obj-$(CONFIG_LEDS_MLXREG)              += leds-mlxreg.o
-obj-$(CONFIG_LEDS_NIC78BX)             += leds-nic78bx.o
-obj-$(CONFIG_LEDS_SPI_BYTE)            += leds-spi-byte.o
 obj-$(CONFIG_LEDS_MT6323)              += leds-mt6323.o
-obj-$(CONFIG_LEDS_LM3692X)             += leds-lm3692x.o
+obj-$(CONFIG_LEDS_NET48XX)             += leds-net48xx.o
+obj-$(CONFIG_LEDS_NETXBIG)             += leds-netxbig.o
+obj-$(CONFIG_LEDS_NIC78BX)             += leds-nic78bx.o
+obj-$(CONFIG_LEDS_NS2)                 += leds-ns2.o
+obj-$(CONFIG_LEDS_OT200)               += leds-ot200.o
+obj-$(CONFIG_LEDS_PCA9532)             += leds-pca9532.o
+obj-$(CONFIG_LEDS_PCA955X)             += leds-pca955x.o
+obj-$(CONFIG_LEDS_PCA963X)             += leds-pca963x.o
+obj-$(CONFIG_LEDS_PM8058)              += leds-pm8058.o
+obj-$(CONFIG_LEDS_POWERNV)             += leds-powernv.o
+obj-$(CONFIG_LEDS_PWM)                 += leds-pwm.o
+obj-$(CONFIG_LEDS_REGULATOR)           += leds-regulator.o
+obj-$(CONFIG_LEDS_S3C24XX)             += leds-s3c24xx.o
 obj-$(CONFIG_LEDS_SC27XX_BLTC)         += leds-sc27xx-bltc.o
-obj-$(CONFIG_LEDS_LM3601X)             += leds-lm3601x.o
+obj-$(CONFIG_LEDS_SUNFIRE)             += leds-sunfire.o
+obj-$(CONFIG_LEDS_SYSCON)              += leds-syscon.o
+obj-$(CONFIG_LEDS_TCA6507)             += leds-tca6507.o
 obj-$(CONFIG_LEDS_TI_LMU_COMMON)       += leds-ti-lmu-common.o
-obj-$(CONFIG_LEDS_LM3697)              += leds-lm3697.o
-obj-$(CONFIG_LEDS_LM36274)             += leds-lm36274.o
+obj-$(CONFIG_LEDS_TLC591XX)            += leds-tlc591xx.o
 obj-$(CONFIG_LEDS_TPS6105X)            += leds-tps6105x.o
+obj-$(CONFIG_LEDS_WM831X_STATUS)       += leds-wm831x-status.o
+obj-$(CONFIG_LEDS_WM8350)              += leds-wm8350.o
+obj-$(CONFIG_LEDS_WRAP)                        += leds-wrap.o
 
 # LED SPI Drivers
 obj-$(CONFIG_LEDS_CR0014114)           += leds-cr0014114.o
 obj-$(CONFIG_LEDS_DAC124S085)          += leds-dac124s085.o
 obj-$(CONFIG_LEDS_EL15203000)          += leds-el15203000.o
+obj-$(CONFIG_LEDS_SPI_BYTE)            += leds-spi-byte.o
 
 # LED Userspace Drivers
 obj-$(CONFIG_LEDS_USER)                        += uleds.o
index 1fc40e8..3363a65 100644 (file)
@@ -376,7 +376,7 @@ int led_classdev_register_ext(struct device *parent,
 
        if (ret)
                dev_warn(parent, "Led %s renamed to %s due to name collision",
-                               led_cdev->name, dev_name(led_cdev->dev));
+                               proposed_name, dev_name(led_cdev->dev));
 
        if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) {
                ret = led_add_brightness_hw_changed(led_cdev);
index bd61a82..8bbaef5 100644 (file)
@@ -660,7 +660,6 @@ static int bd2802_probe(struct i2c_client *client,
                        const struct i2c_device_id *id)
 {
        struct bd2802_led *led;
-       struct bd2802_led_platform_data *pdata;
        int ret, i;
 
        led = devm_kzalloc(&client->dev, sizeof(struct bd2802_led), GFP_KERNEL);
@@ -668,7 +667,6 @@ static int bd2802_probe(struct i2c_client *client,
                return -ENOMEM;
 
        led->client = client;
-       pdata = led->pdata = dev_get_platdata(&client->dev);
        i2c_set_clientdata(client, led);
 
        /*
diff --git a/drivers/leds/leds-ip30.c b/drivers/leds/leds-ip30.c
new file mode 100644 (file)
index 0000000..d4ec736
--- /dev/null
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * LED Driver for SGI Octane machines
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/leds.h>
+
+#define IP30_LED_SYSTEM        0
+#define IP30_LED_FAULT 1
+
+struct ip30_led {
+       struct led_classdev cdev;
+       u32 __iomem *reg;
+};
+
+static void ip30led_set(struct led_classdev *led_cdev,
+                       enum led_brightness value)
+{
+       struct ip30_led *led = container_of(led_cdev, struct ip30_led, cdev);
+
+       writel(value, led->reg);
+}
+
+static int ip30led_create(struct platform_device *pdev, int num)
+{
+       struct resource *res;
+       struct ip30_led *data;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, num);
+       if (!res)
+               return -EBUSY;
+
+       data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       data->reg = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(data->reg))
+               return PTR_ERR(data->reg);
+
+
+       switch (num) {
+       case IP30_LED_SYSTEM:
+               data->cdev.name = "white:power";
+               break;
+       case IP30_LED_FAULT:
+               data->cdev.name = "red:fault";
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       data->cdev.brightness = readl(data->reg);
+       data->cdev.max_brightness = 1;
+       data->cdev.brightness_set = ip30led_set;
+
+       return devm_led_classdev_register(&pdev->dev, &data->cdev);
+}
+
+static int ip30led_probe(struct platform_device *pdev)
+{
+       int ret;
+
+       ret = ip30led_create(pdev, IP30_LED_SYSTEM);
+       if (ret < 0)
+               return ret;
+
+       return ip30led_create(pdev, IP30_LED_FAULT);
+}
+
+static struct platform_driver ip30led_driver = {
+       .probe          = ip30led_probe,
+       .driver         = {
+               .name           = "ip30-leds",
+       },
+};
+
+module_platform_driver(ip30led_driver);
+
+MODULE_AUTHOR("Thomas Bogendoerfer <tbogendoerfer@suse.de>");
+MODULE_DESCRIPTION("SGI Octane LED driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:ip30-leds");
index 6f29b89..cd768f9 100644 (file)
@@ -44,7 +44,7 @@ struct is31fl32xx_priv {
        const struct is31fl32xx_chipdef *cdef;
        struct i2c_client *client;
        unsigned int num_leds;
-       struct is31fl32xx_led_data leds[0];
+       struct is31fl32xx_led_data leds[];
 };
 
 /**
index 188a57d..aa9bf8c 100644 (file)
@@ -140,7 +140,7 @@ struct lm3532_led {
        int ctrl_brt_pointer;
        int num_leds;
        int full_scale_current;
-       int enabled:1;
+       unsigned int enabled:1;
        u32 led_strings[LM3532_MAX_CONTROL_BANKS];
        char label[LED_MAX_NAME_SIZE];
 };
index b71711a..872d26f 100644 (file)
@@ -246,7 +246,7 @@ static int lm3697_probe_dt(struct lm3697 *priv)
 
                led->num_leds = fwnode_property_count_u32(child, "led-sources");
                if (led->num_leds > LM3697_MAX_LED_STRINGS) {
-                       dev_err(&priv->client->dev, "To many LED strings defined\n");
+                       dev_err(&priv->client->dev, "Too many LED strings defined\n");
                        continue;
                }
 
index 7c500df..538ca57 100644 (file)
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/leds.h>
 #include <linux/module.h>
-#include <linux/platform_data/leds-kirkwood-ns2.h>
 #include <linux/of.h>
-#include <linux/of_gpio.h>
 #include "leds.h"
 
+enum ns2_led_modes {
+       NS_V2_LED_OFF,
+       NS_V2_LED_ON,
+       NS_V2_LED_SATA,
+};
+
+struct ns2_led_modval {
+       enum ns2_led_modes      mode;
+       int                     cmd_level;
+       int                     slow_level;
+};
+
+struct ns2_led {
+       const char      *name;
+       const char      *default_trigger;
+       struct gpio_desc *cmd;
+       struct gpio_desc *slow;
+       int             num_modes;
+       struct ns2_led_modval *modval;
+};
+
+struct ns2_led_platform_data {
+       int             num_leds;
+       struct ns2_led  *leds;
+};
+
 /*
  * The Network Space v2 dual-GPIO LED is wired to a CPLD. Three different LED
  * modes are available: off, on and SATA activity blinking. The LED modes are
@@ -29,8 +53,8 @@
 
 struct ns2_led_data {
        struct led_classdev     cdev;
-       unsigned int            cmd;
-       unsigned int            slow;
+       struct gpio_desc        *cmd;
+       struct gpio_desc        *slow;
        bool                    can_sleep;
        unsigned char           sata; /* True when SATA mode active. */
        rwlock_t                rw_lock; /* Lock GPIOs. */
@@ -46,8 +70,8 @@ static int ns2_led_get_mode(struct ns2_led_data *led_dat,
        int cmd_level;
        int slow_level;
 
-       cmd_level = gpio_get_value_cansleep(led_dat->cmd);
-       slow_level = gpio_get_value_cansleep(led_dat->slow);
+       cmd_level = gpiod_get_value_cansleep(led_dat->cmd);
+       slow_level = gpiod_get_value_cansleep(led_dat->slow);
 
        for (i = 0; i < led_dat->num_modes; i++) {
                if (cmd_level == led_dat->modval[i].cmd_level &&
@@ -80,15 +104,15 @@ static void ns2_led_set_mode(struct ns2_led_data *led_dat,
        write_lock_irqsave(&led_dat->rw_lock, flags);
 
        if (!led_dat->can_sleep) {
-               gpio_set_value(led_dat->cmd,
-                              led_dat->modval[i].cmd_level);
-               gpio_set_value(led_dat->slow,
-                              led_dat->modval[i].slow_level);
+               gpiod_set_value(led_dat->cmd,
+                               led_dat->modval[i].cmd_level);
+               gpiod_set_value(led_dat->slow,
+                               led_dat->modval[i].slow_level);
                goto exit_unlock;
        }
 
-       gpio_set_value_cansleep(led_dat->cmd, led_dat->modval[i].cmd_level);
-       gpio_set_value_cansleep(led_dat->slow, led_dat->modval[i].slow_level);
+       gpiod_set_value_cansleep(led_dat->cmd, led_dat->modval[i].cmd_level);
+       gpiod_set_value_cansleep(led_dat->slow, led_dat->modval[i].slow_level);
 
 exit_unlock:
        write_unlock_irqrestore(&led_dat->rw_lock, flags);
@@ -176,26 +200,6 @@ create_ns2_led(struct platform_device *pdev, struct ns2_led_data *led_dat,
        int ret;
        enum ns2_led_modes mode;
 
-       ret = devm_gpio_request_one(&pdev->dev, template->cmd,
-                       gpio_get_value_cansleep(template->cmd) ?
-                       GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW,
-                       template->name);
-       if (ret) {
-               dev_err(&pdev->dev, "%s: failed to setup command GPIO\n",
-                       template->name);
-               return ret;
-       }
-
-       ret = devm_gpio_request_one(&pdev->dev, template->slow,
-                       gpio_get_value_cansleep(template->slow) ?
-                       GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW,
-                       template->name);
-       if (ret) {
-               dev_err(&pdev->dev, "%s: failed to setup slow GPIO\n",
-                       template->name);
-               return ret;
-       }
-
        rwlock_init(&led_dat->rw_lock);
 
        led_dat->cdev.name = template->name;
@@ -205,8 +209,8 @@ create_ns2_led(struct platform_device *pdev, struct ns2_led_data *led_dat,
        led_dat->cdev.groups = ns2_led_groups;
        led_dat->cmd = template->cmd;
        led_dat->slow = template->slow;
-       led_dat->can_sleep = gpio_cansleep(led_dat->cmd) |
-                               gpio_cansleep(led_dat->slow);
+       led_dat->can_sleep = gpiod_cansleep(led_dat->cmd) |
+                               gpiod_cansleep(led_dat->slow);
        if (led_dat->can_sleep)
                led_dat->cdev.brightness_set_blocking = ns2_led_set_blocking;
        else
@@ -261,17 +265,26 @@ ns2_leds_get_of_pdata(struct device *dev, struct ns2_led_platform_data *pdata)
                const char *string;
                int i, num_modes;
                struct ns2_led_modval *modval;
+               struct gpio_desc *gd;
 
-               ret = of_get_named_gpio(child, "cmd-gpio", 0);
-               if (ret < 0)
-                       goto err_node_put;
-               led->cmd = ret;
-               ret = of_get_named_gpio(child, "slow-gpio", 0);
-               if (ret < 0)
-                       goto err_node_put;
-               led->slow = ret;
                ret = of_property_read_string(child, "label", &string);
                led->name = (ret == 0) ? string : child->name;
+
+               gd = gpiod_get_from_of_node(child, "cmd-gpio", 0,
+                                           GPIOD_ASIS, led->name);
+               if (IS_ERR(gd)) {
+                       ret = PTR_ERR(gd);
+                       goto err_node_put;
+               }
+               led->cmd = gd;
+               gd = gpiod_get_from_of_node(child, "slow-gpio", 0,
+                                           GPIOD_ASIS, led->name);
+               if (IS_ERR(gd)) {
+                       ret = PTR_ERR(gd);
+                       goto err_node_put;
+               }
+               led->slow = gd;
+
                ret = of_property_read_string(child, "linux,default-trigger",
                                              &string);
                if (ret == 0)
index 8b6965a..6c8a724 100644 (file)
 #include <linux/leds.h>
 #include <linux/err.h>
 #include <linux/pwm.h>
-#include <linux/leds_pwm.h>
 #include <linux/slab.h>
 
+struct led_pwm {
+       const char      *name;
+       const char      *default_trigger;
+       u8              active_low;
+       unsigned int    max_brightness;
+};
+
+struct led_pwm_platform_data {
+       int             num_leds;
+       struct led_pwm  *leds;
+};
+
 struct led_pwm_data {
        struct led_classdev     cdev;
        struct pwm_device       *pwm;
+       struct pwm_state        pwmstate;
        unsigned int            active_low;
-       unsigned int            period;
-       int                     duty;
 };
 
 struct led_pwm_priv {
        int num_leds;
-       struct led_pwm_data leds[0];
+       struct led_pwm_data leds[];
 };
 
-static void __led_pwm_set(struct led_pwm_data *led_dat)
-{
-       int new_duty = led_dat->duty;
-
-       pwm_config(led_dat->pwm, new_duty, led_dat->period);
-
-       if (new_duty == 0)
-               pwm_disable(led_dat->pwm);
-       else
-               pwm_enable(led_dat->pwm);
-}
-
 static int led_pwm_set(struct led_classdev *led_cdev,
                       enum led_brightness brightness)
 {
        struct led_pwm_data *led_dat =
                container_of(led_cdev, struct led_pwm_data, cdev);
        unsigned int max = led_dat->cdev.max_brightness;
-       unsigned long long duty =  led_dat->period;
+       unsigned long long duty = led_dat->pwmstate.period;
 
        duty *= brightness;
        do_div(duty, max);
 
        if (led_dat->active_low)
-               duty = led_dat->period - duty;
-
-       led_dat->duty = duty;
-
-       __led_pwm_set(led_dat);
+               duty = led_dat->pwmstate.period - duty;
 
-       return 0;
+       led_dat->pwmstate.duty_cycle = duty;
+       led_dat->pwmstate.enabled = duty > 0;
+       return pwm_apply_state(led_dat->pwm, &led_dat->pwmstate);
 }
 
 static int led_pwm_add(struct device *dev, struct led_pwm_priv *priv,
                       struct led_pwm *led, struct fwnode_handle *fwnode)
 {
        struct led_pwm_data *led_data = &priv->leds[priv->num_leds];
-       struct pwm_args pargs;
        int ret;
 
        led_data->active_low = led->active_low;
@@ -93,17 +88,7 @@ static int led_pwm_add(struct device *dev, struct led_pwm_priv *priv,
 
        led_data->cdev.brightness_set_blocking = led_pwm_set;
 
-       /*
-        * FIXME: pwm_apply_args() should be removed when switching to the
-        * atomic PWM API.
-        */
-       pwm_apply_args(led_data->pwm);
-
-       pwm_get_args(led_data->pwm, &pargs);
-
-       led_data->period = pargs.period;
-       if (!led_data->period && (led->pwm_period_ns > 0))
-               led_data->period = led->pwm_period_ns;
+       pwm_init_state(led_data->pwm, &led_data->pwmstate);
 
        ret = devm_led_classdev_register(dev, &led_data->cdev);
        if (ret == 0) {
index 8d07fdf..e1db434 100644 (file)
@@ -201,10 +201,27 @@ static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
        return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
 }
 
+static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
+                                     size_t nr_pages)
+{
+       int ret;
+       struct linear_c *lc = ti->private;
+       struct block_device *bdev = lc->dev->bdev;
+       struct dax_device *dax_dev = lc->dev->dax_dev;
+       sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+       dev_sector = linear_map_sector(ti, sector);
+       ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages << PAGE_SHIFT, &pgoff);
+       if (ret)
+               return ret;
+       return dax_zero_page_range(dax_dev, pgoff, nr_pages);
+}
+
 #else
 #define linear_dax_direct_access NULL
 #define linear_dax_copy_from_iter NULL
 #define linear_dax_copy_to_iter NULL
+#define linear_dax_zero_page_range NULL
 #endif
 
 static struct target_type linear_target = {
@@ -226,6 +243,7 @@ static struct target_type linear_target = {
        .direct_access = linear_dax_direct_access,
        .dax_copy_from_iter = linear_dax_copy_from_iter,
        .dax_copy_to_iter = linear_dax_copy_to_iter,
+       .dax_zero_page_range = linear_dax_zero_page_range,
 };
 
 int __init dm_linear_init(void)
index 99721c7..8ea20b5 100644 (file)
@@ -994,10 +994,26 @@ static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
        return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
 }
 
+static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
+                                         size_t nr_pages)
+{
+       int ret;
+       struct log_writes_c *lc = ti->private;
+       sector_t sector = pgoff * PAGE_SECTORS;
+
+       ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages << PAGE_SHIFT,
+                            &pgoff);
+       if (ret)
+               return ret;
+       return dax_zero_page_range(lc->dev->dax_dev, pgoff,
+                                  nr_pages << PAGE_SHIFT);
+}
+
 #else
 #define log_writes_dax_direct_access NULL
 #define log_writes_dax_copy_from_iter NULL
 #define log_writes_dax_copy_to_iter NULL
+#define log_writes_dax_zero_page_range NULL
 #endif
 
 static struct target_type log_writes_target = {
@@ -1016,6 +1032,7 @@ static struct target_type log_writes_target = {
        .direct_access = log_writes_dax_direct_access,
        .dax_copy_from_iter = log_writes_dax_copy_from_iter,
        .dax_copy_to_iter = log_writes_dax_copy_to_iter,
+       .dax_zero_page_range = log_writes_dax_zero_page_range,
 };
 
 static int __init dm_log_writes_init(void)
index 63bbcc2..fa813c0 100644 (file)
@@ -360,10 +360,32 @@ static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
        return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
 }
 
+static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
+                                     size_t nr_pages)
+{
+       int ret;
+       sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+       struct stripe_c *sc = ti->private;
+       struct dax_device *dax_dev;
+       struct block_device *bdev;
+       uint32_t stripe;
+
+       stripe_map_sector(sc, sector, &stripe, &dev_sector);
+       dev_sector += sc->stripe[stripe].physical_start;
+       dax_dev = sc->stripe[stripe].dev->dax_dev;
+       bdev = sc->stripe[stripe].dev->bdev;
+
+       ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages << PAGE_SHIFT, &pgoff);
+       if (ret)
+               return ret;
+       return dax_zero_page_range(dax_dev, pgoff, nr_pages);
+}
+
 #else
 #define stripe_dax_direct_access NULL
 #define stripe_dax_copy_from_iter NULL
 #define stripe_dax_copy_to_iter NULL
+#define stripe_dax_zero_page_range NULL
 #endif
 
 /*
@@ -486,6 +508,7 @@ static struct target_type stripe_target = {
        .direct_access = stripe_dax_direct_access,
        .dax_copy_from_iter = stripe_dax_copy_from_iter,
        .dax_copy_to_iter = stripe_dax_copy_to_iter,
+       .dax_zero_page_range = stripe_dax_zero_page_range,
 };
 
 int __init dm_stripe_init(void)
index 21c0207..db9e461 100644 (file)
@@ -1199,6 +1199,35 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
        return ret;
 }
 
+static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+                                 size_t nr_pages)
+{
+       struct mapped_device *md = dax_get_private(dax_dev);
+       sector_t sector = pgoff * PAGE_SECTORS;
+       struct dm_target *ti;
+       int ret = -EIO;
+       int srcu_idx;
+
+       ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+       if (!ti)
+               goto out;
+       if (WARN_ON(!ti->type->dax_zero_page_range)) {
+               /*
+                * ->zero_page_range() is mandatory dax operation. If we are
+                *  here, something is wrong.
+                */
+               dm_put_live_table(md, srcu_idx);
+               goto out;
+       }
+       ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
+
+ out:
+       dm_put_live_table(md, srcu_idx);
+
+       return ret;
+}
+
 /*
  * A target may call dm_accept_partial_bio only from the map routine.  It is
  * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET,
@@ -1969,7 +1998,7 @@ static struct mapped_device *alloc_dev(int minor)
        if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
                md->dax_dev = alloc_dax(md, md->disk->disk_name,
                                        &dm_dax_ops, 0);
-               if (!md->dax_dev)
+               if (IS_ERR(md->dax_dev))
                        goto bad;
        }
 
@@ -3200,6 +3229,7 @@ static const struct dax_operations dm_dax_ops = {
        .dax_supported = dm_dax_supported,
        .copy_from_iter = dm_dax_copy_from_iter,
        .copy_to_iter = dm_dax_copy_to_iter,
+       .zero_page_range = dm_dax_zero_page_range,
 };
 
 /*
index 2b20329..0a59249 100644 (file)
@@ -642,6 +642,19 @@ config MFD_IPAQ_MICRO
          AT90LS8535 microcontroller flashed with a special iPAQ
          firmware using the custom protocol implemented in this driver.
 
+config MFD_IQS62X
+       tristate "Azoteq IQS620A/621/622/624/625 core support"
+       depends on I2C
+       select MFD_CORE
+       select REGMAP_I2C
+       help
+         Say Y here if you want to build core support for the Azoteq IQS620A,
+         IQS621, IQS622, IQS624 and IQS625 multi-function sensors. Additional
+         options must be selected to enable device-specific functions.
+
+         To compile this driver as a module, choose M here: the module will
+         be called iqs62x.
+
 config MFD_JANZ_CMODIO
        tristate "Janz CMOD-IO PCI MODULbus Carrier Board"
        select MFD_CORE
@@ -893,6 +906,7 @@ config MFD_CPCAP
        tristate "Support for Motorola CPCAP"
        depends on SPI
        depends on OF || COMPILE_TEST
+       select MFD_CORE
        select REGMAP_SPI
        select REGMAP_IRQ
        help
@@ -1058,6 +1072,7 @@ config MFD_RN5T618
        depends on OF
        select MFD_CORE
        select REGMAP_I2C
+       select REGMAP_IRQ
        help
          Say yes here to add support for the Ricoh RN5T567,
          RN5T618, RC5T619 PMIC.
@@ -1201,7 +1216,7 @@ config AB8500_CORE
          chip. This connects to U8500 either on the SSP/SPI bus (deprecated
          since hardware version v1.0) or the I2C bus via PRCMU. It also adds
          the irq_chip parts for handling the Mixed Signal chip events.
-         This chip embeds various other multimedia funtionalities as well.
+         This chip embeds various other multimedia functionalities as well.
 
 config AB8500_DEBUG
        bool "Enable debug info via debugfs"
@@ -1851,7 +1866,7 @@ config MFD_WM8994
          has on board GPIO and regulator functionality which is
          supported via the relevant subsystems.  This driver provides
          core support for the WM8994, in order to use the actual
-         functionaltiy of the device other drivers must be enabled.
+         functionality of the device other drivers must be enabled.
 
 config MFD_WM97xx
        tristate "Wolfson Microelectronics WM97xx"
@@ -1864,7 +1879,7 @@ config MFD_WM97xx
          designed for smartphone applications.  As well as audio functionality
          it has on board GPIO and a touchscreen functionality which is
          supported via the relevant subsystems.  This driver provides core
-         support for the WM97xx, in order to use the actual functionaltiy of
+         support for the WM97xx, in order to use the actual functionality of
          the device other drivers must be enabled.
 
 config MFD_STW481X
@@ -1957,7 +1972,7 @@ config MFD_STPMIC1
          Support for ST Microelectronics STPMIC1 PMIC. STPMIC1 has power on
          key, watchdog and regulator functionalities which are supported via
          the relevant subsystems. This driver provides core support for the
-         STPMIC1. In order to use the actual functionaltiy of the device other
+         STPMIC1. In order to use the actual functionality of the device other
          drivers must be enabled.
 
          To compile this driver as a module, choose M here: the
index b83f172..f935d10 100644 (file)
@@ -226,6 +226,7 @@ obj-$(CONFIG_MFD_AS3711)    += as3711.o
 obj-$(CONFIG_MFD_AS3722)       += as3722.o
 obj-$(CONFIG_MFD_STW481X)      += stw481x.o
 obj-$(CONFIG_MFD_IPAQ_MICRO)   += ipaq-micro.o
+obj-$(CONFIG_MFD_IQS62X)       += iqs62x.o
 obj-$(CONFIG_MFD_MENF21BMC)    += menf21bmc.o
 obj-$(CONFIG_MFD_HI6421_PMIC)  += hi6421-pmic-core.o
 obj-$(CONFIG_MFD_HI655X_PMIC)   += hi655x-pmic.o
index 78ee4b2..a17cf75 100644 (file)
@@ -221,7 +221,7 @@ static ssize_t aat2870_dump_reg(struct aat2870_data *aat2870, char *buf)
 
        count += sprintf(buf, "aat2870 registers\n");
        for (addr = 0; addr < AAT2870_REG_NUM; addr++) {
-               count += sprintf(buf + count, "0x%02x: ", addr);
+               count += snprintf(buf + count, PAGE_SIZE - count, "0x%02x: ", addr);
                if (count >= PAGE_SIZE - 1)
                        break;
 
index 39e6116..32c2b91 100644 (file)
@@ -211,7 +211,7 @@ static int ec_device_probe(struct platform_device *pdev)
         * explicitly added on platforms that don't have the PD notifier ACPI
         * device entry defined.
         */
-       if (IS_ENABLED(CONFIG_OF)) {
+       if (IS_ENABLED(CONFIG_OF) && ec->ec_dev->dev->of_node) {
                if (cros_ec_check_features(ec, EC_FEATURE_USB_PD)) {
                        retval = mfd_add_hotplug_devices(ec->dev,
                                        cros_usbpd_notify_cells,
index 419c735..fc30726 100644 (file)
@@ -21,6 +21,9 @@
 #define        DA9062_REG_EVENT_B_OFFSET       1
 #define        DA9062_REG_EVENT_C_OFFSET       2
 
+#define        DA9062_IRQ_LOW  0
+#define        DA9062_IRQ_HIGH 1
+
 static struct regmap_irq da9061_irqs[] = {
        /* EVENT A */
        [DA9061_IRQ_ONKEY] = {
@@ -369,6 +372,33 @@ static int da9062_get_device_type(struct da9062 *chip)
        return ret;
 }
 
+static u32 da9062_configure_irq_type(struct da9062 *chip, int irq, u32 *trigger)
+{
+       u32 irq_type = 0;
+       struct irq_data *irq_data = irq_get_irq_data(irq);
+
+       if (!irq_data) {
+               dev_err(chip->dev, "Invalid IRQ: %d\n", irq);
+               return -EINVAL;
+       }
+       *trigger = irqd_get_trigger_type(irq_data);
+
+       switch (*trigger) {
+       case IRQ_TYPE_LEVEL_HIGH:
+               irq_type = DA9062_IRQ_HIGH;
+               break;
+       case IRQ_TYPE_LEVEL_LOW:
+               irq_type = DA9062_IRQ_LOW;
+               break;
+       default:
+               dev_warn(chip->dev, "Unsupported IRQ type: %d\n", *trigger);
+               return -EINVAL;
+       }
+       return regmap_update_bits(chip->regmap, DA9062AA_CONFIG_A,
+                       DA9062AA_IRQ_TYPE_MASK,
+                       irq_type << DA9062AA_IRQ_TYPE_SHIFT);
+}
+
 static const struct regmap_range da9061_aa_readable_ranges[] = {
        regmap_reg_range(DA9062AA_PAGE_CON, DA9062AA_STATUS_B),
        regmap_reg_range(DA9062AA_STATUS_D, DA9062AA_EVENT_C),
@@ -388,6 +418,7 @@ static const struct regmap_range da9061_aa_readable_ranges[] = {
        regmap_reg_range(DA9062AA_VBUCK1_A, DA9062AA_VBUCK4_A),
        regmap_reg_range(DA9062AA_VBUCK3_A, DA9062AA_VBUCK3_A),
        regmap_reg_range(DA9062AA_VLDO1_A, DA9062AA_VLDO4_A),
+       regmap_reg_range(DA9062AA_CONFIG_A, DA9062AA_CONFIG_A),
        regmap_reg_range(DA9062AA_VBUCK1_B, DA9062AA_VBUCK4_B),
        regmap_reg_range(DA9062AA_VBUCK3_B, DA9062AA_VBUCK3_B),
        regmap_reg_range(DA9062AA_VLDO1_B, DA9062AA_VLDO4_B),
@@ -417,6 +448,7 @@ static const struct regmap_range da9061_aa_writeable_ranges[] = {
        regmap_reg_range(DA9062AA_VBUCK1_A, DA9062AA_VBUCK4_A),
        regmap_reg_range(DA9062AA_VBUCK3_A, DA9062AA_VBUCK3_A),
        regmap_reg_range(DA9062AA_VLDO1_A, DA9062AA_VLDO4_A),
+       regmap_reg_range(DA9062AA_CONFIG_A, DA9062AA_CONFIG_A),
        regmap_reg_range(DA9062AA_VBUCK1_B, DA9062AA_VBUCK4_B),
        regmap_reg_range(DA9062AA_VBUCK3_B, DA9062AA_VBUCK3_B),
        regmap_reg_range(DA9062AA_VLDO1_B, DA9062AA_VLDO4_B),
@@ -596,6 +628,7 @@ static int da9062_i2c_probe(struct i2c_client *i2c,
        const struct regmap_irq_chip *irq_chip;
        const struct regmap_config *config;
        int cell_num;
+       u32 trigger_type = 0;
        int ret;
 
        chip = devm_kzalloc(&i2c->dev, sizeof(*chip), GFP_KERNEL);
@@ -654,10 +687,15 @@ static int da9062_i2c_probe(struct i2c_client *i2c,
        if (ret)
                return ret;
 
+       ret = da9062_configure_irq_type(chip, i2c->irq, &trigger_type);
+       if (ret < 0) {
+               dev_err(chip->dev, "Failed to configure IRQ type\n");
+               return ret;
+       }
+
        ret = regmap_add_irq_chip(chip->regmap, i2c->irq,
-                       IRQF_TRIGGER_LOW | IRQF_ONESHOT | IRQF_SHARED,
-                       -1, irq_chip,
-                       &chip->regmap_irq);
+                       trigger_type | IRQF_SHARED | IRQF_ONESHOT,
+                       -1, irq_chip, &chip->regmap_irq);
        if (ret) {
                dev_err(chip->dev, "Failed to request IRQ %d: %d\n",
                        i2c->irq, ret);
index 7841c11..39276fa 100644 (file)
@@ -90,6 +90,11 @@ struct dln2_mod_rx_slots {
        spinlock_t lock;
 };
 
+enum dln2_endpoint {
+       DLN2_EP_OUT     = 0,
+       DLN2_EP_IN      = 1,
+};
+
 struct dln2_dev {
        struct usb_device *usb_dev;
        struct usb_interface *interface;
@@ -640,35 +645,56 @@ static int dln2_start_rx_urbs(struct dln2_dev *dln2, gfp_t gfp)
        return 0;
 }
 
+enum {
+       DLN2_ACPI_MATCH_GPIO    = 0,
+       DLN2_ACPI_MATCH_I2C     = 1,
+       DLN2_ACPI_MATCH_SPI     = 2,
+};
+
 static struct dln2_platform_data dln2_pdata_gpio = {
        .handle = DLN2_HANDLE_GPIO,
 };
 
+static struct mfd_cell_acpi_match dln2_acpi_match_gpio = {
+       .adr = DLN2_ACPI_MATCH_GPIO,
+};
+
 /* Only one I2C port seems to be supported on current hardware */
 static struct dln2_platform_data dln2_pdata_i2c = {
        .handle = DLN2_HANDLE_I2C,
        .port = 0,
 };
 
+static struct mfd_cell_acpi_match dln2_acpi_match_i2c = {
+       .adr = DLN2_ACPI_MATCH_I2C,
+};
+
 /* Only one SPI port supported */
 static struct dln2_platform_data dln2_pdata_spi = {
        .handle = DLN2_HANDLE_SPI,
        .port = 0,
 };
 
+static struct mfd_cell_acpi_match dln2_acpi_match_spi = {
+       .adr = DLN2_ACPI_MATCH_SPI,
+};
+
 static const struct mfd_cell dln2_devs[] = {
        {
                .name = "dln2-gpio",
+               .acpi_match = &dln2_acpi_match_gpio,
                .platform_data = &dln2_pdata_gpio,
                .pdata_size = sizeof(struct dln2_platform_data),
        },
        {
                .name = "dln2-i2c",
+               .acpi_match = &dln2_acpi_match_i2c,
                .platform_data = &dln2_pdata_i2c,
                .pdata_size = sizeof(struct dln2_platform_data),
        },
        {
                .name = "dln2-spi",
+               .acpi_match = &dln2_acpi_match_spi,
                .platform_data = &dln2_pdata_spi,
                .pdata_size = sizeof(struct dln2_platform_data),
        },
@@ -733,10 +759,10 @@ static int dln2_probe(struct usb_interface *interface,
            hostif->desc.bNumEndpoints < 2)
                return -ENODEV;
 
-       epin = &hostif->endpoint[0].desc;
-       epout = &hostif->endpoint[1].desc;
+       epout = &hostif->endpoint[DLN2_EP_OUT].desc;
        if (!usb_endpoint_is_bulk_out(epout))
                return -ENODEV;
+       epin = &hostif->endpoint[DLN2_EP_IN].desc;
        if (!usb_endpoint_is_bulk_in(epin))
                return -ENODEV;
 
index c40a6c7..7fc0c5d 100644 (file)
@@ -139,6 +139,11 @@ static const struct intel_lpss_platform_info cnl_i2c_info = {
        .properties = spt_i2c_properties,
 };
 
+static const struct intel_lpss_platform_info ehl_i2c_info = {
+       .clk_rate = 100000000,
+       .properties = bxt_i2c_properties,
+};
+
 static const struct pci_device_id intel_lpss_pci_ids[] = {
        /* CML-LP */
        { PCI_VDEVICE(INTEL, 0x02a8), (kernel_ulong_t)&spt_uart_info },
@@ -231,15 +236,15 @@ static const struct pci_device_id intel_lpss_pci_ids[] = {
        { PCI_VDEVICE(INTEL, 0x4b2a), (kernel_ulong_t)&bxt_info },
        { PCI_VDEVICE(INTEL, 0x4b2b), (kernel_ulong_t)&bxt_info },
        { PCI_VDEVICE(INTEL, 0x4b37), (kernel_ulong_t)&bxt_info },
-       { PCI_VDEVICE(INTEL, 0x4b44), (kernel_ulong_t)&bxt_i2c_info },
-       { PCI_VDEVICE(INTEL, 0x4b45), (kernel_ulong_t)&bxt_i2c_info },
-       { PCI_VDEVICE(INTEL, 0x4b4b), (kernel_ulong_t)&bxt_i2c_info },
-       { PCI_VDEVICE(INTEL, 0x4b4c), (kernel_ulong_t)&bxt_i2c_info },
+       { PCI_VDEVICE(INTEL, 0x4b44), (kernel_ulong_t)&ehl_i2c_info },
+       { PCI_VDEVICE(INTEL, 0x4b45), (kernel_ulong_t)&ehl_i2c_info },
+       { PCI_VDEVICE(INTEL, 0x4b4b), (kernel_ulong_t)&ehl_i2c_info },
+       { PCI_VDEVICE(INTEL, 0x4b4c), (kernel_ulong_t)&ehl_i2c_info },
        { PCI_VDEVICE(INTEL, 0x4b4d), (kernel_ulong_t)&bxt_uart_info },
-       { PCI_VDEVICE(INTEL, 0x4b78), (kernel_ulong_t)&bxt_i2c_info },
-       { PCI_VDEVICE(INTEL, 0x4b79), (kernel_ulong_t)&bxt_i2c_info },
-       { PCI_VDEVICE(INTEL, 0x4b7a), (kernel_ulong_t)&bxt_i2c_info },
-       { PCI_VDEVICE(INTEL, 0x4b7b), (kernel_ulong_t)&bxt_i2c_info },
+       { PCI_VDEVICE(INTEL, 0x4b78), (kernel_ulong_t)&ehl_i2c_info },
+       { PCI_VDEVICE(INTEL, 0x4b79), (kernel_ulong_t)&ehl_i2c_info },
+       { PCI_VDEVICE(INTEL, 0x4b7a), (kernel_ulong_t)&ehl_i2c_info },
+       { PCI_VDEVICE(INTEL, 0x4b7b), (kernel_ulong_t)&ehl_i2c_info },
        /* JSL */
        { PCI_VDEVICE(INTEL, 0x4da8), (kernel_ulong_t)&spt_uart_info },
        { PCI_VDEVICE(INTEL, 0x4da9), (kernel_ulong_t)&spt_uart_info },
@@ -347,6 +352,16 @@ static const struct pci_device_id intel_lpss_pci_ids[] = {
        { PCI_VDEVICE(INTEL, 0xa36a), (kernel_ulong_t)&cnl_i2c_info },
        { PCI_VDEVICE(INTEL, 0xa36b), (kernel_ulong_t)&cnl_i2c_info },
        { PCI_VDEVICE(INTEL, 0xa37b), (kernel_ulong_t)&spt_info },
+       /* CML-V */
+       { PCI_VDEVICE(INTEL, 0xa3a7), (kernel_ulong_t)&spt_uart_info },
+       { PCI_VDEVICE(INTEL, 0xa3a8), (kernel_ulong_t)&spt_uart_info },
+       { PCI_VDEVICE(INTEL, 0xa3a9), (kernel_ulong_t)&spt_info },
+       { PCI_VDEVICE(INTEL, 0xa3aa), (kernel_ulong_t)&spt_info },
+       { PCI_VDEVICE(INTEL, 0xa3e0), (kernel_ulong_t)&spt_i2c_info },
+       { PCI_VDEVICE(INTEL, 0xa3e1), (kernel_ulong_t)&spt_i2c_info },
+       { PCI_VDEVICE(INTEL, 0xa3e2), (kernel_ulong_t)&spt_i2c_info },
+       { PCI_VDEVICE(INTEL, 0xa3e3), (kernel_ulong_t)&spt_i2c_info },
+       { PCI_VDEVICE(INTEL, 0xa3e6), (kernel_ulong_t)&spt_uart_info },
        { }
 };
 MODULE_DEVICE_TABLE(pci, intel_lpss_pci_ids);
diff --git a/drivers/mfd/iqs62x.c b/drivers/mfd/iqs62x.c
new file mode 100644 (file)
index 0000000..af764bc
--- /dev/null
@@ -0,0 +1,1063 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Azoteq IQS620A/621/622/624/625 Multi-Function Sensors
+ *
+ * Copyright (C) 2019 Jeff LaBundy <jeff@labundy.com>
+ *
+ * These devices rely on application-specific register settings and calibration
+ * data developed in and exported from a suite of GUIs offered by the vendor. A
+ * separate tool converts the GUIs' ASCII-based output into a standard firmware
+ * file parsed by the driver.
+ *
+ * Link to datasheets and GUIs: https://www.azoteq.com/
+ *
+ * Link to conversion tool: https://github.com/jlabundy/iqs62x-h2bin.git
+ */
+
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/firmware.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/iqs62x.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/of_device.h>
+#include <linux/property.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+#define IQS62X_PROD_NUM                                0x00
+
+#define IQS62X_SYS_FLAGS                       0x10
+#define IQS62X_SYS_FLAGS_IN_ATI                        BIT(2)
+
+#define IQS620_HALL_FLAGS                      0x16
+#define IQS621_HALL_FLAGS                      0x19
+#define IQS622_HALL_FLAGS                      IQS621_HALL_FLAGS
+
+#define IQS624_INTERVAL_NUM                    0x18
+#define IQS625_INTERVAL_NUM                    0x12
+
+#define IQS622_PROX_SETTINGS_4                 0x48
+#define IQS620_PROX_SETTINGS_4                 0x50
+#define IQS620_PROX_SETTINGS_4_SAR_EN          BIT(7)
+
+#define IQS621_ALS_CAL_DIV_LUX                 0x82
+#define IQS621_ALS_CAL_DIV_IR                  0x83
+
+#define IQS620_TEMP_CAL_MULT                   0xC2
+#define IQS620_TEMP_CAL_DIV                    0xC3
+#define IQS620_TEMP_CAL_OFFS                   0xC4
+
+#define IQS62X_SYS_SETTINGS                    0xD0
+#define IQS62X_SYS_SETTINGS_SOFT_RESET         BIT(7)
+#define IQS62X_SYS_SETTINGS_ACK_RESET          BIT(6)
+#define IQS62X_SYS_SETTINGS_EVENT_MODE         BIT(5)
+#define IQS62X_SYS_SETTINGS_CLK_DIV            BIT(4)
+#define IQS62X_SYS_SETTINGS_REDO_ATI           BIT(1)
+
+#define IQS62X_PWR_SETTINGS                    0xD2
+#define IQS62X_PWR_SETTINGS_DIS_AUTO           BIT(5)
+#define IQS62X_PWR_SETTINGS_PWR_MODE_MASK      (BIT(4) | BIT(3))
+#define IQS62X_PWR_SETTINGS_PWR_MODE_HALT      (BIT(4) | BIT(3))
+#define IQS62X_PWR_SETTINGS_PWR_MODE_NORM      0
+
+#define IQS62X_OTP_CMD                         0xF0
+#define IQS62X_OTP_CMD_FG3                     0x13
+#define IQS62X_OTP_DATA                                0xF1
+#define IQS62X_MAX_REG                         0xFF
+
+#define IQS62X_HALL_CAL_MASK                   GENMASK(3, 0)
+
+#define IQS62X_FW_REC_TYPE_INFO                        0
+#define IQS62X_FW_REC_TYPE_PROD                        1
+#define IQS62X_FW_REC_TYPE_HALL                        2
+#define IQS62X_FW_REC_TYPE_MASK                        3
+#define IQS62X_FW_REC_TYPE_DATA                        4
+
+#define IQS62X_ATI_POLL_SLEEP_US               10000
+#define IQS62X_ATI_POLL_TIMEOUT_US             500000
+#define IQS62X_ATI_STABLE_DELAY_MS             150
+
+struct iqs62x_fw_rec {
+       u8 type;
+       u8 addr;
+       u8 len;
+       u8 data;
+} __packed;
+
+struct iqs62x_fw_blk {
+       struct list_head list;
+       u8 addr;
+       u8 mask;
+       u8 len;
+       u8 data[];
+};
+
+struct iqs62x_info {
+       u8 prod_num;
+       u8 sw_num;
+       u8 hw_num;
+} __packed;
+
+static int iqs62x_dev_init(struct iqs62x_core *iqs62x)
+{
+       struct iqs62x_fw_blk *fw_blk;
+       unsigned int val;
+       int ret;
+       u8 clk_div = 1;
+
+       list_for_each_entry(fw_blk, &iqs62x->fw_blk_head, list) {
+               if (fw_blk->mask)
+                       ret = regmap_update_bits(iqs62x->regmap, fw_blk->addr,
+                                                fw_blk->mask, *fw_blk->data);
+               else
+                       ret = regmap_raw_write(iqs62x->regmap, fw_blk->addr,
+                                              fw_blk->data, fw_blk->len);
+               if (ret)
+                       return ret;
+       }
+
+       switch (iqs62x->dev_desc->prod_num) {
+       case IQS620_PROD_NUM:
+       case IQS622_PROD_NUM:
+               ret = regmap_read(iqs62x->regmap,
+                                 iqs62x->dev_desc->prox_settings, &val);
+               if (ret)
+                       return ret;
+
+               if (val & IQS620_PROX_SETTINGS_4_SAR_EN)
+                       iqs62x->ui_sel = IQS62X_UI_SAR1;
+
+               /* fall through */
+
+       case IQS621_PROD_NUM:
+               ret = regmap_write(iqs62x->regmap, IQS620_GLBL_EVENT_MASK,
+                                  IQS620_GLBL_EVENT_MASK_PMU |
+                                  iqs62x->dev_desc->prox_mask |
+                                  iqs62x->dev_desc->sar_mask |
+                                  iqs62x->dev_desc->hall_mask |
+                                  iqs62x->dev_desc->hyst_mask |
+                                  iqs62x->dev_desc->temp_mask |
+                                  iqs62x->dev_desc->als_mask |
+                                  iqs62x->dev_desc->ir_mask);
+               if (ret)
+                       return ret;
+               break;
+
+       default:
+               ret = regmap_write(iqs62x->regmap, IQS624_HALL_UI,
+                                  IQS624_HALL_UI_WHL_EVENT |
+                                  IQS624_HALL_UI_INT_EVENT |
+                                  IQS624_HALL_UI_AUTO_CAL);
+               if (ret)
+                       return ret;
+
+               /*
+                * The IQS625 default interval divider is below the minimum
+                * permissible value, and the datasheet mandates that it is
+                * corrected during initialization (unless an updated value
+                * has already been provided by firmware).
+                *
+                * To protect against an unacceptably low user-entered value
+                * stored in the firmware, the same check is extended to the
+                * IQS624 as well.
+                */
+               ret = regmap_read(iqs62x->regmap, IQS624_INTERVAL_DIV, &val);
+               if (ret)
+                       return ret;
+
+               if (val >= iqs62x->dev_desc->interval_div)
+                       break;
+
+               ret = regmap_write(iqs62x->regmap, IQS624_INTERVAL_DIV,
+                                  iqs62x->dev_desc->interval_div);
+               if (ret)
+                       return ret;
+       }
+
+       ret = regmap_read(iqs62x->regmap, IQS62X_SYS_SETTINGS, &val);
+       if (ret)
+               return ret;
+
+       if (val & IQS62X_SYS_SETTINGS_CLK_DIV)
+               clk_div = iqs62x->dev_desc->clk_div;
+
+       ret = regmap_write(iqs62x->regmap, IQS62X_SYS_SETTINGS, val |
+                          IQS62X_SYS_SETTINGS_ACK_RESET |
+                          IQS62X_SYS_SETTINGS_EVENT_MODE |
+                          IQS62X_SYS_SETTINGS_REDO_ATI);
+       if (ret)
+               return ret;
+
+       ret = regmap_read_poll_timeout(iqs62x->regmap, IQS62X_SYS_FLAGS, val,
+                                      !(val & IQS62X_SYS_FLAGS_IN_ATI),
+                                      IQS62X_ATI_POLL_SLEEP_US,
+                                      IQS62X_ATI_POLL_TIMEOUT_US * clk_div);
+       if (ret)
+               return ret;
+
+       msleep(IQS62X_ATI_STABLE_DELAY_MS * clk_div);
+
+       return 0;
+}
+
+static int iqs62x_firmware_parse(struct iqs62x_core *iqs62x,
+                                const struct firmware *fw)
+{
+       struct i2c_client *client = iqs62x->client;
+       struct iqs62x_fw_rec *fw_rec;
+       struct iqs62x_fw_blk *fw_blk;
+       unsigned int val;
+       size_t pos = 0;
+       int ret = 0;
+       u8 mask, len, *data;
+       u8 hall_cal_index = 0;
+
+       while (pos < fw->size) {
+               if (pos + sizeof(*fw_rec) > fw->size) {
+                       ret = -EINVAL;
+                       break;
+               }
+               fw_rec = (struct iqs62x_fw_rec *)(fw->data + pos);
+               pos += sizeof(*fw_rec);
+
+               if (pos + fw_rec->len - 1 > fw->size) {
+                       ret = -EINVAL;
+                       break;
+               }
+               pos += fw_rec->len - 1;
+
+               switch (fw_rec->type) {
+               case IQS62X_FW_REC_TYPE_INFO:
+                       continue;
+
+               case IQS62X_FW_REC_TYPE_PROD:
+                       if (fw_rec->data == iqs62x->dev_desc->prod_num)
+                               continue;
+
+                       dev_err(&client->dev,
+                               "Incompatible product number: 0x%02X\n",
+                               fw_rec->data);
+                       ret = -EINVAL;
+                       break;
+
+               case IQS62X_FW_REC_TYPE_HALL:
+                       if (!hall_cal_index) {
+                               ret = regmap_write(iqs62x->regmap,
+                                                  IQS62X_OTP_CMD,
+                                                  IQS62X_OTP_CMD_FG3);
+                               if (ret)
+                                       break;
+
+                               ret = regmap_read(iqs62x->regmap,
+                                                 IQS62X_OTP_DATA, &val);
+                               if (ret)
+                                       break;
+
+                               hall_cal_index = val & IQS62X_HALL_CAL_MASK;
+                               if (!hall_cal_index) {
+                                       dev_err(&client->dev,
+                                               "Uncalibrated device\n");
+                                       ret = -ENODATA;
+                                       break;
+                               }
+                       }
+
+                       if (hall_cal_index > fw_rec->len) {
+                               ret = -EINVAL;
+                               break;
+                       }
+
+                       mask = 0;
+                       data = &fw_rec->data + hall_cal_index - 1;
+                       len = sizeof(*data);
+                       break;
+
+               case IQS62X_FW_REC_TYPE_MASK:
+                       if (fw_rec->len < (sizeof(mask) + sizeof(*data))) {
+                               ret = -EINVAL;
+                               break;
+                       }
+
+                       mask = fw_rec->data;
+                       data = &fw_rec->data + sizeof(mask);
+                       len = sizeof(*data);
+                       break;
+
+               case IQS62X_FW_REC_TYPE_DATA:
+                       mask = 0;
+                       data = &fw_rec->data;
+                       len = fw_rec->len;
+                       break;
+
+               default:
+                       dev_err(&client->dev,
+                               "Unrecognized record type: 0x%02X\n",
+                               fw_rec->type);
+                       ret = -EINVAL;
+               }
+
+               if (ret)
+                       break;
+
+               fw_blk = devm_kzalloc(&client->dev,
+                                     struct_size(fw_blk, data, len),
+                                     GFP_KERNEL);
+               if (!fw_blk) {
+                       ret = -ENOMEM;
+                       break;
+               }
+
+               fw_blk->addr = fw_rec->addr;
+               fw_blk->mask = mask;
+               fw_blk->len = len;
+               memcpy(fw_blk->data, data, len);
+
+               list_add(&fw_blk->list, &iqs62x->fw_blk_head);
+       }
+
+       release_firmware(fw);
+
+       return ret;
+}
+
+const struct iqs62x_event_desc iqs62x_events[IQS62X_NUM_EVENTS] = {
+       [IQS62X_EVENT_PROX_CH0_T] = {
+               .reg    = IQS62X_EVENT_PROX,
+               .mask   = BIT(4),
+               .val    = BIT(4),
+       },
+       [IQS62X_EVENT_PROX_CH0_P] = {
+               .reg    = IQS62X_EVENT_PROX,
+               .mask   = BIT(0),
+               .val    = BIT(0),
+       },
+       [IQS62X_EVENT_PROX_CH1_T] = {
+               .reg    = IQS62X_EVENT_PROX,
+               .mask   = BIT(5),
+               .val    = BIT(5),
+       },
+       [IQS62X_EVENT_PROX_CH1_P] = {
+               .reg    = IQS62X_EVENT_PROX,
+               .mask   = BIT(1),
+               .val    = BIT(1),
+       },
+       [IQS62X_EVENT_PROX_CH2_T] = {
+               .reg    = IQS62X_EVENT_PROX,
+               .mask   = BIT(6),
+               .val    = BIT(6),
+       },
+       [IQS62X_EVENT_PROX_CH2_P] = {
+               .reg    = IQS62X_EVENT_PROX,
+               .mask   = BIT(2),
+               .val    = BIT(2),
+       },
+       [IQS62X_EVENT_HYST_POS_T] = {
+               .reg    = IQS62X_EVENT_HYST,
+               .mask   = BIT(6) | BIT(7),
+               .val    = BIT(6),
+       },
+       [IQS62X_EVENT_HYST_POS_P] = {
+               .reg    = IQS62X_EVENT_HYST,
+               .mask   = BIT(5) | BIT(7),
+               .val    = BIT(5),
+       },
+       [IQS62X_EVENT_HYST_NEG_T] = {
+               .reg    = IQS62X_EVENT_HYST,
+               .mask   = BIT(6) | BIT(7),
+               .val    = BIT(6) | BIT(7),
+       },
+       [IQS62X_EVENT_HYST_NEG_P] = {
+               .reg    = IQS62X_EVENT_HYST,
+               .mask   = BIT(5) | BIT(7),
+               .val    = BIT(5) | BIT(7),
+       },
+       [IQS62X_EVENT_SAR1_ACT] = {
+               .reg    = IQS62X_EVENT_HYST,
+               .mask   = BIT(4),
+               .val    = BIT(4),
+       },
+       [IQS62X_EVENT_SAR1_QRD] = {
+               .reg    = IQS62X_EVENT_HYST,
+               .mask   = BIT(2),
+               .val    = BIT(2),
+       },
+       [IQS62X_EVENT_SAR1_MOVE] = {
+               .reg    = IQS62X_EVENT_HYST,
+               .mask   = BIT(1),
+               .val    = BIT(1),
+       },
+       [IQS62X_EVENT_SAR1_HALT] = {
+               .reg    = IQS62X_EVENT_HYST,
+               .mask   = BIT(0),
+               .val    = BIT(0),
+       },
+       [IQS62X_EVENT_WHEEL_UP] = {
+               .reg    = IQS62X_EVENT_WHEEL,
+               .mask   = BIT(7) | BIT(6),
+               .val    = BIT(7),
+       },
+       [IQS62X_EVENT_WHEEL_DN] = {
+               .reg    = IQS62X_EVENT_WHEEL,
+               .mask   = BIT(7) | BIT(6),
+               .val    = BIT(7) | BIT(6),
+       },
+       [IQS62X_EVENT_HALL_N_T] = {
+               .reg    = IQS62X_EVENT_HALL,
+               .mask   = BIT(2) | BIT(0),
+               .val    = BIT(2),
+       },
+       [IQS62X_EVENT_HALL_N_P] = {
+               .reg    = IQS62X_EVENT_HALL,
+               .mask   = BIT(1) | BIT(0),
+               .val    = BIT(1),
+       },
+       [IQS62X_EVENT_HALL_S_T] = {
+               .reg    = IQS62X_EVENT_HALL,
+               .mask   = BIT(2) | BIT(0),
+               .val    = BIT(2) | BIT(0),
+       },
+       [IQS62X_EVENT_HALL_S_P] = {
+               .reg    = IQS62X_EVENT_HALL,
+               .mask   = BIT(1) | BIT(0),
+               .val    = BIT(1) | BIT(0),
+       },
+       [IQS62X_EVENT_SYS_RESET] = {
+               .reg    = IQS62X_EVENT_SYS,
+               .mask   = BIT(7),
+               .val    = BIT(7),
+       },
+};
+EXPORT_SYMBOL_GPL(iqs62x_events);
+
+static irqreturn_t iqs62x_irq(int irq, void *context)
+{
+       struct iqs62x_core *iqs62x = context;
+       struct i2c_client *client = iqs62x->client;
+       struct iqs62x_event_data event_data;
+       struct iqs62x_event_desc event_desc;
+       enum iqs62x_event_reg event_reg;
+       unsigned long event_flags = 0;
+       int ret, i, j;
+       u8 event_map[IQS62X_EVENT_SIZE];
+
+       /*
+        * The device asserts the RDY output to signal the beginning of a
+        * communication window, which is closed by an I2C stop condition.
+        * As such, all interrupt status is captured in a single read and
+        * broadcast to any interested sub-device drivers.
+        */
+       ret = regmap_raw_read(iqs62x->regmap, IQS62X_SYS_FLAGS, event_map,
+                             sizeof(event_map));
+       if (ret) {
+               dev_err(&client->dev, "Failed to read device status: %d\n",
+                       ret);
+               return IRQ_NONE;
+       }
+
+       for (i = 0; i < sizeof(event_map); i++) {
+               event_reg = iqs62x->dev_desc->event_regs[iqs62x->ui_sel][i];
+
+               switch (event_reg) {
+               case IQS62X_EVENT_UI_LO:
+                       event_data.ui_data = get_unaligned_le16(&event_map[i]);
+
+                       /* fall through */
+
+               case IQS62X_EVENT_UI_HI:
+               case IQS62X_EVENT_NONE:
+                       continue;
+
+               case IQS62X_EVENT_ALS:
+                       event_data.als_flags = event_map[i];
+                       continue;
+
+               case IQS62X_EVENT_IR:
+                       event_data.ir_flags = event_map[i];
+                       continue;
+
+               case IQS62X_EVENT_INTER:
+                       event_data.interval = event_map[i];
+                       continue;
+
+               case IQS62X_EVENT_HYST:
+                       event_map[i] <<= iqs62x->dev_desc->hyst_shift;
+
+                       /* fall through */
+
+               case IQS62X_EVENT_WHEEL:
+               case IQS62X_EVENT_HALL:
+               case IQS62X_EVENT_PROX:
+               case IQS62X_EVENT_SYS:
+                       break;
+               }
+
+               for (j = 0; j < IQS62X_NUM_EVENTS; j++) {
+                       event_desc = iqs62x_events[j];
+
+                       if (event_desc.reg != event_reg)
+                               continue;
+
+                       if ((event_map[i] & event_desc.mask) == event_desc.val)
+                               event_flags |= BIT(j);
+               }
+       }
+
+       /*
+        * The device resets itself in response to the I2C master stalling
+        * communication past a fixed timeout. In this case, all registers
+        * are restored and any interested sub-device drivers are notified.
+        */
+       if (event_flags & BIT(IQS62X_EVENT_SYS_RESET)) {
+               dev_err(&client->dev, "Unexpected device reset\n");
+
+               ret = iqs62x_dev_init(iqs62x);
+               if (ret) {
+                       dev_err(&client->dev,
+                               "Failed to re-initialize device: %d\n", ret);
+                       return IRQ_NONE;
+               }
+       }
+
+       ret = blocking_notifier_call_chain(&iqs62x->nh, event_flags,
+                                          &event_data);
+       if (ret & NOTIFY_STOP_MASK)
+               return IRQ_NONE;
+
+       /*
+        * Once the communication window is closed, a small delay is added to
+        * ensure the device's RDY output has been deasserted by the time the
+        * interrupt handler returns.
+        */
+       usleep_range(50, 100);
+
+       return IRQ_HANDLED;
+}
+
+static void iqs62x_firmware_load(const struct firmware *fw, void *context)
+{
+       struct iqs62x_core *iqs62x = context;
+       struct i2c_client *client = iqs62x->client;
+       int ret;
+
+       if (fw) {
+               ret = iqs62x_firmware_parse(iqs62x, fw);
+               if (ret) {
+                       dev_err(&client->dev, "Failed to parse firmware: %d\n",
+                               ret);
+                       goto err_out;
+               }
+       }
+
+       ret = iqs62x_dev_init(iqs62x);
+       if (ret) {
+               dev_err(&client->dev, "Failed to initialize device: %d\n", ret);
+               goto err_out;
+       }
+
+       ret = devm_request_threaded_irq(&client->dev, client->irq,
+                                       NULL, iqs62x_irq, IRQF_ONESHOT,
+                                       client->name, iqs62x);
+       if (ret) {
+               dev_err(&client->dev, "Failed to request IRQ: %d\n", ret);
+               goto err_out;
+       }
+
+       ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_NONE,
+                                  iqs62x->dev_desc->sub_devs,
+                                  iqs62x->dev_desc->num_sub_devs,
+                                  NULL, 0, NULL);
+       if (ret)
+               dev_err(&client->dev, "Failed to add sub-devices: %d\n", ret);
+
+err_out:
+       complete_all(&iqs62x->fw_done);
+}
+
+static const struct mfd_cell iqs620at_sub_devs[] = {
+       {
+               .name = "iqs62x-keys",
+               .of_compatible = "azoteq,iqs620a-keys",
+       },
+       {
+               .name = "iqs620a-pwm",
+               .of_compatible = "azoteq,iqs620a-pwm",
+       },
+       { .name = "iqs620at-temp", },
+};
+
+static const struct mfd_cell iqs620a_sub_devs[] = {
+       {
+               .name = "iqs62x-keys",
+               .of_compatible = "azoteq,iqs620a-keys",
+       },
+       {
+               .name = "iqs620a-pwm",
+               .of_compatible = "azoteq,iqs620a-pwm",
+       },
+};
+
+static const struct mfd_cell iqs621_sub_devs[] = {
+       {
+               .name = "iqs62x-keys",
+               .of_compatible = "azoteq,iqs621-keys",
+       },
+       { .name = "iqs621-als", },
+};
+
+static const struct mfd_cell iqs622_sub_devs[] = {
+       {
+               .name = "iqs62x-keys",
+               .of_compatible = "azoteq,iqs622-keys",
+       },
+       { .name = "iqs621-als", },
+};
+
+static const struct mfd_cell iqs624_sub_devs[] = {
+       {
+               .name = "iqs62x-keys",
+               .of_compatible = "azoteq,iqs624-keys",
+       },
+       { .name = "iqs624-pos", },
+};
+
+static const struct mfd_cell iqs625_sub_devs[] = {
+       {
+               .name = "iqs62x-keys",
+               .of_compatible = "azoteq,iqs625-keys",
+       },
+       { .name = "iqs624-pos", },
+};
+
+static const u8 iqs620at_cal_regs[] = {
+       IQS620_TEMP_CAL_MULT,
+       IQS620_TEMP_CAL_DIV,
+       IQS620_TEMP_CAL_OFFS,
+};
+
+static const u8 iqs621_cal_regs[] = {
+       IQS621_ALS_CAL_DIV_LUX,
+       IQS621_ALS_CAL_DIV_IR,
+};
+
+static const enum iqs62x_event_reg iqs620a_event_regs[][IQS62X_EVENT_SIZE] = {
+       [IQS62X_UI_PROX] = {
+               IQS62X_EVENT_SYS,       /* 0x10 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_PROX,      /* 0x12 */
+               IQS62X_EVENT_HYST,      /* 0x13 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_HALL,      /* 0x16 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+       },
+       [IQS62X_UI_SAR1] = {
+               IQS62X_EVENT_SYS,       /* 0x10 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_HYST,      /* 0x13 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_HALL,      /* 0x16 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+       },
+};
+
+static const enum iqs62x_event_reg iqs621_event_regs[][IQS62X_EVENT_SIZE] = {
+       [IQS62X_UI_PROX] = {
+               IQS62X_EVENT_SYS,       /* 0x10 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_PROX,      /* 0x12 */
+               IQS62X_EVENT_HYST,      /* 0x13 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_ALS,       /* 0x16 */
+               IQS62X_EVENT_UI_LO,     /* 0x17 */
+               IQS62X_EVENT_UI_HI,     /* 0x18 */
+               IQS62X_EVENT_HALL,      /* 0x19 */
+       },
+};
+
+static const enum iqs62x_event_reg iqs622_event_regs[][IQS62X_EVENT_SIZE] = {
+       [IQS62X_UI_PROX] = {
+               IQS62X_EVENT_SYS,       /* 0x10 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_PROX,      /* 0x12 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_ALS,       /* 0x14 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_IR,        /* 0x16 */
+               IQS62X_EVENT_UI_LO,     /* 0x17 */
+               IQS62X_EVENT_UI_HI,     /* 0x18 */
+               IQS62X_EVENT_HALL,      /* 0x19 */
+       },
+       [IQS62X_UI_SAR1] = {
+               IQS62X_EVENT_SYS,       /* 0x10 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_HYST,      /* 0x13 */
+               IQS62X_EVENT_ALS,       /* 0x14 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_IR,        /* 0x16 */
+               IQS62X_EVENT_UI_LO,     /* 0x17 */
+               IQS62X_EVENT_UI_HI,     /* 0x18 */
+               IQS62X_EVENT_HALL,      /* 0x19 */
+       },
+};
+
+static const enum iqs62x_event_reg iqs624_event_regs[][IQS62X_EVENT_SIZE] = {
+       [IQS62X_UI_PROX] = {
+               IQS62X_EVENT_SYS,       /* 0x10 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_PROX,      /* 0x12 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_WHEEL,     /* 0x14 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_UI_LO,     /* 0x16 */
+               IQS62X_EVENT_UI_HI,     /* 0x17 */
+               IQS62X_EVENT_INTER,     /* 0x18 */
+               IQS62X_EVENT_NONE,
+       },
+};
+
+static const enum iqs62x_event_reg iqs625_event_regs[][IQS62X_EVENT_SIZE] = {
+       [IQS62X_UI_PROX] = {
+               IQS62X_EVENT_SYS,       /* 0x10 */
+               IQS62X_EVENT_PROX,      /* 0x11 */
+               IQS62X_EVENT_INTER,     /* 0x12 */
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+               IQS62X_EVENT_NONE,
+       },
+};
+
+static const struct iqs62x_dev_desc iqs62x_devs[] = {
+       {
+               .dev_name       = "iqs620at",
+               .sub_devs       = iqs620at_sub_devs,
+               .num_sub_devs   = ARRAY_SIZE(iqs620at_sub_devs),
+
+               .prod_num       = IQS620_PROD_NUM,
+               .sw_num         = 0x08,
+               .cal_regs       = iqs620at_cal_regs,
+               .num_cal_regs   = ARRAY_SIZE(iqs620at_cal_regs),
+
+               .prox_mask      = BIT(0),
+               .sar_mask       = BIT(1) | BIT(7),
+               .hall_mask      = BIT(2),
+               .hyst_mask      = BIT(3),
+               .temp_mask      = BIT(4),
+
+               .prox_settings  = IQS620_PROX_SETTINGS_4,
+               .hall_flags     = IQS620_HALL_FLAGS,
+
+               .clk_div        = 4,
+               .fw_name        = "iqs620a.bin",
+               .event_regs     = &iqs620a_event_regs[IQS62X_UI_PROX],
+       },
+       {
+               .dev_name       = "iqs620a",
+               .sub_devs       = iqs620a_sub_devs,
+               .num_sub_devs   = ARRAY_SIZE(iqs620a_sub_devs),
+
+               .prod_num       = IQS620_PROD_NUM,
+               .sw_num         = 0x08,
+
+               .prox_mask      = BIT(0),
+               .sar_mask       = BIT(1) | BIT(7),
+               .hall_mask      = BIT(2),
+               .hyst_mask      = BIT(3),
+               .temp_mask      = BIT(4),
+
+               .prox_settings  = IQS620_PROX_SETTINGS_4,
+               .hall_flags     = IQS620_HALL_FLAGS,
+
+               .clk_div        = 4,
+               .fw_name        = "iqs620a.bin",
+               .event_regs     = &iqs620a_event_regs[IQS62X_UI_PROX],
+       },
+       {
+               .dev_name       = "iqs621",
+               .sub_devs       = iqs621_sub_devs,
+               .num_sub_devs   = ARRAY_SIZE(iqs621_sub_devs),
+
+               .prod_num       = IQS621_PROD_NUM,
+               .sw_num         = 0x09,
+               .cal_regs       = iqs621_cal_regs,
+               .num_cal_regs   = ARRAY_SIZE(iqs621_cal_regs),
+
+               .prox_mask      = BIT(0),
+               .hall_mask      = BIT(1),
+               .als_mask       = BIT(2),
+               .hyst_mask      = BIT(3),
+               .temp_mask      = BIT(4),
+
+               .als_flags      = IQS621_ALS_FLAGS,
+               .hall_flags     = IQS621_HALL_FLAGS,
+               .hyst_shift     = 5,
+
+               .clk_div        = 2,
+               .fw_name        = "iqs621.bin",
+               .event_regs     = &iqs621_event_regs[IQS62X_UI_PROX],
+       },
+       {
+               .dev_name       = "iqs622",
+               .sub_devs       = iqs622_sub_devs,
+               .num_sub_devs   = ARRAY_SIZE(iqs622_sub_devs),
+
+               .prod_num       = IQS622_PROD_NUM,
+               .sw_num         = 0x06,
+
+               .prox_mask      = BIT(0),
+               .sar_mask       = BIT(1),
+               .hall_mask      = BIT(2),
+               .als_mask       = BIT(3),
+               .ir_mask        = BIT(4),
+
+               .prox_settings  = IQS622_PROX_SETTINGS_4,
+               .als_flags      = IQS622_ALS_FLAGS,
+               .hall_flags     = IQS622_HALL_FLAGS,
+
+               .clk_div        = 2,
+               .fw_name        = "iqs622.bin",
+               .event_regs     = &iqs622_event_regs[IQS62X_UI_PROX],
+       },
+       {
+               .dev_name       = "iqs624",
+               .sub_devs       = iqs624_sub_devs,
+               .num_sub_devs   = ARRAY_SIZE(iqs624_sub_devs),
+
+               .prod_num       = IQS624_PROD_NUM,
+               .sw_num         = 0x0B,
+
+               .interval       = IQS624_INTERVAL_NUM,
+               .interval_div   = 3,
+
+               .clk_div        = 2,
+               .fw_name        = "iqs624.bin",
+               .event_regs     = &iqs624_event_regs[IQS62X_UI_PROX],
+       },
+       {
+               .dev_name       = "iqs625",
+               .sub_devs       = iqs625_sub_devs,
+               .num_sub_devs   = ARRAY_SIZE(iqs625_sub_devs),
+
+               .prod_num       = IQS625_PROD_NUM,
+               .sw_num         = 0x0B,
+
+               .interval       = IQS625_INTERVAL_NUM,
+               .interval_div   = 10,
+
+               .clk_div        = 2,
+               .fw_name        = "iqs625.bin",
+               .event_regs     = &iqs625_event_regs[IQS62X_UI_PROX],
+       },
+};
+
+static const struct regmap_config iqs62x_map_config = {
+       .reg_bits = 8,
+       .val_bits = 8,
+       .max_register = IQS62X_MAX_REG,
+};
+
+static int iqs62x_probe(struct i2c_client *client)
+{
+       struct iqs62x_core *iqs62x;
+       struct iqs62x_info info;
+       unsigned int val;
+       int ret, i, j;
+       u8 sw_num = 0;
+       const char *fw_name = NULL;
+
+       iqs62x = devm_kzalloc(&client->dev, sizeof(*iqs62x), GFP_KERNEL);
+       if (!iqs62x)
+               return -ENOMEM;
+
+       i2c_set_clientdata(client, iqs62x);
+       iqs62x->client = client;
+
+       BLOCKING_INIT_NOTIFIER_HEAD(&iqs62x->nh);
+       INIT_LIST_HEAD(&iqs62x->fw_blk_head);
+       init_completion(&iqs62x->fw_done);
+
+       iqs62x->regmap = devm_regmap_init_i2c(client, &iqs62x_map_config);
+       if (IS_ERR(iqs62x->regmap)) {
+               ret = PTR_ERR(iqs62x->regmap);
+               dev_err(&client->dev, "Failed to initialize register map: %d\n",
+                       ret);
+               return ret;
+       }
+
+       ret = regmap_raw_read(iqs62x->regmap, IQS62X_PROD_NUM, &info,
+                             sizeof(info));
+       if (ret)
+               return ret;
+
+       /*
+        * The following sequence validates the device's product and software
+        * numbers. It then determines if the device is factory-calibrated by
+        * checking for nonzero values in the device's designated calibration
+        * registers (if applicable). Depending on the device, the absence of
+        * calibration data indicates a reduced feature set or invalid device.
+        *
+        * For devices given in both calibrated and uncalibrated versions, the
+        * calibrated version (e.g. IQS620AT) appears first in the iqs62x_devs
+        * array. The uncalibrated version (e.g. IQS620A) appears next and has
+        * the same product and software numbers, but no calibration registers
+        * are specified.
+        */
+       for (i = 0; i < ARRAY_SIZE(iqs62x_devs); i++) {
+               if (info.prod_num != iqs62x_devs[i].prod_num)
+                       continue;
+
+               iqs62x->dev_desc = &iqs62x_devs[i];
+
+               if (info.sw_num < iqs62x->dev_desc->sw_num)
+                       continue;
+
+               sw_num = info.sw_num;
+
+               /*
+                * Read each of the device's designated calibration registers,
+                * if any, and exit from the inner loop early if any are equal
+                * to zero (indicating the device is uncalibrated). This could
+                * be acceptable depending on the device (e.g. IQS620A instead
+                * of IQS620AT).
+                */
+               for (j = 0; j < iqs62x->dev_desc->num_cal_regs; j++) {
+                       ret = regmap_read(iqs62x->regmap,
+                                         iqs62x->dev_desc->cal_regs[j], &val);
+                       if (ret)
+                               return ret;
+
+                       if (!val)
+                               break;
+               }
+
+               /*
+                * If the number of nonzero values read from the device equals
+                * the number of designated calibration registers (which could
+                * be zero), exit from the outer loop early to signal that the
+                * device's product and software numbers match a known device,
+                * and the device is calibrated (if applicable).
+                */
+               if (j == iqs62x->dev_desc->num_cal_regs)
+                       break;
+       }
+
+       if (!iqs62x->dev_desc) {
+               dev_err(&client->dev, "Unrecognized product number: 0x%02X\n",
+                       info.prod_num);
+               return -EINVAL;
+       }
+
+       if (!sw_num) {
+               dev_err(&client->dev, "Unrecognized software number: 0x%02X\n",
+                       info.sw_num);
+               return -EINVAL;
+       }
+
+       if (i == ARRAY_SIZE(iqs62x_devs)) {
+               dev_err(&client->dev, "Uncalibrated device\n");
+               return -ENODATA;
+       }
+
+       device_property_read_string(&client->dev, "firmware-name", &fw_name);
+
+       ret = request_firmware_nowait(THIS_MODULE, FW_ACTION_HOTPLUG,
+                                     fw_name ? : iqs62x->dev_desc->fw_name,
+                                     &client->dev, GFP_KERNEL, iqs62x,
+                                     iqs62x_firmware_load);
+       if (ret)
+               dev_err(&client->dev, "Failed to request firmware: %d\n", ret);
+
+       return ret;
+}
+
+static int iqs62x_remove(struct i2c_client *client)
+{
+       struct iqs62x_core *iqs62x = i2c_get_clientdata(client);
+
+       wait_for_completion(&iqs62x->fw_done);
+
+       return 0;
+}
+
+static int __maybe_unused iqs62x_suspend(struct device *dev)
+{
+       struct iqs62x_core *iqs62x = dev_get_drvdata(dev);
+       int ret;
+
+       wait_for_completion(&iqs62x->fw_done);
+
+       /*
+        * As per the datasheet, automatic mode switching must be disabled
+        * before the device is placed in or taken out of halt mode.
+        */
+       ret = regmap_update_bits(iqs62x->regmap, IQS62X_PWR_SETTINGS,
+                                IQS62X_PWR_SETTINGS_DIS_AUTO, 0xFF);
+       if (ret)
+               return ret;
+
+       return regmap_update_bits(iqs62x->regmap, IQS62X_PWR_SETTINGS,
+                                 IQS62X_PWR_SETTINGS_PWR_MODE_MASK,
+                                 IQS62X_PWR_SETTINGS_PWR_MODE_HALT);
+}
+
+static int __maybe_unused iqs62x_resume(struct device *dev)
+{
+       struct iqs62x_core *iqs62x = dev_get_drvdata(dev);
+       int ret;
+
+       ret = regmap_update_bits(iqs62x->regmap, IQS62X_PWR_SETTINGS,
+                                IQS62X_PWR_SETTINGS_PWR_MODE_MASK,
+                                IQS62X_PWR_SETTINGS_PWR_MODE_NORM);
+       if (ret)
+               return ret;
+
+       return regmap_update_bits(iqs62x->regmap, IQS62X_PWR_SETTINGS,
+                                 IQS62X_PWR_SETTINGS_DIS_AUTO, 0);
+}
+
+static SIMPLE_DEV_PM_OPS(iqs62x_pm, iqs62x_suspend, iqs62x_resume);
+
+static const struct of_device_id iqs62x_of_match[] = {
+       { .compatible = "azoteq,iqs620a" },
+       { .compatible = "azoteq,iqs621" },
+       { .compatible = "azoteq,iqs622" },
+       { .compatible = "azoteq,iqs624" },
+       { .compatible = "azoteq,iqs625" },
+       { }
+};
+MODULE_DEVICE_TABLE(of, iqs62x_of_match);
+
+static struct i2c_driver iqs62x_i2c_driver = {
+       .driver = {
+               .name = "iqs62x",
+               .of_match_table = iqs62x_of_match,
+               .pm = &iqs62x_pm,
+       },
+       .probe_new = iqs62x_probe,
+       .remove = iqs62x_remove,
+};
+module_i2c_driver(iqs62x_i2c_driver);
+
+MODULE_AUTHOR("Jeff LaBundy <jeff@labundy.com>");
+MODULE_DESCRIPTION("Azoteq IQS620A/621/622/624/625 Multi-Function Sensors");
+MODULE_LICENSE("GPL");
index 4798d9f..1f4f01b 100644 (file)
@@ -840,7 +840,7 @@ MODULE_DEVICE_TABLE(of, usbhs_omap_dt_ids);
 
 static struct platform_driver usbhs_omap_driver = {
        .driver = {
-               .name           = (char *)usbhs_driver_name,
+               .name           = usbhs_driver_name,
                .pm             = &usbhsomap_dev_pm_ops,
                .of_match_table = usbhs_omap_dt_ids,
        },
index 265f5e3..4b7f73c 100644 (file)
@@ -99,7 +99,7 @@
 struct usbtll_omap {
        void __iomem    *base;
        int             nch;            /* num. of channels */
-       struct clk      *ch_clk[0];     /* must be the last member */
+       struct clk      *ch_clk[];      /* must be the last member */
 };
 
 /*-------------------------------------------------------------------------*/
@@ -304,7 +304,7 @@ MODULE_DEVICE_TABLE(of, usbtll_omap_dt_ids);
 
 static struct platform_driver usbtll_omap_driver = {
        .driver = {
-               .name           = (char *)usbtll_driver_name,
+               .name           = usbtll_driver_name,
                .of_match_table = usbtll_omap_dt_ids,
        },
        .probe          = usbtll_omap_probe,
index 2913332..acd172d 100644 (file)
@@ -76,7 +76,7 @@ struct pm_irq_chip {
        unsigned int            num_masters;
        const struct pm_irq_data *pm_irq_data;
        /* MUST BE AT THE END OF THIS STRUCT */
-       u8                      config[0];
+       u8                      config[];
 };
 
 static int pm8xxx_read_block_irq(struct pm_irq_chip *chip, unsigned int bp,
index a69a674..d109b9f 100644 (file)
@@ -19,7 +19,6 @@
 #include <linux/module.h>
 #include <linux/of_device.h>
 #include <linux/regmap.h>
-#include <linux/syscore_ops.h>
 
 struct rk808_reg_data {
        int addr;
@@ -186,7 +185,6 @@ static const struct rk808_reg_data rk805_pre_init_reg[] = {
        {RK805_BUCK4_CONFIG_REG, RK805_BUCK3_4_ILMAX_MASK,
                                 RK805_BUCK4_ILMAX_3500MA},
        {RK805_BUCK4_CONFIG_REG, BUCK_ILMIN_MASK, BUCK_ILMIN_400MA},
-       {RK805_GPIO_IO_POL_REG, SLP_SD_MSK, SLEEP_FUN},
        {RK805_THERMAL_REG, TEMP_HOTDIE_MSK, TEMP115C},
 };
 
@@ -449,88 +447,60 @@ static const struct regmap_irq_chip rk818_irq_chip = {
 
 static struct i2c_client *rk808_i2c_client;
 
-static void rk805_device_shutdown(void)
+static void rk808_pm_power_off(void)
 {
        int ret;
+       unsigned int reg, bit;
        struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
 
-       if (!rk808)
-               return;
-
-       ret = regmap_update_bits(rk808->regmap,
-                                RK805_DEV_CTRL_REG,
-                                DEV_OFF, DEV_OFF);
-       if (ret)
-               dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
-}
-
-static void rk805_device_shutdown_prepare(void)
-{
-       int ret;
-       struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
-
-       if (!rk808)
-               return;
-
-       ret = regmap_update_bits(rk808->regmap,
-                                RK805_GPIO_IO_POL_REG,
-                                SLP_SD_MSK, SHUTDOWN_FUN);
-       if (ret)
-               dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
-}
-
-static void rk808_device_shutdown(void)
-{
-       int ret;
-       struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
-
-       if (!rk808)
-               return;
-
-       ret = regmap_update_bits(rk808->regmap,
-                                RK808_DEVCTRL_REG,
-                                DEV_OFF_RST, DEV_OFF_RST);
-       if (ret)
-               dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
-}
-
-static void rk818_device_shutdown(void)
-{
-       int ret;
-       struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
-
-       if (!rk808)
+       switch (rk808->variant) {
+       case RK805_ID:
+               reg = RK805_DEV_CTRL_REG;
+               bit = DEV_OFF;
+               break;
+       case RK808_ID:
+               reg = RK808_DEVCTRL_REG,
+               bit = DEV_OFF_RST;
+               break;
+       case RK818_ID:
+               reg = RK818_DEVCTRL_REG;
+               bit = DEV_OFF;
+               break;
+       default:
                return;
-
-       ret = regmap_update_bits(rk808->regmap,
-                                RK818_DEVCTRL_REG,
-                                DEV_OFF, DEV_OFF);
+       }
+       ret = regmap_update_bits(rk808->regmap, reg, bit, bit);
        if (ret)
                dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
 }
 
-static void rk8xx_syscore_shutdown(void)
+static void rk8xx_shutdown(struct i2c_client *client)
 {
-       struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+       struct rk808 *rk808 = i2c_get_clientdata(client);
        int ret;
 
-       if (system_state == SYSTEM_POWER_OFF &&
-           (rk808->variant == RK809_ID || rk808->variant == RK817_ID)) {
+       switch (rk808->variant) {
+       case RK805_ID:
+               ret = regmap_update_bits(rk808->regmap,
+                                        RK805_GPIO_IO_POL_REG,
+                                        SLP_SD_MSK,
+                                        SHUTDOWN_FUN);
+               break;
+       case RK809_ID:
+       case RK817_ID:
                ret = regmap_update_bits(rk808->regmap,
                                         RK817_SYS_CFG(3),
                                         RK817_SLPPIN_FUNC_MSK,
                                         SLPPIN_DN_FUN);
-               if (ret) {
-                       dev_warn(&rk808_i2c_client->dev,
-                                "Cannot switch to power down function\n");
-               }
+               break;
+       default:
+               return;
        }
+       if (ret)
+               dev_warn(&client->dev,
+                        "Cannot switch to power down function\n");
 }
 
-static struct syscore_ops rk808_syscore_ops = {
-       .shutdown = rk8xx_syscore_shutdown,
-};
-
 static const struct of_device_id rk808_of_match[] = {
        { .compatible = "rockchip,rk805" },
        { .compatible = "rockchip,rk808" },
@@ -550,7 +520,7 @@ static int rk808_probe(struct i2c_client *client,
        const struct mfd_cell *cells;
        int nr_pre_init_regs;
        int nr_cells;
-       int pm_off = 0, msb, lsb;
+       int msb, lsb;
        unsigned char pmic_id_msb, pmic_id_lsb;
        int ret;
        int i;
@@ -594,8 +564,6 @@ static int rk808_probe(struct i2c_client *client,
                nr_pre_init_regs = ARRAY_SIZE(rk805_pre_init_reg);
                cells = rk805s;
                nr_cells = ARRAY_SIZE(rk805s);
-               rk808->pm_pwroff_fn = rk805_device_shutdown;
-               rk808->pm_pwroff_prep_fn = rk805_device_shutdown_prepare;
                break;
        case RK808_ID:
                rk808->regmap_cfg = &rk808_regmap_config;
@@ -604,7 +572,6 @@ static int rk808_probe(struct i2c_client *client,
                nr_pre_init_regs = ARRAY_SIZE(rk808_pre_init_reg);
                cells = rk808s;
                nr_cells = ARRAY_SIZE(rk808s);
-               rk808->pm_pwroff_fn = rk808_device_shutdown;
                break;
        case RK818_ID:
                rk808->regmap_cfg = &rk818_regmap_config;
@@ -613,7 +580,6 @@ static int rk808_probe(struct i2c_client *client,
                nr_pre_init_regs = ARRAY_SIZE(rk818_pre_init_reg);
                cells = rk818s;
                nr_cells = ARRAY_SIZE(rk818s);
-               rk808->pm_pwroff_fn = rk818_device_shutdown;
                break;
        case RK809_ID:
        case RK817_ID:
@@ -623,7 +589,6 @@ static int rk808_probe(struct i2c_client *client,
                nr_pre_init_regs = ARRAY_SIZE(rk817_pre_init_reg);
                cells = rk817s;
                nr_cells = ARRAY_SIZE(rk817s);
-               register_syscore_ops(&rk808_syscore_ops);
                break;
        default:
                dev_err(&client->dev, "Unsupported RK8XX ID %lu\n",
@@ -674,17 +639,9 @@ static int rk808_probe(struct i2c_client *client,
                goto err_irq;
        }
 
-       pm_off = of_property_read_bool(np,
-                               "rockchip,system-power-controller");
-       if (pm_off && !pm_power_off) {
+       if (of_property_read_bool(np, "rockchip,system-power-controller")) {
                rk808_i2c_client = client;
-               pm_power_off = rk808->pm_pwroff_fn;
-       }
-
-       if (pm_off && !pm_power_off_prepare) {
-               if (!rk808_i2c_client)
-                       rk808_i2c_client = client;
-               pm_power_off_prepare = rk808->pm_pwroff_prep_fn;
+               pm_power_off = rk808_pm_power_off;
        }
 
        return 0;
@@ -704,25 +661,24 @@ static int rk808_remove(struct i2c_client *client)
         * pm_power_off may points to a function from another module.
         * Check if the pointer is set by us and only then overwrite it.
         */
-       if (rk808->pm_pwroff_fn && pm_power_off == rk808->pm_pwroff_fn)
+       if (pm_power_off == rk808_pm_power_off)
                pm_power_off = NULL;
 
-       /**
-        * As above, check if the pointer is set by us before overwrite.
-        */
-       if (rk808->pm_pwroff_prep_fn &&
-           pm_power_off_prepare == rk808->pm_pwroff_prep_fn)
-               pm_power_off_prepare = NULL;
-
        return 0;
 }
 
 static int __maybe_unused rk8xx_suspend(struct device *dev)
 {
-       struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+       struct rk808 *rk808 = i2c_get_clientdata(to_i2c_client(dev));
        int ret = 0;
 
        switch (rk808->variant) {
+       case RK805_ID:
+               ret = regmap_update_bits(rk808->regmap,
+                                        RK805_GPIO_IO_POL_REG,
+                                        SLP_SD_MSK,
+                                        SLEEP_FUN);
+               break;
        case RK809_ID:
        case RK817_ID:
                ret = regmap_update_bits(rk808->regmap,
@@ -739,7 +695,7 @@ static int __maybe_unused rk8xx_suspend(struct device *dev)
 
 static int __maybe_unused rk8xx_resume(struct device *dev)
 {
-       struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+       struct rk808 *rk808 = i2c_get_clientdata(to_i2c_client(dev));
        int ret = 0;
 
        switch (rk808->variant) {
@@ -766,6 +722,7 @@ static struct i2c_driver rk808_i2c_driver = {
        },
        .probe    = rk808_probe,
        .remove   = rk808_remove,
+       .shutdown = rk8xx_shutdown,
 };
 
 module_i2c_driver(rk808_i2c_driver);
index ead2e79..232de50 100644 (file)
@@ -8,10 +8,13 @@
 
 #include <linux/delay.h>
 #include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/rn5t618.h>
 #include <linux/module.h>
 #include <linux/of_device.h>
+#include <linux/platform_device.h>
 #include <linux/reboot.h>
 #include <linux/regmap.h>
 
@@ -20,6 +23,13 @@ static const struct mfd_cell rn5t618_cells[] = {
        { .name = "rn5t618-wdt" },
 };
 
+static const struct mfd_cell rc5t619_cells[] = {
+       { .name = "rn5t618-adc" },
+       { .name = "rn5t618-regulator" },
+       { .name = "rc5t619-rtc" },
+       { .name = "rn5t618-wdt" },
+};
+
 static bool rn5t618_volatile_reg(struct device *dev, unsigned int reg)
 {
        switch (reg) {
@@ -32,6 +42,8 @@ static bool rn5t618_volatile_reg(struct device *dev, unsigned int reg)
        case RN5T618_IR_GPF:
        case RN5T618_MON_IOIN:
        case RN5T618_INTMON:
+       case RN5T618_RTC_CTRL1 ... RN5T618_RTC_CTRL2:
+       case RN5T618_RTC_SECONDS ... RN5T618_RTC_YEAR:
                return true;
        default:
                return false;
@@ -46,9 +58,56 @@ static const struct regmap_config rn5t618_regmap_config = {
        .cache_type     = REGCACHE_RBTREE,
 };
 
+static const struct regmap_irq rc5t619_irqs[] = {
+       REGMAP_IRQ_REG(RN5T618_IRQ_SYS, 0, BIT(0)),
+       REGMAP_IRQ_REG(RN5T618_IRQ_DCDC, 0, BIT(1)),
+       REGMAP_IRQ_REG(RN5T618_IRQ_RTC, 0, BIT(2)),
+       REGMAP_IRQ_REG(RN5T618_IRQ_ADC, 0, BIT(3)),
+       REGMAP_IRQ_REG(RN5T618_IRQ_GPIO, 0, BIT(4)),
+       REGMAP_IRQ_REG(RN5T618_IRQ_CHG, 0, BIT(6)),
+};
+
+static const struct regmap_irq_chip rc5t619_irq_chip = {
+       .name = "rc5t619",
+       .irqs = rc5t619_irqs,
+       .num_irqs = ARRAY_SIZE(rc5t619_irqs),
+       .num_regs = 1,
+       .status_base = RN5T618_INTMON,
+       .mask_base = RN5T618_INTEN,
+       .mask_invert = true,
+};
+
 static struct rn5t618 *rn5t618_pm_power_off;
 static struct notifier_block rn5t618_restart_handler;
 
+static int rn5t618_irq_init(struct rn5t618 *rn5t618)
+{
+       const struct regmap_irq_chip *irq_chip = NULL;
+       int ret;
+
+       if (!rn5t618->irq)
+               return 0;
+
+       switch (rn5t618->variant) {
+       case RC5T619:
+               irq_chip = &rc5t619_irq_chip;
+               break;
+       default:
+               dev_err(rn5t618->dev, "Currently no IRQ support for variant %d\n",
+                       (int)rn5t618->variant);
+               return -ENOENT;
+       }
+
+       ret = devm_regmap_add_irq_chip(rn5t618->dev, rn5t618->regmap,
+                                      rn5t618->irq,
+                                      IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
+                                      0, irq_chip, &rn5t618->irq_data);
+       if (ret)
+               dev_err(rn5t618->dev, "Failed to register IRQ chip\n");
+
+       return ret;
+}
+
 static void rn5t618_trigger_poweroff_sequence(bool repower)
 {
        /* disable automatic repower-on */
@@ -87,8 +146,7 @@ static const struct of_device_id rn5t618_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, rn5t618_of_match);
 
-static int rn5t618_i2c_probe(struct i2c_client *i2c,
-                            const struct i2c_device_id *id)
+static int rn5t618_i2c_probe(struct i2c_client *i2c)
 {
        const struct of_device_id *of_id;
        struct rn5t618 *priv;
@@ -106,6 +164,8 @@ static int rn5t618_i2c_probe(struct i2c_client *i2c,
 
        i2c_set_clientdata(i2c, priv);
        priv->variant = (long)of_id->data;
+       priv->irq = i2c->irq;
+       priv->dev = &i2c->dev;
 
        priv->regmap = devm_regmap_init_i2c(i2c, &rn5t618_regmap_config);
        if (IS_ERR(priv->regmap)) {
@@ -114,8 +174,16 @@ static int rn5t618_i2c_probe(struct i2c_client *i2c,
                return ret;
        }
 
-       ret = devm_mfd_add_devices(&i2c->dev, -1, rn5t618_cells,
-                                  ARRAY_SIZE(rn5t618_cells), NULL, 0, NULL);
+       if (priv->variant == RC5T619)
+               ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_NONE,
+                                          rc5t619_cells,
+                                          ARRAY_SIZE(rc5t619_cells),
+                                          NULL, 0, NULL);
+       else
+               ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_NONE,
+                                          rn5t618_cells,
+                                          ARRAY_SIZE(rn5t618_cells),
+                                          NULL, 0, NULL);
        if (ret) {
                dev_err(&i2c->dev, "failed to add sub-devices: %d\n", ret);
                return ret;
@@ -138,7 +206,7 @@ static int rn5t618_i2c_probe(struct i2c_client *i2c,
                return ret;
        }
 
-       return 0;
+       return rn5t618_irq_init(priv);
 }
 
 static int rn5t618_i2c_remove(struct i2c_client *i2c)
@@ -155,19 +223,38 @@ static int rn5t618_i2c_remove(struct i2c_client *i2c)
        return 0;
 }
 
-static const struct i2c_device_id rn5t618_i2c_id[] = {
-       { }
-};
-MODULE_DEVICE_TABLE(i2c, rn5t618_i2c_id);
+static int __maybe_unused rn5t618_i2c_suspend(struct device *dev)
+{
+       struct rn5t618 *priv = dev_get_drvdata(dev);
+
+       if (priv->irq)
+               disable_irq(priv->irq);
+
+       return 0;
+}
+
+static int __maybe_unused rn5t618_i2c_resume(struct device *dev)
+{
+       struct rn5t618 *priv = dev_get_drvdata(dev);
+
+       if (priv->irq)
+               enable_irq(priv->irq);
+
+       return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(rn5t618_i2c_dev_pm_ops,
+                       rn5t618_i2c_suspend,
+                       rn5t618_i2c_resume);
 
 static struct i2c_driver rn5t618_i2c_driver = {
        .driver = {
                .name = "rn5t618",
                .of_match_table = of_match_ptr(rn5t618_of_match),
+               .pm = &rn5t618_i2c_dev_pm_ops,
        },
-       .probe = rn5t618_i2c_probe,
+       .probe_new = rn5t618_i2c_probe,
        .remove = rn5t618_i2c_remove,
-       .id_table = rn5t618_i2c_id,
 };
 
 module_i2c_driver(rn5t618_i2c_driver);
index c0529a1..ebdf2f1 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/of_device.h>
 #include <linux/regmap.h>
 #include <linux/spi/spi.h>
+#include <uapi/linux/usb/charger.h>
 
 #define SPRD_PMIC_INT_MASK_STATUS      0x0
 #define SPRD_PMIC_INT_RAW_STATUS       0x4
 
 #define SPRD_SC2731_IRQ_BASE           0x140
 #define SPRD_SC2731_IRQ_NUMS           16
+#define SPRD_SC2731_CHG_DET            0xedc
+
+/* PMIC charger detection definition */
+#define SPRD_PMIC_CHG_DET_DELAY_US     200000
+#define SPRD_PMIC_CHG_DET_TIMEOUT      2000000
+#define SPRD_PMIC_CHG_DET_DONE         BIT(11)
+#define SPRD_PMIC_SDP_TYPE             BIT(7)
+#define SPRD_PMIC_DCP_TYPE             BIT(6)
+#define SPRD_PMIC_CDP_TYPE             BIT(5)
+#define SPRD_PMIC_CHG_TYPE_MASK                GENMASK(7, 5)
 
 struct sprd_pmic {
        struct regmap *regmap;
@@ -24,12 +35,14 @@ struct sprd_pmic {
        struct regmap_irq *irqs;
        struct regmap_irq_chip irq_chip;
        struct regmap_irq_chip_data *irq_data;
+       const struct sprd_pmic_data *pdata;
        int irq;
 };
 
 struct sprd_pmic_data {
        u32 irq_base;
        u32 num_irqs;
+       u32 charger_det;
 };
 
 /*
@@ -40,8 +53,46 @@ struct sprd_pmic_data {
 static const struct sprd_pmic_data sc2731_data = {
        .irq_base = SPRD_SC2731_IRQ_BASE,
        .num_irqs = SPRD_SC2731_IRQ_NUMS,
+       .charger_det = SPRD_SC2731_CHG_DET,
 };
 
+enum usb_charger_type sprd_pmic_detect_charger_type(struct device *dev)
+{
+       struct spi_device *spi = to_spi_device(dev);
+       struct sprd_pmic *ddata = spi_get_drvdata(spi);
+       const struct sprd_pmic_data *pdata = ddata->pdata;
+       enum usb_charger_type type;
+       u32 val;
+       int ret;
+
+       ret = regmap_read_poll_timeout(ddata->regmap, pdata->charger_det, val,
+                                      (val & SPRD_PMIC_CHG_DET_DONE),
+                                      SPRD_PMIC_CHG_DET_DELAY_US,
+                                      SPRD_PMIC_CHG_DET_TIMEOUT);
+       if (ret) {
+               dev_err(&spi->dev, "failed to detect charger type\n");
+               return UNKNOWN_TYPE;
+       }
+
+       switch (val & SPRD_PMIC_CHG_TYPE_MASK) {
+       case SPRD_PMIC_CDP_TYPE:
+               type = CDP_TYPE;
+               break;
+       case SPRD_PMIC_DCP_TYPE:
+               type = DCP_TYPE;
+               break;
+       case SPRD_PMIC_SDP_TYPE:
+               type = SDP_TYPE;
+               break;
+       default:
+               type = UNKNOWN_TYPE;
+               break;
+       }
+
+       return type;
+}
+EXPORT_SYMBOL_GPL(sprd_pmic_detect_charger_type);
+
 static const struct mfd_cell sprd_pmic_devs[] = {
        {
                .name = "sc27xx-wdt",
@@ -181,6 +232,7 @@ static int sprd_pmic_probe(struct spi_device *spi)
        spi_set_drvdata(spi, ddata);
        ddata->dev = &spi->dev;
        ddata->irq = spi->irq;
+       ddata->pdata = pdata;
 
        ddata->irq_chip.name = dev_name(&spi->dev);
        ddata->irq_chip.status_base =
index cc92bc3..886459e 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/task_stack.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 
 #ifdef CONFIG_X86_32
 #include <asm/desc.h>
@@ -175,6 +176,80 @@ void lkdtm_HUNG_TASK(void)
        schedule();
 }
 
+volatile unsigned int huge = INT_MAX - 2;
+volatile unsigned int ignored;
+
+void lkdtm_OVERFLOW_SIGNED(void)
+{
+       int value;
+
+       value = huge;
+       pr_info("Normal signed addition ...\n");
+       value += 1;
+       ignored = value;
+
+       pr_info("Overflowing signed addition ...\n");
+       value += 4;
+       ignored = value;
+}
+
+
+void lkdtm_OVERFLOW_UNSIGNED(void)
+{
+       unsigned int value;
+
+       value = huge;
+       pr_info("Normal unsigned addition ...\n");
+       value += 1;
+       ignored = value;
+
+       pr_info("Overflowing unsigned addition ...\n");
+       value += 4;
+       ignored = value;
+}
+
+/* Intentially using old-style flex array definition of 1 byte. */
+struct array_bounds_flex_array {
+       int one;
+       int two;
+       char data[1];
+};
+
+struct array_bounds {
+       int one;
+       int two;
+       char data[8];
+       int three;
+};
+
+void lkdtm_ARRAY_BOUNDS(void)
+{
+       struct array_bounds_flex_array *not_checked;
+       struct array_bounds *checked;
+       volatile int i;
+
+       not_checked = kmalloc(sizeof(*not_checked) * 2, GFP_KERNEL);
+       checked = kmalloc(sizeof(*checked) * 2, GFP_KERNEL);
+
+       pr_info("Array access within bounds ...\n");
+       /* For both, touch all bytes in the actual member size. */
+       for (i = 0; i < sizeof(checked->data); i++)
+               checked->data[i] = 'A';
+       /*
+        * For the uninstrumented flex array member, also touch 1 byte
+        * beyond to verify it is correctly uninstrumented.
+        */
+       for (i = 0; i < sizeof(not_checked->data) + 1; i++)
+               not_checked->data[i] = 'A';
+
+       pr_info("Array access beyond bounds ...\n");
+       for (i = 0; i < sizeof(checked->data) + 1; i++)
+               checked->data[i] = 'B';
+
+       kfree(not_checked);
+       kfree(checked);
+}
+
 void lkdtm_CORRUPT_LIST_ADD(void)
 {
        /*
index 5ce4ac8..a5e344d 100644 (file)
@@ -130,6 +130,9 @@ static const struct crashtype crashtypes[] = {
        CRASHTYPE(HARDLOCKUP),
        CRASHTYPE(SPINLOCKUP),
        CRASHTYPE(HUNG_TASK),
+       CRASHTYPE(OVERFLOW_SIGNED),
+       CRASHTYPE(OVERFLOW_UNSIGNED),
+       CRASHTYPE(ARRAY_BOUNDS),
        CRASHTYPE(EXEC_DATA),
        CRASHTYPE(EXEC_STACK),
        CRASHTYPE(EXEC_KMALLOC),
index 8d13d01..601a215 100644 (file)
@@ -22,6 +22,9 @@ void lkdtm_SOFTLOCKUP(void);
 void lkdtm_HARDLOCKUP(void);
 void lkdtm_SPINLOCKUP(void);
 void lkdtm_HUNG_TASK(void);
+void lkdtm_OVERFLOW_SIGNED(void);
+void lkdtm_OVERFLOW_UNSIGNED(void);
+void lkdtm_ARRAY_BOUNDS(void);
 void lkdtm_CORRUPT_LIST_ADD(void);
 void lkdtm_CORRUPT_LIST_DEL(void);
 void lkdtm_CORRUPT_USER_DS(void);
index b6841ba..8f201d0 100644 (file)
@@ -133,8 +133,4 @@ config VOP
          OS and tools for MIC to use with this driver are available from
          <http://software.intel.com/en-us/mic-developer>.
 
-if VOP
-source "drivers/vhost/Kconfig.vringh"
-endif
-
 endmenu
index e74e2bb..9db0570 100644 (file)
@@ -58,8 +58,4 @@ config CAIF_VIRTIO
        ---help---
          The CAIF driver for CAIF over Virtio.
 
-if CAIF_VIRTIO
-source "drivers/vhost/Kconfig.vringh"
-endif
-
 endif # CAIF_DRIVERS
index a8b5159..09087c3 100644 (file)
@@ -1042,8 +1042,10 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
                        return -EFAULT;
        }
 
-       if (!desc || (desc->out_num + desc->in_num == 0) ||
-                       !test_bit(cmd, &cmd_mask))
+       if (!desc ||
+           (desc->out_num + desc->in_num == 0) ||
+           cmd > ND_CMD_CALL ||
+           !test_bit(cmd, &cmd_mask))
                return -ENOTTY;
 
        /* fail write commands (when read-only) */
index 64776ed..7d4ddc4 100644 (file)
@@ -99,7 +99,7 @@ static int nvdimm_probe(struct device *dev)
        if (ndd->ns_current >= 0) {
                rc = nd_label_reserve_dpa(ndd);
                if (rc == 0)
-                       nvdimm_set_aliasing(dev);
+                       nvdimm_set_labeling(dev);
        }
        nvdimm_bus_unlock(dev);
 
index 94ea6db..b7b77e8 100644 (file)
@@ -32,7 +32,7 @@ int nvdimm_check_config_data(struct device *dev)
 
        if (!nvdimm->cmd_mask ||
            !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) {
-               if (test_bit(NDD_ALIASING, &nvdimm->flags))
+               if (test_bit(NDD_LABELING, &nvdimm->flags))
                        return -ENXIO;
                else
                        return -ENOTTY;
@@ -173,11 +173,11 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
        return rc;
 }
 
-void nvdimm_set_aliasing(struct device *dev)
+void nvdimm_set_labeling(struct device *dev)
 {
        struct nvdimm *nvdimm = to_nvdimm(dev);
 
-       set_bit(NDD_ALIASING, &nvdimm->flags);
+       set_bit(NDD_LABELING, &nvdimm->flags);
 }
 
 void nvdimm_set_locked(struct device *dev)
@@ -312,8 +312,9 @@ static ssize_t flags_show(struct device *dev,
 {
        struct nvdimm *nvdimm = to_nvdimm(dev);
 
-       return sprintf(buf, "%s%s\n",
+       return sprintf(buf, "%s%s%s\n",
                        test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "",
+                       test_bit(NDD_LABELING, &nvdimm->flags) ? "label " : "",
                        test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
 }
 static DEVICE_ATTR_RO(flags);
@@ -562,6 +563,21 @@ int nvdimm_security_freeze(struct nvdimm *nvdimm)
        return rc;
 }
 
+static unsigned long dpa_align(struct nd_region *nd_region)
+{
+       struct device *dev = &nd_region->dev;
+
+       if (dev_WARN_ONCE(dev, !is_nvdimm_bus_locked(dev),
+                               "bus lock required for capacity provision\n"))
+               return 0;
+       if (dev_WARN_ONCE(dev, !nd_region->ndr_mappings || nd_region->align
+                               % nd_region->ndr_mappings,
+                               "invalid region align %#lx mappings: %d\n",
+                               nd_region->align, nd_region->ndr_mappings))
+               return 0;
+       return nd_region->align / nd_region->ndr_mappings;
+}
+
 int alias_dpa_busy(struct device *dev, void *data)
 {
        resource_size_t map_end, blk_start, new;
@@ -570,6 +586,7 @@ int alias_dpa_busy(struct device *dev, void *data)
        struct nd_region *nd_region;
        struct nvdimm_drvdata *ndd;
        struct resource *res;
+       unsigned long align;
        int i;
 
        if (!is_memory(dev))
@@ -607,13 +624,21 @@ int alias_dpa_busy(struct device *dev, void *data)
         * Find the free dpa from the end of the last pmem allocation to
         * the end of the interleave-set mapping.
         */
+       align = dpa_align(nd_region);
+       if (!align)
+               return 0;
+
        for_each_dpa_resource(ndd, res) {
+               resource_size_t start, end;
+
                if (strncmp(res->name, "pmem", 4) != 0)
                        continue;
-               if ((res->start >= blk_start && res->start < map_end)
-                               || (res->end >= blk_start
-                                       && res->end <= map_end)) {
-                       new = max(blk_start, min(map_end + 1, res->end + 1));
+
+               start = ALIGN_DOWN(res->start, align);
+               end = ALIGN(res->end + 1, align) - 1;
+               if ((start >= blk_start && start < map_end)
+                               || (end >= blk_start && end <= map_end)) {
+                       new = max(blk_start, min(map_end, end) + 1);
                        if (new != blk_start) {
                                blk_start = new;
                                goto retry;
@@ -653,6 +678,7 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region)
                .res = NULL,
        };
        struct resource *res;
+       unsigned long align;
 
        if (!ndd)
                return 0;
@@ -660,10 +686,20 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region)
        device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy);
 
        /* now account for busy blk allocations in unaliased dpa */
+       align = dpa_align(nd_region);
+       if (!align)
+               return 0;
        for_each_dpa_resource(ndd, res) {
+               resource_size_t start, end, size;
+
                if (strncmp(res->name, "blk", 3) != 0)
                        continue;
-               info.available -= resource_size(res);
+               start = ALIGN_DOWN(res->start, align);
+               end = ALIGN(res->end + 1, align) - 1;
+               size = end - start + 1;
+               if (size >= info.available)
+                       return 0;
+               info.available -= size;
        }
 
        return info.available;
@@ -682,19 +718,31 @@ resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region,
        struct nvdimm_bus *nvdimm_bus;
        resource_size_t max = 0;
        struct resource *res;
+       unsigned long align;
 
        /* if a dimm is disabled the available capacity is zero */
        if (!ndd)
                return 0;
 
+       align = dpa_align(nd_region);
+       if (!align)
+               return 0;
+
        nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
        if (__reserve_free_pmem(&nd_region->dev, nd_mapping->nvdimm))
                return 0;
        for_each_dpa_resource(ndd, res) {
+               resource_size_t start, end;
+
                if (strcmp(res->name, "pmem-reserve") != 0)
                        continue;
-               if (resource_size(res) > max)
-                       max = resource_size(res);
+               /* trim free space relative to current alignment setting */
+               start = ALIGN(res->start, align);
+               end = ALIGN_DOWN(res->end + 1, align) - 1;
+               if (end < start)
+                       continue;
+               if (end - start + 1 > max)
+                       max = end - start + 1;
        }
        release_free_pmem(nvdimm_bus, nd_mapping);
        return max;
@@ -722,24 +770,33 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
        struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
        struct resource *res;
        const char *reason;
+       unsigned long align;
 
        if (!ndd)
                return 0;
 
+       align = dpa_align(nd_region);
+       if (!align)
+               return 0;
+
        map_start = nd_mapping->start;
        map_end = map_start + nd_mapping->size - 1;
        blk_start = max(map_start, map_end + 1 - *overlap);
        for_each_dpa_resource(ndd, res) {
-               if (res->start >= map_start && res->start < map_end) {
+               resource_size_t start, end;
+
+               start = ALIGN_DOWN(res->start, align);
+               end = ALIGN(res->end + 1, align) - 1;
+               if (start >= map_start && start < map_end) {
                        if (strncmp(res->name, "blk", 3) == 0)
                                blk_start = min(blk_start,
-                                               max(map_start, res->start));
-                       else if (res->end > map_end) {
+                                               max(map_start, start));
+                       else if (end > map_end) {
                                reason = "misaligned to iset";
                                goto err;
                        } else
-                               busy += resource_size(res);
-               } else if (res->end >= map_start && res->end <= map_end) {
+                               busy += end - start + 1;
+               } else if (end >= map_start && end <= map_end) {
                        if (strncmp(res->name, "blk", 3) == 0) {
                                /*
                                 * If a BLK allocation overlaps the start of
@@ -748,8 +805,8 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
                                 */
                                blk_start = map_start;
                        } else
-                               busy += resource_size(res);
-               } else if (map_start > res->start && map_start < res->end) {
+                               busy += end - start + 1;
+               } else if (map_start > start && map_start < end) {
                        /* total eclipse of the mapping */
                        busy += nd_mapping->size;
                        blk_start = map_start;
@@ -759,7 +816,7 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
        *overlap = map_end + 1 - blk_start;
        available = blk_start - map_start;
        if (busy < available)
-               return available - busy;
+               return ALIGN_DOWN(available - busy, align);
        return 0;
 
  err:
index e02f60a..4cd18be 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/libnvdimm.h>
 #include <linux/module.h>
+#include <linux/numa.h>
 
 static int e820_pmem_remove(struct platform_device *pdev)
 {
@@ -16,27 +17,16 @@ static int e820_pmem_remove(struct platform_device *pdev)
        return 0;
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int e820_range_to_nid(resource_size_t addr)
-{
-       return memory_add_physaddr_to_nid(addr);
-}
-#else
-static int e820_range_to_nid(resource_size_t addr)
-{
-       return NUMA_NO_NODE;
-}
-#endif
-
 static int e820_register_one(struct resource *res, void *data)
 {
        struct nd_region_desc ndr_desc;
        struct nvdimm_bus *nvdimm_bus = data;
+       int nid = phys_to_target_node(res->start);
 
        memset(&ndr_desc, 0, sizeof(ndr_desc));
        ndr_desc.res = res;
-       ndr_desc.numa_node = e820_range_to_nid(res->start);
-       ndr_desc.target_node = ndr_desc.numa_node;
+       ndr_desc.numa_node = numa_map_to_online_node(nid);
+       ndr_desc.target_node = nid;
        set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
        if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
                return -ENXIO;
index 4c7b775..956b6d1 100644 (file)
@@ -62,7 +62,7 @@ struct nd_namespace_index {
        __le16 major;
        __le16 minor;
        __le64 checksum;
-       u8 free[0];
+       u8 free[];
 };
 
 /**
index 032dc61..ae155e8 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/nd.h>
 #include "nd-core.h"
 #include "pmem.h"
+#include "pfn.h"
 #include "nd.h"
 
 static void namespace_io_release(struct device *dev)
@@ -541,6 +542,11 @@ static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd,
 {
        bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0;
        bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
+       unsigned long align;
+
+       align = nd_region->align / nd_region->ndr_mappings;
+       valid->start = ALIGN(valid->start, align);
+       valid->end = ALIGN_DOWN(valid->end + 1, align) - 1;
 
        if (valid->start >= valid->end)
                goto invalid;
@@ -980,10 +986,10 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
                return -ENXIO;
        }
 
-       div_u64_rem(val, PAGE_SIZE * nd_region->ndr_mappings, &remainder);
+       div_u64_rem(val, nd_region->align, &remainder);
        if (remainder) {
                dev_dbg(dev, "%llu is not %ldK aligned\n", val,
-                               (PAGE_SIZE * nd_region->ndr_mappings) / SZ_1K);
+                               nd_region->align / SZ_1K);
                return -EINVAL;
        }
 
@@ -1739,6 +1745,22 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
                return ERR_PTR(-ENODEV);
        }
 
+       /*
+        * Note, alignment validation for fsdax and devdax mode
+        * namespaces happens in nd_pfn_validate() where infoblock
+        * padding parameters can be applied.
+        */
+       if (pmem_should_map_pages(dev)) {
+               struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+               struct resource *res = &nsio->res;
+
+               if (!IS_ALIGNED(res->start | (res->end + 1),
+                                       memremap_compat_align())) {
+                       dev_err(&ndns->dev, "%pr misaligned, unable to map\n", res);
+                       return ERR_PTR(-EOPNOTSUPP);
+               }
+       }
+
        if (is_namespace_pmem(&ndns->dev)) {
                struct nd_namespace_pmem *nspm;
 
@@ -2521,7 +2543,7 @@ static int init_active_labels(struct nd_region *nd_region)
                if (!ndd) {
                        if (test_bit(NDD_LOCKED, &nvdimm->flags))
                                /* fail, label data may be unreadable */;
-                       else if (test_bit(NDD_ALIASING, &nvdimm->flags))
+                       else if (test_bit(NDD_LABELING, &nvdimm->flags))
                                /* fail, labels needed to disambiguate dpa */;
                        else
                                return 0;
index c9f6a5b..85dbb2a 100644 (file)
@@ -39,7 +39,7 @@ struct nd_region_data {
        int ns_count;
        int ns_active;
        unsigned int hints_shift;
-       void __iomem *flush_wpq[0];
+       void __iomem *flush_wpq[];
 };
 
 static inline void __iomem *ndrd_get_flush_wpq(struct nd_region_data *ndrd,
@@ -146,6 +146,7 @@ struct nd_region {
        struct device *btt_seed;
        struct device *pfn_seed;
        struct device *dax_seed;
+       unsigned long align;
        u16 ndr_mappings;
        u64 ndr_size;
        u64 ndr_start;
@@ -156,7 +157,7 @@ struct nd_region {
        struct nd_interleave_set *nd_set;
        struct nd_percpu_lane __percpu *lane;
        int (*flush)(struct nd_region *nd_region, struct bio *bio);
-       struct nd_mapping mapping[0];
+       struct nd_mapping mapping[];
 };
 
 struct nd_blk_region {
@@ -252,7 +253,7 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
                void *buf, size_t len);
 long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
                unsigned int len);
-void nvdimm_set_aliasing(struct device *dev);
+void nvdimm_set_labeling(struct device *dev);
 void nvdimm_set_locked(struct device *dev);
 void nvdimm_clear_locked(struct device *dev);
 int nvdimm_security_setup_events(struct device *dev);
index 8224d14..6826a27 100644 (file)
@@ -62,8 +62,10 @@ static int of_pmem_region_probe(struct platform_device *pdev)
 
                if (is_volatile)
                        region = nvdimm_volatile_region_create(bus, &ndr_desc);
-               else
+               else {
+                       set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);
                        region = nvdimm_pmem_region_create(bus, &ndr_desc);
+               }
 
                if (!region)
                        dev_warn(&pdev->dev, "Unable to register region %pR from %pOF\n",
index acb1951..37cb1b8 100644 (file)
@@ -24,6 +24,18 @@ struct nd_pfn_sb {
        __le64 npfns;
        __le32 mode;
        /* minor-version-1 additions for section alignment */
+       /**
+        * @start_pad: Deprecated attribute to pad start-misaligned namespaces
+        *
+        * start_pad is deprecated because the original definition did
+        * not comprehend that dataoff is relative to the base address
+        * of the namespace not the start_pad adjusted base. The result
+        * is that the dax path is broken, but the block-I/O path is
+        * not. The kernel will no longer create namespaces using start
+        * padding, but it still supports block-I/O for legacy
+        * configurations mainly to allow a backup, reconfigure the
+        * namespace, and restore flow to repair dax operation.
+        */
        __le32 start_pad;
        __le32 end_trunc;
        /* minor-version-2 record the base alignment of the mapping */
index b94f7a7..34db557 100644 (file)
@@ -446,6 +446,7 @@ static bool nd_supported_alignment(unsigned long align)
 int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 {
        u64 checksum, offset;
+       struct resource *res;
        enum nd_pfn_mode mode;
        struct nd_namespace_io *nsio;
        unsigned long align, start_pad;
@@ -561,14 +562,14 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
                        dev_dbg(&nd_pfn->dev, "align: %lx:%lx mode: %d:%d\n",
                                        nd_pfn->align, align, nd_pfn->mode,
                                        mode);
-                       return -EINVAL;
+                       return -EOPNOTSUPP;
                }
        }
 
        if (align > nvdimm_namespace_capacity(ndns)) {
                dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n",
                                align, nvdimm_namespace_capacity(ndns));
-               return -EINVAL;
+               return -EOPNOTSUPP;
        }
 
        /*
@@ -578,18 +579,31 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
         * established.
         */
        nsio = to_nd_namespace_io(&ndns->dev);
-       if (offset >= resource_size(&nsio->res)) {
+       res = &nsio->res;
+       if (offset >= resource_size(res)) {
                dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n",
                                dev_name(&ndns->dev));
-               return -EBUSY;
+               return -EOPNOTSUPP;
        }
 
-       if ((align && !IS_ALIGNED(nsio->res.start + offset + start_pad, align))
+       if ((align && !IS_ALIGNED(res->start + offset + start_pad, align))
                        || !IS_ALIGNED(offset, PAGE_SIZE)) {
                dev_err(&nd_pfn->dev,
                                "bad offset: %#llx dax disabled align: %#lx\n",
                                offset, align);
-               return -ENXIO;
+               return -EOPNOTSUPP;
+       }
+
+       if (!IS_ALIGNED(res->start + le32_to_cpu(pfn_sb->start_pad),
+                               memremap_compat_align())) {
+               dev_err(&nd_pfn->dev, "resource start misaligned\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (!IS_ALIGNED(res->end + 1 - le32_to_cpu(pfn_sb->end_trunc),
+                               memremap_compat_align())) {
+               dev_err(&nd_pfn->dev, "resource end misaligned\n");
+               return -EOPNOTSUPP;
        }
 
        return 0;
@@ -750,7 +764,19 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
        start = nsio->res.start;
        size = resource_size(&nsio->res);
        npfns = PHYS_PFN(size - SZ_8K);
-       align = max(nd_pfn->align, (1UL << SUBSECTION_SHIFT));
+       align = max(nd_pfn->align, memremap_compat_align());
+
+       /*
+        * When @start is misaligned fail namespace creation. See
+        * the 'struct nd_pfn_sb' commentary on why ->start_pad is not
+        * an option.
+        */
+       if (!IS_ALIGNED(start, memremap_compat_align())) {
+               dev_err(&nd_pfn->dev, "%s: start %pa misaligned to %#lx\n",
+                               dev_name(&ndns->dev), &start,
+                               memremap_compat_align());
+               return -EINVAL;
+       }
        end_trunc = start + size - ALIGN_DOWN(start + size, align);
        if (nd_pfn->mode == PFN_MODE_PMEM) {
                /*
index 4ffc6f7..2df6994 100644 (file)
@@ -136,9 +136,25 @@ static blk_status_t read_pmem(struct page *page, unsigned int off,
        return BLK_STS_OK;
 }
 
-static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
-                       unsigned int len, unsigned int off, unsigned int op,
-                       sector_t sector)
+static blk_status_t pmem_do_read(struct pmem_device *pmem,
+                       struct page *page, unsigned int page_off,
+                       sector_t sector, unsigned int len)
+{
+       blk_status_t rc;
+       phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
+       void *pmem_addr = pmem->virt_addr + pmem_off;
+
+       if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
+               return BLK_STS_IOERR;
+
+       rc = read_pmem(page, page_off, pmem_addr, len);
+       flush_dcache_page(page);
+       return rc;
+}
+
+static blk_status_t pmem_do_write(struct pmem_device *pmem,
+                       struct page *page, unsigned int page_off,
+                       sector_t sector, unsigned int len)
 {
        blk_status_t rc = BLK_STS_OK;
        bool bad_pmem = false;
@@ -148,34 +164,25 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
        if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
                bad_pmem = true;
 
-       if (!op_is_write(op)) {
-               if (unlikely(bad_pmem))
-                       rc = BLK_STS_IOERR;
-               else {
-                       rc = read_pmem(page, off, pmem_addr, len);
-                       flush_dcache_page(page);
-               }
-       } else {
-               /*
-                * Note that we write the data both before and after
-                * clearing poison.  The write before clear poison
-                * handles situations where the latest written data is
-                * preserved and the clear poison operation simply marks
-                * the address range as valid without changing the data.
-                * In this case application software can assume that an
-                * interrupted write will either return the new good
-                * data or an error.
-                *
-                * However, if pmem_clear_poison() leaves the data in an
-                * indeterminate state we need to perform the write
-                * after clear poison.
-                */
-               flush_dcache_page(page);
-               write_pmem(pmem_addr, page, off, len);
-               if (unlikely(bad_pmem)) {
-                       rc = pmem_clear_poison(pmem, pmem_off, len);
-                       write_pmem(pmem_addr, page, off, len);
-               }
+       /*
+        * Note that we write the data both before and after
+        * clearing poison.  The write before clear poison
+        * handles situations where the latest written data is
+        * preserved and the clear poison operation simply marks
+        * the address range as valid without changing the data.
+        * In this case application software can assume that an
+        * interrupted write will either return the new good
+        * data or an error.
+        *
+        * However, if pmem_clear_poison() leaves the data in an
+        * indeterminate state we need to perform the write
+        * after clear poison.
+        */
+       flush_dcache_page(page);
+       write_pmem(pmem_addr, page, page_off, len);
+       if (unlikely(bad_pmem)) {
+               rc = pmem_clear_poison(pmem, pmem_off, len);
+               write_pmem(pmem_addr, page, page_off, len);
        }
 
        return rc;
@@ -197,8 +204,12 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 
        do_acct = nd_iostat_start(bio, &start);
        bio_for_each_segment(bvec, bio, iter) {
-               rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
-                               bvec.bv_offset, bio_op(bio), iter.bi_sector);
+               if (op_is_write(bio_op(bio)))
+                       rc = pmem_do_write(pmem, bvec.bv_page, bvec.bv_offset,
+                               iter.bi_sector, bvec.bv_len);
+               else
+                       rc = pmem_do_read(pmem, bvec.bv_page, bvec.bv_offset,
+                               iter.bi_sector, bvec.bv_len);
                if (rc) {
                        bio->bi_status = rc;
                        break;
@@ -223,9 +234,12 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
        struct pmem_device *pmem = bdev->bd_queue->queuedata;
        blk_status_t rc;
 
-       rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
-                         0, op, sector);
-
+       if (op_is_write(op))
+               rc = pmem_do_write(pmem, page, 0, sector,
+                                  hpage_nr_pages(page) * PAGE_SIZE);
+       else
+               rc = pmem_do_read(pmem, page, 0, sector,
+                                  hpage_nr_pages(page) * PAGE_SIZE);
        /*
         * The ->rw_page interface is subtle and tricky.  The core
         * retries on any error, so we can only invoke page_endio() in
@@ -268,6 +282,16 @@ static const struct block_device_operations pmem_fops = {
        .revalidate_disk =      nvdimm_revalidate_disk,
 };
 
+static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+                                   size_t nr_pages)
+{
+       struct pmem_device *pmem = dax_get_private(dax_dev);
+
+       return blk_status_to_errno(pmem_do_write(pmem, ZERO_PAGE(0), 0,
+                                  PFN_PHYS(pgoff) >> SECTOR_SHIFT,
+                                  PAGE_SIZE));
+}
+
 static long pmem_dax_direct_access(struct dax_device *dax_dev,
                pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
 {
@@ -299,6 +323,7 @@ static const struct dax_operations pmem_dax_ops = {
        .dax_supported = generic_fsdax_supported,
        .copy_from_iter = pmem_copy_from_iter,
        .copy_to_iter = pmem_copy_to_iter,
+       .zero_page_range = pmem_dax_zero_page_range,
 };
 
 static const struct attribute_group *pmem_attribute_groups[] = {
@@ -461,9 +486,9 @@ static int pmem_attach_disk(struct device *dev,
        if (is_nvdimm_sync(nd_region))
                flags = DAXDEV_F_SYNC;
        dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags);
-       if (!dax_dev) {
+       if (IS_ERR(dax_dev)) {
                put_disk(disk);
-               return -ENOMEM;
+               return PTR_ERR(dax_dev);
        }
        dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
        pmem->dax_dev = dax_dev;
index a19e535..ccbb5b4 100644 (file)
@@ -195,16 +195,16 @@ EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data);
 int nd_region_to_nstype(struct nd_region *nd_region)
 {
        if (is_memory(&nd_region->dev)) {
-               u16 i, alias;
+               u16 i, label;
 
-               for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) {
+               for (i = 0, label = 0; i < nd_region->ndr_mappings; i++) {
                        struct nd_mapping *nd_mapping = &nd_region->mapping[i];
                        struct nvdimm *nvdimm = nd_mapping->nvdimm;
 
-                       if (test_bit(NDD_ALIASING, &nvdimm->flags))
-                               alias++;
+                       if (test_bit(NDD_LABELING, &nvdimm->flags))
+                               label++;
                }
-               if (alias)
+               if (label)
                        return ND_DEVICE_NAMESPACE_PMEM;
                else
                        return ND_DEVICE_NAMESPACE_IO;
@@ -216,21 +216,25 @@ int nd_region_to_nstype(struct nd_region *nd_region)
 }
 EXPORT_SYMBOL(nd_region_to_nstype);
 
-static ssize_t size_show(struct device *dev,
-               struct device_attribute *attr, char *buf)
+static unsigned long long region_size(struct nd_region *nd_region)
 {
-       struct nd_region *nd_region = to_nd_region(dev);
-       unsigned long long size = 0;
-
-       if (is_memory(dev)) {
-               size = nd_region->ndr_size;
+       if (is_memory(&nd_region->dev)) {
+               return nd_region->ndr_size;
        } else if (nd_region->ndr_mappings == 1) {
                struct nd_mapping *nd_mapping = &nd_region->mapping[0];
 
-               size = nd_mapping->size;
+               return nd_mapping->size;
        }
 
-       return sprintf(buf, "%llu\n", size);
+       return 0;
+}
+
+static ssize_t size_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct nd_region *nd_region = to_nd_region(dev);
+
+       return sprintf(buf, "%llu\n", region_size(nd_region));
 }
 static DEVICE_ATTR_RO(size);
 
@@ -529,6 +533,54 @@ static ssize_t read_only_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(read_only);
 
+static ssize_t align_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct nd_region *nd_region = to_nd_region(dev);
+
+       return sprintf(buf, "%#lx\n", nd_region->align);
+}
+
+static ssize_t align_store(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t len)
+{
+       struct nd_region *nd_region = to_nd_region(dev);
+       unsigned long val, dpa;
+       u32 remainder;
+       int rc;
+
+       rc = kstrtoul(buf, 0, &val);
+       if (rc)
+               return rc;
+
+       if (!nd_region->ndr_mappings)
+               return -ENXIO;
+
+       /*
+        * Ensure space-align is evenly divisible by the region
+        * interleave-width because the kernel typically has no facility
+        * to determine which DIMM(s), dimm-physical-addresses, would
+        * contribute to the tail capacity in system-physical-address
+        * space for the namespace.
+        */
+       dpa = div_u64_rem(val, nd_region->ndr_mappings, &remainder);
+       if (!is_power_of_2(dpa) || dpa < PAGE_SIZE
+                       || val > region_size(nd_region) || remainder)
+               return -EINVAL;
+
+       /*
+        * Given that space allocation consults this value multiple
+        * times ensure it does not change for the duration of the
+        * allocation.
+        */
+       nvdimm_bus_lock(dev);
+       nd_region->align = val;
+       nvdimm_bus_unlock(dev);
+
+       return len;
+}
+static DEVICE_ATTR_RW(align);
+
 static ssize_t region_badblocks_show(struct device *dev,
                struct device_attribute *attr, char *buf)
 {
@@ -571,6 +623,7 @@ static DEVICE_ATTR_RO(persistence_domain);
 
 static struct attribute *nd_region_attributes[] = {
        &dev_attr_size.attr,
+       &dev_attr_align.attr,
        &dev_attr_nstype.attr,
        &dev_attr_mappings.attr,
        &dev_attr_btt_seed.attr,
@@ -626,6 +679,19 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
                return a->mode;
        }
 
+       if (a == &dev_attr_align.attr) {
+               int i;
+
+               for (i = 0; i < nd_region->ndr_mappings; i++) {
+                       struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+                       struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+                       if (test_bit(NDD_LABELING, &nvdimm->flags))
+                               return a->mode;
+               }
+               return 0;
+       }
+
        if (a != &dev_attr_set_cookie.attr
                        && a != &dev_attr_available_size.attr)
                return a->mode;
@@ -935,6 +1001,41 @@ void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane)
 }
 EXPORT_SYMBOL(nd_region_release_lane);
 
+/*
+ * PowerPC requires this alignment for memremap_pages(). All other archs
+ * should be ok with SUBSECTION_SIZE (see memremap_compat_align()).
+ */
+#define MEMREMAP_COMPAT_ALIGN_MAX SZ_16M
+
+static unsigned long default_align(struct nd_region *nd_region)
+{
+       unsigned long align;
+       int i, mappings;
+       u32 remainder;
+
+       if (is_nd_blk(&nd_region->dev))
+               align = PAGE_SIZE;
+       else
+               align = MEMREMAP_COMPAT_ALIGN_MAX;
+
+       for (i = 0; i < nd_region->ndr_mappings; i++) {
+               struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+               struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+               if (test_bit(NDD_ALIASING, &nvdimm->flags)) {
+                       align = MEMREMAP_COMPAT_ALIGN_MAX;
+                       break;
+               }
+       }
+
+       mappings = max_t(u16, 1, nd_region->ndr_mappings);
+       div_u64_rem(align, mappings, &remainder);
+       if (remainder)
+               align *= mappings;
+
+       return align;
+}
+
 static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
                struct nd_region_desc *ndr_desc,
                const struct device_type *dev_type, const char *caller)
@@ -1039,6 +1140,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
        dev->of_node = ndr_desc->of_node;
        nd_region->ndr_size = resource_size(ndr_desc->res);
        nd_region->ndr_start = ndr_desc->res->start;
+       nd_region->align = default_align(nd_region);
        if (ndr_desc->flush)
                nd_region->flush = ndr_desc->flush;
        else
index 4f907e3..91c1bd6 100644 (file)
@@ -6,6 +6,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
+#include <linux/compat.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/hdreg.h>
@@ -1252,6 +1253,18 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl)
        queue_work(nvme_wq, &ctrl->async_event_work);
 }
 
+/*
+ * Convert integer values from ioctl structures to user pointers, silently
+ * ignoring the upper bits in the compat case to match behaviour of 32-bit
+ * kernels.
+ */
+static void __user *nvme_to_user_ptr(uintptr_t ptrval)
+{
+       if (in_compat_syscall())
+               ptrval = (compat_uptr_t)ptrval;
+       return (void __user *)ptrval;
+}
+
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 {
        struct nvme_user_io io;
@@ -1275,7 +1288,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 
        length = (io.nblocks + 1) << ns->lba_shift;
        meta_len = (io.nblocks + 1) * ns->ms;
-       metadata = (void __user *)(uintptr_t)io.metadata;
+       metadata = nvme_to_user_ptr(io.metadata);
 
        if (ns->ext) {
                length += meta_len;
@@ -1298,7 +1311,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
        c.rw.appmask = cpu_to_le16(io.appmask);
 
        return nvme_submit_user_cmd(ns->queue, &c,
-                       (void __user *)(uintptr_t)io.addr, length,
+                       nvme_to_user_ptr(io.addr), length,
                        metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
 }
 
@@ -1418,9 +1431,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 
        effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
        status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
-                       (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
-                       (void __user *)(uintptr_t)cmd.metadata,
-                       cmd.metadata_len, 0, &result, timeout);
+                       nvme_to_user_ptr(cmd.addr), cmd.data_len,
+                       nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
+                       0, &result, timeout);
        nvme_passthru_end(ctrl, effects);
 
        if (status >= 0) {
@@ -1465,8 +1478,8 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 
        effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
        status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
-                       (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
-                       (void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len,
+                       nvme_to_user_ptr(cmd.addr), cmd.data_len,
+                       nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
                        0, &cmd.result, timeout);
        nvme_passthru_end(ctrl, effects);
 
@@ -1884,6 +1897,13 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
        if (ns->head->disk) {
                nvme_update_disk_info(ns->head->disk, ns, id);
                blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
+               if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) {
+                       struct backing_dev_info *info =
+                               ns->head->disk->queue->backing_dev_info;
+
+                        info->capabilities |= BDI_CAP_STABLE_WRITES;
+               }
+
                revalidate_disk(ns->head->disk);
        }
 #endif
index a8bf2fb..7dfc4a2 100644 (file)
@@ -342,8 +342,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
            !template->ls_req || !template->fcp_io ||
            !template->ls_abort || !template->fcp_abort ||
            !template->max_hw_queues || !template->max_sgl_segments ||
-           !template->max_dif_sgl_segments || !template->dma_boundary ||
-           !template->module) {
+           !template->max_dif_sgl_segments || !template->dma_boundary) {
                ret = -EINVAL;
                goto out_reghost_failed;
        }
@@ -2016,7 +2015,6 @@ nvme_fc_ctrl_free(struct kref *ref)
 {
        struct nvme_fc_ctrl *ctrl =
                container_of(ref, struct nvme_fc_ctrl, ref);
-       struct nvme_fc_lport *lport = ctrl->lport;
        unsigned long flags;
 
        if (ctrl->ctrl.tagset) {
@@ -2043,7 +2041,6 @@ nvme_fc_ctrl_free(struct kref *ref)
        if (ctrl->ctrl.opts)
                nvmf_free_options(ctrl->ctrl.opts);
        kfree(ctrl);
-       module_put(lport->ops->module);
 }
 
 static void
@@ -3074,15 +3071,10 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
                goto out_fail;
        }
 
-       if (!try_module_get(lport->ops->module)) {
-               ret = -EUNATCH;
-               goto out_free_ctrl;
-       }
-
        idx = ida_simple_get(&nvme_fc_ctrl_cnt, 0, 0, GFP_KERNEL);
        if (idx < 0) {
                ret = -ENOSPC;
-               goto out_mod_put;
+               goto out_free_ctrl;
        }
 
        ctrl->ctrl.opts = opts;
@@ -3232,8 +3224,6 @@ out_free_queues:
 out_free_ida:
        put_device(ctrl->dev);
        ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum);
-out_mod_put:
-       module_put(lport->ops->module);
 out_free_ctrl:
        kfree(ctrl);
 out_fail:
index 61bf875..54603bd 100644 (file)
@@ -510,7 +510,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
        if (!nr_nsids)
                return 0;
 
-       down_write(&ctrl->namespaces_rwsem);
+       down_read(&ctrl->namespaces_rwsem);
        list_for_each_entry(ns, &ctrl->namespaces, list) {
                unsigned nsid = le32_to_cpu(desc->nsids[n]);
 
@@ -521,7 +521,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
                if (++n == nr_nsids)
                        break;
        }
-       up_write(&ctrl->namespaces_rwsem);
+       up_read(&ctrl->namespaces_rwsem);
        return 0;
 }
 
index 76dbb55..cac8a93 100644 (file)
@@ -1342,7 +1342,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
        int ret;
 
        sge->addr   = qe->dma;
-       sge->length = sizeof(struct nvme_command),
+       sge->length = sizeof(struct nvme_command);
        sge->lkey   = queue->device->pd->local_dma_lkey;
 
        wr.next       = NULL;
index 0ef14f0..c15a921 100644 (file)
@@ -174,16 +174,14 @@ static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
 {
        struct request *rq;
-       unsigned int bytes;
 
        if (unlikely(nvme_tcp_async_req(req)))
                return false; /* async events don't have a request */
 
        rq = blk_mq_rq_from_pdu(req);
-       bytes = blk_rq_payload_bytes(rq);
 
-       return rq_data_dir(rq) == WRITE && bytes &&
-               bytes <= nvme_tcp_inline_data_size(req->queue);
+       return rq_data_dir(rq) == WRITE && req->data_len &&
+               req->data_len <= nvme_tcp_inline_data_size(req->queue);
 }
 
 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
@@ -1075,7 +1073,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
                if (result > 0)
                        pending = true;
                else if (unlikely(result < 0))
-                       break;
+                       return;
 
                if (!pending)
                        return;
@@ -2164,7 +2162,9 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
 
        c->common.flags |= NVME_CMD_SGL_METABUF;
 
-       if (rq_data_dir(rq) == WRITE && req->data_len &&
+       if (!blk_rq_nr_phys_segments(rq))
+               nvme_tcp_set_sg_null(c);
+       else if (rq_data_dir(rq) == WRITE &&
            req->data_len <= nvme_tcp_inline_data_size(queue))
                nvme_tcp_set_sg_inline(queue, c, req->data_len);
        else
@@ -2191,7 +2191,8 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
        req->data_sent = 0;
        req->pdu_len = 0;
        req->pdu_sent = 0;
-       req->data_len = blk_rq_payload_bytes(rq);
+       req->data_len = blk_rq_nr_phys_segments(rq) ?
+                               blk_rq_payload_bytes(rq) : 0;
        req->curr_bio = rq->bio;
 
        if (rq_data_dir(rq) == WRITE &&
@@ -2298,6 +2299,9 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
        struct nvme_tcp_queue *queue = hctx->driver_data;
        struct sock *sk = queue->sock->sk;
 
+       if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
+               return 0;
+
        if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
                sk_busy_loop(sk, true);
        nvme_tcp_try_recv(queue);
index 7aa1078..58cabd7 100644 (file)
@@ -1098,12 +1098,19 @@ static struct configfs_attribute *nvmet_referral_attrs[] = {
        NULL,
 };
 
-static void nvmet_referral_release(struct config_item *item)
+static void nvmet_referral_notify(struct config_group *group,
+               struct config_item *item)
 {
        struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent);
        struct nvmet_port *port = to_nvmet_port(item);
 
        nvmet_referral_disable(parent, port);
+}
+
+static void nvmet_referral_release(struct config_item *item)
+{
+       struct nvmet_port *port = to_nvmet_port(item);
+
        kfree(port);
 }
 
@@ -1134,6 +1141,7 @@ static struct config_group *nvmet_referral_make(
 
 static struct configfs_group_operations nvmet_referral_group_ops = {
        .make_group             = nvmet_referral_make,
+       .disconnect_notify      = nvmet_referral_notify,
 };
 
 static const struct config_item_type nvmet_referrals_type = {
index a0db637..a8ceb77 100644 (file)
@@ -684,7 +684,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
        disconnect = atomic_xchg(&queue->connected, 0);
 
        spin_lock_irqsave(&queue->qlock, flags);
-       /* about outstanding io's */
+       /* abort outstanding io's */
        for (i = 0; i < queue->sqsize; fod++, i++) {
                if (fod->active) {
                        spin_lock(&fod->flock);
index 1c50af6..f69ce66 100644 (file)
@@ -198,10 +198,13 @@ struct fcloop_lport_priv {
 };
 
 struct fcloop_rport {
-       struct nvme_fc_remote_port *remoteport;
-       struct nvmet_fc_target_port *targetport;
-       struct fcloop_nport *nport;
-       struct fcloop_lport *lport;
+       struct nvme_fc_remote_port      *remoteport;
+       struct nvmet_fc_target_port     *targetport;
+       struct fcloop_nport             *nport;
+       struct fcloop_lport             *lport;
+       spinlock_t                      lock;
+       struct list_head                ls_list;
+       struct work_struct              ls_work;
 };
 
 struct fcloop_tport {
@@ -224,11 +227,10 @@ struct fcloop_nport {
 };
 
 struct fcloop_lsreq {
-       struct fcloop_tport             *tport;
        struct nvmefc_ls_req            *lsreq;
-       struct work_struct              work;
        struct nvmefc_tgt_ls_req        tgt_ls_req;
        int                             status;
+       struct list_head                ls_list; /* fcloop_rport->ls_list */
 };
 
 struct fcloop_rscn {
@@ -292,21 +294,32 @@ fcloop_delete_queue(struct nvme_fc_local_port *localport,
 {
 }
 
-
-/*
- * Transmit of LS RSP done (e.g. buffers all set). call back up
- * initiator "done" flows.
- */
 static void
-fcloop_tgt_lsrqst_done_work(struct work_struct *work)
+fcloop_rport_lsrqst_work(struct work_struct *work)
 {
-       struct fcloop_lsreq *tls_req =
-               container_of(work, struct fcloop_lsreq, work);
-       struct fcloop_tport *tport = tls_req->tport;
-       struct nvmefc_ls_req *lsreq = tls_req->lsreq;
+       struct fcloop_rport *rport =
+               container_of(work, struct fcloop_rport, ls_work);
+       struct fcloop_lsreq *tls_req;
 
-       if (!tport || tport->remoteport)
-               lsreq->done(lsreq, tls_req->status);
+       spin_lock(&rport->lock);
+       for (;;) {
+               tls_req = list_first_entry_or_null(&rport->ls_list,
+                               struct fcloop_lsreq, ls_list);
+               if (!tls_req)
+                       break;
+
+               list_del(&tls_req->ls_list);
+               spin_unlock(&rport->lock);
+
+               tls_req->lsreq->done(tls_req->lsreq, tls_req->status);
+               /*
+                * callee may free memory containing tls_req.
+                * do not reference lsreq after this.
+                */
+
+               spin_lock(&rport->lock);
+       }
+       spin_unlock(&rport->lock);
 }
 
 static int
@@ -319,17 +332,18 @@ fcloop_ls_req(struct nvme_fc_local_port *localport,
        int ret = 0;
 
        tls_req->lsreq = lsreq;
-       INIT_WORK(&tls_req->work, fcloop_tgt_lsrqst_done_work);
+       INIT_LIST_HEAD(&tls_req->ls_list);
 
        if (!rport->targetport) {
                tls_req->status = -ECONNREFUSED;
-               tls_req->tport = NULL;
-               schedule_work(&tls_req->work);
+               spin_lock(&rport->lock);
+               list_add_tail(&rport->ls_list, &tls_req->ls_list);
+               spin_unlock(&rport->lock);
+               schedule_work(&rport->ls_work);
                return ret;
        }
 
        tls_req->status = 0;
-       tls_req->tport = rport->targetport->private;
        ret = nvmet_fc_rcv_ls_req(rport->targetport, &tls_req->tgt_ls_req,
                                 lsreq->rqstaddr, lsreq->rqstlen);
 
@@ -337,18 +351,28 @@ fcloop_ls_req(struct nvme_fc_local_port *localport,
 }
 
 static int
-fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
+fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *targetport,
                        struct nvmefc_tgt_ls_req *tgt_lsreq)
 {
        struct fcloop_lsreq *tls_req = tgt_ls_req_to_lsreq(tgt_lsreq);
        struct nvmefc_ls_req *lsreq = tls_req->lsreq;
+       struct fcloop_tport *tport = targetport->private;
+       struct nvme_fc_remote_port *remoteport = tport->remoteport;
+       struct fcloop_rport *rport;
 
        memcpy(lsreq->rspaddr, tgt_lsreq->rspbuf,
                ((lsreq->rsplen < tgt_lsreq->rsplen) ?
                                lsreq->rsplen : tgt_lsreq->rsplen));
+
        tgt_lsreq->done(tgt_lsreq);
 
-       schedule_work(&tls_req->work);
+       if (remoteport) {
+               rport = remoteport->private;
+               spin_lock(&rport->lock);
+               list_add_tail(&rport->ls_list, &tls_req->ls_list);
+               spin_unlock(&rport->lock);
+               schedule_work(&rport->ls_work);
+       }
 
        return 0;
 }
@@ -834,6 +858,7 @@ fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport)
 {
        struct fcloop_rport *rport = remoteport->private;
 
+       flush_work(&rport->ls_work);
        fcloop_nport_put(rport->nport);
 }
 
@@ -850,7 +875,6 @@ fcloop_targetport_delete(struct nvmet_fc_target_port *targetport)
 #define FCLOOP_DMABOUND_4G             0xFFFFFFFF
 
 static struct nvme_fc_port_template fctemplate = {
-       .module                 = THIS_MODULE,
        .localport_delete       = fcloop_localport_delete,
        .remoteport_delete      = fcloop_remoteport_delete,
        .create_queue           = fcloop_create_queue,
@@ -1136,6 +1160,9 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr,
        rport->nport = nport;
        rport->lport = nport->lport;
        nport->rport = rport;
+       spin_lock_init(&rport->lock);
+       INIT_WORK(&rport->ls_work, fcloop_rport_lsrqst_work);
+       INIT_LIST_HEAD(&rport->ls_list);
 
        return count;
 }
index c90c068..fd47de0 100644 (file)
@@ -78,6 +78,7 @@ enum nvmet_rdma_queue_state {
 
 struct nvmet_rdma_queue {
        struct rdma_cm_id       *cm_id;
+       struct ib_qp            *qp;
        struct nvmet_port       *port;
        struct ib_cq            *cq;
        atomic_t                sq_wr_avail;
@@ -105,6 +106,13 @@ struct nvmet_rdma_queue {
        struct list_head        queue_list;
 };
 
+struct nvmet_rdma_port {
+       struct nvmet_port       *nport;
+       struct sockaddr_storage addr;
+       struct rdma_cm_id       *cm_id;
+       struct delayed_work     repair_work;
+};
+
 struct nvmet_rdma_device {
        struct ib_device        *device;
        struct ib_pd            *pd;
@@ -461,7 +469,7 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
        if (ndev->srq)
                ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL);
        else
-               ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL);
+               ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL);
 
        if (unlikely(ret))
                pr_err("post_recv cmd failed\n");
@@ -500,7 +508,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
        atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
 
        if (rsp->n_rdma) {
-               rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
+               rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
                                queue->cm_id->port_num, rsp->req.sg,
                                rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
        }
@@ -584,7 +592,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
 
        WARN_ON(rsp->n_rdma <= 0);
        atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
-       rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
+       rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
                        queue->cm_id->port_num, rsp->req.sg,
                        rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
        rsp->n_rdma = 0;
@@ -739,7 +747,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
        }
 
        if (nvmet_rdma_need_data_in(rsp)) {
-               if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
+               if (rdma_rw_ctx_post(&rsp->rw, queue->qp,
                                queue->cm_id->port_num, &rsp->read_cqe, NULL))
                        nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
        } else {
@@ -911,7 +919,8 @@ static void nvmet_rdma_free_dev(struct kref *ref)
 static struct nvmet_rdma_device *
 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
 {
-       struct nvmet_port *port = cm_id->context;
+       struct nvmet_rdma_port *port = cm_id->context;
+       struct nvmet_port *nport = port->nport;
        struct nvmet_rdma_device *ndev;
        int inline_page_count;
        int inline_sge_count;
@@ -928,17 +937,17 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
        if (!ndev)
                goto out_err;
 
-       inline_page_count = num_pages(port->inline_data_size);
+       inline_page_count = num_pages(nport->inline_data_size);
        inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
                                cm_id->device->attrs.max_recv_sge) - 1;
        if (inline_page_count > inline_sge_count) {
                pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
-                       port->inline_data_size, cm_id->device->name,
+                       nport->inline_data_size, cm_id->device->name,
                        inline_sge_count * PAGE_SIZE);
-               port->inline_data_size = inline_sge_count * PAGE_SIZE;
+               nport->inline_data_size = inline_sge_count * PAGE_SIZE;
                inline_page_count = inline_sge_count;
        }
-       ndev->inline_data_size = port->inline_data_size;
+       ndev->inline_data_size = nport->inline_data_size;
        ndev->inline_page_count = inline_page_count;
        ndev->device = cm_id->device;
        kref_init(&ndev->ref);
@@ -1024,6 +1033,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
                pr_err("failed to create_qp ret= %d\n", ret);
                goto err_destroy_cq;
        }
+       queue->qp = queue->cm_id->qp;
 
        atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
 
@@ -1052,11 +1062,10 @@ err_destroy_cq:
 
 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
 {
-       struct ib_qp *qp = queue->cm_id->qp;
-
-       ib_drain_qp(qp);
-       rdma_destroy_id(queue->cm_id);
-       ib_destroy_qp(qp);
+       ib_drain_qp(queue->qp);
+       if (queue->cm_id)
+               rdma_destroy_id(queue->cm_id);
+       ib_destroy_qp(queue->qp);
        ib_free_cq(queue->cq);
 }
 
@@ -1266,6 +1275,7 @@ static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
                struct rdma_cm_event *event)
 {
+       struct nvmet_rdma_port *port = cm_id->context;
        struct nvmet_rdma_device *ndev;
        struct nvmet_rdma_queue *queue;
        int ret = -EINVAL;
@@ -1281,7 +1291,7 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
                ret = -ENOMEM;
                goto put_device;
        }
-       queue->port = cm_id->context;
+       queue->port = port->nport;
 
        if (queue->host_qid == 0) {
                /* Let inflight controller teardown complete */
@@ -1290,9 +1300,12 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
 
        ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
        if (ret) {
-               schedule_work(&queue->release_work);
-               /* Destroying rdma_cm id is not needed here */
-               return 0;
+               /*
+                * Don't destroy the cm_id in free path, as we implicitly
+                * destroy the cm_id here with non-zero ret code.
+                */
+               queue->cm_id = NULL;
+               goto free_queue;
        }
 
        mutex_lock(&nvmet_rdma_queue_mutex);
@@ -1301,6 +1314,8 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
 
        return 0;
 
+free_queue:
+       nvmet_rdma_free_queue(queue);
 put_device:
        kref_put(&ndev->ref, nvmet_rdma_free_dev);
 
@@ -1406,7 +1421,7 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
                struct nvmet_rdma_queue *queue)
 {
-       struct nvmet_port *port;
+       struct nvmet_rdma_port *port;
 
        if (queue) {
                /*
@@ -1425,7 +1440,7 @@ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
         * cm_id destroy. use atomic xchg to make sure
         * we don't compete with remove_port.
         */
-       if (xchg(&port->priv, NULL) != cm_id)
+       if (xchg(&port->cm_id, NULL) != cm_id)
                return 0;
 
        /*
@@ -1456,6 +1471,13 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
                nvmet_rdma_queue_established(queue);
                break;
        case RDMA_CM_EVENT_ADDR_CHANGE:
+               if (!queue) {
+                       struct nvmet_rdma_port *port = cm_id->context;
+
+                       schedule_delayed_work(&port->repair_work, 0);
+                       break;
+               }
+               /* FALLTHROUGH */
        case RDMA_CM_EVENT_DISCONNECTED:
        case RDMA_CM_EVENT_TIMEWAIT_EXIT:
                nvmet_rdma_queue_disconnect(queue);
@@ -1498,42 +1520,19 @@ restart:
        mutex_unlock(&nvmet_rdma_queue_mutex);
 }
 
-static int nvmet_rdma_add_port(struct nvmet_port *port)
+static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port)
 {
-       struct rdma_cm_id *cm_id;
-       struct sockaddr_storage addr = { };
-       __kernel_sa_family_t af;
-       int ret;
+       struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL);
 
-       switch (port->disc_addr.adrfam) {
-       case NVMF_ADDR_FAMILY_IP4:
-               af = AF_INET;
-               break;
-       case NVMF_ADDR_FAMILY_IP6:
-               af = AF_INET6;
-               break;
-       default:
-               pr_err("address family %d not supported\n",
-                               port->disc_addr.adrfam);
-               return -EINVAL;
-       }
-
-       if (port->inline_data_size < 0) {
-               port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
-       } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
-               pr_warn("inline_data_size %u is too large, reducing to %u\n",
-                       port->inline_data_size,
-                       NVMET_RDMA_MAX_INLINE_DATA_SIZE);
-               port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
-       }
+       if (cm_id)
+               rdma_destroy_id(cm_id);
+}
 
-       ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
-                       port->disc_addr.trsvcid, &addr);
-       if (ret) {
-               pr_err("malformed ip/port passed: %s:%s\n",
-                       port->disc_addr.traddr, port->disc_addr.trsvcid);
-               return ret;
-       }
+static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port)
+{
+       struct sockaddr *addr = (struct sockaddr *)&port->addr;
+       struct rdma_cm_id *cm_id;
+       int ret;
 
        cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
                        RDMA_PS_TCP, IB_QPT_RC);
@@ -1552,23 +1551,19 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
                goto out_destroy_id;
        }
 
-       ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
+       ret = rdma_bind_addr(cm_id, addr);
        if (ret) {
-               pr_err("binding CM ID to %pISpcs failed (%d)\n",
-                       (struct sockaddr *)&addr, ret);
+               pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret);
                goto out_destroy_id;
        }
 
        ret = rdma_listen(cm_id, 128);
        if (ret) {
-               pr_err("listening to %pISpcs failed (%d)\n",
-                       (struct sockaddr *)&addr, ret);
+               pr_err("listening to %pISpcs failed (%d)\n", addr, ret);
                goto out_destroy_id;
        }
 
-       pr_info("enabling port %d (%pISpcs)\n",
-               le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
-       port->priv = cm_id;
+       port->cm_id = cm_id;
        return 0;
 
 out_destroy_id:
@@ -1576,18 +1571,92 @@ out_destroy_id:
        return ret;
 }
 
-static void nvmet_rdma_remove_port(struct nvmet_port *port)
+static void nvmet_rdma_repair_port_work(struct work_struct *w)
 {
-       struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
+       struct nvmet_rdma_port *port = container_of(to_delayed_work(w),
+                       struct nvmet_rdma_port, repair_work);
+       int ret;
 
-       if (cm_id)
-               rdma_destroy_id(cm_id);
+       nvmet_rdma_disable_port(port);
+       ret = nvmet_rdma_enable_port(port);
+       if (ret)
+               schedule_delayed_work(&port->repair_work, 5 * HZ);
+}
+
+static int nvmet_rdma_add_port(struct nvmet_port *nport)
+{
+       struct nvmet_rdma_port *port;
+       __kernel_sa_family_t af;
+       int ret;
+
+       port = kzalloc(sizeof(*port), GFP_KERNEL);
+       if (!port)
+               return -ENOMEM;
+
+       nport->priv = port;
+       port->nport = nport;
+       INIT_DELAYED_WORK(&port->repair_work, nvmet_rdma_repair_port_work);
+
+       switch (nport->disc_addr.adrfam) {
+       case NVMF_ADDR_FAMILY_IP4:
+               af = AF_INET;
+               break;
+       case NVMF_ADDR_FAMILY_IP6:
+               af = AF_INET6;
+               break;
+       default:
+               pr_err("address family %d not supported\n",
+                       nport->disc_addr.adrfam);
+               ret = -EINVAL;
+               goto out_free_port;
+       }
+
+       if (nport->inline_data_size < 0) {
+               nport->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
+       } else if (nport->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
+               pr_warn("inline_data_size %u is too large, reducing to %u\n",
+                       nport->inline_data_size,
+                       NVMET_RDMA_MAX_INLINE_DATA_SIZE);
+               nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
+       }
+
+       ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
+                       nport->disc_addr.trsvcid, &port->addr);
+       if (ret) {
+               pr_err("malformed ip/port passed: %s:%s\n",
+                       nport->disc_addr.traddr, nport->disc_addr.trsvcid);
+               goto out_free_port;
+       }
+
+       ret = nvmet_rdma_enable_port(port);
+       if (ret)
+               goto out_free_port;
+
+       pr_info("enabling port %d (%pISpcs)\n",
+               le16_to_cpu(nport->disc_addr.portid),
+               (struct sockaddr *)&port->addr);
+
+       return 0;
+
+out_free_port:
+       kfree(port);
+       return ret;
+}
+
+static void nvmet_rdma_remove_port(struct nvmet_port *nport)
+{
+       struct nvmet_rdma_port *port = nport->priv;
+
+       cancel_delayed_work_sync(&port->repair_work);
+       nvmet_rdma_disable_port(port);
+       kfree(port);
 }
 
 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
-               struct nvmet_port *port, char *traddr)
+               struct nvmet_port *nport, char *traddr)
 {
-       struct rdma_cm_id *cm_id = port->priv;
+       struct nvmet_rdma_port *port = nport->priv;
+       struct rdma_cm_id *cm_id = port->cm_id;
 
        if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
                struct nvmet_rdma_rsp *rsp =
@@ -1597,7 +1666,7 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
 
                sprintf(traddr, "%pISc", addr);
        } else {
-               memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
+               memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
        }
 }
 
index 3ef0bb2..390e92f 100644 (file)
@@ -366,6 +366,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features)
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(pci_enable_pasid);
 
 /**
  * pci_disable_pasid - Disable the PASID capability
@@ -390,6 +391,7 @@ void pci_disable_pasid(struct pci_dev *pdev)
 
        pdev->pasid_enabled = 0;
 }
+EXPORT_SYMBOL_GPL(pci_disable_pasid);
 
 /**
  * pci_restore_pasid_state - Restore PASID capabilities
@@ -441,6 +443,7 @@ int pci_pasid_features(struct pci_dev *pdev)
 
        return supported;
 }
+EXPORT_SYMBOL_GPL(pci_pasid_features);
 
 #define PASID_NUMBER_SHIFT     8
 #define PASID_NUMBER_MASK      (0x1f << PASID_NUMBER_SHIFT)
@@ -469,4 +472,5 @@ int pci_max_pasids(struct pci_dev *pdev)
 
        return (1 << supported);
 }
+EXPORT_SYMBOL_GPL(pci_max_pasids);
 #endif /* CONFIG_PCI_PASID */
index 5f57282..03ea512 100644 (file)
@@ -7,7 +7,7 @@ config MFD_CROS_EC
        tristate "Platform support for Chrome hardware (transitional)"
        select CHROME_PLATFORMS
        select CROS_EC
-       select CONFIG_MFD_CROS_EC_DEV
+       select MFD_CROS_EC_DEV
        depends on X86 || ARM || ARM64 || COMPILE_TEST
        help
          This is a transitional Kconfig option and will be removed after
@@ -214,6 +214,17 @@ config CROS_EC_SYSFS
          To compile this driver as a module, choose M here: the
          module will be called cros_ec_sysfs.
 
+config CROS_EC_TYPEC
+       tristate "ChromeOS EC Type-C Connector Control"
+       depends on MFD_CROS_EC_DEV && TYPEC
+       default MFD_CROS_EC_DEV
+       help
+         If you say Y here, you get support for accessing Type C connector
+         information from the Chrome OS EC.
+
+         To compile this driver as a module, choose M here: the module will be
+         called cros_ec_typec.
+
 config CROS_USBPD_LOGGER
        tristate "Logging driver for USB PD charger"
        depends on CHARGER_CROS_USBPD
@@ -226,6 +237,20 @@ config CROS_USBPD_LOGGER
          To compile this driver as a module, choose M here: the
          module will be called cros_usbpd_logger.
 
+config CROS_USBPD_NOTIFY
+       tristate "ChromeOS Type-C power delivery event notifier"
+       depends on MFD_CROS_EC_DEV
+       default MFD_CROS_EC_DEV
+       help
+         If you say Y here, you get support for Type-C PD event notifications
+         from the ChromeOS EC. On ACPI platorms this driver will bind to the
+         GOOG0003 ACPI device, and on platforms which don't have this device it
+         will get initialized on ECs which support the feature
+         EC_FEATURE_USB_PD.
+
+         To compile this driver as a module, choose M here: the
+         module will be called cros_usbpd_notify.
+
 source "drivers/platform/chrome/wilco_ec/Kconfig"
 
 endif # CHROMEOS_PLATFORMS
index aacd592..41baccb 100644 (file)
@@ -12,6 +12,7 @@ obj-$(CONFIG_CROS_EC_ISHTP)           += cros_ec_ishtp.o
 obj-$(CONFIG_CROS_EC_RPMSG)            += cros_ec_rpmsg.o
 obj-$(CONFIG_CROS_EC_SPI)              += cros_ec_spi.o
 cros_ec_lpcs-objs                      := cros_ec_lpc.o cros_ec_lpc_mec.o
+obj-$(CONFIG_CROS_EC_TYPEC)            += cros_ec_typec.o
 obj-$(CONFIG_CROS_EC_LPC)              += cros_ec_lpcs.o
 obj-$(CONFIG_CROS_EC_PROTO)            += cros_ec_proto.o cros_ec_trace.o
 obj-$(CONFIG_CROS_KBD_LED_BACKLIGHT)   += cros_kbd_led_backlight.o
@@ -19,8 +20,10 @@ obj-$(CONFIG_CROS_EC_CHARDEV)                += cros_ec_chardev.o
 obj-$(CONFIG_CROS_EC_LIGHTBAR)         += cros_ec_lightbar.o
 obj-$(CONFIG_CROS_EC_VBC)              += cros_ec_vbc.o
 obj-$(CONFIG_CROS_EC_DEBUGFS)          += cros_ec_debugfs.o
-obj-$(CONFIG_CROS_EC_SENSORHUB)                += cros_ec_sensorhub.o
+cros-ec-sensorhub-objs                 := cros_ec_sensorhub.o cros_ec_sensorhub_ring.o
+obj-$(CONFIG_CROS_EC_SENSORHUB)                += cros-ec-sensorhub.o
 obj-$(CONFIG_CROS_EC_SYSFS)            += cros_ec_sysfs.o
 obj-$(CONFIG_CROS_USBPD_LOGGER)                += cros_usbpd_logger.o
+obj-$(CONFIG_CROS_USBPD_NOTIFY)                += cros_usbpd_notify.o
 
 obj-$(CONFIG_WILCO_EC)                 += wilco_ec/
index 4f3651f..472a03d 100644 (file)
@@ -103,7 +103,7 @@ chromes_laptop_instantiate_i2c_device(struct i2c_adapter *adapter,
                        pr_debug("%d-%02x is probed at %02x\n",
                                 adapter->nr, info->addr, dummy->addr);
                        i2c_unregister_device(dummy);
-                       client = i2c_new_device(adapter, info);
+                       client = i2c_new_client_device(adapter, info);
                }
        }
 
index 6fc8f2c..3104680 100644 (file)
@@ -120,7 +120,7 @@ static int cros_ec_sleep_event(struct cros_ec_device *ec_dev, u8 sleep_event)
 
        buf.msg.command = EC_CMD_HOST_SLEEP_EVENT;
 
-       ret = cros_ec_cmd_xfer(ec_dev, &buf.msg);
+       ret = cros_ec_cmd_xfer_status(ec_dev, &buf.msg);
 
        /* For now, report failure to transition to S0ix with a warning. */
        if (ret >= 0 && ec_dev->host_sleep_v1 &&
@@ -138,6 +138,24 @@ static int cros_ec_sleep_event(struct cros_ec_device *ec_dev, u8 sleep_event)
        return ret;
 }
 
+static int cros_ec_ready_event(struct notifier_block *nb,
+                              unsigned long queued_during_suspend,
+                              void *_notify)
+{
+       struct cros_ec_device *ec_dev = container_of(nb, struct cros_ec_device,
+                                                    notifier_ready);
+       u32 host_event = cros_ec_get_host_event(ec_dev);
+
+       if (host_event & EC_HOST_EVENT_MASK(EC_HOST_EVENT_INTERFACE_READY)) {
+               mutex_lock(&ec_dev->lock);
+               cros_ec_query_all(ec_dev);
+               mutex_unlock(&ec_dev->lock);
+               return NOTIFY_OK;
+       }
+
+       return NOTIFY_DONE;
+}
+
 /**
  * cros_ec_register() - Register a new ChromeOS EC, using the provided info.
  * @ec_dev: Device to register.
@@ -237,6 +255,18 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
                dev_dbg(ec_dev->dev, "Error %d clearing sleep event to ec",
                        err);
 
+       if (ec_dev->mkbp_event_supported) {
+               /*
+                * Register the notifier for EC_HOST_EVENT_INTERFACE_READY
+                * event.
+                */
+               ec_dev->notifier_ready.notifier_call = cros_ec_ready_event;
+               err = blocking_notifier_chain_register(&ec_dev->event_notifier,
+                                                     &ec_dev->notifier_ready);
+               if (err)
+                       return err;
+       }
+
        dev_info(dev, "Chrome EC device registered\n");
 
        return 0;
index c65e70b..e0bce86 100644 (file)
@@ -48,7 +48,7 @@ struct ec_event {
        struct list_head node;
        size_t size;
        u8 event_type;
-       u8 data[0];
+       u8 data[];
 };
 
 static int ec_get_version(struct cros_ec_dev *ec, char *str, int maxlen)
@@ -301,7 +301,7 @@ static long cros_ec_chardev_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg)
        }
 
        s_cmd->command += ec->cmd_offset;
-       ret = cros_ec_cmd_xfer(ec->ec_dev, s_cmd);
+       ret = cros_ec_cmd_xfer_status(ec->ec_dev, s_cmd);
        /* Only copy data to userland if data was received. */
        if (ret < 0)
                goto exit;
index b4c110c..b59180b 100644 (file)
@@ -116,7 +116,7 @@ static int get_lightbar_version(struct cros_ec_dev *ec,
 
        param = (struct ec_params_lightbar *)msg->data;
        param->cmd = LIGHTBAR_CMD_VERSION;
-       ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+       ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
        if (ret < 0) {
                ret = 0;
                goto exit;
@@ -193,15 +193,10 @@ static ssize_t brightness_store(struct device *dev,
        if (ret)
                goto exit;
 
-       ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+       ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
        if (ret < 0)
                goto exit;
 
-       if (msg->result != EC_RES_SUCCESS) {
-               ret = -EINVAL;
-               goto exit;
-       }
-
        ret = count;
 exit:
        kfree(msg);
@@ -258,13 +253,10 @@ static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr,
                                        goto exit;
                        }
 
-                       ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+                       ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
                        if (ret < 0)
                                goto exit;
 
-                       if (msg->result != EC_RES_SUCCESS)
-                               goto exit;
-
                        i = 0;
                        ok = 1;
                }
@@ -305,14 +297,13 @@ static ssize_t sequence_show(struct device *dev,
        if (ret)
                goto exit;
 
-       ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
-       if (ret < 0)
-               goto exit;
-
-       if (msg->result != EC_RES_SUCCESS) {
+       ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
+       if (ret == -EPROTO) {
                ret = scnprintf(buf, PAGE_SIZE,
                                "ERROR: EC returned %d\n", msg->result);
                goto exit;
+       } else if (ret < 0) {
+               goto exit;
        }
 
        resp = (struct ec_response_lightbar *)msg->data;
@@ -344,13 +335,10 @@ static int lb_send_empty_cmd(struct cros_ec_dev *ec, uint8_t cmd)
        if (ret)
                goto error;
 
-       ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+       ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
        if (ret < 0)
                goto error;
-       if (msg->result != EC_RES_SUCCESS) {
-               ret = -EINVAL;
-               goto error;
-       }
+
        ret = 0;
 error:
        kfree(msg);
@@ -377,13 +365,10 @@ static int lb_manual_suspend_ctrl(struct cros_ec_dev *ec, uint8_t enable)
        if (ret)
                goto error;
 
-       ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+       ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
        if (ret < 0)
                goto error;
-       if (msg->result != EC_RES_SUCCESS) {
-               ret = -EINVAL;
-               goto error;
-       }
+
        ret = 0;
 error:
        kfree(msg);
@@ -425,15 +410,10 @@ static ssize_t sequence_store(struct device *dev, struct device_attribute *attr,
        if (ret)
                goto exit;
 
-       ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+       ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
        if (ret < 0)
                goto exit;
 
-       if (msg->result != EC_RES_SUCCESS) {
-               ret = -EINVAL;
-               goto exit;
-       }
-
        ret = count;
 exit:
        kfree(msg);
@@ -487,13 +467,9 @@ static ssize_t program_store(struct device *dev, struct device_attribute *attr,
         */
        msg->outsize = count + extra_bytes;
 
-       ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
+       ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
        if (ret < 0)
                goto exit;
-       if (msg->result != EC_RES_SUCCESS) {
-               ret = -EINVAL;
-               goto exit;
-       }
 
        ret = count;
 exit:
index 3cfa643..3e745e0 100644 (file)
@@ -553,7 +553,10 @@ EXPORT_SYMBOL(cros_ec_cmd_xfer);
  * replied with success status. It's not necessary to check msg->result when
  * using this function.
  *
- * Return: The number of bytes transferred on success or negative error code.
+ * Return:
+ * >=0 - The number of bytes transferred
+ * -ENOTSUPP - Operation not supported
+ * -EPROTO - Protocol error
  */
 int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev,
                            struct cros_ec_command *msg)
@@ -563,6 +566,10 @@ int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev,
        ret = cros_ec_cmd_xfer(ec_dev, msg);
        if (ret < 0) {
                dev_err(ec_dev->dev, "Command xfer error (err:%d)\n", ret);
+       } else if (msg->result == EC_RES_INVALID_VERSION) {
+               dev_dbg(ec_dev->dev, "Command invalid version (err:%d)\n",
+                       msg->result);
+               return -ENOTSUPP;
        } else if (msg->result != EC_RES_SUCCESS) {
                dev_dbg(ec_dev->dev, "Command result (err: %d)\n", msg->result);
                return -EPROTO;
index dbc3f55..7e8629e 100644 (file)
@@ -44,6 +44,8 @@ struct cros_ec_rpmsg {
        struct completion xfer_ack;
        struct work_struct host_event_work;
        struct rpmsg_endpoint *ept;
+       bool has_pending_host_event;
+       bool probe_done;
 };
 
 /**
@@ -177,7 +179,14 @@ static int cros_ec_rpmsg_callback(struct rpmsg_device *rpdev, void *data,
                memcpy(ec_dev->din, resp->data, len);
                complete(&ec_rpmsg->xfer_ack);
        } else if (resp->type == HOST_EVENT_MARK) {
-               schedule_work(&ec_rpmsg->host_event_work);
+               /*
+                * If the host event is sent before cros_ec_register is
+                * finished, queue the host event.
+                */
+               if (ec_rpmsg->probe_done)
+                       schedule_work(&ec_rpmsg->host_event_work);
+               else
+                       ec_rpmsg->has_pending_host_event = true;
        } else {
                dev_warn(ec_dev->dev, "rpmsg received invalid type = %d",
                         resp->type);
@@ -240,6 +249,11 @@ static int cros_ec_rpmsg_probe(struct rpmsg_device *rpdev)
                return ret;
        }
 
+       ec_rpmsg->probe_done = true;
+
+       if (ec_rpmsg->has_pending_host_event)
+               schedule_work(&ec_rpmsg->host_event_work);
+
        return 0;
 }
 
index 79fefd3..b7f2c00 100644 (file)
@@ -50,10 +50,8 @@ static int cros_ec_sensorhub_register(struct device *dev,
                                      struct cros_ec_sensorhub *sensorhub)
 {
        int sensor_type[MOTIONSENSE_TYPE_MAX] = { 0 };
+       struct cros_ec_command *msg = sensorhub->msg;
        struct cros_ec_dev *ec = sensorhub->ec;
-       struct ec_params_motion_sense *params;
-       struct ec_response_motion_sense *resp;
-       struct cros_ec_command *msg;
        int ret, i, sensor_num;
        char *name;
 
@@ -65,27 +63,19 @@ static int cros_ec_sensorhub_register(struct device *dev,
                return sensor_num;
        }
 
+       sensorhub->sensor_num = sensor_num;
        if (sensor_num == 0) {
                dev_err(dev, "Zero sensors reported.\n");
                return -EINVAL;
        }
 
-       /* Prepare a message to send INFO command to each sensor. */
-       msg = kzalloc(sizeof(*msg) + max(sizeof(*params), sizeof(*resp)),
-                     GFP_KERNEL);
-       if (!msg)
-               return -ENOMEM;
-
        msg->version = 1;
-       msg->command = EC_CMD_MOTION_SENSE_CMD + ec->cmd_offset;
-       msg->outsize = sizeof(*params);
-       msg->insize = sizeof(*resp);
-       params = (struct ec_params_motion_sense *)msg->data;
-       resp = (struct ec_response_motion_sense *)msg->data;
+       msg->insize = sizeof(struct ec_response_motion_sense);
+       msg->outsize = sizeof(struct ec_params_motion_sense);
 
        for (i = 0; i < sensor_num; i++) {
-               params->cmd = MOTIONSENSE_CMD_INFO;
-               params->info.sensor_num = i;
+               sensorhub->params->cmd = MOTIONSENSE_CMD_INFO;
+               sensorhub->params->info.sensor_num = i;
 
                ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
                if (ret < 0) {
@@ -94,7 +84,7 @@ static int cros_ec_sensorhub_register(struct device *dev,
                        continue;
                }
 
-               switch (resp->info.type) {
+               switch (sensorhub->resp->info.type) {
                case MOTIONSENSE_TYPE_ACCEL:
                        name = "cros-ec-accel";
                        break;
@@ -117,15 +107,16 @@ static int cros_ec_sensorhub_register(struct device *dev,
                        name = "cros-ec-activity";
                        break;
                default:
-                       dev_warn(dev, "unknown type %d\n", resp->info.type);
+                       dev_warn(dev, "unknown type %d\n",
+                                sensorhub->resp->info.type);
                        continue;
                }
 
                ret = cros_ec_sensorhub_allocate_sensor(dev, name, i);
                if (ret)
-                       goto error;
+                       return ret;
 
-               sensor_type[resp->info.type]++;
+               sensor_type[sensorhub->resp->info.type]++;
        }
 
        if (sensor_type[MOTIONSENSE_TYPE_ACCEL] >= 2)
@@ -137,29 +128,41 @@ static int cros_ec_sensorhub_register(struct device *dev,
                                                        "cros-ec-lid-angle",
                                                        0);
                if (ret)
-                       goto error;
+                       return ret;
        }
 
-       kfree(msg);
        return 0;
-
-error:
-       kfree(msg);
-       return ret;
 }
 
 static int cros_ec_sensorhub_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
+       struct cros_ec_dev *ec = dev_get_drvdata(dev->parent);
        struct cros_ec_sensorhub *data;
+       struct cros_ec_command *msg;
        int ret;
        int i;
 
+       msg = devm_kzalloc(dev, sizeof(struct cros_ec_command) +
+                          max((u16)sizeof(struct ec_params_motion_sense),
+                              ec->ec_dev->max_response), GFP_KERNEL);
+       if (!msg)
+               return -ENOMEM;
+
+       msg->command = EC_CMD_MOTION_SENSE_CMD + ec->cmd_offset;
+
        data = devm_kzalloc(dev, sizeof(struct cros_ec_sensorhub), GFP_KERNEL);
        if (!data)
                return -ENOMEM;
 
-       data->ec = dev_get_drvdata(dev->parent);
+       mutex_init(&data->cmd_lock);
+
+       data->dev = dev;
+       data->ec = ec;
+       data->msg = msg;
+       data->params = (struct ec_params_motion_sense *)msg->data;
+       data->resp = (struct ec_response_motion_sense *)msg->data;
+
        dev_set_drvdata(dev, data);
 
        /* Check whether this EC is a sensor hub. */
@@ -172,7 +175,8 @@ static int cros_ec_sensorhub_probe(struct platform_device *pdev)
                 * If the device has sensors but does not claim to
                 * be a sensor hub, we are in legacy mode.
                 */
-               for (i = 0; i < 2; i++) {
+               data->sensor_num = 2;
+               for (i = 0; i < data->sensor_num; i++) {
                        ret = cros_ec_sensorhub_allocate_sensor(dev,
                                                "cros-ec-accel-legacy", i);
                        if (ret)
@@ -180,12 +184,63 @@ static int cros_ec_sensorhub_probe(struct platform_device *pdev)
                }
        }
 
+       /*
+        * If the EC does not have a FIFO, the sensors will query their data
+        * themselves via sysfs or a software trigger.
+        */
+       if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE_FIFO)) {
+               ret = cros_ec_sensorhub_ring_add(data);
+               if (ret)
+                       return ret;
+               /*
+                * The msg and its data is not under the control of the ring
+                * handler.
+                */
+               return devm_add_action_or_reset(dev,
+                                               cros_ec_sensorhub_ring_remove,
+                                               data);
+       }
+
+       return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+/*
+ * When the EC is suspending, we must stop sending interrupt,
+ * we may use the same interrupt line for waking up the device.
+ * Tell the EC to stop sending non-interrupt event on the iio ring.
+ */
+static int cros_ec_sensorhub_suspend(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct cros_ec_sensorhub *sensorhub = platform_get_drvdata(pdev);
+       struct cros_ec_dev *ec = sensorhub->ec;
+
+       if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE_FIFO))
+               return cros_ec_sensorhub_ring_fifo_enable(sensorhub, false);
        return 0;
 }
 
+static int cros_ec_sensorhub_resume(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct cros_ec_sensorhub *sensorhub = platform_get_drvdata(pdev);
+       struct cros_ec_dev *ec = sensorhub->ec;
+
+       if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE_FIFO))
+               return cros_ec_sensorhub_ring_fifo_enable(sensorhub, true);
+       return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(cros_ec_sensorhub_pm_ops,
+               cros_ec_sensorhub_suspend,
+               cros_ec_sensorhub_resume);
+
 static struct platform_driver cros_ec_sensorhub_driver = {
        .driver = {
                .name = DRV_NAME,
+               .pm = &cros_ec_sensorhub_pm_ops,
        },
        .probe = cros_ec_sensorhub_probe,
 };
diff --git a/drivers/platform/chrome/cros_ec_sensorhub_ring.c b/drivers/platform/chrome/cros_ec_sensorhub_ring.c
new file mode 100644 (file)
index 0000000..230e6cf
--- /dev/null
@@ -0,0 +1,1046 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Driver for Chrome OS EC Sensor hub FIFO.
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/iio/iio.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_data/cros_ec_commands.h>
+#include <linux/platform_data/cros_ec_proto.h>
+#include <linux/platform_data/cros_ec_sensorhub.h>
+#include <linux/platform_device.h>
+#include <linux/sort.h>
+#include <linux/slab.h>
+
+/* Precision of fixed point for the m values from the filter */
+#define M_PRECISION BIT(23)
+
+/* Only activate the filter once we have at least this many elements. */
+#define TS_HISTORY_THRESHOLD 8
+
+/*
+ * If we don't have any history entries for this long, empty the filter to
+ * make sure there are no big discontinuities.
+ */
+#define TS_HISTORY_BORED_US 500000
+
+/* To measure by how much the filter is overshooting, if it happens. */
+#define FUTURE_TS_ANALYTICS_COUNT_MAX 100
+
+static inline int
+cros_sensorhub_send_sample(struct cros_ec_sensorhub *sensorhub,
+                          struct cros_ec_sensors_ring_sample *sample)
+{
+       cros_ec_sensorhub_push_data_cb_t cb;
+       int id = sample->sensor_id;
+       struct iio_dev *indio_dev;
+
+       if (id > sensorhub->sensor_num)
+               return -EINVAL;
+
+       cb = sensorhub->push_data[id].push_data_cb;
+       if (!cb)
+               return 0;
+
+       indio_dev = sensorhub->push_data[id].indio_dev;
+
+       if (sample->flag & MOTIONSENSE_SENSOR_FLAG_FLUSH)
+               return 0;
+
+       return cb(indio_dev, sample->vector, sample->timestamp);
+}
+
+/**
+ * cros_ec_sensorhub_register_push_data() - register the callback to the hub.
+ *
+ * @sensorhub : Sensor Hub object
+ * @sensor_num : The sensor the caller is interested in.
+ * @indio_dev : The iio device to use when a sample arrives.
+ * @cb : The callback to call when a sample arrives.
+ *
+ * The callback cb will be used by cros_ec_sensorhub_ring to distribute events
+ * from the EC.
+ *
+ * Return: 0 when callback is registered.
+ *         EINVAL is the sensor number is invalid or the slot already used.
+ */
+int cros_ec_sensorhub_register_push_data(struct cros_ec_sensorhub *sensorhub,
+                                        u8 sensor_num,
+                                        struct iio_dev *indio_dev,
+                                        cros_ec_sensorhub_push_data_cb_t cb)
+{
+       if (sensor_num >= sensorhub->sensor_num)
+               return -EINVAL;
+       if (sensorhub->push_data[sensor_num].indio_dev)
+               return -EINVAL;
+
+       sensorhub->push_data[sensor_num].indio_dev = indio_dev;
+       sensorhub->push_data[sensor_num].push_data_cb = cb;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(cros_ec_sensorhub_register_push_data);
+
+void cros_ec_sensorhub_unregister_push_data(struct cros_ec_sensorhub *sensorhub,
+                                           u8 sensor_num)
+{
+       sensorhub->push_data[sensor_num].indio_dev = NULL;
+       sensorhub->push_data[sensor_num].push_data_cb = NULL;
+}
+EXPORT_SYMBOL_GPL(cros_ec_sensorhub_unregister_push_data);
+
+/**
+ * cros_ec_sensorhub_ring_fifo_enable() - Enable or disable interrupt generation
+ *                                       for FIFO events.
+ * @sensorhub: Sensor Hub object
+ * @on: true when events are requested.
+ *
+ * To be called before sleeping or when noone is listening.
+ * Return: 0 on success, or an error when we can not communicate with the EC.
+ *
+ */
+int cros_ec_sensorhub_ring_fifo_enable(struct cros_ec_sensorhub *sensorhub,
+                                      bool on)
+{
+       int ret, i;
+
+       mutex_lock(&sensorhub->cmd_lock);
+       if (sensorhub->tight_timestamps)
+               for (i = 0; i < sensorhub->sensor_num; i++)
+                       sensorhub->batch_state[i].last_len = 0;
+
+       sensorhub->params->cmd = MOTIONSENSE_CMD_FIFO_INT_ENABLE;
+       sensorhub->params->fifo_int_enable.enable = on;
+
+       sensorhub->msg->outsize = sizeof(struct ec_params_motion_sense);
+       sensorhub->msg->insize = sizeof(struct ec_response_motion_sense);
+
+       ret = cros_ec_cmd_xfer_status(sensorhub->ec->ec_dev, sensorhub->msg);
+       mutex_unlock(&sensorhub->cmd_lock);
+
+       /* We expect to receive a payload of 4 bytes, ignore. */
+       if (ret > 0)
+               ret = 0;
+
+       return ret;
+}
+
+static int cros_ec_sensor_ring_median_cmp(const void *pv1, const void *pv2)
+{
+       s64 v1 = *(s64 *)pv1;
+       s64 v2 = *(s64 *)pv2;
+
+       if (v1 > v2)
+               return 1;
+       else if (v1 < v2)
+               return -1;
+       else
+               return 0;
+}
+
+/*
+ * cros_ec_sensor_ring_median: Gets median of an array of numbers
+ *
+ * For now it's implemented using an inefficient > O(n) sort then return
+ * the middle element. A more optimal method would be something like
+ * quickselect, but given that n = 64 we can probably live with it in the
+ * name of clarity.
+ *
+ * Warning: the input array gets modified (sorted)!
+ */
+static s64 cros_ec_sensor_ring_median(s64 *array, size_t length)
+{
+       sort(array, length, sizeof(s64), cros_ec_sensor_ring_median_cmp, NULL);
+       return array[length / 2];
+}
+
+/*
+ * IRQ Timestamp Filtering
+ *
+ * Lower down in cros_ec_sensor_ring_process_event(), for each sensor event
+ * we have to calculate it's timestamp in the AP timebase. There are 3 time
+ * points:
+ *   a - EC timebase, sensor event
+ *   b - EC timebase, IRQ
+ *   c - AP timebase, IRQ
+ *   a' - what we want: sensor even in AP timebase
+ *
+ * While a and b are recorded at accurate times (due to the EC real time
+ * nature); c is pretty untrustworthy, even though it's recorded the
+ * first thing in ec_irq_handler(). There is a very good change we'll get
+ * added lantency due to:
+ *   other irqs
+ *   ddrfreq
+ *   cpuidle
+ *
+ * Normally a' = c - b + a, but if we do that naive math any jitter in c
+ * will get coupled in a', which we don't want. We want a function
+ * a' = cros_ec_sensor_ring_ts_filter(a) which will filter out outliers in c.
+ *
+ * Think of a graph of AP time(b) on the y axis vs EC time(c) on the x axis.
+ * The slope of the line won't be exactly 1, there will be some clock drift
+ * between the 2 chips for various reasons (mechanical stress, temperature,
+ * voltage). We need to extrapolate values for a future x, without trusting
+ * recent y values too much.
+ *
+ * We use a median filter for the slope, then another median filter for the
+ * y-intercept to calculate this function:
+ *   dx[n] = x[n-1] - x[n]
+ *   dy[n] = x[n-1] - x[n]
+ *   m[n] = dy[n] / dx[n]
+ *   median_m = median(m[n-k:n])
+ *   error[i] = y[n-i] - median_m * x[n-i]
+ *   median_error = median(error[:k])
+ *   predicted_y = median_m * x + median_error
+ *
+ * Implementation differences from above:
+ * - Redefined y to be actually c - b, this gives us a lot more precision
+ * to do the math. (c-b)/b variations are more obvious than c/b variations.
+ * - Since we don't have floating point, any operations involving slope are
+ * done using fixed point math (*M_PRECISION)
+ * - Since x and y grow with time, we keep zeroing the graph (relative to
+ * the last sample), this way math involving *x[n-i] will not overflow
+ * - EC timestamps are kept in us, it improves the slope calculation precision
+ */
+
+/**
+ * cros_ec_sensor_ring_ts_filter_update() - Update filter history.
+ *
+ * @state: Filter information.
+ * @b: IRQ timestamp, EC timebase (us)
+ * @c: IRQ timestamp, AP timebase (ns)
+ *
+ * Given a new IRQ timestamp pair (EC and AP timebases), add it to the filter
+ * history.
+ */
+static void
+cros_ec_sensor_ring_ts_filter_update(struct cros_ec_sensors_ts_filter_state
+                                    *state,
+                                    s64 b, s64 c)
+{
+       s64 x, y;
+       s64 dx, dy;
+       s64 m; /* stored as *M_PRECISION */
+       s64 *m_history_copy = state->temp_buf;
+       s64 *error = state->temp_buf;
+       int i;
+
+       /* we trust b the most, that'll be our independent variable */
+       x = b;
+       /* y is the offset between AP and EC times, in ns */
+       y = c - b * 1000;
+
+       dx = (state->x_history[0] + state->x_offset) - x;
+       if (dx == 0)
+               return; /* we already have this irq in the history */
+       dy = (state->y_history[0] + state->y_offset) - y;
+       m = div64_s64(dy * M_PRECISION, dx);
+
+       /* Empty filter if we haven't seen any action in a while. */
+       if (-dx > TS_HISTORY_BORED_US)
+               state->history_len = 0;
+
+       /* Move everything over, also update offset to all absolute coords .*/
+       for (i = state->history_len - 1; i >= 1; i--) {
+               state->x_history[i] = state->x_history[i - 1] + dx;
+               state->y_history[i] = state->y_history[i - 1] + dy;
+
+               state->m_history[i] = state->m_history[i - 1];
+               /*
+                * Also use the same loop to copy m_history for future
+                * median extraction.
+                */
+               m_history_copy[i] = state->m_history[i - 1];
+       }
+
+       /* Store the x and y, but remember offset is actually last sample. */
+       state->x_offset = x;
+       state->y_offset = y;
+       state->x_history[0] = 0;
+       state->y_history[0] = 0;
+
+       state->m_history[0] = m;
+       m_history_copy[0] = m;
+
+       if (state->history_len < CROS_EC_SENSORHUB_TS_HISTORY_SIZE)
+               state->history_len++;
+
+       /* Precalculate things for the filter. */
+       if (state->history_len > TS_HISTORY_THRESHOLD) {
+               state->median_m =
+                   cros_ec_sensor_ring_median(m_history_copy,
+                                              state->history_len - 1);
+
+               /*
+                * Calculate y-intercepts as if m_median is the slope and
+                * points in the history are on the line. median_error will
+                * still be in the offset coordinate system.
+                */
+               for (i = 0; i < state->history_len; i++)
+                       error[i] = state->y_history[i] -
+                               div_s64(state->median_m * state->x_history[i],
+                                       M_PRECISION);
+               state->median_error =
+                       cros_ec_sensor_ring_median(error, state->history_len);
+       } else {
+               state->median_m = 0;
+               state->median_error = 0;
+       }
+}
+
+/**
+ * cros_ec_sensor_ring_ts_filter() - Translate EC timebase timestamp to AP
+ *                                   timebase
+ *
+ * @state: filter information.
+ * @x: any ec timestamp (us):
+ *
+ * cros_ec_sensor_ring_ts_filter(a) => a' event timestamp, AP timebase
+ * cros_ec_sensor_ring_ts_filter(b) => calculated timestamp when the EC IRQ
+ *                           should have happened on the AP, with low jitter
+ *
+ * Note: The filter will only activate once state->history_len goes
+ * over TS_HISTORY_THRESHOLD. Otherwise it'll just do the naive c - b + a
+ * transform.
+ *
+ * How to derive the formula, starting from:
+ *   f(x) = median_m * x + median_error
+ * That's the calculated AP - EC offset (at the x point in time)
+ * Undo the coordinate system transform:
+ *   f(x) = median_m * (x - x_offset) + median_error + y_offset
+ * Remember to undo the "y = c - b * 1000" modification:
+ *   f(x) = median_m * (x - x_offset) + median_error + y_offset + x * 1000
+ *
+ * Return: timestamp in AP timebase (ns)
+ */
+static s64
+cros_ec_sensor_ring_ts_filter(struct cros_ec_sensors_ts_filter_state *state,
+                             s64 x)
+{
+       return div_s64(state->median_m * (x - state->x_offset), M_PRECISION)
+              + state->median_error + state->y_offset + x * 1000;
+}
+
+/*
+ * Since a and b were originally 32 bit values from the EC,
+ * they overflow relatively often, casting is not enough, so we need to
+ * add an offset.
+ */
+static void
+cros_ec_sensor_ring_fix_overflow(s64 *ts,
+                                const s64 overflow_period,
+                                struct cros_ec_sensors_ec_overflow_state
+                                *state)
+{
+       s64 adjust;
+
+       *ts += state->offset;
+       if (abs(state->last - *ts) > (overflow_period / 2)) {
+               adjust = state->last > *ts ? overflow_period : -overflow_period;
+               state->offset += adjust;
+               *ts += adjust;
+       }
+       state->last = *ts;
+}
+
+static void
+cros_ec_sensor_ring_check_for_past_timestamp(struct cros_ec_sensorhub
+                                            *sensorhub,
+                                            struct cros_ec_sensors_ring_sample
+                                            *sample)
+{
+       const u8 sensor_id = sample->sensor_id;
+
+       /* If this event is earlier than one we saw before... */
+       if (sensorhub->batch_state[sensor_id].newest_sensor_event >
+           sample->timestamp)
+               /* mark it for spreading. */
+               sample->timestamp =
+                       sensorhub->batch_state[sensor_id].last_ts;
+       else
+               sensorhub->batch_state[sensor_id].newest_sensor_event =
+                       sample->timestamp;
+}
+
+/**
+ * cros_ec_sensor_ring_process_event() - Process one EC FIFO event
+ *
+ * @sensorhub: Sensor Hub object.
+ * @fifo_info: FIFO information from the EC (includes b point, EC timebase).
+ * @fifo_timestamp: EC IRQ, kernel timebase (aka c).
+ * @current_timestamp: calculated event timestamp, kernel timebase (aka a').
+ * @in: incoming FIFO event from EC (includes a point, EC timebase).
+ * @out: outgoing event to user space (includes a').
+ *
+ * Process one EC event, add it in the ring if necessary.
+ *
+ * Return: true if out event has been populated.
+ */
+static bool
+cros_ec_sensor_ring_process_event(struct cros_ec_sensorhub *sensorhub,
+                               const struct ec_response_motion_sense_fifo_info
+                               *fifo_info,
+                               const ktime_t fifo_timestamp,
+                               ktime_t *current_timestamp,
+                               struct ec_response_motion_sensor_data *in,
+                               struct cros_ec_sensors_ring_sample *out)
+{
+       const s64 now = cros_ec_get_time_ns();
+       int axis, async_flags;
+
+       /* Do not populate the filter based on asynchronous events. */
+       async_flags = in->flags &
+               (MOTIONSENSE_SENSOR_FLAG_ODR | MOTIONSENSE_SENSOR_FLAG_FLUSH);
+
+       if (in->flags & MOTIONSENSE_SENSOR_FLAG_TIMESTAMP && !async_flags) {
+               s64 a = in->timestamp;
+               s64 b = fifo_info->timestamp;
+               s64 c = fifo_timestamp;
+
+               cros_ec_sensor_ring_fix_overflow(&a, 1LL << 32,
+                                         &sensorhub->overflow_a);
+               cros_ec_sensor_ring_fix_overflow(&b, 1LL << 32,
+                                         &sensorhub->overflow_b);
+
+               if (sensorhub->tight_timestamps) {
+                       cros_ec_sensor_ring_ts_filter_update(
+                                       &sensorhub->filter, b, c);
+                       *current_timestamp = cros_ec_sensor_ring_ts_filter(
+                                       &sensorhub->filter, a);
+               } else {
+                       s64 new_timestamp;
+
+                       /*
+                        * Disable filtering since we might add more jitter
+                        * if b is in a random point in time.
+                        */
+                       new_timestamp = fifo_timestamp -
+                                       fifo_info->timestamp  * 1000 +
+                                       in->timestamp * 1000;
+                       /*
+                        * The timestamp can be stale if we had to use the fifo
+                        * info timestamp.
+                        */
+                       if (new_timestamp - *current_timestamp > 0)
+                               *current_timestamp = new_timestamp;
+               }
+       }
+
+       if (in->flags & MOTIONSENSE_SENSOR_FLAG_ODR) {
+               if (sensorhub->tight_timestamps) {
+                       sensorhub->batch_state[in->sensor_num].last_len = 0;
+                       sensorhub->batch_state[in->sensor_num].penul_len = 0;
+               }
+               /*
+                * ODR change is only useful for the sensor_ring, it does not
+                * convey information to clients.
+                */
+               return false;
+       }
+
+       if (in->flags & MOTIONSENSE_SENSOR_FLAG_FLUSH) {
+               out->sensor_id = in->sensor_num;
+               out->timestamp = *current_timestamp;
+               out->flag = in->flags;
+               if (sensorhub->tight_timestamps)
+                       sensorhub->batch_state[out->sensor_id].last_len = 0;
+               /*
+                * No other payload information provided with
+                * flush ack.
+                */
+               return true;
+       }
+
+       if (in->flags & MOTIONSENSE_SENSOR_FLAG_TIMESTAMP)
+               /* If we just have a timestamp, skip this entry. */
+               return false;
+
+       /* Regular sample */
+       out->sensor_id = in->sensor_num;
+       if (*current_timestamp - now > 0) {
+               /*
+                * This fix is needed to overcome the timestamp filter putting
+                * events in the future.
+                */
+               sensorhub->future_timestamp_total_ns +=
+                       *current_timestamp - now;
+               if (++sensorhub->future_timestamp_count ==
+                               FUTURE_TS_ANALYTICS_COUNT_MAX) {
+                       s64 avg = div_s64(sensorhub->future_timestamp_total_ns,
+                                       sensorhub->future_timestamp_count);
+                       dev_warn_ratelimited(sensorhub->dev,
+                                            "100 timestamps in the future, %lldns shaved on average\n",
+                                            avg);
+                       sensorhub->future_timestamp_count = 0;
+                       sensorhub->future_timestamp_total_ns = 0;
+               }
+               out->timestamp = now;
+       } else {
+               out->timestamp = *current_timestamp;
+       }
+
+       out->flag = in->flags;
+       for (axis = 0; axis < 3; axis++)
+               out->vector[axis] = in->data[axis];
+
+       if (sensorhub->tight_timestamps)
+               cros_ec_sensor_ring_check_for_past_timestamp(sensorhub, out);
+       return true;
+}
+
+/*
+ * cros_ec_sensor_ring_spread_add: Calculate proper timestamps then add to
+ *                                 ringbuffer.
+ *
+ * This is the new spreading code, assumes every sample's timestamp
+ * preceeds the sample. Run if tight_timestamps == true.
+ *
+ * Sometimes the EC receives only one interrupt (hence timestamp) for
+ * a batch of samples. Only the first sample will have the correct
+ * timestamp. So we must interpolate the other samples.
+ * We use the previous batch timestamp and our current batch timestamp
+ * as a way to calculate period, then spread the samples evenly.
+ *
+ * s0 int, 0ms
+ * s1 int, 10ms
+ * s2 int, 20ms
+ * 30ms point goes by, no interrupt, previous one is still asserted
+ * downloading s2 and s3
+ * s3 sample, 20ms (incorrect timestamp)
+ * s4 int, 40ms
+ *
+ * The batches are [(s0), (s1), (s2, s3), (s4)]. Since the 3rd batch
+ * has 2 samples in them, we adjust the timestamp of s3.
+ * s2 - s1 = 10ms, so s3 must be s2 + 10ms => 20ms. If s1 would have
+ * been part of a bigger batch things would have gotten a little
+ * more complicated.
+ *
+ * Note: we also assume another sensor sample doesn't break up a batch
+ * in 2 or more partitions. Example, there can't ever be a sync sensor
+ * in between S2 and S3. This simplifies the following code.
+ */
+static void
+cros_ec_sensor_ring_spread_add(struct cros_ec_sensorhub *sensorhub,
+                              unsigned long sensor_mask,
+                              struct cros_ec_sensors_ring_sample *last_out)
+{
+       struct cros_ec_sensors_ring_sample *batch_start, *next_batch_start;
+       int id;
+
+       for_each_set_bit(id, &sensor_mask, sensorhub->sensor_num) {
+               for (batch_start = sensorhub->ring; batch_start < last_out;
+                    batch_start = next_batch_start) {
+                       /*
+                        * For each batch (where all samples have the same
+                        * timestamp).
+                        */
+                       int batch_len, sample_idx;
+                       struct cros_ec_sensors_ring_sample *batch_end =
+                               batch_start;
+                       struct cros_ec_sensors_ring_sample *s;
+                       s64 batch_timestamp = batch_start->timestamp;
+                       s64 sample_period;
+
+                       /*
+                        * Skip over batches that start with the sensor types
+                        * we're not looking at right now.
+                        */
+                       if (batch_start->sensor_id != id) {
+                               next_batch_start = batch_start + 1;
+                               continue;
+                       }
+
+                       /*
+                        * Do not start a batch
+                        * from a flush, as it happens asynchronously to the
+                        * regular flow of events.
+                        */
+                       if (batch_start->flag & MOTIONSENSE_SENSOR_FLAG_FLUSH) {
+                               cros_sensorhub_send_sample(sensorhub,
+                                                          batch_start);
+                               next_batch_start = batch_start + 1;
+                               continue;
+                       }
+
+                       if (batch_start->timestamp <=
+                           sensorhub->batch_state[id].last_ts) {
+                               batch_timestamp =
+                                       sensorhub->batch_state[id].last_ts;
+                               batch_len = sensorhub->batch_state[id].last_len;
+
+                               sample_idx = batch_len;
+
+                               sensorhub->batch_state[id].last_ts =
+                                 sensorhub->batch_state[id].penul_ts;
+                               sensorhub->batch_state[id].last_len =
+                                 sensorhub->batch_state[id].penul_len;
+                       } else {
+                               /*
+                                * Push first sample in the batch to the,
+                                * kifo, it's guaranteed to be correct, the
+                                * rest will follow later on.
+                                */
+                               sample_idx = 1;
+                               batch_len = 1;
+                               cros_sensorhub_send_sample(sensorhub,
+                                                          batch_start);
+                               batch_start++;
+                       }
+
+                       /* Find all samples have the same timestamp. */
+                       for (s = batch_start; s < last_out; s++) {
+                               if (s->sensor_id != id)
+                                       /*
+                                        * Skip over other sensor types that
+                                        * are interleaved, don't count them.
+                                        */
+                                       continue;
+                               if (s->timestamp != batch_timestamp)
+                                       /* we discovered the next batch */
+                                       break;
+                               if (s->flag & MOTIONSENSE_SENSOR_FLAG_FLUSH)
+                                       /* break on flush packets */
+                                       break;
+                               batch_end = s;
+                               batch_len++;
+                       }
+
+                       if (batch_len == 1)
+                               goto done_with_this_batch;
+
+                       /* Can we calculate period? */
+                       if (sensorhub->batch_state[id].last_len == 0) {
+                               dev_warn(sensorhub->dev, "Sensor %d: lost %d samples when spreading\n",
+                                        id, batch_len - 1);
+                               goto done_with_this_batch;
+                               /*
+                                * Note: we're dropping the rest of the samples
+                                * in this batch since we have no idea where
+                                * they're supposed to go without a period
+                                * calculation.
+                                */
+                       }
+
+                       sample_period = div_s64(batch_timestamp -
+                               sensorhub->batch_state[id].last_ts,
+                               sensorhub->batch_state[id].last_len);
+                       dev_dbg(sensorhub->dev,
+                               "Adjusting %d samples, sensor %d last_batch @%lld (%d samples) batch_timestamp=%lld => period=%lld\n",
+                               batch_len, id,
+                               sensorhub->batch_state[id].last_ts,
+                               sensorhub->batch_state[id].last_len,
+                               batch_timestamp,
+                               sample_period);
+
+                       /*
+                        * Adjust timestamps of the samples then push them to
+                        * kfifo.
+                        */
+                       for (s = batch_start; s <= batch_end; s++) {
+                               if (s->sensor_id != id)
+                                       /*
+                                        * Skip over other sensor types that
+                                        * are interleaved, don't change them.
+                                        */
+                                       continue;
+
+                               s->timestamp = batch_timestamp +
+                                       sample_period * sample_idx;
+                               sample_idx++;
+
+                               cros_sensorhub_send_sample(sensorhub, s);
+                       }
+
+done_with_this_batch:
+                       sensorhub->batch_state[id].penul_ts =
+                               sensorhub->batch_state[id].last_ts;
+                       sensorhub->batch_state[id].penul_len =
+                               sensorhub->batch_state[id].last_len;
+
+                       sensorhub->batch_state[id].last_ts =
+                               batch_timestamp;
+                       sensorhub->batch_state[id].last_len = batch_len;
+
+                       next_batch_start = batch_end + 1;
+               }
+       }
+}
+
+/*
+ * cros_ec_sensor_ring_spread_add_legacy: Calculate proper timestamps then
+ * add to ringbuffer (legacy).
+ *
+ * Note: This assumes we're running old firmware, where every sample's timestamp
+ * is after the sample. Run if tight_timestamps == false.
+ *
+ * If there is a sample with a proper timestamp
+ *
+ *                        timestamp | count
+ *                        -----------------
+ * older_unprocess_out --> TS1      | 1
+ *                         TS1      | 2
+ *                out -->  TS1      | 3
+ *           next_out -->  TS2      |
+ *
+ * We spread time for the samples [older_unprocess_out .. out]
+ * between TS1 and TS2: [TS1+1/4, TS1+2/4, TS1+3/4, TS2].
+ *
+ * If we reach the end of the samples, we compare with the
+ * current timestamp:
+ *
+ * older_unprocess_out --> TS1      | 1
+ *                         TS1      | 2
+ *                 out --> TS1      | 3
+ *
+ * We know have [TS1+1/3, TS1+2/3, current timestamp]
+ */
+static void
+cros_ec_sensor_ring_spread_add_legacy(struct cros_ec_sensorhub *sensorhub,
+                                     unsigned long sensor_mask,
+                                     s64 current_timestamp,
+                                     struct cros_ec_sensors_ring_sample
+                                     *last_out)
+{
+       struct cros_ec_sensors_ring_sample *out;
+       int i;
+
+       for_each_set_bit(i, &sensor_mask, sensorhub->sensor_num) {
+               s64 older_timestamp;
+               s64 timestamp;
+               struct cros_ec_sensors_ring_sample *older_unprocess_out =
+                       sensorhub->ring;
+               struct cros_ec_sensors_ring_sample *next_out;
+               int count = 1;
+
+               for (out = sensorhub->ring; out < last_out; out = next_out) {
+                       s64 time_period;
+
+                       next_out = out + 1;
+                       if (out->sensor_id != i)
+                               continue;
+
+                       /* Timestamp to start with */
+                       older_timestamp = out->timestamp;
+
+                       /* Find next sample. */
+                       while (next_out < last_out && next_out->sensor_id != i)
+                               next_out++;
+
+                       if (next_out >= last_out) {
+                               timestamp = current_timestamp;
+                       } else {
+                               timestamp = next_out->timestamp;
+                               if (timestamp == older_timestamp) {
+                                       count++;
+                                       continue;
+                               }
+                       }
+
+                       /*
+                        * The next sample has a new timestamp, spread the
+                        * unprocessed samples.
+                        */
+                       if (next_out < last_out)
+                               count++;
+                       time_period = div_s64(timestamp - older_timestamp,
+                                             count);
+
+                       for (; older_unprocess_out <= out;
+                                       older_unprocess_out++) {
+                               if (older_unprocess_out->sensor_id != i)
+                                       continue;
+                               older_timestamp += time_period;
+                               older_unprocess_out->timestamp =
+                                       older_timestamp;
+                       }
+                       count = 1;
+                       /* The next_out sample has a valid timestamp, skip. */
+                       next_out++;
+                       older_unprocess_out = next_out;
+               }
+       }
+
+       /* Push the event into the kfifo */
+       for (out = sensorhub->ring; out < last_out; out++)
+               cros_sensorhub_send_sample(sensorhub, out);
+}
+
+/**
+ * cros_ec_sensorhub_ring_handler() - The trigger handler function
+ *
+ * @sensorhub: Sensor Hub object.
+ *
+ * Called by the notifier, process the EC sensor FIFO queue.
+ */
+static void cros_ec_sensorhub_ring_handler(struct cros_ec_sensorhub *sensorhub)
+{
+       struct ec_response_motion_sense_fifo_info *fifo_info =
+               sensorhub->fifo_info;
+       struct cros_ec_dev *ec = sensorhub->ec;
+       ktime_t fifo_timestamp, current_timestamp;
+       int i, j, number_data, ret;
+       unsigned long sensor_mask = 0;
+       struct ec_response_motion_sensor_data *in;
+       struct cros_ec_sensors_ring_sample *out, *last_out;
+
+       mutex_lock(&sensorhub->cmd_lock);
+
+       /* Get FIFO information if there are lost vectors. */
+       if (fifo_info->total_lost) {
+               int fifo_info_length =
+                       sizeof(struct ec_response_motion_sense_fifo_info) +
+                       sizeof(u16) * sensorhub->sensor_num;
+
+               /* Need to retrieve the number of lost vectors per sensor */
+               sensorhub->params->cmd = MOTIONSENSE_CMD_FIFO_INFO;
+               sensorhub->msg->outsize = 1;
+               sensorhub->msg->insize = fifo_info_length;
+
+               if (cros_ec_cmd_xfer_status(ec->ec_dev, sensorhub->msg) < 0)
+                       goto error;
+
+               memcpy(fifo_info, &sensorhub->resp->fifo_info,
+                      fifo_info_length);
+
+               /*
+                * Update collection time, will not be as precise as the
+                * non-error case.
+                */
+               fifo_timestamp = cros_ec_get_time_ns();
+       } else {
+               fifo_timestamp = sensorhub->fifo_timestamp[
+                       CROS_EC_SENSOR_NEW_TS];
+       }
+
+       if (fifo_info->count > sensorhub->fifo_size ||
+           fifo_info->size != sensorhub->fifo_size) {
+               dev_warn(sensorhub->dev,
+                        "Mismatch EC data: count %d, size %d - expected %d",
+                        fifo_info->count, fifo_info->size,
+                        sensorhub->fifo_size);
+               goto error;
+       }
+
+       /* Copy elements in the main fifo */
+       current_timestamp = sensorhub->fifo_timestamp[CROS_EC_SENSOR_LAST_TS];
+       out = sensorhub->ring;
+       for (i = 0; i < fifo_info->count; i += number_data) {
+               sensorhub->params->cmd = MOTIONSENSE_CMD_FIFO_READ;
+               sensorhub->params->fifo_read.max_data_vector =
+                       fifo_info->count - i;
+               sensorhub->msg->outsize =
+                       sizeof(struct ec_params_motion_sense);
+               sensorhub->msg->insize =
+                       sizeof(sensorhub->resp->fifo_read) +
+                       sensorhub->params->fifo_read.max_data_vector *
+                         sizeof(struct ec_response_motion_sensor_data);
+               ret = cros_ec_cmd_xfer_status(ec->ec_dev, sensorhub->msg);
+               if (ret < 0) {
+                       dev_warn(sensorhub->dev, "Fifo error: %d\n", ret);
+                       break;
+               }
+               number_data = sensorhub->resp->fifo_read.number_data;
+               if (number_data == 0) {
+                       dev_dbg(sensorhub->dev, "Unexpected empty FIFO\n");
+                       break;
+               }
+               if (number_data > fifo_info->count - i) {
+                       dev_warn(sensorhub->dev,
+                                "Invalid EC data: too many entry received: %d, expected %d",
+                                number_data, fifo_info->count - i);
+                       break;
+               }
+               if (out + number_data >
+                   sensorhub->ring + fifo_info->count) {
+                       dev_warn(sensorhub->dev,
+                                "Too many samples: %d (%zd data) to %d entries for expected %d entries",
+                                i, out - sensorhub->ring, i + number_data,
+                                fifo_info->count);
+                       break;
+               }
+
+               for (in = sensorhub->resp->fifo_read.data, j = 0;
+                    j < number_data; j++, in++) {
+                       if (cros_ec_sensor_ring_process_event(
+                                               sensorhub, fifo_info,
+                                               fifo_timestamp,
+                                               &current_timestamp,
+                                               in, out)) {
+                               sensor_mask |= BIT(in->sensor_num);
+                               out++;
+                       }
+               }
+       }
+       mutex_unlock(&sensorhub->cmd_lock);
+       last_out = out;
+
+       if (out == sensorhub->ring)
+               /* Unexpected empty FIFO. */
+               goto ring_handler_end;
+
+       /*
+        * Check if current_timestamp is ahead of the last sample. Normally,
+        * the EC appends a timestamp after the last sample, but if the AP
+        * is slow to respond to the IRQ, the EC may have added new samples.
+        * Use the FIFO info timestamp as last timestamp then.
+        */
+       if (!sensorhub->tight_timestamps &&
+           (last_out - 1)->timestamp == current_timestamp)
+               current_timestamp = fifo_timestamp;
+
+       /* Warn on lost samples. */
+       if (fifo_info->total_lost)
+               for (i = 0; i < sensorhub->sensor_num; i++) {
+                       if (fifo_info->lost[i]) {
+                               dev_warn_ratelimited(sensorhub->dev,
+                                                    "Sensor %d: lost: %d out of %d\n",
+                                                    i, fifo_info->lost[i],
+                                                    fifo_info->total_lost);
+                               if (sensorhub->tight_timestamps)
+                                       sensorhub->batch_state[i].last_len = 0;
+                       }
+               }
+
+       /*
+        * Spread samples in case of batching, then add them to the
+        * ringbuffer.
+        */
+       if (sensorhub->tight_timestamps)
+               cros_ec_sensor_ring_spread_add(sensorhub, sensor_mask,
+                                              last_out);
+       else
+               cros_ec_sensor_ring_spread_add_legacy(sensorhub, sensor_mask,
+                                                     current_timestamp,
+                                                     last_out);
+
+ring_handler_end:
+       sensorhub->fifo_timestamp[CROS_EC_SENSOR_LAST_TS] = current_timestamp;
+       return;
+
+error:
+       mutex_unlock(&sensorhub->cmd_lock);
+}
+
+static int cros_ec_sensorhub_event(struct notifier_block *nb,
+                                  unsigned long queued_during_suspend,
+                                  void *_notify)
+{
+       struct cros_ec_sensorhub *sensorhub;
+       struct cros_ec_device *ec_dev;
+
+       sensorhub = container_of(nb, struct cros_ec_sensorhub, notifier);
+       ec_dev = sensorhub->ec->ec_dev;
+
+       if (ec_dev->event_data.event_type != EC_MKBP_EVENT_SENSOR_FIFO)
+               return NOTIFY_DONE;
+
+       if (ec_dev->event_size != sizeof(ec_dev->event_data.data.sensor_fifo)) {
+               dev_warn(ec_dev->dev, "Invalid fifo info size\n");
+               return NOTIFY_DONE;
+       }
+
+       if (queued_during_suspend)
+               return NOTIFY_OK;
+
+       memcpy(sensorhub->fifo_info, &ec_dev->event_data.data.sensor_fifo.info,
+              sizeof(*sensorhub->fifo_info));
+       sensorhub->fifo_timestamp[CROS_EC_SENSOR_NEW_TS] =
+               ec_dev->last_event_time;
+       cros_ec_sensorhub_ring_handler(sensorhub);
+
+       return NOTIFY_OK;
+}
+
+/**
+ * cros_ec_sensorhub_ring_add() - Add the FIFO functionality if the EC
+ *                               supports it.
+ *
+ * @sensorhub : Sensor Hub object.
+ *
+ * Return: 0 on success.
+ */
+int cros_ec_sensorhub_ring_add(struct cros_ec_sensorhub *sensorhub)
+{
+       struct cros_ec_dev *ec = sensorhub->ec;
+       int ret;
+       int fifo_info_length =
+               sizeof(struct ec_response_motion_sense_fifo_info) +
+               sizeof(u16) * sensorhub->sensor_num;
+
+       /* Allocate the array for lost events. */
+       sensorhub->fifo_info = devm_kzalloc(sensorhub->dev, fifo_info_length,
+                                           GFP_KERNEL);
+       if (!sensorhub->fifo_info)
+               return -ENOMEM;
+
+       /* Retrieve FIFO information */
+       sensorhub->msg->version = 2;
+       sensorhub->params->cmd = MOTIONSENSE_CMD_FIFO_INFO;
+       sensorhub->msg->outsize = 1;
+       sensorhub->msg->insize = fifo_info_length;
+
+       ret = cros_ec_cmd_xfer_status(ec->ec_dev, sensorhub->msg);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * Allocate the full fifo. We need to copy the whole FIFO to set
+        * timestamps properly.
+        */
+       sensorhub->fifo_size = sensorhub->resp->fifo_info.size;
+       sensorhub->ring = devm_kcalloc(sensorhub->dev, sensorhub->fifo_size,
+                                      sizeof(*sensorhub->ring), GFP_KERNEL);
+       if (!sensorhub->ring)
+               return -ENOMEM;
+
+       /*
+        * Allocate the callback area based on the number of sensors.
+        */
+       sensorhub->push_data = devm_kcalloc(
+                       sensorhub->dev, sensorhub->sensor_num,
+                       sizeof(*sensorhub->push_data),
+                       GFP_KERNEL);
+       if (!sensorhub->push_data)
+               return -ENOMEM;
+
+       sensorhub->fifo_timestamp[CROS_EC_SENSOR_LAST_TS] =
+               cros_ec_get_time_ns();
+
+       sensorhub->tight_timestamps = cros_ec_check_features(
+                       ec, EC_FEATURE_MOTION_SENSE_TIGHT_TIMESTAMPS);
+
+       if (sensorhub->tight_timestamps) {
+               sensorhub->batch_state = devm_kcalloc(sensorhub->dev,
+                               sensorhub->sensor_num,
+                               sizeof(*sensorhub->batch_state),
+                               GFP_KERNEL);
+               if (!sensorhub->batch_state)
+                       return -ENOMEM;
+       }
+
+       /* Register the notifier that will act as a top half interrupt. */
+       sensorhub->notifier.notifier_call = cros_ec_sensorhub_event;
+       ret = blocking_notifier_chain_register(&ec->ec_dev->event_notifier,
+                                              &sensorhub->notifier);
+       if (ret < 0)
+               return ret;
+
+       /* Start collection samples. */
+       return cros_ec_sensorhub_ring_fifo_enable(sensorhub, true);
+}
+
+void cros_ec_sensorhub_ring_remove(void *arg)
+{
+       struct cros_ec_sensorhub *sensorhub = arg;
+       struct cros_ec_device *ec_dev = sensorhub->ec->ec_dev;
+
+       /* Disable the ring, prevent EC interrupt to the AP for nothing. */
+       cros_ec_sensorhub_ring_fifo_enable(sensorhub, false);
+       blocking_notifier_chain_unregister(&ec_dev->event_notifier,
+                                          &sensorhub->notifier);
+}
index 46786d2..debea5c 100644 (file)
@@ -127,7 +127,8 @@ static int terminate_request(struct cros_ec_device *ec_dev)
         */
        spi_message_init(&msg);
        memset(&trans, 0, sizeof(trans));
-       trans.delay_usecs = ec_spi->end_of_msg_delay;
+       trans.delay.value = ec_spi->end_of_msg_delay;
+       trans.delay.unit = SPI_DELAY_UNIT_USECS;
        spi_message_add_tail(&trans, &msg);
 
        ret = spi_sync_locked(ec_spi->spi, &msg);
@@ -416,7 +417,8 @@ static int do_cros_ec_pkt_xfer_spi(struct cros_ec_device *ec_dev,
        spi_message_init(&msg);
        if (ec_spi->start_of_msg_delay) {
                memset(&trans_delay, 0, sizeof(trans_delay));
-               trans_delay.delay_usecs = ec_spi->start_of_msg_delay;
+               trans_delay.delay.value = ec_spi->start_of_msg_delay;
+               trans_delay.delay.unit = SPI_DELAY_UNIT_USECS;
                spi_message_add_tail(&trans_delay, &msg);
        }
 
index 07dac97..d45ea5d 100644 (file)
@@ -149,14 +149,14 @@ static ssize_t version_show(struct device *dev,
        /* Get build info. */
        msg->command = EC_CMD_GET_BUILD_INFO + ec->cmd_offset;
        msg->insize = EC_HOST_PARAM_SIZE;
-       ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
-       if (ret < 0)
-               count += scnprintf(buf + count, PAGE_SIZE - count,
-                                  "Build info:    XFER ERROR %d\n", ret);
-       else if (msg->result != EC_RES_SUCCESS)
+       ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
+       if (ret == -EPROTO) {
                count += scnprintf(buf + count, PAGE_SIZE - count,
                                   "Build info:    EC error %d\n", msg->result);
-       else {
+       } else if (ret < 0) {
+               count += scnprintf(buf + count, PAGE_SIZE - count,
+                                  "Build info:    XFER ERROR %d\n", ret);
+       } else {
                msg->data[EC_HOST_PARAM_SIZE - 1] = '\0';
                count += scnprintf(buf + count, PAGE_SIZE - count,
                                   "Build info:    %s\n", msg->data);
@@ -165,14 +165,14 @@ static ssize_t version_show(struct device *dev,
        /* Get chip info. */
        msg->command = EC_CMD_GET_CHIP_INFO + ec->cmd_offset;
        msg->insize = sizeof(*r_chip);
-       ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
-       if (ret < 0)
-               count += scnprintf(buf + count, PAGE_SIZE - count,
-                                  "Chip info:     XFER ERROR %d\n", ret);
-       else if (msg->result != EC_RES_SUCCESS)
+       ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
+       if (ret == -EPROTO) {
                count += scnprintf(buf + count, PAGE_SIZE - count,
                                   "Chip info:     EC error %d\n", msg->result);
-       else {
+       } else if (ret < 0) {
+               count += scnprintf(buf + count, PAGE_SIZE - count,
+                                  "Chip info:     XFER ERROR %d\n", ret);
+       } else {
                r_chip = (struct ec_response_get_chip_info *)msg->data;
 
                r_chip->vendor[sizeof(r_chip->vendor) - 1] = '\0';
@@ -189,14 +189,14 @@ static ssize_t version_show(struct device *dev,
        /* Get board version */
        msg->command = EC_CMD_GET_BOARD_VERSION + ec->cmd_offset;
        msg->insize = sizeof(*r_board);
-       ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
-       if (ret < 0)
-               count += scnprintf(buf + count, PAGE_SIZE - count,
-                                  "Board version: XFER ERROR %d\n", ret);
-       else if (msg->result != EC_RES_SUCCESS)
+       ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg);
+       if (ret == -EPROTO) {
                count += scnprintf(buf + count, PAGE_SIZE - count,
                                   "Board version: EC error %d\n", msg->result);
-       else {
+       } else if (ret < 0) {
+               count += scnprintf(buf + count, PAGE_SIZE - count,
+                                  "Board version: XFER ERROR %d\n", ret);
+       } else {
                r_board = (struct ec_response_board_version *)msg->data;
 
                count += scnprintf(buf + count, PAGE_SIZE - count,
diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c
new file mode 100644 (file)
index 0000000..874269c
--- /dev/null
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2020 Google LLC
+ *
+ * This driver provides the ability to view and manage Type C ports through the
+ * Chrome OS EC.
+ */
+
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_data/cros_ec_commands.h>
+#include <linux/platform_data/cros_ec_proto.h>
+#include <linux/platform_device.h>
+#include <linux/usb/typec.h>
+
+#define DRV_NAME "cros-ec-typec"
+
+/* Platform-specific data for the Chrome OS EC Type C controller. */
+struct cros_typec_data {
+       struct device *dev;
+       struct cros_ec_device *ec;
+       int num_ports;
+       unsigned int cmd_ver;
+       /* Array of ports, indexed by port number. */
+       struct typec_port *ports[EC_USB_PD_MAX_PORTS];
+       /* Initial capabilities for each port. */
+       struct typec_capability *caps[EC_USB_PD_MAX_PORTS];
+};
+
+static int cros_typec_parse_port_props(struct typec_capability *cap,
+                                      struct fwnode_handle *fwnode,
+                                      struct device *dev)
+{
+       const char *buf;
+       int ret;
+
+       memset(cap, 0, sizeof(*cap));
+       ret = fwnode_property_read_string(fwnode, "power-role", &buf);
+       if (ret) {
+               dev_err(dev, "power-role not found: %d\n", ret);
+               return ret;
+       }
+
+       ret = typec_find_port_power_role(buf);
+       if (ret < 0)
+               return ret;
+       cap->type = ret;
+
+       ret = fwnode_property_read_string(fwnode, "data-role", &buf);
+       if (ret) {
+               dev_err(dev, "data-role not found: %d\n", ret);
+               return ret;
+       }
+
+       ret = typec_find_port_data_role(buf);
+       if (ret < 0)
+               return ret;
+       cap->data = ret;
+
+       ret = fwnode_property_read_string(fwnode, "try-power-role", &buf);
+       if (ret) {
+               dev_err(dev, "try-power-role not found: %d\n", ret);
+               return ret;
+       }
+
+       ret = typec_find_power_role(buf);
+       if (ret < 0)
+               return ret;
+       cap->prefer_role = ret;
+
+       cap->fwnode = fwnode;
+
+       return 0;
+}
+
+static int cros_typec_init_ports(struct cros_typec_data *typec)
+{
+       struct device *dev = typec->dev;
+       struct typec_capability *cap;
+       struct fwnode_handle *fwnode;
+       const char *port_prop;
+       int ret;
+       int i;
+       int nports;
+       u32 port_num = 0;
+
+       nports = device_get_child_node_count(dev);
+       if (nports == 0) {
+               dev_err(dev, "No port entries found.\n");
+               return -ENODEV;
+       }
+
+       if (nports > typec->num_ports) {
+               dev_err(dev, "More ports listed than can be supported.\n");
+               return -EINVAL;
+       }
+
+       /* DT uses "reg" to specify port number. */
+       port_prop = dev->of_node ? "reg" : "port-number";
+       device_for_each_child_node(dev, fwnode) {
+               if (fwnode_property_read_u32(fwnode, port_prop, &port_num)) {
+                       ret = -EINVAL;
+                       dev_err(dev, "No port-number for port, aborting.\n");
+                       goto unregister_ports;
+               }
+
+               if (port_num >= typec->num_ports) {
+                       dev_err(dev, "Invalid port number.\n");
+                       ret = -EINVAL;
+                       goto unregister_ports;
+               }
+
+               dev_dbg(dev, "Registering port %d\n", port_num);
+
+               cap = devm_kzalloc(dev, sizeof(*cap), GFP_KERNEL);
+               if (!cap) {
+                       ret = -ENOMEM;
+                       goto unregister_ports;
+               }
+
+               typec->caps[port_num] = cap;
+
+               ret = cros_typec_parse_port_props(cap, fwnode, dev);
+               if (ret < 0)
+                       goto unregister_ports;
+
+               typec->ports[port_num] = typec_register_port(dev, cap);
+               if (IS_ERR(typec->ports[port_num])) {
+                       dev_err(dev, "Failed to register port %d\n", port_num);
+                       ret = PTR_ERR(typec->ports[port_num]);
+                       goto unregister_ports;
+               }
+       }
+
+       return 0;
+
+unregister_ports:
+       for (i = 0; i < typec->num_ports; i++)
+               typec_unregister_port(typec->ports[i]);
+       return ret;
+}
+
+static int cros_typec_ec_command(struct cros_typec_data *typec,
+                                unsigned int version,
+                                unsigned int command,
+                                void *outdata,
+                                unsigned int outsize,
+                                void *indata,
+                                unsigned int insize)
+{
+       struct cros_ec_command *msg;
+       int ret;
+
+       msg = kzalloc(sizeof(*msg) + max(outsize, insize), GFP_KERNEL);
+       if (!msg)
+               return -ENOMEM;
+
+       msg->version = version;
+       msg->command = command;
+       msg->outsize = outsize;
+       msg->insize = insize;
+
+       if (outsize)
+               memcpy(msg->data, outdata, outsize);
+
+       ret = cros_ec_cmd_xfer_status(typec->ec, msg);
+       if (ret >= 0 && insize)
+               memcpy(indata, msg->data, insize);
+
+       kfree(msg);
+       return ret;
+}
+
+static void cros_typec_set_port_params_v0(struct cros_typec_data *typec,
+               int port_num, struct ec_response_usb_pd_control *resp)
+{
+       struct typec_port *port = typec->ports[port_num];
+       enum typec_orientation polarity;
+
+       if (!resp->enabled)
+               polarity = TYPEC_ORIENTATION_NONE;
+       else if (!resp->polarity)
+               polarity = TYPEC_ORIENTATION_NORMAL;
+       else
+               polarity = TYPEC_ORIENTATION_REVERSE;
+
+       typec_set_pwr_role(port, resp->role ? TYPEC_SOURCE : TYPEC_SINK);
+       typec_set_orientation(port, polarity);
+}
+
+static void cros_typec_set_port_params_v1(struct cros_typec_data *typec,
+               int port_num, struct ec_response_usb_pd_control_v1 *resp)
+{
+       struct typec_port *port = typec->ports[port_num];
+       enum typec_orientation polarity;
+
+       if (!(resp->enabled & PD_CTRL_RESP_ENABLED_CONNECTED))
+               polarity = TYPEC_ORIENTATION_NONE;
+       else if (!resp->polarity)
+               polarity = TYPEC_ORIENTATION_NORMAL;
+       else
+               polarity = TYPEC_ORIENTATION_REVERSE;
+       typec_set_orientation(port, polarity);
+       typec_set_data_role(port, resp->role & PD_CTRL_RESP_ROLE_DATA ?
+                       TYPEC_HOST : TYPEC_DEVICE);
+       typec_set_pwr_role(port, resp->role & PD_CTRL_RESP_ROLE_POWER ?
+                       TYPEC_SOURCE : TYPEC_SINK);
+       typec_set_vconn_role(port, resp->role & PD_CTRL_RESP_ROLE_VCONN ?
+                       TYPEC_SOURCE : TYPEC_SINK);
+}
+
+static int cros_typec_port_update(struct cros_typec_data *typec, int port_num)
+{
+       struct ec_params_usb_pd_control req;
+       struct ec_response_usb_pd_control_v1 resp;
+       int ret;
+
+       if (port_num < 0 || port_num >= typec->num_ports) {
+               dev_err(typec->dev, "cannot get status for invalid port %d\n",
+                       port_num);
+               return -EINVAL;
+       }
+
+       req.port = port_num;
+       req.role = USB_PD_CTRL_ROLE_NO_CHANGE;
+       req.mux = USB_PD_CTRL_MUX_NO_CHANGE;
+       req.swap = USB_PD_CTRL_SWAP_NONE;
+
+       ret = cros_typec_ec_command(typec, typec->cmd_ver,
+                                   EC_CMD_USB_PD_CONTROL, &req, sizeof(req),
+                                   &resp, sizeof(resp));
+       if (ret < 0)
+               return ret;
+
+       dev_dbg(typec->dev, "Enabled %d: 0x%hhx\n", port_num, resp.enabled);
+       dev_dbg(typec->dev, "Role %d: 0x%hhx\n", port_num, resp.role);
+       dev_dbg(typec->dev, "Polarity %d: 0x%hhx\n", port_num, resp.polarity);
+       dev_dbg(typec->dev, "State %d: %s\n", port_num, resp.state);
+
+       if (typec->cmd_ver == 1)
+               cros_typec_set_port_params_v1(typec, port_num, &resp);
+       else
+               cros_typec_set_port_params_v0(typec, port_num,
+                       (struct ec_response_usb_pd_control *) &resp);
+
+       return 0;
+}
+
+static int cros_typec_get_cmd_version(struct cros_typec_data *typec)
+{
+       struct ec_params_get_cmd_versions_v1 req_v1;
+       struct ec_response_get_cmd_versions resp;
+       int ret;
+
+       /* We're interested in the PD control command version. */
+       req_v1.cmd = EC_CMD_USB_PD_CONTROL;
+       ret = cros_typec_ec_command(typec, 1, EC_CMD_GET_CMD_VERSIONS,
+                                   &req_v1, sizeof(req_v1), &resp,
+                                   sizeof(resp));
+       if (ret < 0)
+               return ret;
+
+       if (resp.version_mask & EC_VER_MASK(1))
+               typec->cmd_ver = 1;
+       else
+               typec->cmd_ver = 0;
+
+       dev_dbg(typec->dev, "PD Control has version mask 0x%hhx\n",
+               typec->cmd_ver);
+
+       return 0;
+}
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id cros_typec_acpi_id[] = {
+       { "GOOG0014", 0 },
+       {}
+};
+MODULE_DEVICE_TABLE(acpi, cros_typec_acpi_id);
+#endif
+
+#ifdef CONFIG_OF
+static const struct of_device_id cros_typec_of_match[] = {
+       { .compatible = "google,cros-ec-typec", },
+       {}
+};
+MODULE_DEVICE_TABLE(of, cros_typec_of_match);
+#endif
+
+static int cros_typec_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct cros_typec_data *typec;
+       struct ec_response_usb_pd_ports resp;
+       int ret, i;
+
+       typec = devm_kzalloc(dev, sizeof(*typec), GFP_KERNEL);
+       if (!typec)
+               return -ENOMEM;
+
+       typec->dev = dev;
+       typec->ec = dev_get_drvdata(pdev->dev.parent);
+       platform_set_drvdata(pdev, typec);
+
+       ret = cros_typec_get_cmd_version(typec);
+       if (ret < 0) {
+               dev_err(dev, "failed to get PD command version info\n");
+               return ret;
+       }
+
+       ret = cros_typec_ec_command(typec, 0, EC_CMD_USB_PD_PORTS, NULL, 0,
+                                   &resp, sizeof(resp));
+       if (ret < 0)
+               return ret;
+
+       typec->num_ports = resp.num_ports;
+       if (typec->num_ports > EC_USB_PD_MAX_PORTS) {
+               dev_warn(typec->dev,
+                        "Too many ports reported: %d, limiting to max: %d\n",
+                        typec->num_ports, EC_USB_PD_MAX_PORTS);
+               typec->num_ports = EC_USB_PD_MAX_PORTS;
+       }
+
+       ret = cros_typec_init_ports(typec);
+       if (ret < 0)
+               return ret;
+
+       for (i = 0; i < typec->num_ports; i++) {
+               ret = cros_typec_port_update(typec, i);
+               if (ret < 0)
+                       goto unregister_ports;
+       }
+
+       return 0;
+
+unregister_ports:
+       for (i = 0; i < typec->num_ports; i++)
+               if (typec->ports[i])
+                       typec_unregister_port(typec->ports[i]);
+       return ret;
+}
+
+static struct platform_driver cros_typec_driver = {
+       .driver = {
+               .name = DRV_NAME,
+               .acpi_match_table = ACPI_PTR(cros_typec_acpi_id),
+               .of_match_table = of_match_ptr(cros_typec_of_match),
+       },
+       .probe = cros_typec_probe,
+};
+
+module_platform_driver(cros_typec_driver);
+
+MODULE_AUTHOR("Prashant Malani <pmalani@chromium.org>");
+MODULE_DESCRIPTION("Chrome OS EC Type C control");
+MODULE_LICENSE("GPL");
index 8edae46..46482d1 100644 (file)
@@ -40,7 +40,7 @@ static ssize_t vboot_context_read(struct file *filp, struct kobject *kobj,
        msg->outsize = para_sz;
        msg->insize = resp_sz;
 
-       err = cros_ec_cmd_xfer(ecdev, msg);
+       err = cros_ec_cmd_xfer_status(ecdev, msg);
        if (err < 0) {
                dev_err(dev, "Error sending read request: %d\n", err);
                kfree(msg);
@@ -83,7 +83,7 @@ static ssize_t vboot_context_write(struct file *filp, struct kobject *kobj,
        msg->outsize = para_sz;
        msg->insize = 0;
 
-       err = cros_ec_cmd_xfer(ecdev, msg);
+       err = cros_ec_cmd_xfer_status(ecdev, msg);
        if (err < 0) {
                dev_err(dev, "Error sending write request: %d\n", err);
                kfree(msg);
diff --git a/drivers/platform/chrome/cros_usbpd_notify.c b/drivers/platform/chrome/cros_usbpd_notify.c
new file mode 100644 (file)
index 0000000..7f36142
--- /dev/null
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2020 Google LLC
+ *
+ * This driver serves as the receiver of cros_ec PD host events.
+ */
+
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/platform_data/cros_ec_proto.h>
+#include <linux/platform_data/cros_usbpd_notify.h>
+#include <linux/platform_device.h>
+
+#define DRV_NAME "cros-usbpd-notify"
+#define DRV_NAME_PLAT_ACPI "cros-usbpd-notify-acpi"
+#define ACPI_DRV_NAME "GOOG0003"
+
+static BLOCKING_NOTIFIER_HEAD(cros_usbpd_notifier_list);
+
+struct cros_usbpd_notify_data {
+       struct device *dev;
+       struct cros_ec_device *ec;
+       struct notifier_block nb;
+};
+
+/**
+ * cros_usbpd_register_notify - Register a notifier callback for PD events.
+ * @nb: Notifier block pointer to register
+ *
+ * On ACPI platforms this corresponds to host events on the ECPD
+ * "GOOG0003" ACPI device. On non-ACPI platforms this will filter mkbp events
+ * for USB PD events.
+ *
+ * Return: 0 on success or negative error code.
+ */
+int cros_usbpd_register_notify(struct notifier_block *nb)
+{
+       return blocking_notifier_chain_register(&cros_usbpd_notifier_list,
+                                               nb);
+}
+EXPORT_SYMBOL_GPL(cros_usbpd_register_notify);
+
+/**
+ * cros_usbpd_unregister_notify - Unregister notifier callback for PD events.
+ * @nb: Notifier block pointer to unregister
+ *
+ * Unregister a notifier callback that was previously registered with
+ * cros_usbpd_register_notify().
+ */
+void cros_usbpd_unregister_notify(struct notifier_block *nb)
+{
+       blocking_notifier_chain_unregister(&cros_usbpd_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(cros_usbpd_unregister_notify);
+
+/**
+ * cros_ec_pd_command - Send a command to the EC.
+ *
+ * @ec_dev: EC device
+ * @command: EC command
+ * @outdata: EC command output data
+ * @outsize: Size of outdata
+ * @indata: EC command input data
+ * @insize: Size of indata
+ *
+ * Return: >= 0 on success, negative error number on failure.
+ */
+static int cros_ec_pd_command(struct cros_ec_device *ec_dev,
+                             int command,
+                             uint8_t *outdata,
+                             int outsize,
+                             uint8_t *indata,
+                             int insize)
+{
+       struct cros_ec_command *msg;
+       int ret;
+
+       msg = kzalloc(sizeof(*msg) + max(insize, outsize), GFP_KERNEL);
+       if (!msg)
+               return -ENOMEM;
+
+       msg->command = command;
+       msg->outsize = outsize;
+       msg->insize = insize;
+
+       if (outsize)
+               memcpy(msg->data, outdata, outsize);
+
+       ret = cros_ec_cmd_xfer_status(ec_dev, msg);
+       if (ret < 0)
+               goto error;
+
+       if (insize)
+               memcpy(indata, msg->data, insize);
+error:
+       kfree(msg);
+       return ret;
+}
+
+static void cros_usbpd_get_event_and_notify(struct device  *dev,
+                                           struct cros_ec_device *ec_dev)
+{
+       struct ec_response_host_event_status host_event_status;
+       u32 event = 0;
+       int ret;
+
+       /*
+        * We still send a 0 event out to older devices which don't
+        * have the updated device heirarchy.
+        */
+       if (!ec_dev) {
+               dev_dbg(dev,
+                       "EC device inaccessible; sending 0 event status.\n");
+               goto send_notify;
+       }
+
+       /* Check for PD host events on EC. */
+       ret = cros_ec_pd_command(ec_dev, EC_CMD_PD_HOST_EVENT_STATUS,
+                                NULL, 0,
+                                (uint8_t *)&host_event_status,
+                                sizeof(host_event_status));
+       if (ret < 0) {
+               dev_warn(dev, "Can't get host event status (err: %d)\n", ret);
+               goto send_notify;
+       }
+
+       event = host_event_status.status;
+
+send_notify:
+       blocking_notifier_call_chain(&cros_usbpd_notifier_list, event, NULL);
+}
+
+#ifdef CONFIG_ACPI
+
+static void cros_usbpd_notify_acpi(acpi_handle device, u32 event, void *data)
+{
+       struct cros_usbpd_notify_data *pdnotify = data;
+
+       cros_usbpd_get_event_and_notify(pdnotify->dev, pdnotify->ec);
+}
+
+static int cros_usbpd_notify_probe_acpi(struct platform_device *pdev)
+{
+       struct cros_usbpd_notify_data *pdnotify;
+       struct device *dev = &pdev->dev;
+       struct acpi_device *adev;
+       struct cros_ec_device *ec_dev;
+       acpi_status status;
+
+       adev = ACPI_COMPANION(dev);
+
+       pdnotify = devm_kzalloc(dev, sizeof(*pdnotify), GFP_KERNEL);
+       if (!pdnotify)
+               return -ENOMEM;
+
+       /* Get the EC device pointer needed to talk to the EC. */
+       ec_dev = dev_get_drvdata(dev->parent);
+       if (!ec_dev) {
+               /*
+                * We continue even for older devices which don't have the
+                * correct device heirarchy, namely, GOOG0003 is a child
+                * of GOOG0004.
+                */
+               dev_warn(dev, "Couldn't get Chrome EC device pointer.\n");
+       }
+
+       pdnotify->dev = dev;
+       pdnotify->ec = ec_dev;
+
+       status = acpi_install_notify_handler(adev->handle,
+                                            ACPI_ALL_NOTIFY,
+                                            cros_usbpd_notify_acpi,
+                                            pdnotify);
+       if (ACPI_FAILURE(status)) {
+               dev_warn(dev, "Failed to register notify handler %08x\n",
+                        status);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int cros_usbpd_notify_remove_acpi(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct acpi_device *adev = ACPI_COMPANION(dev);
+
+       acpi_remove_notify_handler(adev->handle, ACPI_ALL_NOTIFY,
+                                  cros_usbpd_notify_acpi);
+
+       return 0;
+}
+
+static const struct acpi_device_id cros_usbpd_notify_acpi_device_ids[] = {
+       { ACPI_DRV_NAME, 0 },
+       { }
+};
+MODULE_DEVICE_TABLE(acpi, cros_usbpd_notify_acpi_device_ids);
+
+static struct platform_driver cros_usbpd_notify_acpi_driver = {
+       .driver = {
+               .name = DRV_NAME_PLAT_ACPI,
+               .acpi_match_table = cros_usbpd_notify_acpi_device_ids,
+       },
+       .probe = cros_usbpd_notify_probe_acpi,
+       .remove = cros_usbpd_notify_remove_acpi,
+};
+
+#endif /* CONFIG_ACPI */
+
+static int cros_usbpd_notify_plat(struct notifier_block *nb,
+                                 unsigned long queued_during_suspend,
+                                 void *data)
+{
+       struct cros_usbpd_notify_data *pdnotify = container_of(nb,
+                       struct cros_usbpd_notify_data, nb);
+       struct cros_ec_device *ec_dev = (struct cros_ec_device *)data;
+       u32 host_event = cros_ec_get_host_event(ec_dev);
+
+       if (!host_event)
+               return NOTIFY_DONE;
+
+       if (host_event & EC_HOST_EVENT_MASK(EC_HOST_EVENT_PD_MCU)) {
+               cros_usbpd_get_event_and_notify(pdnotify->dev, ec_dev);
+               return NOTIFY_OK;
+       }
+       return NOTIFY_DONE;
+}
+
+static int cros_usbpd_notify_probe_plat(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct cros_ec_dev *ecdev = dev_get_drvdata(dev->parent);
+       struct cros_usbpd_notify_data *pdnotify;
+       int ret;
+
+       pdnotify = devm_kzalloc(dev, sizeof(*pdnotify), GFP_KERNEL);
+       if (!pdnotify)
+               return -ENOMEM;
+
+       pdnotify->dev = dev;
+       pdnotify->ec = ecdev->ec_dev;
+       pdnotify->nb.notifier_call = cros_usbpd_notify_plat;
+
+       dev_set_drvdata(dev, pdnotify);
+
+       ret = blocking_notifier_chain_register(&ecdev->ec_dev->event_notifier,
+                                              &pdnotify->nb);
+       if (ret < 0) {
+               dev_err(dev, "Failed to register notifier\n");
+               return ret;
+       }
+
+       return 0;
+}
+
+static int cros_usbpd_notify_remove_plat(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct cros_ec_dev *ecdev = dev_get_drvdata(dev->parent);
+       struct cros_usbpd_notify_data *pdnotify =
+               (struct cros_usbpd_notify_data *)dev_get_drvdata(dev);
+
+       blocking_notifier_chain_unregister(&ecdev->ec_dev->event_notifier,
+                                          &pdnotify->nb);
+
+       return 0;
+}
+
+static struct platform_driver cros_usbpd_notify_plat_driver = {
+       .driver = {
+               .name = DRV_NAME,
+       },
+       .probe = cros_usbpd_notify_probe_plat,
+       .remove = cros_usbpd_notify_remove_plat,
+};
+
+static int __init cros_usbpd_notify_init(void)
+{
+       int ret;
+
+       ret = platform_driver_register(&cros_usbpd_notify_plat_driver);
+       if (ret < 0)
+               return ret;
+
+#ifdef CONFIG_ACPI
+       platform_driver_register(&cros_usbpd_notify_acpi_driver);
+#endif
+       return 0;
+}
+
+static void __exit cros_usbpd_notify_exit(void)
+{
+#ifdef CONFIG_ACPI
+       platform_driver_unregister(&cros_usbpd_notify_acpi_driver);
+#endif
+       platform_driver_unregister(&cros_usbpd_notify_plat_driver);
+}
+
+module_init(cros_usbpd_notify_init);
+module_exit(cros_usbpd_notify_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ChromeOS power delivery notifier device");
+MODULE_AUTHOR("Jon Flatley <jflat@chromium.org>");
+MODULE_ALIAS("platform:" DRV_NAME);
index dba3d44..8145185 100644 (file)
@@ -79,7 +79,7 @@ static DEFINE_IDA(event_ida);
 struct ec_event {
        u16 size;
        u16 type;
-       u16 event[0];
+       u16 event[];
 } __packed;
 
 #define ec_event_num_words(ev) (ev->size - 1)
@@ -96,7 +96,7 @@ struct ec_event_queue {
        int capacity;
        int head;
        int tail;
-       struct ec_event *entries[0];
+       struct ec_event *entries[];
 };
 
 /* Maximum number of events to store in ec_event_queue */
index 62f2761..c2bf4c9 100644 (file)
@@ -3,8 +3,11 @@
  * Copyright 2019 Google LLC
  */
 
+#include <linux/errno.h>
+#include <linux/export.h>
 #include <linux/platform_data/wilco-ec.h>
 #include <linux/string.h>
+#include <linux/types.h>
 #include <asm/unaligned.h>
 
 /* Operation code; what the EC should do with the property */
index f0d174b..3c587b4 100644 (file)
@@ -8,8 +8,12 @@
  * See Documentation/ABI/testing/sysfs-platform-wilco-ec for more information.
  */
 
+#include <linux/device.h>
+#include <linux/kernel.h>
 #include <linux/platform_data/wilco-ec.h>
+#include <linux/string.h>
 #include <linux/sysfs.h>
+#include <linux/types.h>
 
 #define CMD_KB_CMOS                    0x7C
 #define SUB_CMD_KB_CMOS_AUTO_ON                0x03
index 195bc04..f3424fd 100644 (file)
@@ -659,7 +659,7 @@ config CHARGER_RT9455
 
 config CHARGER_CROS_USBPD
        tristate "ChromeOS EC based USBPD charger"
-       depends on CROS_EC
+       depends on CROS_USBPD_NOTIFY
        default n
        help
          Say Y here to enable ChromeOS EC based USBPD charger
index 30c3d37..2a45e84 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/platform_data/cros_ec_commands.h>
 #include <linux/platform_data/cros_ec_proto.h>
+#include <linux/platform_data/cros_usbpd_notify.h>
 #include <linux/platform_device.h>
 #include <linux/power_supply.h>
 #include <linux/slab.h>
@@ -517,32 +518,21 @@ static int cros_usbpd_charger_property_is_writeable(struct power_supply *psy,
 }
 
 static int cros_usbpd_charger_ec_event(struct notifier_block *nb,
-                                      unsigned long queued_during_suspend,
+                                      unsigned long host_event,
                                       void *_notify)
 {
-       struct cros_ec_device *ec_device;
-       struct charger_data *charger;
-       u32 host_event;
+       struct charger_data *charger = container_of(nb, struct charger_data,
+                                                   notifier);
 
-       charger = container_of(nb, struct charger_data, notifier);
-       ec_device = charger->ec_device;
-
-       host_event = cros_ec_get_host_event(ec_device);
-       if (host_event & EC_HOST_EVENT_MASK(EC_HOST_EVENT_PD_MCU)) {
-               cros_usbpd_charger_power_changed(charger->ports[0]->psy);
-               return NOTIFY_OK;
-       } else {
-               return NOTIFY_DONE;
-       }
+       cros_usbpd_charger_power_changed(charger->ports[0]->psy);
+       return NOTIFY_OK;
 }
 
 static void cros_usbpd_charger_unregister_notifier(void *data)
 {
        struct charger_data *charger = data;
-       struct cros_ec_device *ec_device = charger->ec_device;
 
-       blocking_notifier_chain_unregister(&ec_device->event_notifier,
-                                          &charger->notifier);
+       cros_usbpd_unregister_notify(&charger->notifier);
 }
 
 static int cros_usbpd_charger_probe(struct platform_device *pd)
@@ -676,21 +666,17 @@ static int cros_usbpd_charger_probe(struct platform_device *pd)
                goto fail;
        }
 
-       if (ec_device->mkbp_event_supported) {
-               /* Get PD events from the EC */
-               charger->notifier.notifier_call = cros_usbpd_charger_ec_event;
-               ret = blocking_notifier_chain_register(
-                                               &ec_device->event_notifier,
-                                               &charger->notifier);
-               if (ret < 0) {
-                       dev_warn(dev, "failed to register notifier\n");
-               } else {
-                       ret = devm_add_action_or_reset(dev,
-                                       cros_usbpd_charger_unregister_notifier,
-                                       charger);
-                       if (ret < 0)
-                               goto fail;
-               }
+       /* Get PD events from the EC */
+       charger->notifier.notifier_call = cros_usbpd_charger_ec_event;
+       ret = cros_usbpd_register_notify(&charger->notifier);
+       if (ret < 0) {
+               dev_warn(dev, "failed to register notifier\n");
+       } else {
+               ret = devm_add_action_or_reset(dev,
+                               cros_usbpd_charger_unregister_notifier,
+                               charger);
+               if (ret < 0)
+                       goto fail;
        }
 
        return 0;
index 24709c5..e061b7d 100644 (file)
@@ -31,7 +31,7 @@ void ps3_sys_manager_register_ops(const struct ps3_sys_manager_ops *ops)
 {
        BUG_ON(!ops);
        BUG_ON(!ops->dev);
-       ps3_sys_manager_ops = ops ? *ops : ps3_sys_manager_ops;
+       ps3_sys_manager_ops = *ops;
 }
 EXPORT_SYMBOL_GPL(ps3_sys_manager_register_ops);
 
index f942a33..ec873f0 100644 (file)
@@ -590,6 +590,16 @@ config RTC_DRV_RC5T583
          This driver can also be built as a module. If so, the module
          will be called rtc-rc5t583.
 
+config RTC_DRV_RC5T619
+       tristate "RICOH RC5T619 RTC driver"
+       depends on MFD_RN5T618
+       help
+         If you say yes here you get support for the RTC on the
+         RICOH RC5T619 chips.
+
+         This driver can also be built as a module. If so, the module
+         will be called rtc-rc5t619.
+
 config RTC_DRV_S35390A
        tristate "Seiko Instruments S-35390A"
        select BITREVERSE
index 3b66ee9..0721752 100644 (file)
@@ -133,6 +133,7 @@ obj-$(CONFIG_RTC_DRV_PXA)   += rtc-pxa.o
 obj-$(CONFIG_RTC_DRV_R7301)    += rtc-r7301.o
 obj-$(CONFIG_RTC_DRV_R9701)    += rtc-r9701.o
 obj-$(CONFIG_RTC_DRV_RC5T583)  += rtc-rc5t583.o
+obj-$(CONFIG_RTC_DRV_RC5T619)  += rtc-rc5t619.o
 obj-$(CONFIG_RTC_DRV_RK808)    += rtc-rk808.o
 obj-$(CONFIG_RTC_DRV_RP5C01)   += rtc-rp5c01.o
 obj-$(CONFIG_RTC_DRV_RS5C313)  += rtc-rs5c313.o
diff --git a/drivers/rtc/rtc-rc5t619.c b/drivers/rtc/rtc-rc5t619.c
new file mode 100644 (file)
index 0000000..24e386e
--- /dev/null
@@ -0,0 +1,444 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * drivers/rtc/rtc-rc5t619.c
+ *
+ * Real time clock driver for RICOH RC5T619 power management chip.
+ *
+ * Copyright (C) 2019 Andreas Kemnade
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mfd/rn5t618.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/bcd.h>
+#include <linux/rtc.h>
+#include <linux/slab.h>
+#include <linux/irqdomain.h>
+
+struct rc5t619_rtc {
+       int                     irq;
+       struct rtc_device       *rtc;
+       struct rn5t618 *rn5t618;
+};
+
+#define CTRL1_ALARM_ENABLED 0x40
+#define CTRL1_24HR 0x20
+#define CTRL1_PERIODIC_MASK 0xf
+
+#define CTRL2_PON 0x10
+#define CTRL2_ALARM_STATUS 0x80
+#define CTRL2_CTFG 0x4
+#define CTRL2_CTC 0x1
+
+#define MONTH_CENTFLAG 0x80
+#define HOUR_PMFLAG 0x20
+#define MDAY_DAL_EXT 0x80
+
+static uint8_t rtc5t619_12hour_bcd2bin(uint8_t hour)
+{
+       if (hour & HOUR_PMFLAG) {
+               hour = bcd2bin(hour & ~HOUR_PMFLAG);
+               return hour == 12 ? 12 : 12 + hour;
+       }
+
+       hour = bcd2bin(hour);
+       return hour == 12 ? 0 : hour;
+}
+
+static uint8_t rtc5t619_12hour_bin2bcd(uint8_t hour)
+{
+       if (!hour)
+               return 0x12;
+
+       if (hour < 12)
+               return bin2bcd(hour);
+
+       if (hour == 12)
+               return 0x12 | HOUR_PMFLAG;
+
+       return bin2bcd(hour - 12) | HOUR_PMFLAG;
+}
+
+static int rc5t619_rtc_periodic_disable(struct device *dev)
+{
+       struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+       int err;
+
+       /* disable function */
+       err = regmap_update_bits(rtc->rn5t618->regmap,
+                                RN5T618_RTC_CTRL1, CTRL1_PERIODIC_MASK, 0);
+       if (err < 0)
+               return err;
+
+       /* clear alarm flag and CTFG */
+       err = regmap_update_bits(rtc->rn5t618->regmap, RN5T618_RTC_CTRL2,
+                                CTRL2_ALARM_STATUS | CTRL2_CTFG | CTRL2_CTC,
+                                0);
+       if (err < 0)
+               return err;
+
+       return 0;
+}
+
+/* things to be done once after power on */
+static int rc5t619_rtc_pon_setup(struct device *dev)
+{
+       struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+       int err;
+       unsigned int reg_data;
+
+       err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL2, &reg_data);
+       if (err < 0)
+               return err;
+
+       /* clear VDET PON */
+       reg_data &= ~(CTRL2_PON | CTRL2_CTC | 0x4a);    /* 0101-1011 */
+       reg_data |= 0x20;       /* 0010-0000 */
+       err = regmap_write(rtc->rn5t618->regmap, RN5T618_RTC_CTRL2, reg_data);
+       if (err < 0)
+               return err;
+
+       /* clearing RTC Adjust register */
+       err = regmap_write(rtc->rn5t618->regmap, RN5T618_RTC_ADJUST, 0);
+       if (err)
+               return err;
+
+       return regmap_update_bits(rtc->rn5t618->regmap,
+                                       RN5T618_RTC_CTRL1,
+                                       CTRL1_24HR, CTRL1_24HR);
+}
+
+static int rc5t619_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+       struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+       u8 buff[7];
+       int err;
+       int cent_flag;
+       unsigned int ctrl1;
+       unsigned int ctrl2;
+
+       err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL2, &ctrl2);
+       if (err < 0)
+               return err;
+
+       if (ctrl2 & CTRL2_PON)
+               return -EINVAL;
+
+       err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL1, &ctrl1);
+       if (err < 0)
+               return err;
+
+       err = regmap_bulk_read(rtc->rn5t618->regmap, RN5T618_RTC_SECONDS,
+                              buff, sizeof(buff));
+       if (err < 0)
+               return err;
+
+       if (buff[5] & MONTH_CENTFLAG)
+               cent_flag = 1;
+       else
+               cent_flag = 0;
+
+       tm->tm_sec  = bcd2bin(buff[0]);
+       tm->tm_min  = bcd2bin(buff[1]);
+
+       if (ctrl1 & CTRL1_24HR)
+               tm->tm_hour = bcd2bin(buff[2]);
+       else
+               tm->tm_hour = rtc5t619_12hour_bcd2bin(buff[2]);
+
+       tm->tm_wday = bcd2bin(buff[3]);
+       tm->tm_mday = bcd2bin(buff[4]);
+       tm->tm_mon  = bcd2bin(buff[5] & 0x1f) - 1; /* back to system 0-11 */
+       tm->tm_year = bcd2bin(buff[6]) + 100 * cent_flag;
+
+       return 0;
+}
+
+static int rc5t619_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+       struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+       u8 buff[7];
+       int err;
+       int cent_flag;
+       unsigned int ctrl1;
+       unsigned int ctrl2;
+
+       err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL2, &ctrl2);
+       if (err < 0)
+               return err;
+
+       if (ctrl2 & CTRL2_PON)
+               rc5t619_rtc_pon_setup(dev);
+
+       err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL1, &ctrl1);
+       if (err < 0)
+               return err;
+
+       if (tm->tm_year >= 100)
+               cent_flag = 1;
+       else
+               cent_flag = 0;
+
+       buff[0] = bin2bcd(tm->tm_sec);
+       buff[1] = bin2bcd(tm->tm_min);
+
+       if (ctrl1 & CTRL1_24HR)
+               buff[2] = bin2bcd(tm->tm_hour);
+       else
+               buff[2] = rtc5t619_12hour_bin2bcd(tm->tm_hour);
+
+       buff[3] = bin2bcd(tm->tm_wday);
+       buff[4] = bin2bcd(tm->tm_mday);
+       buff[5] = bin2bcd(tm->tm_mon + 1);      /* system set 0-11 */
+       buff[6] = bin2bcd(tm->tm_year - cent_flag * 100);
+
+       if (cent_flag)
+               buff[5] |= MONTH_CENTFLAG;
+
+       err = regmap_bulk_write(rtc->rn5t618->regmap, RN5T618_RTC_SECONDS,
+                               buff, sizeof(buff));
+       if (err < 0) {
+               dev_err(dev, "failed to program new time: %d\n", err);
+               return err;
+       }
+
+       return 0;
+}
+
+/* 0-disable, 1-enable */
+static int rc5t619_rtc_alarm_enable(struct device *dev, unsigned int enabled)
+{
+       struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+
+       return regmap_update_bits(rtc->rn5t618->regmap,
+                       RN5T618_RTC_CTRL1,
+                       CTRL1_ALARM_ENABLED,
+                       enabled ? CTRL1_ALARM_ENABLED : 0);
+}
+
+static int rc5t619_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+       struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+       u8 buff[6];
+       unsigned int buff_cent;
+       int err;
+       int cent_flag;
+       unsigned int ctrl1;
+
+       err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL1, &ctrl1);
+       if (err)
+               return err;
+
+       err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_MONTH, &buff_cent);
+       if (err < 0) {
+               dev_err(dev, "failed to read time: %d\n", err);
+               return err;
+       }
+
+       if (buff_cent & MONTH_CENTFLAG)
+               cent_flag = 1;
+       else
+               cent_flag = 0;
+
+       err = regmap_bulk_read(rtc->rn5t618->regmap, RN5T618_RTC_ALARM_Y_SEC,
+                              buff, sizeof(buff));
+       if (err)
+               return err;
+
+       buff[3] = buff[3] & 0x3f;
+
+       alrm->time.tm_sec  = bcd2bin(buff[0]);
+       alrm->time.tm_min  = bcd2bin(buff[1]);
+
+       if (ctrl1 & CTRL1_24HR)
+               alrm->time.tm_hour = bcd2bin(buff[2]);
+       else
+               alrm->time.tm_hour = rtc5t619_12hour_bcd2bin(buff[2]);
+
+       alrm->time.tm_mday = bcd2bin(buff[3]);
+       alrm->time.tm_mon  = bcd2bin(buff[4]) - 1;
+       alrm->time.tm_year = bcd2bin(buff[5]) + 100 * cent_flag;
+       alrm->enabled = !!(ctrl1 & CTRL1_ALARM_ENABLED);
+       dev_dbg(dev, "read alarm: %ptR\n", &alrm->time);
+
+       return 0;
+}
+
+static int rc5t619_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+       struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+       u8 buff[6];
+       int err;
+       int cent_flag;
+       unsigned int ctrl1;
+
+       err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL1, &ctrl1);
+       if (err)
+               return err;
+
+       err = rc5t619_rtc_alarm_enable(dev, 0);
+       if (err < 0)
+               return err;
+
+       if (rtc->irq == -1)
+               return -EINVAL;
+
+       if (alrm->enabled == 0)
+               return 0;
+
+       if (alrm->time.tm_year >= 100)
+               cent_flag = 1;
+       else
+               cent_flag = 0;
+
+       alrm->time.tm_mon += 1;
+       buff[0] = bin2bcd(alrm->time.tm_sec);
+       buff[1] = bin2bcd(alrm->time.tm_min);
+
+       if (ctrl1 & CTRL1_24HR)
+               buff[2] = bin2bcd(alrm->time.tm_hour);
+       else
+               buff[2] = rtc5t619_12hour_bin2bcd(alrm->time.tm_hour);
+
+       buff[3] = bin2bcd(alrm->time.tm_mday);
+       buff[4] = bin2bcd(alrm->time.tm_mon);
+       buff[5] = bin2bcd(alrm->time.tm_year - 100 * cent_flag);
+       buff[3] |= MDAY_DAL_EXT;
+
+       err = regmap_bulk_write(rtc->rn5t618->regmap, RN5T618_RTC_ALARM_Y_SEC,
+                               buff, sizeof(buff));
+       if (err < 0)
+               return err;
+
+       return rc5t619_rtc_alarm_enable(dev, alrm->enabled);
+}
+
+static const struct rtc_class_ops rc5t619_rtc_ops = {
+       .read_time      = rc5t619_rtc_read_time,
+       .set_time       = rc5t619_rtc_set_time,
+       .set_alarm      = rc5t619_rtc_set_alarm,
+       .read_alarm     = rc5t619_rtc_read_alarm,
+       .alarm_irq_enable = rc5t619_rtc_alarm_enable,
+};
+
+static int rc5t619_rtc_alarm_flag_clr(struct device *dev)
+{
+       struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+
+       /* clear alarm-D status bits.*/
+       return regmap_update_bits(rtc->rn5t618->regmap,
+                               RN5T618_RTC_CTRL2,
+                               CTRL2_ALARM_STATUS | CTRL2_CTC, 0);
+}
+
+static irqreturn_t rc5t619_rtc_irq(int irq, void *data)
+{
+       struct device *dev = data;
+       struct rc5t619_rtc *rtc = dev_get_drvdata(dev);
+
+       rc5t619_rtc_alarm_flag_clr(dev);
+
+       rtc_update_irq(rtc->rtc, 1, RTC_IRQF | RTC_AF);
+       return IRQ_HANDLED;
+}
+
+static int rc5t619_rtc_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct rn5t618 *rn5t618 = dev_get_drvdata(pdev->dev.parent);
+       struct rc5t619_rtc *rtc;
+       unsigned int ctrl2;
+       int err;
+
+       rtc = devm_kzalloc(dev, sizeof(*rtc), GFP_KERNEL);
+       if (IS_ERR(rtc)) {
+               err = PTR_ERR(rtc);
+               return -ENOMEM;
+       }
+
+       rtc->rn5t618 = rn5t618;
+
+       dev_set_drvdata(dev, rtc);
+       rtc->irq = -1;
+
+       if (rn5t618->irq_data)
+               rtc->irq = regmap_irq_get_virq(rn5t618->irq_data,
+                                              RN5T618_IRQ_RTC);
+
+       if (rtc->irq  < 0)
+               rtc->irq = -1;
+
+       err = regmap_read(rtc->rn5t618->regmap, RN5T618_RTC_CTRL2, &ctrl2);
+       if (err < 0)
+               return err;
+
+       /* disable rtc periodic function */
+       err = rc5t619_rtc_periodic_disable(&pdev->dev);
+       if (err)
+               return err;
+
+       if (ctrl2 & CTRL2_PON) {
+               err = rc5t619_rtc_alarm_flag_clr(&pdev->dev);
+               if (err)
+                       return err;
+       }
+
+       rtc->rtc = devm_rtc_allocate_device(&pdev->dev);
+       if (IS_ERR(rtc->rtc)) {
+               err = PTR_ERR(rtc->rtc);
+               dev_err(dev, "RTC device register: err %d\n", err);
+               return err;
+       }
+
+       rtc->rtc->ops = &rc5t619_rtc_ops;
+       rtc->rtc->range_min = RTC_TIMESTAMP_BEGIN_1900;
+       rtc->rtc->range_max = RTC_TIMESTAMP_END_2099;
+
+       /* set interrupt and enable it */
+       if (rtc->irq != -1) {
+               err = devm_request_threaded_irq(&pdev->dev, rtc->irq, NULL,
+                                               rc5t619_rtc_irq,
+                                               IRQF_ONESHOT,
+                                               "rtc-rc5t619",
+                                               &pdev->dev);
+               if (err < 0) {
+                       dev_err(&pdev->dev, "request IRQ:%d fail\n", rtc->irq);
+                       rtc->irq = -1;
+
+                       err = rc5t619_rtc_alarm_enable(&pdev->dev, 0);
+                       if (err)
+                               return err;
+
+               } else {
+                       /* enable wake */
+                       device_init_wakeup(&pdev->dev, 1);
+                       enable_irq_wake(rtc->irq);
+               }
+       } else {
+               /* system don't want to using alarm interrupt, so close it */
+               err = rc5t619_rtc_alarm_enable(&pdev->dev, 0);
+               if (err)
+                       return err;
+
+               dev_warn(&pdev->dev, "rc5t619 interrupt is disabled\n");
+       }
+
+       return rtc_register_device(rtc->rtc);
+}
+
+static struct platform_driver rc5t619_rtc_driver = {
+       .driver = {
+               .name   = "rc5t619-rtc",
+       },
+       .probe  = rc5t619_rtc_probe,
+};
+
+module_platform_driver(rc5t619_rtc_driver);
+MODULE_ALIAS("platform:rc5t619-rtc");
+MODULE_DESCRIPTION("RICOH RC5T619 RTC driver");
+MODULE_LICENSE("GPL");
index 80d2229..384edff 100644 (file)
@@ -57,11 +57,26 @@ static size_t dcssblk_dax_copy_to_iter(struct dax_device *dax_dev,
        return copy_to_iter(addr, bytes, i);
 }
 
+static int dcssblk_dax_zero_page_range(struct dax_device *dax_dev,
+                                      pgoff_t pgoff, size_t nr_pages)
+{
+       long rc;
+       void *kaddr;
+
+       rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL);
+       if (rc < 0)
+               return rc;
+       memset(kaddr, 0, nr_pages << PAGE_SHIFT);
+       dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
+       return 0;
+}
+
 static const struct dax_operations dcssblk_dax_ops = {
        .direct_access = dcssblk_dax_direct_access,
        .dax_supported = generic_fsdax_supported,
        .copy_from_iter = dcssblk_dax_copy_from_iter,
        .copy_to_iter = dcssblk_dax_copy_to_iter,
+       .zero_page_range = dcssblk_dax_zero_page_range,
 };
 
 struct dcssblk_dev_info {
@@ -680,8 +695,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 
        dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
                        &dcssblk_dax_ops, DAXDEV_F_SYNC);
-       if (!dev_info->dax_dev) {
-               rc = -ENOMEM;
+       if (IS_ERR(dev_info->dax_dev)) {
+               rc = PTR_ERR(dev_info->dax_dev);
+               dev_info->dax_dev = NULL;
                goto put_dev;
        }
 
index 50007cb..b29fe8d 100644 (file)
@@ -849,8 +849,10 @@ static void io_subchannel_register(struct ccw_device *cdev)
         * Now we know this subchannel will stay, we can throw
         * our delayed uevent.
         */
-       dev_set_uevent_suppress(&sch->dev, 0);
-       kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
+       if (dev_get_uevent_suppress(&sch->dev)) {
+               dev_set_uevent_suppress(&sch->dev, 0);
+               kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
+       }
        /* make it known to the system */
        ret = ccw_device_add(cdev);
        if (ret) {
@@ -1058,8 +1060,11 @@ static int io_subchannel_probe(struct subchannel *sch)
                 * Throw the delayed uevent for the subchannel, register
                 * the ccw_device and exit.
                 */
-               dev_set_uevent_suppress(&sch->dev, 0);
-               kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
+               if (dev_get_uevent_suppress(&sch->dev)) {
+                       /* should always be the case for the console */
+                       dev_set_uevent_suppress(&sch->dev, 0);
+                       kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
+               }
                cdev = sch_get_cdev(sch);
                rc = ccw_device_add(cdev);
                if (rc) {
index b0beafc..b8453b5 100644 (file)
@@ -374,7 +374,6 @@ int tiqdio_allocate_memory(void);
 void tiqdio_free_memory(void);
 int tiqdio_register_thinints(void);
 void tiqdio_unregister_thinints(void);
-void clear_nonshared_ind(struct qdio_irq *);
 int test_nonshared_ind(struct qdio_irq *);
 
 /* prototypes for setup */
index 5a3d9ee..286b044 100644 (file)
@@ -58,25 +58,11 @@ static void qdio_clear_dbf_list(void)
        mutex_unlock(&qdio_dbf_list_mutex);
 }
 
-int qdio_allocate_dbf(struct qdio_initialize *init_data,
-                      struct qdio_irq *irq_ptr)
+int qdio_allocate_dbf(struct qdio_irq *irq_ptr)
 {
        char text[QDIO_DBF_NAME_LEN];
        struct qdio_dbf_entry *new_entry;
 
-       DBF_EVENT("qfmt:%1d", init_data->q_format);
-       DBF_HEX(init_data->adapter_name, 8);
-       DBF_EVENT("qpff%4x", init_data->qib_param_field_format);
-       DBF_HEX(&init_data->qib_param_field, sizeof(void *));
-       DBF_HEX(&init_data->input_slib_elements, sizeof(void *));
-       DBF_HEX(&init_data->output_slib_elements, sizeof(void *));
-       DBF_EVENT("niq:%1d noq:%1d", init_data->no_input_qs,
-                 init_data->no_output_qs);
-       DBF_HEX(&init_data->input_handler, sizeof(void *));
-       DBF_HEX(&init_data->output_handler, sizeof(void *));
-       DBF_HEX(&init_data->int_parm, sizeof(long));
-       DBF_HEX(&init_data->input_sbal_addr_array, sizeof(void *));
-       DBF_HEX(&init_data->output_sbal_addr_array, sizeof(void *));
        DBF_EVENT("irq:%8lx", (unsigned long)irq_ptr);
 
        /* allocate trace view for the interface */
index 122450b..0dfba08 100644 (file)
@@ -64,8 +64,7 @@ static inline void DBF_DEV_HEX(struct qdio_irq *dev, void *addr,
        debug_event(dev->debug_area, level, addr, len);
 }
 
-int qdio_allocate_dbf(struct qdio_initialize *init_data,
-                      struct qdio_irq *irq_ptr);
+int qdio_allocate_dbf(struct qdio_irq *irq_ptr);
 void qdio_setup_debug_entries(struct qdio_irq *irq_ptr);
 void qdio_shutdown_debug_entries(struct qdio_irq *irq_ptr);
 int qdio_debug_init(void);
index c890848..bcc3ab1 100644 (file)
@@ -1220,27 +1220,21 @@ EXPORT_SYMBOL_GPL(qdio_free);
 
 /**
  * qdio_allocate - allocate qdio queues and associated data
- * @init_data: initialization data
+ * @cdev: associated ccw device
+ * @no_input_qs: allocate this number of Input Queues
+ * @no_output_qs: allocate this number of Output Queues
  */
-int qdio_allocate(struct qdio_initialize *init_data)
+int qdio_allocate(struct ccw_device *cdev, unsigned int no_input_qs,
+                 unsigned int no_output_qs)
 {
-       struct ccw_device *cdev = init_data->cdev;
        struct subchannel_id schid;
        struct qdio_irq *irq_ptr;
 
        ccw_device_get_schid(cdev, &schid);
        DBF_EVENT("qallocate:%4x", schid.sch_no);
 
-       if ((init_data->no_input_qs && !init_data->input_handler) ||
-           (init_data->no_output_qs && !init_data->output_handler))
-               return -EINVAL;
-
-       if ((init_data->no_input_qs > QDIO_MAX_QUEUES_PER_IRQ) ||
-           (init_data->no_output_qs > QDIO_MAX_QUEUES_PER_IRQ))
-               return -EINVAL;
-
-       if ((!init_data->input_sbal_addr_array) ||
-           (!init_data->output_sbal_addr_array))
+       if (no_input_qs > QDIO_MAX_QUEUES_PER_IRQ ||
+           no_output_qs > QDIO_MAX_QUEUES_PER_IRQ)
                return -EINVAL;
 
        /* irq_ptr must be in GFP_DMA since it contains ccw1.cda */
@@ -1250,9 +1244,12 @@ int qdio_allocate(struct qdio_initialize *init_data)
 
        irq_ptr->cdev = cdev;
        mutex_init(&irq_ptr->setup_mutex);
-       if (qdio_allocate_dbf(init_data, irq_ptr))
+       if (qdio_allocate_dbf(irq_ptr))
                goto out_rel;
 
+       DBF_DEV_EVENT(DBF_ERR, irq_ptr, "alloc niq:%1u noq:%1u", no_input_qs,
+                     no_output_qs);
+
        /*
         * Allocate a page for the chsc calls in qdio_establish.
         * Must be pre-allocated since a zfcp recovery will call
@@ -1268,8 +1265,7 @@ int qdio_allocate(struct qdio_initialize *init_data)
        if (!irq_ptr->qdr)
                goto out_rel;
 
-       if (qdio_allocate_qs(irq_ptr, init_data->no_input_qs,
-                            init_data->no_output_qs))
+       if (qdio_allocate_qs(irq_ptr, no_input_qs, no_output_qs))
                goto out_rel;
 
        INIT_LIST_HEAD(&irq_ptr->entry);
@@ -1305,13 +1301,33 @@ static void qdio_detect_hsicq(struct qdio_irq *irq_ptr)
        DBF_EVENT("use_cq:%d", use_cq);
 }
 
+static void qdio_trace_init_data(struct qdio_irq *irq,
+                                struct qdio_initialize *data)
+{
+       DBF_DEV_EVENT(DBF_ERR, irq, "qfmt:%1u", data->q_format);
+       DBF_DEV_HEX(irq, data->adapter_name, 8, DBF_ERR);
+       DBF_DEV_EVENT(DBF_ERR, irq, "qpff%4x", data->qib_param_field_format);
+       DBF_DEV_HEX(irq, &data->qib_param_field, sizeof(void *), DBF_ERR);
+       DBF_DEV_HEX(irq, &data->input_slib_elements, sizeof(void *), DBF_ERR);
+       DBF_DEV_HEX(irq, &data->output_slib_elements, sizeof(void *), DBF_ERR);
+       DBF_DEV_EVENT(DBF_ERR, irq, "niq:%1u noq:%1u", data->no_input_qs,
+                     data->no_output_qs);
+       DBF_DEV_HEX(irq, &data->input_handler, sizeof(void *), DBF_ERR);
+       DBF_DEV_HEX(irq, &data->output_handler, sizeof(void *), DBF_ERR);
+       DBF_DEV_HEX(irq, &data->int_parm, sizeof(long), DBF_ERR);
+       DBF_DEV_HEX(irq, &data->input_sbal_addr_array, sizeof(void *), DBF_ERR);
+       DBF_DEV_HEX(irq, &data->output_sbal_addr_array, sizeof(void *),
+                   DBF_ERR);
+}
+
 /**
  * qdio_establish - establish queues on a qdio subchannel
+ * @cdev: associated ccw device
  * @init_data: initialization data
  */
-int qdio_establish(struct qdio_initialize *init_data)
+int qdio_establish(struct ccw_device *cdev,
+                  struct qdio_initialize *init_data)
 {
-       struct ccw_device *cdev = init_data->cdev;
        struct qdio_irq *irq_ptr = cdev->private->qdio_data;
        struct subchannel_id schid;
        int rc;
@@ -1322,7 +1338,16 @@ int qdio_establish(struct qdio_initialize *init_data)
        if (!irq_ptr)
                return -ENODEV;
 
+       if ((init_data->no_input_qs && !init_data->input_handler) ||
+           (init_data->no_output_qs && !init_data->output_handler))
+               return -EINVAL;
+
+       if (!init_data->input_sbal_addr_array ||
+           !init_data->output_sbal_addr_array)
+               return -EINVAL;
+
        mutex_lock(&irq_ptr->setup_mutex);
+       qdio_trace_init_data(irq_ptr, init_data);
        qdio_setup_irq(irq_ptr, init_data);
 
        rc = qdio_establish_thinint(irq_ptr);
@@ -1618,8 +1643,6 @@ int qdio_start_irq(struct ccw_device *cdev)
        if (!irq_ptr)
                return -ENODEV;
 
-       clear_nonshared_ind(irq_ptr);
-
        for_each_input_queue(irq_ptr, q, i)
                qdio_stop_polling(q);
 
index bbbefc9..3083edd 100644 (file)
@@ -213,8 +213,6 @@ static void setup_queues(struct qdio_irq *irq_ptr,
                         struct qdio_initialize *qdio_init)
 {
        struct qdio_q *q;
-       struct qdio_buffer **input_sbal_array = qdio_init->input_sbal_addr_array;
-       struct qdio_buffer **output_sbal_array = qdio_init->output_sbal_addr_array;
        struct qdio_outbuf_state *output_sbal_state_array =
                                  qdio_init->output_sbal_state_array;
        int i;
@@ -225,8 +223,8 @@ static void setup_queues(struct qdio_irq *irq_ptr,
 
                q->is_input_q = 1;
 
-               setup_storage_lists(q, irq_ptr, input_sbal_array, i);
-               input_sbal_array += QDIO_MAX_BUFFERS_PER_Q;
+               setup_storage_lists(q, irq_ptr,
+                                   qdio_init->input_sbal_addr_array[i], i);
 
                if (is_thinint_irq(irq_ptr)) {
                        tasklet_init(&q->tasklet, tiqdio_inbound_processing,
@@ -245,8 +243,8 @@ static void setup_queues(struct qdio_irq *irq_ptr,
                output_sbal_state_array += QDIO_MAX_BUFFERS_PER_Q;
 
                q->is_input_q = 0;
-               setup_storage_lists(q, irq_ptr, output_sbal_array, i);
-               output_sbal_array += QDIO_MAX_BUFFERS_PER_Q;
+               setup_storage_lists(q, irq_ptr,
+                                   qdio_init->output_sbal_addr_array[i], i);
 
                tasklet_init(&q->tasklet, qdio_outbound_processing,
                             (unsigned long) q);
index ea09df7..ae50373 100644 (file)
@@ -82,36 +82,16 @@ void tiqdio_remove_device(struct qdio_irq *irq_ptr)
        INIT_LIST_HEAD(&irq_ptr->entry);
 }
 
-static inline int has_multiple_inq_on_dsci(struct qdio_irq *irq_ptr)
-{
-       return irq_ptr->nr_input_qs > 1;
-}
-
 static inline int references_shared_dsci(struct qdio_irq *irq_ptr)
 {
        return irq_ptr->dsci == &q_indicators[TIQDIO_SHARED_IND].ind;
 }
 
-static inline int shared_ind(struct qdio_irq *irq_ptr)
-{
-       return references_shared_dsci(irq_ptr) ||
-               has_multiple_inq_on_dsci(irq_ptr);
-}
-
-void clear_nonshared_ind(struct qdio_irq *irq_ptr)
-{
-       if (!is_thinint_irq(irq_ptr))
-               return;
-       if (shared_ind(irq_ptr))
-               return;
-       xchg(irq_ptr->dsci, 0);
-}
-
 int test_nonshared_ind(struct qdio_irq *irq_ptr)
 {
        if (!is_thinint_irq(irq_ptr))
                return 0;
-       if (shared_ind(irq_ptr))
+       if (references_shared_dsci(irq_ptr))
                return 0;
        if (*irq_ptr->dsci)
                return 1;
@@ -131,8 +111,7 @@ static inline void tiqdio_call_inq_handlers(struct qdio_irq *irq)
        struct qdio_q *q;
        int i;
 
-       if (!references_shared_dsci(irq) &&
-           has_multiple_inq_on_dsci(irq))
+       if (!references_shared_dsci(irq))
                xchg(irq->dsci, 0);
 
        if (irq->irq_poll) {
@@ -145,9 +124,6 @@ static inline void tiqdio_call_inq_handlers(struct qdio_irq *irq)
        }
 
        for_each_input_queue(irq, q, i) {
-               if (!shared_ind(irq))
-                       xchg(irq->dsci, 0);
-
                /*
                 * Call inbound processing but not directly
                 * since that could starve other thinint queues.
index e401a3d..339a6bc 100644 (file)
@@ -167,6 +167,11 @@ static int vfio_ccw_sch_probe(struct subchannel *sch)
        if (ret)
                goto out_disable;
 
+       if (dev_get_uevent_suppress(&sch->dev)) {
+               dev_set_uevent_suppress(&sch->dev, 0);
+               kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
+       }
+
        VFIO_CCW_MSG_EVENT(4, "bound to subchannel %x.%x.%04x\n",
                           sch->schid.cssid, sch->schid.ssid,
                           sch->schid.sch_no);
index acda230..e0b2631 100644 (file)
@@ -181,11 +181,12 @@ struct qeth_vnicc_info {
 /*****************************************************************************/
 /* QDIO queue and buffer handling                                            */
 /*****************************************************************************/
-#define QETH_MAX_QUEUES 4
+#define QETH_MAX_OUT_QUEUES    4
 #define QETH_IQD_MIN_TXQ       2       /* One for ucast, one for mcast. */
 #define QETH_IQD_MCAST_TXQ     0
 #define QETH_IQD_MIN_UCAST_TXQ 1
 
+#define QETH_MAX_IN_QUEUES     2
 #define QETH_RX_COPYBREAK      (PAGE_SIZE >> 1)
 #define QETH_IN_BUF_SIZE_DEFAULT 65536
 #define QETH_IN_BUF_COUNT_DEFAULT 64
@@ -539,7 +540,7 @@ struct qeth_qdio_info {
 
        /* output */
        int no_out_queues;
-       struct qeth_qdio_out_q *out_qs[QETH_MAX_QUEUES];
+       struct qeth_qdio_out_q *out_qs[QETH_MAX_OUT_QUEUES];
        struct qdio_outbuf_state *out_bufstates;
 
        /* priority queueing */
index 24fd17b..f768946 100644 (file)
@@ -4812,28 +4812,13 @@ out:
        return;
 }
 
-static void qeth_qdio_establish_cq(struct qeth_card *card,
-                                  struct qdio_buffer **in_sbal_ptrs)
-{
-       int i;
-
-       if (card->options.cq == QETH_CQ_ENABLED) {
-               int offset = QDIO_MAX_BUFFERS_PER_Q *
-                            (card->qdio.no_in_queues - 1);
-
-               for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; i++)
-                       in_sbal_ptrs[offset + i] =
-                               card->qdio.c_q->bufs[i].buffer;
-       }
-}
-
 static int qeth_qdio_establish(struct qeth_card *card)
 {
+       struct qdio_buffer **out_sbal_ptrs[QETH_MAX_OUT_QUEUES];
+       struct qdio_buffer **in_sbal_ptrs[QETH_MAX_IN_QUEUES];
        struct qdio_initialize init_data;
        char *qib_param_field;
-       struct qdio_buffer **in_sbal_ptrs;
-       struct qdio_buffer **out_sbal_ptrs;
-       int i, j, k;
+       unsigned int i;
        int rc = 0;
 
        QETH_CARD_TEXT(card, 2, "qdioest");
@@ -4847,35 +4832,14 @@ static int qeth_qdio_establish(struct qeth_card *card)
        qeth_create_qib_param_field(card, qib_param_field);
        qeth_create_qib_param_field_blkt(card, qib_param_field);
 
-       in_sbal_ptrs = kcalloc(card->qdio.no_in_queues * QDIO_MAX_BUFFERS_PER_Q,
-                              sizeof(void *),
-                              GFP_KERNEL);
-       if (!in_sbal_ptrs) {
-               rc = -ENOMEM;
-               goto out_free_qib_param;
-       }
-
-       for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; i++)
-               in_sbal_ptrs[i] = card->qdio.in_q->bufs[i].buffer;
-
-       qeth_qdio_establish_cq(card, in_sbal_ptrs);
-
-       out_sbal_ptrs =
-               kcalloc(card->qdio.no_out_queues * QDIO_MAX_BUFFERS_PER_Q,
-                       sizeof(void *),
-                       GFP_KERNEL);
-       if (!out_sbal_ptrs) {
-               rc = -ENOMEM;
-               goto out_free_in_sbals;
-       }
+       in_sbal_ptrs[0] = card->qdio.in_q->qdio_bufs;
+       if (card->options.cq == QETH_CQ_ENABLED)
+               in_sbal_ptrs[1] = card->qdio.c_q->qdio_bufs;
 
-       for (i = 0, k = 0; i < card->qdio.no_out_queues; ++i)
-               for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++, k++)
-                       out_sbal_ptrs[k] =
-                               card->qdio.out_qs[i]->bufs[j]->buffer;
+       for (i = 0; i < card->qdio.no_out_queues; i++)
+               out_sbal_ptrs[i] = card->qdio.out_qs[i]->qdio_bufs;
 
        memset(&init_data, 0, sizeof(struct qdio_initialize));
-       init_data.cdev                   = CARD_DDEV(card);
        init_data.q_format               = IS_IQD(card) ? QDIO_IQDIO_QFMT :
                                                          QDIO_QETH_QFMT;
        init_data.qib_param_field_format = 0;
@@ -4893,12 +4857,13 @@ static int qeth_qdio_establish(struct qeth_card *card)
 
        if (atomic_cmpxchg(&card->qdio.state, QETH_QDIO_ALLOCATED,
                QETH_QDIO_ESTABLISHED) == QETH_QDIO_ALLOCATED) {
-               rc = qdio_allocate(&init_data);
+               rc = qdio_allocate(CARD_DDEV(card), init_data.no_input_qs,
+                                  init_data.no_output_qs);
                if (rc) {
                        atomic_set(&card->qdio.state, QETH_QDIO_ALLOCATED);
                        goto out;
                }
-               rc = qdio_establish(&init_data);
+               rc = qdio_establish(CARD_DDEV(card), &init_data);
                if (rc) {
                        atomic_set(&card->qdio.state, QETH_QDIO_ALLOCATED);
                        qdio_free(CARD_DDEV(card));
@@ -4916,10 +4881,6 @@ static int qeth_qdio_establish(struct qeth_card *card)
                break;
        }
 out:
-       kfree(out_sbal_ptrs);
-out_free_in_sbals:
-       kfree(in_sbal_ptrs);
-out_free_qib_param:
        kfree(qib_param_field);
 out_free_nothing:
        return rc;
@@ -5985,7 +5946,7 @@ static struct net_device *qeth_alloc_netdev(struct qeth_card *card)
        switch (card->info.type) {
        case QETH_CARD_TYPE_IQD:
                dev = alloc_netdev_mqs(sizeof(*priv), "hsi%d", NET_NAME_UNKNOWN,
-                                      ether_setup, QETH_MAX_QUEUES, 1);
+                                      ether_setup, QETH_MAX_OUT_QUEUES, 1);
                break;
        case QETH_CARD_TYPE_OSM:
                dev = alloc_etherdev(sizeof(*priv));
@@ -5995,7 +5956,7 @@ static struct net_device *qeth_alloc_netdev(struct qeth_card *card)
                                   ether_setup);
                break;
        default:
-               dev = alloc_etherdev_mqs(sizeof(*priv), QETH_MAX_QUEUES, 1);
+               dev = alloc_etherdev_mqs(sizeof(*priv), QETH_MAX_OUT_QUEUES, 1);
        }
 
        if (!dev)
index f0d6296..26702b5 100644 (file)
@@ -277,29 +277,6 @@ int zfcp_qdio_send(struct zfcp_qdio *qdio, struct zfcp_qdio_req *q_req)
        return 0;
 }
 
-
-static void zfcp_qdio_setup_init_data(struct qdio_initialize *id,
-                                     struct zfcp_qdio *qdio)
-{
-       memset(id, 0, sizeof(*id));
-       id->cdev = qdio->adapter->ccw_device;
-       id->q_format = QDIO_ZFCP_QFMT;
-       memcpy(id->adapter_name, dev_name(&id->cdev->dev), 8);
-       ASCEBC(id->adapter_name, 8);
-       id->qib_rflags = QIB_RFLAGS_ENABLE_DATA_DIV;
-       if (enable_multibuffer)
-               id->qdr_ac |= QDR_AC_MULTI_BUFFER_ENABLE;
-       id->no_input_qs = 1;
-       id->no_output_qs = 1;
-       id->input_handler = zfcp_qdio_int_resp;
-       id->output_handler = zfcp_qdio_int_req;
-       id->int_parm = (unsigned long) qdio;
-       id->input_sbal_addr_array = qdio->res_q;
-       id->output_sbal_addr_array = qdio->req_q;
-       id->scan_threshold =
-               QDIO_MAX_BUFFERS_PER_Q - ZFCP_QDIO_MAX_SBALS_PER_REQ * 2;
-}
-
 /**
  * zfcp_qdio_allocate - allocate queue memory and initialize QDIO data
  * @qdio: pointer to struct zfcp_qdio
@@ -308,7 +285,6 @@ static void zfcp_qdio_setup_init_data(struct qdio_initialize *id,
  */
 static int zfcp_qdio_allocate(struct zfcp_qdio *qdio)
 {
-       struct qdio_initialize init_data;
        int ret;
 
        ret = qdio_alloc_buffers(qdio->req_q, QDIO_MAX_BUFFERS_PER_Q);
@@ -319,10 +295,9 @@ static int zfcp_qdio_allocate(struct zfcp_qdio *qdio)
        if (ret)
                goto free_req_q;
 
-       zfcp_qdio_setup_init_data(&init_data, qdio);
        init_waitqueue_head(&qdio->req_q_wq);
 
-       ret = qdio_allocate(&init_data);
+       ret = qdio_allocate(qdio->adapter->ccw_device, 1, 1);
        if (ret)
                goto free_res_q;
 
@@ -374,8 +349,10 @@ void zfcp_qdio_close(struct zfcp_qdio *qdio)
  */
 int zfcp_qdio_open(struct zfcp_qdio *qdio)
 {
+       struct qdio_buffer **input_sbals[1] = {qdio->res_q};
+       struct qdio_buffer **output_sbals[1] = {qdio->req_q};
        struct qdio_buffer_element *sbale;
-       struct qdio_initialize init_data;
+       struct qdio_initialize init_data = {0};
        struct zfcp_adapter *adapter = qdio->adapter;
        struct ccw_device *cdev = adapter->ccw_device;
        struct qdio_ssqd_desc ssqd;
@@ -387,12 +364,26 @@ int zfcp_qdio_open(struct zfcp_qdio *qdio)
        atomic_andnot(ZFCP_STATUS_ADAPTER_SIOSL_ISSUED,
                          &qdio->adapter->status);
 
-       zfcp_qdio_setup_init_data(&init_data, qdio);
+       init_data.q_format = QDIO_ZFCP_QFMT;
+       memcpy(init_data.adapter_name, dev_name(&cdev->dev), 8);
+       ASCEBC(init_data.adapter_name, 8);
+       init_data.qib_rflags = QIB_RFLAGS_ENABLE_DATA_DIV;
+       if (enable_multibuffer)
+               init_data.qdr_ac |= QDR_AC_MULTI_BUFFER_ENABLE;
+       init_data.no_input_qs = 1;
+       init_data.no_output_qs = 1;
+       init_data.input_handler = zfcp_qdio_int_resp;
+       init_data.output_handler = zfcp_qdio_int_req;
+       init_data.int_parm = (unsigned long) qdio;
+       init_data.input_sbal_addr_array = input_sbals;
+       init_data.output_sbal_addr_array = output_sbals;
+       init_data.scan_threshold =
+               QDIO_MAX_BUFFERS_PER_Q - ZFCP_QDIO_MAX_SBALS_PER_REQ * 2;
 
-       if (qdio_establish(&init_data))
+       if (qdio_establish(cdev, &init_data))
                goto failed_establish;
 
-       if (qdio_get_ssqd_desc(init_data.cdev, &ssqd))
+       if (qdio_get_ssqd_desc(cdev, &ssqd))
                goto failed_qdio;
 
        if (ssqd.qdioac2 & CHSC_AC2_DATA_DIV_ENABLED)
index f6c8963..db4a04a 100644 (file)
@@ -1985,8 +1985,6 @@ out_unlock:
 
 /* Declare and initialization an instance of the FC NVME template. */
 static struct nvme_fc_port_template lpfc_nvme_template = {
-       .module = THIS_MODULE,
-
        /* initiator-based functions */
        .localport_delete  = lpfc_nvme_localport_delete,
        .remoteport_delete = lpfc_nvme_remoteport_delete,
index 84e2a98..4886d24 100644 (file)
@@ -610,7 +610,6 @@ static void qla_nvme_remoteport_delete(struct nvme_fc_remote_port *rport)
 }
 
 static struct nvme_fc_port_template qla_nvme_fc_transport = {
-       .module = THIS_MODULE,
        .localport_delete = qla_nvme_localport_delete,
        .remoteport_delete = qla_nvme_remoteport_delete,
        .create_queue   = qla_nvme_alloc_queue,
index 1778f8c..425ab6f 100644 (file)
@@ -22,5 +22,6 @@ source "drivers/soc/ux500/Kconfig"
 source "drivers/soc/versatile/Kconfig"
 source "drivers/soc/xilinx/Kconfig"
 source "drivers/soc/zte/Kconfig"
+source "drivers/soc/kendryte/Kconfig"
 
 endmenu
index a39f17c..36452be 100644 (file)
@@ -28,3 +28,4 @@ obj-$(CONFIG_ARCH_U8500)      += ux500/
 obj-$(CONFIG_PLAT_VERSATILE)   += versatile/
 obj-y                          += xilinx/
 obj-$(CONFIG_ARCH_ZX)          += zte/
+obj-$(CONFIG_SOC_KENDRYTE)     += kendryte/
diff --git a/drivers/soc/kendryte/Kconfig b/drivers/soc/kendryte/Kconfig
new file mode 100644 (file)
index 0000000..49785b1
--- /dev/null
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+
+if SOC_KENDRYTE
+
+config K210_SYSCTL
+       bool "Kendryte K210 system controller"
+       default y
+       depends on RISCV
+       help
+         Enables controlling the K210 various clocks and to enable
+         general purpose use of the extra 2MB of SRAM normally
+         reserved for the AI engine.
+
+endif
diff --git a/drivers/soc/kendryte/Makefile b/drivers/soc/kendryte/Makefile
new file mode 100644 (file)
index 0000000..002d9ce
--- /dev/null
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_K210_SYSCTL)      += k210-sysctl.o
diff --git a/drivers/soc/kendryte/k210-sysctl.c b/drivers/soc/kendryte/k210-sysctl.c
new file mode 100644 (file)
index 0000000..4608fbc
--- /dev/null
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2019 Christoph Hellwig.
+ * Copyright (c) 2019 Western Digital Corporation or its affiliates.
+ */
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/clk-provider.h>
+#include <linux/clkdev.h>
+#include <linux/bitfield.h>
+#include <asm/soc.h>
+
+#define K210_SYSCTL_CLK0_FREQ          26000000UL
+
+/* Registers base address */
+#define K210_SYSCTL_SYSCTL_BASE_ADDR   0x50440000ULL
+
+/* Registers */
+#define K210_SYSCTL_PLL0               0x08
+#define K210_SYSCTL_PLL1               0x0c
+/* clkr: 4bits, clkf1: 6bits, clkod: 4bits, bwadj: 4bits */
+#define   PLL_RESET            (1 << 20)
+#define   PLL_PWR              (1 << 21)
+#define   PLL_INTFB            (1 << 22)
+#define   PLL_BYPASS           (1 << 23)
+#define   PLL_TEST             (1 << 24)
+#define   PLL_OUT_EN           (1 << 25)
+#define   PLL_TEST_EN          (1 << 26)
+#define K210_SYSCTL_PLL_LOCK           0x18
+#define   PLL0_LOCK1           (1 << 0)
+#define   PLL0_LOCK2           (1 << 1)
+#define   PLL0_SLIP_CLEAR      (1 << 2)
+#define   PLL0_TEST_CLK_OUT    (1 << 3)
+#define   PLL1_LOCK1           (1 << 8)
+#define   PLL1_LOCK2           (1 << 9)
+#define   PLL1_SLIP_CLEAR      (1 << 10)
+#define   PLL1_TEST_CLK_OUT    (1 << 11)
+#define   PLL2_LOCK1           (1 << 16)
+#define   PLL2_LOCK2           (1 << 16)
+#define   PLL2_SLIP_CLEAR      (1 << 18)
+#define   PLL2_TEST_CLK_OUT    (1 << 19)
+#define K210_SYSCTL_CLKSEL0    0x20
+#define   CLKSEL_ACLK          (1 << 0)
+#define K210_SYSCTL_CLKEN_CENT         0x28
+#define   CLKEN_CPU            (1 << 0)
+#define   CLKEN_SRAM0          (1 << 1)
+#define   CLKEN_SRAM1          (1 << 2)
+#define   CLKEN_APB0           (1 << 3)
+#define   CLKEN_APB1           (1 << 4)
+#define   CLKEN_APB2           (1 << 5)
+#define K210_SYSCTL_CLKEN_PERI         0x2c
+#define   CLKEN_ROM            (1 << 0)
+#define   CLKEN_DMA            (1 << 1)
+#define   CLKEN_AI             (1 << 2)
+#define   CLKEN_DVP            (1 << 3)
+#define   CLKEN_FFT            (1 << 4)
+#define   CLKEN_GPIO           (1 << 5)
+#define   CLKEN_SPI0           (1 << 6)
+#define   CLKEN_SPI1           (1 << 7)
+#define   CLKEN_SPI2           (1 << 8)
+#define   CLKEN_SPI3           (1 << 9)
+#define   CLKEN_I2S0           (1 << 10)
+#define   CLKEN_I2S1           (1 << 11)
+#define   CLKEN_I2S2           (1 << 12)
+#define   CLKEN_I2C0           (1 << 13)
+#define   CLKEN_I2C1           (1 << 14)
+#define   CLKEN_I2C2           (1 << 15)
+#define   CLKEN_UART1          (1 << 16)
+#define   CLKEN_UART2          (1 << 17)
+#define   CLKEN_UART3          (1 << 18)
+#define   CLKEN_AES            (1 << 19)
+#define   CLKEN_FPIO           (1 << 20)
+#define   CLKEN_TIMER0         (1 << 21)
+#define   CLKEN_TIMER1         (1 << 22)
+#define   CLKEN_TIMER2         (1 << 23)
+#define   CLKEN_WDT0           (1 << 24)
+#define   CLKEN_WDT1           (1 << 25)
+#define   CLKEN_SHA            (1 << 26)
+#define   CLKEN_OTP            (1 << 27)
+#define   CLKEN_RTC            (1 << 29)
+
+struct k210_sysctl {
+       void __iomem            *regs;
+       struct clk_hw           hw;
+};
+
+static void k210_set_bits(u32 val, void __iomem *reg)
+{
+       writel(readl(reg) | val, reg);
+}
+
+static void k210_clear_bits(u32 val, void __iomem *reg)
+{
+       writel(readl(reg) & ~val, reg);
+}
+
+static void k210_pll1_enable(void __iomem *regs)
+{
+       u32 val;
+
+       val = readl(regs + K210_SYSCTL_PLL1);
+       val &= ~GENMASK(19, 0);                         /* clkr1 = 0 */
+       val |= FIELD_PREP(GENMASK(9, 4), 0x3B);         /* clkf1 = 59 */
+       val |= FIELD_PREP(GENMASK(13, 10), 0x3);        /* clkod1 = 3 */
+       val |= FIELD_PREP(GENMASK(19, 14), 0x3B);       /* bwadj1 = 59 */
+       writel(val, regs + K210_SYSCTL_PLL1);
+
+       k210_clear_bits(PLL_BYPASS, regs + K210_SYSCTL_PLL1);
+       k210_set_bits(PLL_PWR, regs + K210_SYSCTL_PLL1);
+
+       /*
+        * Reset the pll. The magic NOPs come from the Kendryte reference SDK.
+        */
+       k210_clear_bits(PLL_RESET, regs + K210_SYSCTL_PLL1);
+       k210_set_bits(PLL_RESET, regs + K210_SYSCTL_PLL1);
+       nop();
+       nop();
+       k210_clear_bits(PLL_RESET, regs + K210_SYSCTL_PLL1);
+
+       for (;;) {
+               val = readl(regs + K210_SYSCTL_PLL_LOCK);
+               if (val & PLL1_LOCK2)
+                       break;
+               writel(val | PLL1_SLIP_CLEAR, regs + K210_SYSCTL_PLL_LOCK);
+       }
+
+       k210_set_bits(PLL_OUT_EN, regs + K210_SYSCTL_PLL1);
+}
+
+static unsigned long k210_sysctl_clk_recalc_rate(struct clk_hw *hw,
+               unsigned long parent_rate)
+{
+       struct k210_sysctl *s = container_of(hw, struct k210_sysctl, hw);
+       u32 clksel0, pll0;
+       u64 pll0_freq, clkr0, clkf0, clkod0;
+
+       /*
+        * If the clock selector is not set, use the base frequency.
+        * Otherwise, use PLL0 frequency with a frequency divisor.
+        */
+       clksel0 = readl(s->regs + K210_SYSCTL_CLKSEL0);
+       if (!(clksel0 & CLKSEL_ACLK))
+               return K210_SYSCTL_CLK0_FREQ;
+
+       /*
+        * Get PLL0 frequency:
+        * freq = base frequency * clkf0 / (clkr0 * clkod0)
+        */
+       pll0 = readl(s->regs + K210_SYSCTL_PLL0);
+       clkr0 = 1 + FIELD_GET(GENMASK(3, 0), pll0);
+       clkf0 = 1 + FIELD_GET(GENMASK(9, 4), pll0);
+       clkod0 = 1 + FIELD_GET(GENMASK(13, 10), pll0);
+       pll0_freq = clkf0 * K210_SYSCTL_CLK0_FREQ / (clkr0 * clkod0);
+
+       /* Get the frequency divisor from the clock selector */
+       return pll0_freq / (2ULL << FIELD_GET(0x00000006, clksel0));
+}
+
+static const struct clk_ops k210_sysctl_clk_ops = {
+       .recalc_rate    = k210_sysctl_clk_recalc_rate,
+};
+
+static const struct clk_init_data k210_clk_init_data = {
+       .name           = "k210-sysctl-pll1",
+       .ops            = &k210_sysctl_clk_ops,
+};
+
+static int k210_sysctl_probe(struct platform_device *pdev)
+{
+       struct k210_sysctl *s;
+       int error;
+
+       pr_info("Kendryte K210 SoC sysctl\n");
+
+       s = devm_kzalloc(&pdev->dev, sizeof(*s), GFP_KERNEL);
+       if (!s)
+               return -ENOMEM;
+
+       s->regs = devm_ioremap_resource(&pdev->dev,
+                       platform_get_resource(pdev, IORESOURCE_MEM, 0));
+       if (IS_ERR(s->regs))
+               return PTR_ERR(s->regs);
+
+       s->hw.init = &k210_clk_init_data;
+       error = devm_clk_hw_register(&pdev->dev, &s->hw);
+       if (error) {
+               dev_err(&pdev->dev, "failed to register clk");
+               return error;
+       }
+
+       error = devm_of_clk_add_hw_provider(&pdev->dev, of_clk_hw_simple_get,
+                                           &s->hw);
+       if (error) {
+               dev_err(&pdev->dev, "adding clk provider failed\n");
+               return error;
+       }
+
+       return 0;
+}
+
+static const struct of_device_id k210_sysctl_of_match[] = {
+       { .compatible = "kendryte,k210-sysctl", },
+       {}
+};
+
+static struct platform_driver k210_sysctl_driver = {
+       .driver = {
+               .name           = "k210-sysctl",
+               .of_match_table = k210_sysctl_of_match,
+       },
+       .probe                  = k210_sysctl_probe,
+};
+
+static int __init k210_sysctl_init(void)
+{
+       return platform_driver_register(&k210_sysctl_driver);
+}
+core_initcall(k210_sysctl_init);
+
+/*
+ * This needs to be called very early during initialization, given that
+ * PLL1 needs to be enabled to be able to use all SRAM.
+ */
+static void __init k210_soc_early_init(const void *fdt)
+{
+       void __iomem *regs;
+
+       regs = ioremap(K210_SYSCTL_SYSCTL_BASE_ADDR, 0x1000);
+       if (!regs)
+               panic("K210 sysctl ioremap");
+
+       /* Enable PLL1 to make the KPU SRAM useable */
+       k210_pll1_enable(regs);
+
+       k210_set_bits(PLL_OUT_EN, regs + K210_SYSCTL_PLL0);
+
+       k210_set_bits(CLKEN_CPU | CLKEN_SRAM0 | CLKEN_SRAM1,
+                     regs + K210_SYSCTL_CLKEN_CENT);
+       k210_set_bits(CLKEN_ROM | CLKEN_TIMER0 | CLKEN_RTC,
+                     regs + K210_SYSCTL_CLKEN_PERI);
+
+       k210_set_bits(CLKSEL_ACLK, regs + K210_SYSCTL_CLKSEL0);
+
+       iounmap(regs);
+}
+SOC_EARLY_INIT_DECLARE(generic_k210, "kendryte,k210", k210_soc_early_init);
index 5a05db5..91af271 100644 (file)
@@ -1,17 +1,18 @@
 # SPDX-License-Identifier: GPL-2.0-only
 #
-# Generic thermal sysfs drivers configuration
+# Generic thermal drivers configuration
 #
 
 menuconfig THERMAL
-       bool "Generic Thermal sysfs driver"
+       bool "Thermal drivers"
        help
-         Generic Thermal Sysfs driver offers a generic mechanism for
+         Thermal drivers offer a generic mechanism for
          thermal management. Usually it's made up of one or more thermal
-         zone and cooling device.
+         zones and cooling devices.
          Each thermal zone contains its own temperature, trip points,
-         cooling devices.
-         All platforms with ACPI thermal support can use this driver.
+         and cooling devices.
+         All platforms with ACPI or Open Firmware thermal support can use
+         this driver.
          If you want this support, you should say Y here.
 
 if THERMAL
@@ -251,6 +252,27 @@ config IMX_THERMAL
          cpufreq is used as the cooling device to throttle CPUs when the
          passive trip is crossed.
 
+config IMX_SC_THERMAL
+       tristate "Temperature sensor driver for NXP i.MX SoCs with System Controller"
+       depends on IMX_SCU
+       depends on OF
+       help
+         Support for Temperature Monitor (TEMPMON) found on NXP i.MX SoCs with
+         system controller inside, Linux kernel has to communicate with system
+         controller via MU (message unit) IPC to get temperature from thermal
+         sensor. It supports one critical trip point and one
+         passive trip point for each thermal sensor.
+
+config IMX8MM_THERMAL
+       tristate "Temperature sensor driver for Freescale i.MX8MM SoC"
+       depends on ARCH_MXC || COMPILE_TEST
+       depends on OF
+       help
+         Support for Thermal Monitoring Unit (TMU) found on Freescale i.MX8MM SoC.
+         It supports one critical trip point and one passive trip point. The
+         cpufreq is used as the cooling device to throttle CPUs when the passive
+         trip is crossed.
+
 config MAX77620_THERMAL
        tristate "Temperature sensor driver for Maxim MAX77620 PMIC"
        depends on MFD_MAX77620
@@ -265,6 +287,7 @@ config QORIQ_THERMAL
        tristate "QorIQ Thermal Monitoring Unit"
        depends on THERMAL_OF
        depends on HAS_IOMEM
+       select REGMAP_MMIO
        help
          Support for Thermal Monitoring Unit (TMU) found on QorIQ platforms.
          It supports one critical trip point and one passive trip point. The
@@ -460,4 +483,11 @@ config UNIPHIER_THERMAL
          Enable this to plug in UniPhier on-chip PVT thermal driver into the
          thermal framework. The driver supports CPU thermal zone temperature
          reporting and a couple of trip points.
+
+config SPRD_THERMAL
+       tristate "Temperature sensor on Spreadtrum SoCs"
+       depends on ARCH_SPRD || COMPILE_TEST
+       help
+         Support for the Spreadtrum thermal sensor driver in the Linux thermal
+         framework.
 endif
index 9fb88e2..8c8ed7b 100644 (file)
@@ -43,6 +43,8 @@ obj-$(CONFIG_DB8500_THERMAL)  += db8500_thermal.o
 obj-$(CONFIG_ARMADA_THERMAL)   += armada_thermal.o
 obj-$(CONFIG_TANGO_THERMAL)    += tango_thermal.o
 obj-$(CONFIG_IMX_THERMAL)      += imx_thermal.o
+obj-$(CONFIG_IMX_SC_THERMAL)   += imx_sc_thermal.o
+obj-$(CONFIG_IMX8MM_THERMAL)   += imx8mm_thermal.o
 obj-$(CONFIG_MAX77620_THERMAL) += max77620_thermal.o
 obj-$(CONFIG_QORIQ_THERMAL)    += qoriq_thermal.o
 obj-$(CONFIG_DA9062_THERMAL)   += da9062-thermal.o
@@ -57,3 +59,4 @@ obj-$(CONFIG_GENERIC_ADC_THERMAL)     += thermal-generic-adc.o
 obj-$(CONFIG_ZX2967_THERMAL)   += zx2967_thermal.o
 obj-$(CONFIG_UNIPHIER_THERMAL) += uniphier_thermal.o
 obj-$(CONFIG_AMLOGIC_THERMAL)     += amlogic_thermal.o
+obj-$(CONFIG_SPRD_THERMAL)     += sprd_thermal.o
index 4ae8c85..e297e13 100644 (file)
@@ -273,7 +273,7 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev,
        struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 
        /* Request state should be less than max_level */
-       if (WARN_ON(state > cpufreq_cdev->max_level))
+       if (state > cpufreq_cdev->max_level)
                return -EINVAL;
 
        num_cpus = cpumask_weight(cpufreq_cdev->policy->cpus);
@@ -437,7 +437,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
        int ret;
 
        /* Request state should be less than max_level */
-       if (WARN_ON(state > cpufreq_cdev->max_level))
+       if (state > cpufreq_cdev->max_level)
                return -EINVAL;
 
        /* Check if the old cooling action is same as new cooling action */
@@ -456,6 +456,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
                capacity = frequency * max_capacity;
                capacity /= cpufreq_cdev->policy->cpuinfo.max_freq;
                arch_set_thermal_pressure(cpus, max_capacity - capacity);
+               ret = 0;
        }
 
        return ret;
diff --git a/drivers/thermal/imx8mm_thermal.c b/drivers/thermal/imx8mm_thermal.c
new file mode 100644 (file)
index 0000000..0d60f8d
--- /dev/null
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2020 NXP.
+ *
+ * Author: Anson Huang <Anson.Huang@nxp.com>
+ */
+
+#include <linux/bitfield.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/thermal.h>
+
+#include "thermal_core.h"
+
+#define TER                    0x0     /* TMU enable */
+#define TPS                    0x4
+#define TRITSR                 0x20    /* TMU immediate temp */
+
+#define TER_EN                 BIT(31)
+#define TRITSR_TEMP0_VAL_MASK  0xff
+#define TRITSR_TEMP1_VAL_MASK  0xff0000
+
+#define PROBE_SEL_ALL          GENMASK(31, 30)
+
+#define probe_status_offset(x) (30 + x)
+#define SIGN_BIT               BIT(7)
+#define TEMP_VAL_MASK          GENMASK(6, 0)
+
+#define VER1_TEMP_LOW_LIMIT    10000
+#define VER2_TEMP_LOW_LIMIT    -40000
+#define VER2_TEMP_HIGH_LIMIT   125000
+
+#define TMU_VER1               0x1
+#define TMU_VER2               0x2
+
+struct thermal_soc_data {
+       u32 num_sensors;
+       u32 version;
+       int (*get_temp)(void *, int *);
+};
+
+struct tmu_sensor {
+       struct imx8mm_tmu *priv;
+       u32 hw_id;
+       struct thermal_zone_device *tzd;
+};
+
+struct imx8mm_tmu {
+       void __iomem *base;
+       struct clk *clk;
+       const struct thermal_soc_data *socdata;
+       struct tmu_sensor sensors[0];
+};
+
+static int imx8mm_tmu_get_temp(void *data, int *temp)
+{
+       struct tmu_sensor *sensor = data;
+       struct imx8mm_tmu *tmu = sensor->priv;
+       u32 val;
+
+       val = readl_relaxed(tmu->base + TRITSR) & TRITSR_TEMP0_VAL_MASK;
+       *temp = val * 1000;
+       if (*temp < VER1_TEMP_LOW_LIMIT)
+               return -EAGAIN;
+
+       return 0;
+}
+
+static int imx8mp_tmu_get_temp(void *data, int *temp)
+{
+       struct tmu_sensor *sensor = data;
+       struct imx8mm_tmu *tmu = sensor->priv;
+       unsigned long val;
+       bool ready;
+
+       val = readl_relaxed(tmu->base + TRITSR);
+       ready = test_bit(probe_status_offset(sensor->hw_id), &val);
+       if (!ready)
+               return -EAGAIN;
+
+       val = sensor->hw_id ? FIELD_GET(TRITSR_TEMP1_VAL_MASK, val) :
+             FIELD_GET(TRITSR_TEMP0_VAL_MASK, val);
+       if (val & SIGN_BIT) /* negative */
+               val = (~(val & TEMP_VAL_MASK) + 1);
+
+       *temp = val * 1000;
+       if (*temp < VER2_TEMP_LOW_LIMIT || *temp > VER2_TEMP_HIGH_LIMIT)
+               return -EAGAIN;
+
+       return 0;
+}
+
+static int tmu_get_temp(void *data, int *temp)
+{
+       struct tmu_sensor *sensor = data;
+       struct imx8mm_tmu *tmu = sensor->priv;
+
+       return tmu->socdata->get_temp(data, temp);
+}
+
+static struct thermal_zone_of_device_ops tmu_tz_ops = {
+       .get_temp = tmu_get_temp,
+};
+
+static void imx8mm_tmu_enable(struct imx8mm_tmu *tmu, bool enable)
+{
+       u32 val;
+
+       val = readl_relaxed(tmu->base + TER);
+       val = enable ? (val | TER_EN) : (val & ~TER_EN);
+       writel_relaxed(val, tmu->base + TER);
+}
+
+static void imx8mm_tmu_probe_sel_all(struct imx8mm_tmu *tmu)
+{
+       u32 val;
+
+       val = readl_relaxed(tmu->base + TPS);
+       val |= PROBE_SEL_ALL;
+       writel_relaxed(val, tmu->base + TPS);
+}
+
+static int imx8mm_tmu_probe(struct platform_device *pdev)
+{
+       const struct thermal_soc_data *data;
+       struct imx8mm_tmu *tmu;
+       int ret;
+       int i;
+
+       data = of_device_get_match_data(&pdev->dev);
+
+       tmu = devm_kzalloc(&pdev->dev, struct_size(tmu, sensors,
+                          data->num_sensors), GFP_KERNEL);
+       if (!tmu)
+               return -ENOMEM;
+
+       tmu->socdata = data;
+
+       tmu->base = devm_platform_ioremap_resource(pdev, 0);
+       if (IS_ERR(tmu->base))
+               return PTR_ERR(tmu->base);
+
+       tmu->clk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(tmu->clk)) {
+               ret = PTR_ERR(tmu->clk);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(&pdev->dev,
+                               "failed to get tmu clock: %d\n", ret);
+               return ret;
+       }
+
+       ret = clk_prepare_enable(tmu->clk);
+       if (ret) {
+               dev_err(&pdev->dev, "failed to enable tmu clock: %d\n", ret);
+               return ret;
+       }
+
+       /* disable the monitor during initialization */
+       imx8mm_tmu_enable(tmu, false);
+
+       for (i = 0; i < data->num_sensors; i++) {
+               tmu->sensors[i].priv = tmu;
+               tmu->sensors[i].tzd =
+                       devm_thermal_zone_of_sensor_register(&pdev->dev, i,
+                                                            &tmu->sensors[i],
+                                                            &tmu_tz_ops);
+               if (IS_ERR(tmu->sensors[i].tzd)) {
+                       dev_err(&pdev->dev,
+                               "failed to register thermal zone sensor[%d]: %d\n",
+                               i, ret);
+                       return PTR_ERR(tmu->sensors[i].tzd);
+               }
+               tmu->sensors[i].hw_id = i;
+       }
+
+       platform_set_drvdata(pdev, tmu);
+
+       /* enable all the probes for V2 TMU */
+       if (tmu->socdata->version == TMU_VER2)
+               imx8mm_tmu_probe_sel_all(tmu);
+
+       /* enable the monitor */
+       imx8mm_tmu_enable(tmu, true);
+
+       return 0;
+}
+
+static int imx8mm_tmu_remove(struct platform_device *pdev)
+{
+       struct imx8mm_tmu *tmu = platform_get_drvdata(pdev);
+
+       /* disable TMU */
+       imx8mm_tmu_enable(tmu, false);
+
+       clk_disable_unprepare(tmu->clk);
+       platform_set_drvdata(pdev, NULL);
+
+       return 0;
+}
+
+static struct thermal_soc_data imx8mm_tmu_data = {
+       .num_sensors = 1,
+       .version = TMU_VER1,
+       .get_temp = imx8mm_tmu_get_temp,
+};
+
+static struct thermal_soc_data imx8mp_tmu_data = {
+       .num_sensors = 2,
+       .version = TMU_VER2,
+       .get_temp = imx8mp_tmu_get_temp,
+};
+
+static const struct of_device_id imx8mm_tmu_table[] = {
+       { .compatible = "fsl,imx8mm-tmu", .data = &imx8mm_tmu_data, },
+       { .compatible = "fsl,imx8mp-tmu", .data = &imx8mp_tmu_data, },
+       { },
+};
+
+static struct platform_driver imx8mm_tmu = {
+       .driver = {
+               .name   = "i.mx8mm_thermal",
+               .of_match_table = imx8mm_tmu_table,
+       },
+       .probe = imx8mm_tmu_probe,
+       .remove = imx8mm_tmu_remove,
+};
+module_platform_driver(imx8mm_tmu);
+
+MODULE_AUTHOR("Anson Huang <Anson.Huang@nxp.com>");
+MODULE_DESCRIPTION("i.MX8MM Thermal Monitor Unit driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/imx_sc_thermal.c b/drivers/thermal/imx_sc_thermal.c
new file mode 100644 (file)
index 0000000..a8723b1
--- /dev/null
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright 2018-2020 NXP.
+ */
+
+#include <linux/err.h>
+#include <linux/firmware/imx/sci.h>
+#include <linux/firmware/imx/types.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+
+#include "thermal_core.h"
+
+#define IMX_SC_MISC_FUNC_GET_TEMP      13
+
+static struct imx_sc_ipc *thermal_ipc_handle;
+
+struct imx_sc_sensor {
+       struct thermal_zone_device *tzd;
+       u32 resource_id;
+};
+
+struct req_get_temp {
+       u16 resource_id;
+       u8 type;
+} __packed __aligned(4);
+
+struct resp_get_temp {
+       s16 celsius;
+       s8 tenths;
+} __packed __aligned(4);
+
+struct imx_sc_msg_misc_get_temp {
+       struct imx_sc_rpc_msg hdr;
+       union {
+               struct req_get_temp req;
+               struct resp_get_temp resp;
+       } data;
+} __packed __aligned(4);
+
+static int imx_sc_thermal_get_temp(void *data, int *temp)
+{
+       struct imx_sc_msg_misc_get_temp msg;
+       struct imx_sc_rpc_msg *hdr = &msg.hdr;
+       struct imx_sc_sensor *sensor = data;
+       int ret;
+
+       msg.data.req.resource_id = sensor->resource_id;
+       msg.data.req.type = IMX_SC_C_TEMP;
+
+       hdr->ver = IMX_SC_RPC_VERSION;
+       hdr->svc = IMX_SC_RPC_SVC_MISC;
+       hdr->func = IMX_SC_MISC_FUNC_GET_TEMP;
+       hdr->size = 2;
+
+       ret = imx_scu_call_rpc(thermal_ipc_handle, &msg, true);
+       if (ret) {
+               dev_err(&sensor->tzd->device, "read temp sensor %d failed, ret %d\n",
+                       sensor->resource_id, ret);
+               return ret;
+       }
+
+       *temp = msg.data.resp.celsius * 1000 + msg.data.resp.tenths * 100;
+
+       return 0;
+}
+
+static const struct thermal_zone_of_device_ops imx_sc_thermal_ops = {
+       .get_temp = imx_sc_thermal_get_temp,
+};
+
+static int imx_sc_thermal_probe(struct platform_device *pdev)
+{
+       struct device_node *np, *child, *sensor_np;
+       struct imx_sc_sensor *sensor;
+       int ret;
+
+       ret = imx_scu_get_handle(&thermal_ipc_handle);
+       if (ret)
+               return ret;
+
+       np = of_find_node_by_name(NULL, "thermal-zones");
+       if (!np)
+               return -ENODEV;
+
+       sensor_np = of_node_get(pdev->dev.of_node);
+
+       for_each_available_child_of_node(np, child) {
+               sensor = devm_kzalloc(&pdev->dev, sizeof(*sensor), GFP_KERNEL);
+               if (!sensor) {
+                       of_node_put(sensor_np);
+                       return -ENOMEM;
+               }
+
+               ret = thermal_zone_of_get_sensor_id(child,
+                                                   sensor_np,
+                                                   &sensor->resource_id);
+               if (ret < 0) {
+                       dev_err(&pdev->dev,
+                               "failed to get valid sensor resource id: %d\n",
+                               ret);
+                       break;
+               }
+
+               sensor->tzd = devm_thermal_zone_of_sensor_register(&pdev->dev,
+                                                                  sensor->resource_id,
+                                                                  sensor,
+                                                                  &imx_sc_thermal_ops);
+               if (IS_ERR(sensor->tzd)) {
+                       dev_err(&pdev->dev, "failed to register thermal zone\n");
+                       ret = PTR_ERR(sensor->tzd);
+                       break;
+               }
+       }
+
+       of_node_put(sensor_np);
+
+       return ret;
+}
+
+static int imx_sc_thermal_remove(struct platform_device *pdev)
+{
+       return 0;
+}
+
+static const struct of_device_id imx_sc_thermal_table[] = {
+       { .compatible = "fsl,imx-sc-thermal", },
+       {}
+};
+MODULE_DEVICE_TABLE(of, imx_sc_thermal_table);
+
+static struct platform_driver imx_sc_thermal_driver = {
+               .probe = imx_sc_thermal_probe,
+               .remove = imx_sc_thermal_remove,
+               .driver = {
+                       .name = "imx-sc-thermal",
+                       .of_match_table = imx_sc_thermal_table,
+               },
+};
+module_platform_driver(imx_sc_thermal_driver);
+
+MODULE_AUTHOR("Anson Huang <Anson.Huang@nxp.com>");
+MODULE_DESCRIPTION("Thermal driver for NXP i.MX SoCs with system controller");
+MODULE_LICENSE("GPL v2");
index bb6754a..e761c9b 100644 (file)
@@ -3,24 +3,17 @@
 // Copyright 2013 Freescale Semiconductor, Inc.
 
 #include <linux/clk.h>
-#include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/cpu_cooling.h>
 #include <linux/delay.h>
-#include <linux/device.h>
-#include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
-#include <linux/kernel.h>
 #include <linux/mfd/syscon.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-#include <linux/platform_device.h>
 #include <linux/regmap.h>
-#include <linux/slab.h>
 #include <linux/thermal.h>
-#include <linux/types.h>
 #include <linux/nvmem-consumer.h>
 
 #define REG_SET                0x4
@@ -872,14 +865,12 @@ static int imx_thermal_remove(struct platform_device *pdev)
                clk_disable_unprepare(data->thermal_clk);
 
        thermal_zone_device_unregister(data->tz);
-       cpufreq_cooling_unregister(data->cdev);
-       cpufreq_cpu_put(data->policy);
+       imx_thermal_unregister_legacy_cooling(data);
 
        return 0;
 }
 
-#ifdef CONFIG_PM_SLEEP
-static int imx_thermal_suspend(struct device *dev)
+static int __maybe_unused imx_thermal_suspend(struct device *dev)
 {
        struct imx_thermal_data *data = dev_get_drvdata(dev);
        struct regmap *map = data->tempmon;
@@ -900,7 +891,7 @@ static int imx_thermal_suspend(struct device *dev)
        return 0;
 }
 
-static int imx_thermal_resume(struct device *dev)
+static int __maybe_unused imx_thermal_resume(struct device *dev)
 {
        struct imx_thermal_data *data = dev_get_drvdata(dev);
        struct regmap *map = data->tempmon;
@@ -918,7 +909,6 @@ static int imx_thermal_resume(struct device *dev)
 
        return 0;
 }
-#endif
 
 static SIMPLE_DEV_PM_OPS(imx_thermal_pm_ops,
                         imx_thermal_suspend, imx_thermal_resume);
index 6cad15e..ceef89c 100644 (file)
@@ -65,7 +65,7 @@ static ssize_t available_uuids_show(struct device *dev,
        for (i = 0; i < INT3400_THERMAL_MAXIMUM_UUID; i++) {
                if (priv->uuid_bitmap & (1 << i))
                        if (PAGE_SIZE - length > 0)
-                               length += snprintf(&buf[length],
+                               length += scnprintf(&buf[length],
                                                   PAGE_SIZE - length,
                                                   "%s\n",
                                                   int3400_thermal_uuids[i]);
index b1fd345..297db1d 100644 (file)
@@ -45,6 +45,9 @@
 /* JasperLake thermal reporting device */
 #define PCI_DEVICE_ID_PROC_JSL_THERMAL 0x4503
 
+/* TigerLake thermal reporting device */
+#define PCI_DEVICE_ID_PROC_TGL_THERMAL 0x9A03
+
 #define DRV_NAME "proc_thermal"
 
 struct power_config {
@@ -728,6 +731,8 @@ static const struct pci_device_id proc_thermal_pci_ids[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_ICL_THERMAL),
                .driver_data = (kernel_ulong_t)&rapl_mmio_hsw, },
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_JSL_THERMAL)},
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_TGL_THERMAL),
+               .driver_data = (kernel_ulong_t)&rapl_mmio_hsw, },
        { 0, },
 };
 
index ef0baa9..874a47d 100644 (file)
@@ -449,6 +449,50 @@ thermal_zone_of_add_sensor(struct device_node *zone,
 }
 
 /**
+ * thermal_zone_of_get_sensor_id - get sensor ID from a DT thermal zone
+ * @tz_np: a valid thermal zone device node.
+ * @sensor_np: a sensor node of a valid sensor device.
+ * @id: the sensor ID returned if success.
+ *
+ * This function will get sensor ID from a given thermal zone node and
+ * the sensor node must match the temperature provider @sensor_np.
+ *
+ * Return: 0 on success, proper error code otherwise.
+ */
+
+int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
+                                 struct device_node *sensor_np,
+                                 u32 *id)
+{
+       struct of_phandle_args sensor_specs;
+       int ret;
+
+       ret = of_parse_phandle_with_args(tz_np,
+                                        "thermal-sensors",
+                                        "#thermal-sensor-cells",
+                                        0,
+                                        &sensor_specs);
+       if (ret)
+               return ret;
+
+       if (sensor_specs.np != sensor_np) {
+               of_node_put(sensor_specs.np);
+               return -ENODEV;
+       }
+
+       if (sensor_specs.args_count > 1)
+               pr_warn("%pOFn: too many cells in sensor specifier %d\n",
+                    sensor_specs.np, sensor_specs.args_count);
+
+       *id = sensor_specs.args_count ? sensor_specs.args[0] : 0;
+
+       of_node_put(sensor_specs.np);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_of_get_sensor_id);
+
+/**
  * thermal_zone_of_sensor_register - registers a sensor to a DT thermal zone
  * @dev: a valid struct device pointer of a sensor device. Must contain
  *       a valid .of_node, for the sensor node.
@@ -499,36 +543,22 @@ thermal_zone_of_sensor_register(struct device *dev, int sensor_id, void *data,
        sensor_np = of_node_get(dev->of_node);
 
        for_each_available_child_of_node(np, child) {
-               struct of_phandle_args sensor_specs;
                int ret, id;
 
                /* For now, thermal framework supports only 1 sensor per zone */
-               ret = of_parse_phandle_with_args(child, "thermal-sensors",
-                                                "#thermal-sensor-cells",
-                                                0, &sensor_specs);
+               ret = thermal_zone_of_get_sensor_id(child, sensor_np, &id);
                if (ret)
                        continue;
 
-               if (sensor_specs.args_count >= 1) {
-                       id = sensor_specs.args[0];
-                       WARN(sensor_specs.args_count > 1,
-                            "%pOFn: too many cells in sensor specifier %d\n",
-                            sensor_specs.np, sensor_specs.args_count);
-               } else {
-                       id = 0;
-               }
-
-               if (sensor_specs.np == sensor_np && id == sensor_id) {
+               if (id == sensor_id) {
                        tzd = thermal_zone_of_add_sensor(child, sensor_np,
                                                         data, ops);
                        if (!IS_ERR(tzd))
                                tzd->ops->set_mode(tzd, THERMAL_DEVICE_ENABLED);
 
-                       of_node_put(sensor_specs.np);
                        of_node_put(child);
                        goto exit;
                }
-               of_node_put(sensor_specs.np);
        }
 exit:
        of_node_put(sensor_np);
index fb77acb..2a28a5a 100644 (file)
@@ -245,7 +245,7 @@ static inline int code_to_mdegC(u32 adc_code, const struct tsens_sensor *s)
        return adc_code * slope + offset;
 }
 
-static int get_temp_8960(struct tsens_sensor *s, int *temp)
+static int get_temp_8960(const struct tsens_sensor *s, int *temp)
 {
        int ret;
        u32 code, trdy;
@@ -279,7 +279,7 @@ static const struct tsens_ops ops_8960 = {
        .resume         = resume_8960,
 };
 
-const struct tsens_plat_data data_8960 = {
+struct tsens_plat_data data_8960 = {
        .num_sensors    = 11,
        .ops            = &ops_8960,
 };
index c8d57ee..1725453 100644 (file)
  * @low_thresh:     lower threshold temperature value
  * @low_irq_mask:   mask register for lower threshold irqs
  * @low_irq_clear:  clear register for lower threshold irqs
+ * @crit_viol:      critical threshold violated
+ * @crit_thresh:    critical threshold temperature value
+ * @crit_irq_mask:  mask register for critical threshold irqs
+ * @crit_irq_clear: clear register for critical threshold irqs
  *
  * Structure containing data about temperature threshold settings and
  * irq status if they were violated.
@@ -36,6 +40,10 @@ struct tsens_irq_data {
        int low_thresh;
        u32 low_irq_mask;
        u32 low_irq_clear;
+       u32 crit_viol;
+       u32 crit_thresh;
+       u32 crit_irq_mask;
+       u32 crit_irq_clear;
 };
 
 char *qfprom_read(struct device *dev, const char *cname)
@@ -128,7 +136,7 @@ static inline int code_to_degc(u32 adc_code, const struct tsens_sensor *s)
  * Return: Temperature in milliCelsius on success, a negative errno will
  * be returned in error cases
  */
-static int tsens_hw_to_mC(struct tsens_sensor *s, int field)
+static int tsens_hw_to_mC(const struct tsens_sensor *s, int field)
 {
        struct tsens_priv *priv = s->priv;
        u32 resolution;
@@ -160,7 +168,7 @@ static int tsens_hw_to_mC(struct tsens_sensor *s, int field)
  *
  * Return: ADC code or temperature in deciCelsius.
  */
-static int tsens_mC_to_hw(struct tsens_sensor *s, int temp)
+static int tsens_mC_to_hw(const struct tsens_sensor *s, int temp)
 {
        struct tsens_priv *priv = s->priv;
 
@@ -189,6 +197,9 @@ static void tsens_set_interrupt_v1(struct tsens_priv *priv, u32 hw_id,
        case LOWER:
                index = LOW_INT_CLEAR_0 + hw_id;
                break;
+       case CRITICAL:
+               /* No critical interrupts before v2 */
+               return;
        }
        regmap_field_write(priv->rf[index], enable ? 0 : 1);
 }
@@ -214,6 +225,10 @@ static void tsens_set_interrupt_v2(struct tsens_priv *priv, u32 hw_id,
                index_mask  = LOW_INT_MASK_0 + hw_id;
                index_clear = LOW_INT_CLEAR_0 + hw_id;
                break;
+       case CRITICAL:
+               index_mask  = CRIT_INT_MASK_0 + hw_id;
+               index_clear = CRIT_INT_CLEAR_0 + hw_id;
+               break;
        }
 
        if (enable) {
@@ -268,14 +283,23 @@ static int tsens_threshold_violated(struct tsens_priv *priv, u32 hw_id,
        ret = regmap_field_read(priv->rf[LOWER_STATUS_0 + hw_id], &d->low_viol);
        if (ret)
                return ret;
-       if (d->up_viol || d->low_viol)
+
+       if (priv->feat->crit_int) {
+               ret = regmap_field_read(priv->rf[CRITICAL_STATUS_0 + hw_id],
+                                       &d->crit_viol);
+               if (ret)
+                       return ret;
+       }
+
+       if (d->up_viol || d->low_viol || d->crit_viol)
                return 1;
 
        return 0;
 }
 
 static int tsens_read_irq_state(struct tsens_priv *priv, u32 hw_id,
-                               struct tsens_sensor *s, struct tsens_irq_data *d)
+                               const struct tsens_sensor *s,
+                               struct tsens_irq_data *d)
 {
        int ret;
 
@@ -292,22 +316,37 @@ static int tsens_read_irq_state(struct tsens_priv *priv, u32 hw_id,
                ret = regmap_field_read(priv->rf[LOW_INT_MASK_0 + hw_id], &d->low_irq_mask);
                if (ret)
                        return ret;
+               ret = regmap_field_read(priv->rf[CRIT_INT_CLEAR_0 + hw_id],
+                                       &d->crit_irq_clear);
+               if (ret)
+                       return ret;
+               ret = regmap_field_read(priv->rf[CRIT_INT_MASK_0 + hw_id],
+                                       &d->crit_irq_mask);
+               if (ret)
+                       return ret;
+
+               d->crit_thresh = tsens_hw_to_mC(s, CRIT_THRESH_0 + hw_id);
        } else {
                /* No mask register on older TSENS */
                d->up_irq_mask = 0;
                d->low_irq_mask = 0;
+               d->crit_irq_clear = 0;
+               d->crit_irq_mask = 0;
+               d->crit_thresh = 0;
        }
 
        d->up_thresh  = tsens_hw_to_mC(s, UP_THRESH_0 + hw_id);
        d->low_thresh = tsens_hw_to_mC(s, LOW_THRESH_0 + hw_id);
 
-       dev_dbg(priv->dev, "[%u] %s%s: status(%u|%u) | clr(%u|%u) | mask(%u|%u)\n",
-               hw_id, __func__, (d->up_viol || d->low_viol) ? "(V)" : "",
-               d->low_viol, d->up_viol, d->low_irq_clear, d->up_irq_clear,
-               d->low_irq_mask, d->up_irq_mask);
-       dev_dbg(priv->dev, "[%u] %s%s: thresh: (%d:%d)\n", hw_id, __func__,
-               (d->up_viol || d->low_viol) ? "(violation)" : "",
-               d->low_thresh, d->up_thresh);
+       dev_dbg(priv->dev, "[%u] %s%s: status(%u|%u|%u) | clr(%u|%u|%u) | mask(%u|%u|%u)\n",
+               hw_id, __func__,
+               (d->up_viol || d->low_viol || d->crit_viol) ? "(V)" : "",
+               d->low_viol, d->up_viol, d->crit_viol,
+               d->low_irq_clear, d->up_irq_clear, d->crit_irq_clear,
+               d->low_irq_mask, d->up_irq_mask, d->crit_irq_mask);
+       dev_dbg(priv->dev, "[%u] %s%s: thresh: (%d:%d:%d)\n", hw_id, __func__,
+               (d->up_viol || d->low_viol || d->crit_viol) ? "(V)" : "",
+               d->low_thresh, d->up_thresh, d->crit_thresh);
 
        return 0;
 }
@@ -322,6 +361,76 @@ static inline u32 masked_irq(u32 hw_id, u32 mask, enum tsens_ver ver)
 }
 
 /**
+ * tsens_critical_irq_thread() - Threaded handler for critical interrupts
+ * @irq: irq number
+ * @data: tsens controller private data
+ *
+ * Check FSM watchdog bark status and clear if needed.
+ * Check all sensors to find ones that violated their critical threshold limits.
+ * Clear and then re-enable the interrupt.
+ *
+ * The level-triggered interrupt might deassert if the temperature returned to
+ * within the threshold limits by the time the handler got scheduled. We
+ * consider the irq to have been handled in that case.
+ *
+ * Return: IRQ_HANDLED
+ */
+irqreturn_t tsens_critical_irq_thread(int irq, void *data)
+{
+       struct tsens_priv *priv = data;
+       struct tsens_irq_data d;
+       int temp, ret, i;
+       u32 wdog_status, wdog_count;
+
+       if (priv->feat->has_watchdog) {
+               ret = regmap_field_read(priv->rf[WDOG_BARK_STATUS],
+                                       &wdog_status);
+               if (ret)
+                       return ret;
+
+               if (wdog_status) {
+                       /* Clear WDOG interrupt */
+                       regmap_field_write(priv->rf[WDOG_BARK_CLEAR], 1);
+                       regmap_field_write(priv->rf[WDOG_BARK_CLEAR], 0);
+                       ret = regmap_field_read(priv->rf[WDOG_BARK_COUNT],
+                                               &wdog_count);
+                       if (ret)
+                               return ret;
+                       if (wdog_count)
+                               dev_dbg(priv->dev, "%s: watchdog count: %d\n",
+                                       __func__, wdog_count);
+
+                       /* Fall through to handle critical interrupts if any */
+               }
+       }
+
+       for (i = 0; i < priv->num_sensors; i++) {
+               const struct tsens_sensor *s = &priv->sensor[i];
+               u32 hw_id = s->hw_id;
+
+               if (IS_ERR(s->tzd))
+                       continue;
+               if (!tsens_threshold_violated(priv, hw_id, &d))
+                       continue;
+               ret = get_temp_tsens_valid(s, &temp);
+               if (ret) {
+                       dev_err(priv->dev, "[%u] %s: error reading sensor\n",
+                               hw_id, __func__);
+                       continue;
+               }
+
+               tsens_read_irq_state(priv, hw_id, s, &d);
+               if (d.crit_viol &&
+                   !masked_irq(hw_id, d.crit_irq_mask, tsens_version(priv))) {
+                       /* Mask critical interrupts, unused on Linux */
+                       tsens_set_interrupt(priv, hw_id, CRITICAL, false);
+               }
+       }
+
+       return IRQ_HANDLED;
+}
+
+/**
  * tsens_irq_thread - Threaded interrupt handler for uplow interrupts
  * @irq: irq number
  * @data: tsens controller private data
@@ -346,10 +455,10 @@ irqreturn_t tsens_irq_thread(int irq, void *data)
 
        for (i = 0; i < priv->num_sensors; i++) {
                bool trigger = false;
-               struct tsens_sensor *s = &priv->sensor[i];
+               const struct tsens_sensor *s = &priv->sensor[i];
                u32 hw_id = s->hw_id;
 
-               if (IS_ERR(priv->sensor[i].tzd))
+               if (IS_ERR(s->tzd))
                        continue;
                if (!tsens_threshold_violated(priv, hw_id, &d))
                        continue;
@@ -368,7 +477,7 @@ irqreturn_t tsens_irq_thread(int irq, void *data)
                        tsens_set_interrupt(priv, hw_id, UPPER, disable);
                        if (d.up_thresh > temp) {
                                dev_dbg(priv->dev, "[%u] %s: re-arm upper\n",
-                                       priv->sensor[i].hw_id, __func__);
+                                       hw_id, __func__);
                                tsens_set_interrupt(priv, hw_id, UPPER, enable);
                        } else {
                                trigger = true;
@@ -379,7 +488,7 @@ irqreturn_t tsens_irq_thread(int irq, void *data)
                        tsens_set_interrupt(priv, hw_id, LOWER, disable);
                        if (d.low_thresh < temp) {
                                dev_dbg(priv->dev, "[%u] %s: re-arm low\n",
-                                       priv->sensor[i].hw_id, __func__);
+                                       hw_id, __func__);
                                tsens_set_interrupt(priv, hw_id, LOWER, enable);
                        } else {
                                trigger = true;
@@ -392,7 +501,7 @@ irqreturn_t tsens_irq_thread(int irq, void *data)
                if (trigger) {
                        dev_dbg(priv->dev, "[%u] %s: TZ update trigger (%d mC)\n",
                                hw_id, __func__, temp);
-                       thermal_zone_device_update(priv->sensor[i].tzd,
+                       thermal_zone_device_update(s->tzd,
                                                   THERMAL_EVENT_UNSPECIFIED);
                } else {
                        dev_dbg(priv->dev, "[%u] %s: no violation:  %d\n",
@@ -435,7 +544,7 @@ int tsens_set_trips(void *_sensor, int low, int high)
        spin_unlock_irqrestore(&priv->ul_lock, flags);
 
        dev_dbg(dev, "[%u] %s: (%d:%d)->(%d:%d)\n",
-               s->hw_id, __func__, d.low_thresh, d.up_thresh, cl_low, cl_high);
+               hw_id, __func__, d.low_thresh, d.up_thresh, cl_low, cl_high);
 
        return 0;
 }
@@ -457,7 +566,7 @@ void tsens_disable_irq(struct tsens_priv *priv)
        regmap_field_write(priv->rf[INT_EN], 0);
 }
 
-int get_temp_tsens_valid(struct tsens_sensor *s, int *temp)
+int get_temp_tsens_valid(const struct tsens_sensor *s, int *temp)
 {
        struct tsens_priv *priv = s->priv;
        int hw_id = s->hw_id;
@@ -486,7 +595,7 @@ int get_temp_tsens_valid(struct tsens_sensor *s, int *temp)
        return 0;
 }
 
-int get_temp_common(struct tsens_sensor *s, int *temp)
+int get_temp_common(const struct tsens_sensor *s, int *temp)
 {
        struct tsens_priv *priv = s->priv;
        int hw_id = s->hw_id;
@@ -590,6 +699,7 @@ int __init init_common(struct tsens_priv *priv)
 {
        void __iomem *tm_base, *srot_base;
        struct device *dev = priv->dev;
+       u32 ver_minor;
        struct resource *res;
        u32 enabled;
        int ret, i, j;
@@ -602,7 +712,7 @@ int __init init_common(struct tsens_priv *priv)
                /* DT with separate SROT and TM address space */
                priv->tm_offset = 0;
                res = platform_get_resource(op, IORESOURCE_MEM, 1);
-               srot_base = devm_ioremap_resource(&op->dev, res);
+               srot_base = devm_ioremap_resource(dev, res);
                if (IS_ERR(srot_base)) {
                        ret = PTR_ERR(srot_base);
                        goto err_put_device;
@@ -620,7 +730,7 @@ int __init init_common(struct tsens_priv *priv)
        }
 
        res = platform_get_resource(op, IORESOURCE_MEM, 0);
-       tm_base = devm_ioremap_resource(&op->dev, res);
+       tm_base = devm_ioremap_resource(dev, res);
        if (IS_ERR(tm_base)) {
                ret = PTR_ERR(tm_base);
                goto err_put_device;
@@ -639,6 +749,9 @@ int __init init_common(struct tsens_priv *priv)
                        if (IS_ERR(priv->rf[i]))
                                return PTR_ERR(priv->rf[i]);
                }
+               ret = regmap_field_read(priv->rf[VER_MINOR], &ver_minor);
+               if (ret)
+                       goto err_put_device;
        }
 
        priv->rf[TSENS_EN] = devm_regmap_field_alloc(dev, priv->srot_map,
@@ -683,12 +796,47 @@ int __init init_common(struct tsens_priv *priv)
                }
        }
 
+       if (priv->feat->crit_int) {
+               /* Loop might need changes if enum regfield_ids is reordered */
+               for (j = CRITICAL_STATUS_0; j <= CRIT_THRESH_15; j += 16) {
+                       for (i = 0; i < priv->feat->max_sensors; i++) {
+                               int idx = j + i;
+
+                               priv->rf[idx] =
+                                       devm_regmap_field_alloc(dev,
+                                                               priv->tm_map,
+                                                               priv->fields[idx]);
+                               if (IS_ERR(priv->rf[idx])) {
+                                       ret = PTR_ERR(priv->rf[idx]);
+                                       goto err_put_device;
+                               }
+                       }
+               }
+       }
+
+       if (tsens_version(priv) > VER_1_X &&  ver_minor > 2) {
+               /* Watchdog is present only on v2.3+ */
+               priv->feat->has_watchdog = 1;
+               for (i = WDOG_BARK_STATUS; i <= CC_MON_MASK; i++) {
+                       priv->rf[i] = devm_regmap_field_alloc(dev, priv->tm_map,
+                                                             priv->fields[i]);
+                       if (IS_ERR(priv->rf[i])) {
+                               ret = PTR_ERR(priv->rf[i]);
+                               goto err_put_device;
+                       }
+               }
+               /*
+                * Watchdog is already enabled, unmask the bark.
+                * Disable cycle completion monitoring
+                */
+               regmap_field_write(priv->rf[WDOG_BARK_MASK], 0);
+               regmap_field_write(priv->rf[CC_MON_MASK], 1);
+       }
+
        spin_lock_init(&priv->ul_lock);
        tsens_enable_irq(priv);
        tsens_debug_init(op);
 
-       return 0;
-
 err_put_device:
        put_device(&op->dev);
        return ret;
index 4b8dd6d..959a937 100644 (file)
@@ -327,7 +327,7 @@ static int calibrate_8974(struct tsens_priv *priv)
 
 /* v0.1: 8916, 8974 */
 
-static const struct tsens_features tsens_v0_1_feat = {
+static struct tsens_features tsens_v0_1_feat = {
        .ver_major      = VER_0_1,
        .crit_int       = 0,
        .adc            = 1,
@@ -377,7 +377,7 @@ static const struct tsens_ops ops_8916 = {
        .get_temp       = get_temp_common,
 };
 
-const struct tsens_plat_data data_8916 = {
+struct tsens_plat_data data_8916 = {
        .num_sensors    = 5,
        .ops            = &ops_8916,
        .hw_ids         = (unsigned int []){0, 1, 2, 4, 5 },
@@ -392,7 +392,7 @@ static const struct tsens_ops ops_8974 = {
        .get_temp       = get_temp_common,
 };
 
-const struct tsens_plat_data data_8974 = {
+struct tsens_plat_data data_8974 = {
        .num_sensors    = 11,
        .ops            = &ops_8974,
        .feat           = &tsens_v0_1_feat,
index bd2ddb6..b682a4d 100644 (file)
@@ -299,7 +299,7 @@ static int calibrate_8976(struct tsens_priv *priv)
 
 /* v1.x: msm8956,8976,qcs404,405 */
 
-static const struct tsens_features tsens_v1_feat = {
+static struct tsens_features tsens_v1_feat = {
        .ver_major      = VER_1_X,
        .crit_int       = 0,
        .adc            = 1,
@@ -368,7 +368,7 @@ static const struct tsens_ops ops_generic_v1 = {
        .get_temp       = get_temp_tsens_valid,
 };
 
-const struct tsens_plat_data data_tsens_v1 = {
+struct tsens_plat_data data_tsens_v1 = {
        .ops            = &ops_generic_v1,
        .feat           = &tsens_v1_feat,
        .fields = tsens_v1_regfields,
@@ -381,7 +381,7 @@ static const struct tsens_ops ops_8976 = {
 };
 
 /* Valid for both MSM8956 and MSM8976. Sensor ID 3 is unused. */
-const struct tsens_plat_data data_8976 = {
+struct tsens_plat_data data_8976 = {
        .num_sensors    = 11,
        .ops            = &ops_8976,
        .hw_ids         = (unsigned int[]){0, 1, 2, 4, 5, 6, 7, 8, 9, 10},
index a4d15e1..b293ed3 100644 (file)
 #define TM_Sn_CRITICAL_THRESHOLD_OFF   0x0060
 #define TM_Sn_STATUS_OFF               0x00a0
 #define TM_TRDY_OFF                    0x00e4
+#define TM_WDOG_LOG_OFF                0x013c
 
 /* v2.x: 8996, 8998, sdm845 */
 
-static const struct tsens_features tsens_v2_feat = {
+static struct tsens_features tsens_v2_feat = {
        .ver_major      = VER_2_X,
        .crit_int       = 1,
        .adc            = 0,
@@ -51,8 +52,9 @@ static const struct reg_field tsens_v2_regfields[MAX_REGFIELDS] = {
        [INT_EN]  = REG_FIELD(TM_INT_EN_OFF, 0, 2),
 
        /* TEMPERATURE THRESHOLDS */
-       REG_FIELD_FOR_EACH_SENSOR16(LOW_THRESH, TM_Sn_UPPER_LOWER_THRESHOLD_OFF,  0,  11),
-       REG_FIELD_FOR_EACH_SENSOR16(UP_THRESH,  TM_Sn_UPPER_LOWER_THRESHOLD_OFF, 12,  23),
+       REG_FIELD_FOR_EACH_SENSOR16(LOW_THRESH,  TM_Sn_UPPER_LOWER_THRESHOLD_OFF,  0,  11),
+       REG_FIELD_FOR_EACH_SENSOR16(UP_THRESH,   TM_Sn_UPPER_LOWER_THRESHOLD_OFF, 12,  23),
+       REG_FIELD_FOR_EACH_SENSOR16(CRIT_THRESH, TM_Sn_CRITICAL_THRESHOLD_OFF,     0,  11),
 
        /* INTERRUPTS [CLEAR/STATUS/MASK] */
        REG_FIELD_SPLIT_BITS_0_15(LOW_INT_STATUS,  TM_UPPER_LOWER_INT_STATUS_OFF),
@@ -61,6 +63,18 @@ static const struct reg_field tsens_v2_regfields[MAX_REGFIELDS] = {
        REG_FIELD_SPLIT_BITS_16_31(UP_INT_STATUS,  TM_UPPER_LOWER_INT_STATUS_OFF),
        REG_FIELD_SPLIT_BITS_16_31(UP_INT_CLEAR,   TM_UPPER_LOWER_INT_CLEAR_OFF),
        REG_FIELD_SPLIT_BITS_16_31(UP_INT_MASK,    TM_UPPER_LOWER_INT_MASK_OFF),
+       REG_FIELD_SPLIT_BITS_0_15(CRIT_INT_STATUS, TM_CRITICAL_INT_STATUS_OFF),
+       REG_FIELD_SPLIT_BITS_0_15(CRIT_INT_CLEAR,  TM_CRITICAL_INT_CLEAR_OFF),
+       REG_FIELD_SPLIT_BITS_0_15(CRIT_INT_MASK,   TM_CRITICAL_INT_MASK_OFF),
+
+       /* WATCHDOG on v2.3 or later */
+       [WDOG_BARK_STATUS] = REG_FIELD(TM_CRITICAL_INT_STATUS_OFF, 31, 31),
+       [WDOG_BARK_CLEAR]  = REG_FIELD(TM_CRITICAL_INT_CLEAR_OFF,  31, 31),
+       [WDOG_BARK_MASK]   = REG_FIELD(TM_CRITICAL_INT_MASK_OFF,   31, 31),
+       [CC_MON_STATUS]    = REG_FIELD(TM_CRITICAL_INT_STATUS_OFF, 30, 30),
+       [CC_MON_CLEAR]     = REG_FIELD(TM_CRITICAL_INT_CLEAR_OFF,  30, 30),
+       [CC_MON_MASK]      = REG_FIELD(TM_CRITICAL_INT_MASK_OFF,   30, 30),
+       [WDOG_BARK_COUNT]  = REG_FIELD(TM_WDOG_LOG_OFF,             0,  7),
 
        /* Sn_STATUS */
        REG_FIELD_FOR_EACH_SENSOR16(LAST_TEMP,       TM_Sn_STATUS_OFF,  0,  11),
@@ -81,14 +95,14 @@ static const struct tsens_ops ops_generic_v2 = {
        .get_temp       = get_temp_tsens_valid,
 };
 
-const struct tsens_plat_data data_tsens_v2 = {
+struct tsens_plat_data data_tsens_v2 = {
        .ops            = &ops_generic_v2,
        .feat           = &tsens_v2_feat,
        .fields = tsens_v2_regfields,
 };
 
 /* Kept around for backward compatibility with old msm8996.dtsi */
-const struct tsens_plat_data data_8996 = {
+struct tsens_plat_data data_8996 = {
        .num_sensors    = 13,
        .ops            = &ops_generic_v2,
        .feat           = &tsens_v2_feat,
index 0e7cf52..2f77d23 100644 (file)
@@ -85,11 +85,42 @@ static const struct thermal_zone_of_device_ops tsens_of_ops = {
        .set_trips = tsens_set_trips,
 };
 
+static int tsens_register_irq(struct tsens_priv *priv, char *irqname,
+                             irq_handler_t thread_fn)
+{
+       struct platform_device *pdev;
+       int ret, irq;
+
+       pdev = of_find_device_by_node(priv->dev->of_node);
+       if (!pdev)
+               return -ENODEV;
+
+       irq = platform_get_irq_byname(pdev, irqname);
+       if (irq < 0) {
+               ret = irq;
+               /* For old DTs with no IRQ defined */
+               if (irq == -ENXIO)
+                       ret = 0;
+       } else {
+               ret = devm_request_threaded_irq(&pdev->dev, irq,
+                                               NULL, thread_fn,
+                                               IRQF_ONESHOT,
+                                               dev_name(&pdev->dev), priv);
+               if (ret)
+                       dev_err(&pdev->dev, "%s: failed to get irq\n",
+                               __func__);
+               else
+                       enable_irq_wake(irq);
+       }
+
+       put_device(&pdev->dev);
+       return ret;
+}
+
 static int tsens_register(struct tsens_priv *priv)
 {
-       int i, ret, irq;
+       int i, ret;
        struct thermal_zone_device *tzd;
-       struct platform_device *pdev;
 
        for (i = 0;  i < priv->num_sensors; i++) {
                priv->sensor[i].priv = priv;
@@ -103,32 +134,14 @@ static int tsens_register(struct tsens_priv *priv)
                        priv->ops->enable(priv, i);
        }
 
-       pdev = of_find_device_by_node(priv->dev->of_node);
-       if (!pdev)
-               return -ENODEV;
-
-       irq = platform_get_irq_byname(pdev, "uplow");
-       if (irq < 0) {
-               ret = irq;
-               /* For old DTs with no IRQ defined */
-               if (irq == -ENXIO)
-                       ret = 0;
-               goto err_put_device;
-       }
-
-       ret = devm_request_threaded_irq(&pdev->dev, irq,
-                                       NULL, tsens_irq_thread,
-                                       IRQF_TRIGGER_HIGH | IRQF_ONESHOT,
-                                       dev_name(&pdev->dev), priv);
-       if (ret) {
-               dev_err(&pdev->dev, "%s: failed to get irq\n", __func__);
-               goto err_put_device;
-       }
+       ret = tsens_register_irq(priv, "uplow", tsens_irq_thread);
+       if (ret < 0)
+               return ret;
 
-       enable_irq_wake(irq);
+       if (priv->feat->crit_int)
+               ret = tsens_register_irq(priv, "critical",
+                                        tsens_critical_irq_thread);
 
-err_put_device:
-       put_device(&pdev->dev);
        return ret;
 }
 
index e24a865..502acf0 100644 (file)
@@ -23,6 +23,7 @@
 
 struct tsens_priv;
 
+/* IP version numbers in ascending order */
 enum tsens_ver {
        VER_0_1 = 0,
        VER_1_X,
@@ -32,6 +33,7 @@ enum tsens_ver {
 enum tsens_irq_type {
        LOWER,
        UPPER,
+       CRITICAL,
 };
 
 /**
@@ -67,7 +69,7 @@ struct tsens_ops {
        /* mandatory callbacks */
        int (*init)(struct tsens_priv *priv);
        int (*calibrate)(struct tsens_priv *priv);
-       int (*get_temp)(struct tsens_sensor *s, int *temp);
+       int (*get_temp)(const struct tsens_sensor *s, int *temp);
        /* optional callbacks */
        int (*enable)(struct tsens_priv *priv, int i);
        void (*disable)(struct tsens_priv *priv);
@@ -374,6 +376,82 @@ enum regfield_ids {
        CRITICAL_STATUS_13,
        CRITICAL_STATUS_14,
        CRITICAL_STATUS_15,
+       CRIT_INT_STATUS_0,      /* CRITICAL interrupt status */
+       CRIT_INT_STATUS_1,
+       CRIT_INT_STATUS_2,
+       CRIT_INT_STATUS_3,
+       CRIT_INT_STATUS_4,
+       CRIT_INT_STATUS_5,
+       CRIT_INT_STATUS_6,
+       CRIT_INT_STATUS_7,
+       CRIT_INT_STATUS_8,
+       CRIT_INT_STATUS_9,
+       CRIT_INT_STATUS_10,
+       CRIT_INT_STATUS_11,
+       CRIT_INT_STATUS_12,
+       CRIT_INT_STATUS_13,
+       CRIT_INT_STATUS_14,
+       CRIT_INT_STATUS_15,
+       CRIT_INT_CLEAR_0,       /* CRITICAL interrupt clear */
+       CRIT_INT_CLEAR_1,
+       CRIT_INT_CLEAR_2,
+       CRIT_INT_CLEAR_3,
+       CRIT_INT_CLEAR_4,
+       CRIT_INT_CLEAR_5,
+       CRIT_INT_CLEAR_6,
+       CRIT_INT_CLEAR_7,
+       CRIT_INT_CLEAR_8,
+       CRIT_INT_CLEAR_9,
+       CRIT_INT_CLEAR_10,
+       CRIT_INT_CLEAR_11,
+       CRIT_INT_CLEAR_12,
+       CRIT_INT_CLEAR_13,
+       CRIT_INT_CLEAR_14,
+       CRIT_INT_CLEAR_15,
+       CRIT_INT_MASK_0,        /* CRITICAL interrupt mask */
+       CRIT_INT_MASK_1,
+       CRIT_INT_MASK_2,
+       CRIT_INT_MASK_3,
+       CRIT_INT_MASK_4,
+       CRIT_INT_MASK_5,
+       CRIT_INT_MASK_6,
+       CRIT_INT_MASK_7,
+       CRIT_INT_MASK_8,
+       CRIT_INT_MASK_9,
+       CRIT_INT_MASK_10,
+       CRIT_INT_MASK_11,
+       CRIT_INT_MASK_12,
+       CRIT_INT_MASK_13,
+       CRIT_INT_MASK_14,
+       CRIT_INT_MASK_15,
+       CRIT_THRESH_0,          /* CRITICAL threshold values */
+       CRIT_THRESH_1,
+       CRIT_THRESH_2,
+       CRIT_THRESH_3,
+       CRIT_THRESH_4,
+       CRIT_THRESH_5,
+       CRIT_THRESH_6,
+       CRIT_THRESH_7,
+       CRIT_THRESH_8,
+       CRIT_THRESH_9,
+       CRIT_THRESH_10,
+       CRIT_THRESH_11,
+       CRIT_THRESH_12,
+       CRIT_THRESH_13,
+       CRIT_THRESH_14,
+       CRIT_THRESH_15,
+
+       /* WATCHDOG */
+       WDOG_BARK_STATUS,
+       WDOG_BARK_CLEAR,
+       WDOG_BARK_MASK,
+       WDOG_BARK_COUNT,
+
+       /* CYCLE COMPLETION MONITOR */
+       CC_MON_STATUS,
+       CC_MON_CLEAR,
+       CC_MON_MASK,
+
        MIN_STATUS_0,           /* MIN threshold violated */
        MIN_STATUS_1,
        MIN_STATUS_2,
@@ -418,6 +496,7 @@ enum regfield_ids {
  * @adc:      do the sensors only output adc code (instead of temperature)?
  * @srot_split: does the IP neatly splits the register space into SROT and TM,
  *              with SROT only being available to secure boot firmware?
+ * @has_watchdog: does this IP support watchdog functionality?
  * @max_sensors: maximum sensors supported by this version of the IP
  */
 struct tsens_features {
@@ -425,6 +504,7 @@ struct tsens_features {
        unsigned int crit_int:1;
        unsigned int adc:1;
        unsigned int srot_split:1;
+       unsigned int has_watchdog:1;
        unsigned int max_sensors;
 };
 
@@ -440,12 +520,14 @@ struct tsens_plat_data {
        const u32               num_sensors;
        const struct tsens_ops  *ops;
        unsigned int            *hw_ids;
-       const struct tsens_features     *feat;
+       struct tsens_features   *feat;
        const struct reg_field          *fields;
 };
 
 /**
  * struct tsens_context - Registers to be saved/restored across a context loss
+ * @threshold: Threshold register value
+ * @control: Control register value
  */
 struct tsens_context {
        int     threshold;
@@ -460,6 +542,8 @@ struct tsens_context {
  * @srot_map: pointer to SROT register address space
  * @tm_offset: deal with old device trees that don't address TM and SROT
  *             address space separately
+ * @ul_lock: lock while processing upper/lower threshold interrupts
+ * @crit_lock: lock while processing critical threshold interrupts
  * @rf: array of regmap_fields used to store value of the field
  * @ctx: registers to be saved and restored during suspend/resume
  * @feat: features of the IP
@@ -481,36 +565,37 @@ struct tsens_priv {
 
        struct regmap_field             *rf[MAX_REGFIELDS];
        struct tsens_context            ctx;
-       const struct tsens_features     *feat;
+       struct tsens_features           *feat;
        const struct reg_field          *fields;
        const struct tsens_ops          *ops;
 
        struct dentry                   *debug_root;
        struct dentry                   *debug;
 
-       struct tsens_sensor             sensor[0];
+       struct tsens_sensor             sensor[];
 };
 
 char *qfprom_read(struct device *dev, const char *cname);
 void compute_intercept_slope(struct tsens_priv *priv, u32 *pt1, u32 *pt2, u32 mode);
 int init_common(struct tsens_priv *priv);
-int get_temp_tsens_valid(struct tsens_sensor *s, int *temp);
-int get_temp_common(struct tsens_sensor *s, int *temp);
+int get_temp_tsens_valid(const struct tsens_sensor *s, int *temp);
+int get_temp_common(const struct tsens_sensor *s, int *temp);
 int tsens_enable_irq(struct tsens_priv *priv);
 void tsens_disable_irq(struct tsens_priv *priv);
 int tsens_set_trips(void *_sensor, int low, int high);
 irqreturn_t tsens_irq_thread(int irq, void *data);
+irqreturn_t tsens_critical_irq_thread(int irq, void *data);
 
 /* TSENS target */
-extern const struct tsens_plat_data data_8960;
+extern struct tsens_plat_data data_8960;
 
 /* TSENS v0.1 targets */
-extern const struct tsens_plat_data data_8916, data_8974;
+extern struct tsens_plat_data data_8916, data_8974;
 
 /* TSENS v1 targets */
-extern const struct tsens_plat_data data_tsens_v1, data_8976;
+extern struct tsens_plat_data data_tsens_v1, data_8976;
 
 /* TSENS v2 targets */
-extern const struct tsens_plat_data data_8996, data_tsens_v2;
+extern struct tsens_plat_data data_8996, data_tsens_v2;
 
 #endif /* __QCOM_TSENS_H__ */
index 874bc46..028a6bb 100644 (file)
@@ -3,12 +3,11 @@
 // Copyright 2016 Freescale Semiconductor, Inc.
 
 #include <linux/clk.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
 #include <linux/err.h>
 #include <linux/io.h>
+#include <linux/module.h>
 #include <linux/of.h>
-#include <linux/of_address.h>
+#include <linux/platform_device.h>
 #include <linux/regmap.h>
 #include <linux/sizes.h>
 #include <linux/thermal.h>
@@ -228,6 +227,14 @@ static const struct regmap_access_table qoriq_rd_table = {
        .n_yes_ranges   = ARRAY_SIZE(qoriq_yes_ranges),
 };
 
+static void qoriq_tmu_action(void *p)
+{
+       struct qoriq_tmu_data *data = p;
+
+       regmap_write(data->regmap, REGS_TMR, TMR_DISABLE);
+       clk_disable_unprepare(data->clk);
+}
+
 static int qoriq_tmu_probe(struct platform_device *pdev)
 {
        int ret;
@@ -278,6 +285,10 @@ static int qoriq_tmu_probe(struct platform_device *pdev)
                return ret;
        }
 
+       ret = devm_add_action_or_reset(dev, qoriq_tmu_action, data);
+       if (ret)
+               return ret;
+
        /* version register offset at: 0xbf8 on both v1 and v2 */
        ret = regmap_read(data->regmap, REGS_IPBRR(0), &ver);
        if (ret) {
@@ -290,35 +301,17 @@ static int qoriq_tmu_probe(struct platform_device *pdev)
 
        ret = qoriq_tmu_calibration(dev, data); /* TMU calibration */
        if (ret < 0)
-               goto err;
+               return ret;
 
        ret = qoriq_tmu_register_tmu_zone(dev, data);
        if (ret < 0) {
                dev_err(dev, "Failed to register sensors\n");
-               ret = -ENODEV;
-               goto err;
+               return ret;
        }
 
        platform_set_drvdata(pdev, data);
 
        return 0;
-
-err:
-       clk_disable_unprepare(data->clk);
-
-       return ret;
-}
-
-static int qoriq_tmu_remove(struct platform_device *pdev)
-{
-       struct qoriq_tmu_data *data = platform_get_drvdata(pdev);
-
-       /* Disable monitoring */
-       regmap_write(data->regmap, REGS_TMR, TMR_DISABLE);
-
-       clk_disable_unprepare(data->clk);
-
-       return 0;
 }
 
 static int __maybe_unused qoriq_tmu_suspend(struct device *dev)
@@ -365,7 +358,6 @@ static struct platform_driver qoriq_tmu = {
                .of_match_table = qoriq_tmu_match,
        },
        .probe  = qoriq_tmu_probe,
-       .remove = qoriq_tmu_remove,
 };
 module_platform_driver(qoriq_tmu);
 
index 72877bd..58fe7c1 100644 (file)
@@ -81,8 +81,6 @@ struct rcar_gen3_thermal_tsc {
        void __iomem *base;
        struct thermal_zone_device *zone;
        struct equation_coefs coef;
-       int low;
-       int high;
        int tj_t;
        int id; /* thermal channel id */
 };
@@ -204,12 +202,14 @@ static int rcar_gen3_thermal_mcelsius_to_temp(struct rcar_gen3_thermal_tsc *tsc,
        return INT_FIXPT(val);
 }
 
-static int rcar_gen3_thermal_set_trips(void *devdata, int low, int high)
+static int rcar_gen3_thermal_update_range(struct rcar_gen3_thermal_tsc *tsc)
 {
-       struct rcar_gen3_thermal_tsc *tsc = devdata;
+       int temperature, low, high;
+
+       rcar_gen3_thermal_get_temp(tsc, &temperature);
 
-       low = clamp_val(low, -40000, 120000);
-       high = clamp_val(high, -40000, 120000);
+       low = temperature - MCELSIUS(1);
+       high = temperature + MCELSIUS(1);
 
        rcar_gen3_thermal_write(tsc, REG_GEN3_IRQTEMP1,
                                rcar_gen3_thermal_mcelsius_to_temp(tsc, low));
@@ -217,15 +217,11 @@ static int rcar_gen3_thermal_set_trips(void *devdata, int low, int high)
        rcar_gen3_thermal_write(tsc, REG_GEN3_IRQTEMP2,
                                rcar_gen3_thermal_mcelsius_to_temp(tsc, high));
 
-       tsc->low = low;
-       tsc->high = high;
-
        return 0;
 }
 
 static const struct thermal_zone_of_device_ops rcar_gen3_tz_of_ops = {
        .get_temp       = rcar_gen3_thermal_get_temp,
-       .set_trips      = rcar_gen3_thermal_set_trips,
 };
 
 static void rcar_thermal_irq_set(struct rcar_gen3_thermal_priv *priv, bool on)
@@ -246,9 +242,11 @@ static irqreturn_t rcar_gen3_thermal_irq(int irq, void *data)
        for (i = 0; i < priv->num_tscs; i++) {
                status = rcar_gen3_thermal_read(priv->tscs[i], REG_GEN3_IRQSTR);
                rcar_gen3_thermal_write(priv->tscs[i], REG_GEN3_IRQSTR, 0);
-               if (status)
+               if (status) {
+                       rcar_gen3_thermal_update_range(priv->tscs[i]);
                        thermal_zone_device_update(priv->tscs[i]->zone,
                                                   THERMAL_EVENT_UNSPECIFIED);
+               }
        }
 
        return IRQ_HANDLED;
@@ -325,6 +323,10 @@ static const struct of_device_id rcar_gen3_thermal_dt_ids[] = {
                .data = &rcar_gen3_ths_tj_1_m3_w,
        },
        {
+               .compatible = "renesas,r8a77961-thermal",
+               .data = &rcar_gen3_ths_tj_1_m3_w,
+       },
+       {
                .compatible = "renesas,r8a77965-thermal",
                .data = &rcar_gen3_ths_tj_1,
        },
@@ -446,14 +448,15 @@ static int rcar_gen3_thermal_probe(struct platform_device *pdev)
                        goto error_unregister;
 
                ret = devm_add_action_or_reset(dev, rcar_gen3_hwmon_action, zone);
-               if (ret) {
+               if (ret)
                        goto error_unregister;
-               }
 
                ret = of_thermal_get_ntrips(tsc->zone);
                if (ret < 0)
                        goto error_unregister;
 
+               rcar_gen3_thermal_update_range(tsc);
+
                dev_info(dev, "TSC%d: Loaded %d trip points\n", i, ret);
        }
 
@@ -492,7 +495,7 @@ static int __maybe_unused rcar_gen3_thermal_resume(struct device *dev)
                struct rcar_gen3_thermal_tsc *tsc = priv->tscs[i];
 
                priv->thermal_init(tsc);
-               rcar_gen3_thermal_set_trips(tsc, tsc->low, tsc->high);
+               rcar_gen3_thermal_update_range(tsc);
        }
 
        rcar_thermal_irq_set(priv, true);
index 8f1aafa..e0c1f24 100644 (file)
@@ -95,7 +95,6 @@ struct rcar_thermal_priv {
        struct mutex lock;
        struct list_head list;
        int id;
-       u32 ctemp;
 };
 
 #define rcar_thermal_for_each_priv(pos, common)        \
@@ -201,7 +200,6 @@ static int rcar_thermal_update_temp(struct rcar_thermal_priv *priv)
        struct device *dev = rcar_priv_to_dev(priv);
        int i;
        u32 ctemp, old, new;
-       int ret = -EINVAL;
 
        mutex_lock(&priv->lock);
 
@@ -247,37 +245,29 @@ static int rcar_thermal_update_temp(struct rcar_thermal_priv *priv)
                                                   ((ctemp - 1) << 0)));
        }
 
-       dev_dbg(dev, "thermal%d  %d -> %d\n", priv->id, priv->ctemp, ctemp);
-
-       priv->ctemp = ctemp;
-       ret = 0;
 err_out_unlock:
        mutex_unlock(&priv->lock);
-       return ret;
+
+       return ctemp ? ctemp : -EINVAL;
 }
 
 static int rcar_thermal_get_current_temp(struct rcar_thermal_priv *priv,
                                         int *temp)
 {
-       int tmp;
-       int ret;
-
-       ret = rcar_thermal_update_temp(priv);
-       if (ret < 0)
-               return ret;
+       int ctemp;
 
-       mutex_lock(&priv->lock);
-       if (priv->chip->ctemp_bands == 1)
-               tmp = MCELSIUS((priv->ctemp * 5) - 65);
-       else if (priv->ctemp < 24)
-               tmp = MCELSIUS(((priv->ctemp * 55) - 720) / 10);
-       else
-               tmp = MCELSIUS((priv->ctemp * 5) - 60);
-       mutex_unlock(&priv->lock);
+       ctemp = rcar_thermal_update_temp(priv);
+       if (ctemp < 0)
+               return ctemp;
 
        /* Guaranteed operating range is -45C to 125C. */
 
-       *temp = tmp;
+       if (priv->chip->ctemp_bands == 1)
+               *temp = MCELSIUS((ctemp * 5) - 65);
+       else if (ctemp < 24)
+               *temp = MCELSIUS(((ctemp * 55) - 720) / 10);
+       else
+               *temp = MCELSIUS((ctemp * 5) - 60);
 
        return 0;
 }
@@ -387,28 +377,17 @@ static void _rcar_thermal_irq_ctrl(struct rcar_thermal_priv *priv, int enable)
 static void rcar_thermal_work(struct work_struct *work)
 {
        struct rcar_thermal_priv *priv;
-       int cctemp, nctemp;
        int ret;
 
        priv = container_of(work, struct rcar_thermal_priv, work.work);
 
-       ret = rcar_thermal_get_current_temp(priv, &cctemp);
-       if (ret < 0)
-               return;
-
        ret = rcar_thermal_update_temp(priv);
        if (ret < 0)
                return;
 
        rcar_thermal_irq_enable(priv);
 
-       ret = rcar_thermal_get_current_temp(priv, &nctemp);
-       if (ret < 0)
-               return;
-
-       if (nctemp != cctemp)
-               thermal_zone_device_update(priv->zone,
-                                          THERMAL_EVENT_UNSPECIFIED);
+       thermal_zone_device_update(priv->zone, THERMAL_EVENT_UNSPECIFIED);
 }
 
 static u32 rcar_thermal_had_changed(struct rcar_thermal_priv *priv, u32 status)
@@ -521,8 +500,10 @@ static int rcar_thermal_probe(struct platform_device *pdev)
                        res = platform_get_resource(pdev, IORESOURCE_MEM,
                                                    mres++);
                        common->base = devm_ioremap_resource(dev, res);
-                       if (IS_ERR(common->base))
-                               return PTR_ERR(common->base);
+                       if (IS_ERR(common->base)) {
+                               ret = PTR_ERR(common->base);
+                               goto error_unregister;
+                       }
 
                        idle = 0; /* polling delay is not needed */
                }
index fd4a178..e9a90bc 100644 (file)
@@ -1094,7 +1094,9 @@ static int exynos_tmu_probe(struct platform_device *pdev)
                                                    &exynos_sensor_ops);
        if (IS_ERR(data->tzd)) {
                ret = PTR_ERR(data->tzd);
-               dev_err(&pdev->dev, "Failed to register sensor: %d\n", ret);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(&pdev->dev, "Failed to register sensor: %d\n",
+                               ret);
                goto err_sclk;
        }
 
diff --git a/drivers/thermal/sprd_thermal.c b/drivers/thermal/sprd_thermal.c
new file mode 100644 (file)
index 0000000..a340374
--- /dev/null
@@ -0,0 +1,552 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 Spreadtrum Communications Inc.
+
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/nvmem-consumer.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+
+#define SPRD_THM_CTL                   0x0
+#define SPRD_THM_INT_EN                        0x4
+#define SPRD_THM_INT_STS               0x8
+#define SPRD_THM_INT_RAW_STS           0xc
+#define SPRD_THM_DET_PERIOD            0x10
+#define SPRD_THM_INT_CLR               0x14
+#define SPRD_THM_INT_CLR_ST            0x18
+#define SPRD_THM_MON_PERIOD            0x4c
+#define SPRD_THM_MON_CTL               0x50
+#define SPRD_THM_INTERNAL_STS1         0x54
+#define SPRD_THM_RAW_READ_MSK          0x3ff
+
+#define SPRD_THM_OFFSET(id)            ((id) * 0x4)
+#define SPRD_THM_TEMP(id)              (SPRD_THM_OFFSET(id) + 0x5c)
+#define SPRD_THM_THRES(id)             (SPRD_THM_OFFSET(id) + 0x2c)
+
+#define SPRD_THM_SEN(id)               BIT((id) + 2)
+#define SPRD_THM_SEN_OVERHEAT_EN(id)   BIT((id) + 8)
+#define SPRD_THM_SEN_OVERHEAT_ALARM_EN(id)     BIT((id) + 0)
+
+/* bits definitions for register THM_CTL */
+#define SPRD_THM_SET_RDY_ST            BIT(13)
+#define SPRD_THM_SET_RDY               BIT(12)
+#define SPRD_THM_MON_EN                        BIT(1)
+#define SPRD_THM_EN                    BIT(0)
+
+/* bits definitions for register THM_INT_CTL */
+#define SPRD_THM_BIT_INT_EN            BIT(26)
+#define SPRD_THM_OVERHEAT_EN           BIT(25)
+#define SPRD_THM_OTP_TRIP_SHIFT                10
+
+/* bits definitions for register SPRD_THM_INTERNAL_STS1 */
+#define SPRD_THM_TEMPER_RDY            BIT(0)
+
+#define SPRD_THM_DET_PERIOD_DATA       0x800
+#define SPRD_THM_DET_PERIOD_MASK       GENMASK(19, 0)
+#define SPRD_THM_MON_MODE              0x7
+#define SPRD_THM_MON_MODE_MASK         GENMASK(3, 0)
+#define SPRD_THM_MON_PERIOD_DATA       0x10
+#define SPRD_THM_MON_PERIOD_MASK       GENMASK(15, 0)
+#define SPRD_THM_THRES_MASK            GENMASK(19, 0)
+#define SPRD_THM_INT_CLR_MASK          GENMASK(24, 0)
+
+/* thermal sensor calibration parameters */
+#define SPRD_THM_TEMP_LOW              -40000
+#define SPRD_THM_TEMP_HIGH             120000
+#define SPRD_THM_OTP_TEMP              120000
+#define SPRD_THM_HOT_TEMP              75000
+#define SPRD_THM_RAW_DATA_LOW          0
+#define SPRD_THM_RAW_DATA_HIGH         1000
+#define SPRD_THM_SEN_NUM               8
+#define SPRD_THM_DT_OFFSET             24
+#define SPRD_THM_RATION_OFFSET         17
+#define SPRD_THM_RATION_SIGN           16
+
+#define SPRD_THM_RDYST_POLLING_TIME    10
+#define SPRD_THM_RDYST_TIMEOUT         700
+#define SPRD_THM_TEMP_READY_POLL_TIME  10000
+#define SPRD_THM_TEMP_READY_TIMEOUT    600000
+#define SPRD_THM_MAX_SENSOR            8
+
+struct sprd_thermal_sensor {
+       struct thermal_zone_device *tzd;
+       struct sprd_thermal_data *data;
+       struct device *dev;
+       int cal_slope;
+       int cal_offset;
+       int id;
+};
+
+struct sprd_thermal_data {
+       const struct sprd_thm_variant_data *var_data;
+       struct sprd_thermal_sensor *sensor[SPRD_THM_MAX_SENSOR];
+       struct clk *clk;
+       void __iomem *base;
+       u32 ratio_off;
+       int ratio_sign;
+       int nr_sensors;
+};
+
+/*
+ * The conversion between ADC and temperature is based on linear relationship,
+ * and use idea_k to specify the slope and ideal_b to specify the offset.
+ *
+ * Since different Spreadtrum SoCs have different ideal_k and ideal_b,
+ * we should save ideal_k and ideal_b in the device data structure.
+ */
+struct sprd_thm_variant_data {
+       u32 ideal_k;
+       u32 ideal_b;
+};
+
+static const struct sprd_thm_variant_data ums512_data = {
+       .ideal_k = 262,
+       .ideal_b = 66400,
+};
+
+static inline void sprd_thm_update_bits(void __iomem *reg, u32 mask, u32 val)
+{
+       u32 tmp, orig;
+
+       orig = readl(reg);
+       tmp = orig & ~mask;
+       tmp |= val & mask;
+       writel(tmp, reg);
+}
+
+static int sprd_thm_cal_read(struct device_node *np, const char *cell_id,
+                            u32 *val)
+{
+       struct nvmem_cell *cell;
+       void *buf;
+       size_t len;
+
+       cell = of_nvmem_cell_get(np, cell_id);
+       if (IS_ERR(cell))
+               return PTR_ERR(cell);
+
+       buf = nvmem_cell_read(cell, &len);
+       nvmem_cell_put(cell);
+       if (IS_ERR(buf))
+               return PTR_ERR(buf);
+
+       if (len > sizeof(u32)) {
+               kfree(buf);
+               return -EINVAL;
+       }
+
+       memcpy(val, buf, len);
+
+       kfree(buf);
+       return 0;
+}
+
+static int sprd_thm_sensor_calibration(struct device_node *np,
+                                      struct sprd_thermal_data *thm,
+                                      struct sprd_thermal_sensor *sen)
+{
+       int ret;
+       /*
+        * According to thermal datasheet, the default calibration offset is 64,
+        * and the default ratio is 1000.
+        */
+       int dt_offset = 64, ratio = 1000;
+
+       ret = sprd_thm_cal_read(np, "sen_delta_cal", &dt_offset);
+       if (ret)
+               return ret;
+
+       ratio += thm->ratio_sign * thm->ratio_off;
+
+       /*
+        * According to the ideal slope K and ideal offset B, combined with
+        * calibration value of thermal from efuse, then calibrate the real
+        * slope k and offset b:
+        * k_cal = (k * ratio) / 1000.
+        * b_cal = b + (dt_offset - 64) * 500.
+        */
+       sen->cal_slope = (thm->var_data->ideal_k * ratio) / 1000;
+       sen->cal_offset = thm->var_data->ideal_b + (dt_offset - 128) * 250;
+
+       return 0;
+}
+
+static int sprd_thm_rawdata_to_temp(struct sprd_thermal_sensor *sen,
+                                   u32 rawdata)
+{
+       clamp(rawdata, (u32)SPRD_THM_RAW_DATA_LOW, (u32)SPRD_THM_RAW_DATA_HIGH);
+
+       /*
+        * According to the thermal datasheet, the formula of converting
+        * adc value to the temperature value should be:
+        * T_final = k_cal * x - b_cal.
+        */
+       return sen->cal_slope * rawdata - sen->cal_offset;
+}
+
+static int sprd_thm_temp_to_rawdata(int temp, struct sprd_thermal_sensor *sen)
+{
+       u32 val;
+
+       clamp(temp, (int)SPRD_THM_TEMP_LOW, (int)SPRD_THM_TEMP_HIGH);
+
+       /*
+        * According to the thermal datasheet, the formula of converting
+        * adc value to the temperature value should be:
+        * T_final = k_cal * x - b_cal.
+        */
+       val = (temp + sen->cal_offset) / sen->cal_slope;
+
+       return clamp(val, val, (u32)(SPRD_THM_RAW_DATA_HIGH - 1));
+}
+
+static int sprd_thm_read_temp(void *devdata, int *temp)
+{
+       struct sprd_thermal_sensor *sen = devdata;
+       u32 data;
+
+       data = readl(sen->data->base + SPRD_THM_TEMP(sen->id)) &
+               SPRD_THM_RAW_READ_MSK;
+
+       *temp = sprd_thm_rawdata_to_temp(sen, data);
+
+       return 0;
+}
+
+static const struct thermal_zone_of_device_ops sprd_thm_ops = {
+       .get_temp = sprd_thm_read_temp,
+};
+
+static int sprd_thm_poll_ready_status(struct sprd_thermal_data *thm)
+{
+       u32 val;
+       int ret;
+
+       /*
+        * Wait for thermal ready status before configuring thermal parameters.
+        */
+       ret = readl_poll_timeout(thm->base + SPRD_THM_CTL, val,
+                                !(val & SPRD_THM_SET_RDY_ST),
+                                SPRD_THM_RDYST_POLLING_TIME,
+                                SPRD_THM_RDYST_TIMEOUT);
+       if (ret)
+               return ret;
+
+       sprd_thm_update_bits(thm->base + SPRD_THM_CTL, SPRD_THM_MON_EN,
+                            SPRD_THM_MON_EN);
+       sprd_thm_update_bits(thm->base + SPRD_THM_CTL, SPRD_THM_SET_RDY,
+                            SPRD_THM_SET_RDY);
+       return 0;
+}
+
+static int sprd_thm_wait_temp_ready(struct sprd_thermal_data *thm)
+{
+       u32 val;
+
+       /* Wait for first temperature data ready before reading temperature */
+       return readl_poll_timeout(thm->base + SPRD_THM_INTERNAL_STS1, val,
+                                 !(val & SPRD_THM_TEMPER_RDY),
+                                 SPRD_THM_TEMP_READY_POLL_TIME,
+                                 SPRD_THM_TEMP_READY_TIMEOUT);
+}
+
+static int sprd_thm_set_ready(struct sprd_thermal_data *thm)
+{
+       int ret;
+
+       ret = sprd_thm_poll_ready_status(thm);
+       if (ret)
+               return ret;
+
+       /*
+        * Clear interrupt status, enable thermal interrupt and enable thermal.
+        *
+        * The SPRD thermal controller integrates a hardware interrupt signal,
+        * which means if the temperature is overheat, it will generate an
+        * interrupt and notify the event to PMIC automatically to shutdown the
+        * system. So here we should enable the interrupt bits, though we have
+        * not registered an irq handler.
+        */
+       writel(SPRD_THM_INT_CLR_MASK, thm->base + SPRD_THM_INT_CLR);
+       sprd_thm_update_bits(thm->base + SPRD_THM_INT_EN,
+                            SPRD_THM_BIT_INT_EN, SPRD_THM_BIT_INT_EN);
+       sprd_thm_update_bits(thm->base + SPRD_THM_CTL,
+                            SPRD_THM_EN, SPRD_THM_EN);
+       return 0;
+}
+
+static void sprd_thm_sensor_init(struct sprd_thermal_data *thm,
+                                struct sprd_thermal_sensor *sen)
+{
+       u32 otp_rawdata, hot_rawdata;
+
+       otp_rawdata = sprd_thm_temp_to_rawdata(SPRD_THM_OTP_TEMP, sen);
+       hot_rawdata = sprd_thm_temp_to_rawdata(SPRD_THM_HOT_TEMP, sen);
+
+       /* Enable the sensor' overheat temperature protection interrupt */
+       sprd_thm_update_bits(thm->base + SPRD_THM_INT_EN,
+                            SPRD_THM_SEN_OVERHEAT_ALARM_EN(sen->id),
+                            SPRD_THM_SEN_OVERHEAT_ALARM_EN(sen->id));
+
+       /* Set the sensor' overheat and hot threshold temperature */
+       sprd_thm_update_bits(thm->base + SPRD_THM_THRES(sen->id),
+                            SPRD_THM_THRES_MASK,
+                            (otp_rawdata << SPRD_THM_OTP_TRIP_SHIFT) |
+                            hot_rawdata);
+
+       /* Enable the corresponding sensor */
+       sprd_thm_update_bits(thm->base + SPRD_THM_CTL, SPRD_THM_SEN(sen->id),
+                            SPRD_THM_SEN(sen->id));
+}
+
+static void sprd_thm_para_config(struct sprd_thermal_data *thm)
+{
+       /* Set the period of two valid temperature detection action */
+       sprd_thm_update_bits(thm->base + SPRD_THM_DET_PERIOD,
+                            SPRD_THM_DET_PERIOD_MASK, SPRD_THM_DET_PERIOD);
+
+       /* Set the sensors' monitor mode */
+       sprd_thm_update_bits(thm->base + SPRD_THM_MON_CTL,
+                            SPRD_THM_MON_MODE_MASK, SPRD_THM_MON_MODE);
+
+       /* Set the sensors' monitor period */
+       sprd_thm_update_bits(thm->base + SPRD_THM_MON_PERIOD,
+                            SPRD_THM_MON_PERIOD_MASK, SPRD_THM_MON_PERIOD);
+}
+
+static void sprd_thm_toggle_sensor(struct sprd_thermal_sensor *sen, bool on)
+{
+       struct thermal_zone_device *tzd = sen->tzd;
+
+       tzd->ops->set_mode(tzd,
+               on ? THERMAL_DEVICE_ENABLED : THERMAL_DEVICE_DISABLED);
+}
+
+static int sprd_thm_probe(struct platform_device *pdev)
+{
+       struct device_node *np = pdev->dev.of_node;
+       struct device_node *sen_child;
+       struct sprd_thermal_data *thm;
+       struct sprd_thermal_sensor *sen;
+       const struct sprd_thm_variant_data *pdata;
+       int ret, i;
+       u32 val;
+
+       pdata = of_device_get_match_data(&pdev->dev);
+       if (!pdata) {
+               dev_err(&pdev->dev, "No matching driver data found\n");
+               return -EINVAL;
+       }
+
+       thm = devm_kzalloc(&pdev->dev, sizeof(*thm), GFP_KERNEL);
+       if (!thm)
+               return -ENOMEM;
+
+       thm->var_data = pdata;
+       thm->base = devm_platform_ioremap_resource(pdev, 0);
+       if (!thm->base)
+               return -ENOMEM;
+
+       thm->nr_sensors = of_get_child_count(np);
+       if (thm->nr_sensors == 0 || thm->nr_sensors > SPRD_THM_MAX_SENSOR) {
+               dev_err(&pdev->dev, "incorrect sensor count\n");
+               return -EINVAL;
+       }
+
+       thm->clk = devm_clk_get(&pdev->dev, "enable");
+       if (IS_ERR(thm->clk)) {
+               dev_err(&pdev->dev, "failed to get enable clock\n");
+               return PTR_ERR(thm->clk);
+       }
+
+       ret = clk_prepare_enable(thm->clk);
+       if (ret)
+               return ret;
+
+       sprd_thm_para_config(thm);
+
+       ret = sprd_thm_cal_read(np, "thm_sign_cal", &val);
+       if (ret)
+               goto disable_clk;
+
+       if (val > 0)
+               thm->ratio_sign = -1;
+       else
+               thm->ratio_sign = 1;
+
+       ret = sprd_thm_cal_read(np, "thm_ratio_cal", &thm->ratio_off);
+       if (ret)
+               goto disable_clk;
+
+       for_each_child_of_node(np, sen_child) {
+               sen = devm_kzalloc(&pdev->dev, sizeof(*sen), GFP_KERNEL);
+               if (!sen) {
+                       ret = -ENOMEM;
+                       goto disable_clk;
+               }
+
+               sen->data = thm;
+               sen->dev = &pdev->dev;
+
+               ret = of_property_read_u32(sen_child, "reg", &sen->id);
+               if (ret) {
+                       dev_err(&pdev->dev, "get sensor reg failed");
+                       goto disable_clk;
+               }
+
+               ret = sprd_thm_sensor_calibration(sen_child, thm, sen);
+               if (ret) {
+                       dev_err(&pdev->dev, "efuse cal analysis failed");
+                       goto disable_clk;
+               }
+
+               sprd_thm_sensor_init(thm, sen);
+
+               sen->tzd = devm_thermal_zone_of_sensor_register(sen->dev,
+                                                               sen->id,
+                                                               sen,
+                                                               &sprd_thm_ops);
+               if (IS_ERR(sen->tzd)) {
+                       dev_err(&pdev->dev, "register thermal zone failed %d\n",
+                               sen->id);
+                       ret = PTR_ERR(sen->tzd);
+                       goto disable_clk;
+               }
+
+               thm->sensor[sen->id] = sen;
+       }
+
+       ret = sprd_thm_set_ready(thm);
+       if (ret)
+               goto disable_clk;
+
+       ret = sprd_thm_wait_temp_ready(thm);
+       if (ret)
+               goto disable_clk;
+
+       for (i = 0; i < thm->nr_sensors; i++)
+               sprd_thm_toggle_sensor(thm->sensor[i], true);
+
+       platform_set_drvdata(pdev, thm);
+       return 0;
+
+disable_clk:
+       clk_disable_unprepare(thm->clk);
+       return ret;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static void sprd_thm_hw_suspend(struct sprd_thermal_data *thm)
+{
+       int i;
+
+       for (i = 0; i < thm->nr_sensors; i++) {
+               sprd_thm_update_bits(thm->base + SPRD_THM_CTL,
+                                    SPRD_THM_SEN(thm->sensor[i]->id), 0);
+       }
+
+       sprd_thm_update_bits(thm->base + SPRD_THM_CTL,
+                            SPRD_THM_EN, 0x0);
+}
+
+static int sprd_thm_suspend(struct device *dev)
+{
+       struct sprd_thermal_data *thm = dev_get_drvdata(dev);
+       int i;
+
+       for (i = 0; i < thm->nr_sensors; i++)
+               sprd_thm_toggle_sensor(thm->sensor[i], false);
+
+       sprd_thm_hw_suspend(thm);
+       clk_disable_unprepare(thm->clk);
+
+       return 0;
+}
+
+static int sprd_thm_hw_resume(struct sprd_thermal_data *thm)
+{
+       int ret, i;
+
+       for (i = 0; i < thm->nr_sensors; i++) {
+               sprd_thm_update_bits(thm->base + SPRD_THM_CTL,
+                                    SPRD_THM_SEN(thm->sensor[i]->id),
+                                    SPRD_THM_SEN(thm->sensor[i]->id));
+       }
+
+       ret = sprd_thm_poll_ready_status(thm);
+       if (ret)
+               return ret;
+
+       writel(SPRD_THM_INT_CLR_MASK, thm->base + SPRD_THM_INT_CLR);
+       sprd_thm_update_bits(thm->base + SPRD_THM_CTL,
+                            SPRD_THM_EN, SPRD_THM_EN);
+       return sprd_thm_wait_temp_ready(thm);
+}
+
+static int sprd_thm_resume(struct device *dev)
+{
+       struct sprd_thermal_data *thm = dev_get_drvdata(dev);
+       int ret, i;
+
+       ret = clk_prepare_enable(thm->clk);
+       if (ret)
+               return ret;
+
+       ret = sprd_thm_hw_resume(thm);
+       if (ret)
+               goto disable_clk;
+
+       for (i = 0; i < thm->nr_sensors; i++)
+               sprd_thm_toggle_sensor(thm->sensor[i], true);
+
+       return 0;
+
+disable_clk:
+       clk_disable_unprepare(thm->clk);
+       return ret;
+}
+#endif
+
+static int sprd_thm_remove(struct platform_device *pdev)
+{
+       struct sprd_thermal_data *thm = platform_get_drvdata(pdev);
+       int i;
+
+       for (i = 0; i < thm->nr_sensors; i++) {
+               sprd_thm_toggle_sensor(thm->sensor[i], false);
+               devm_thermal_zone_of_sensor_unregister(&pdev->dev,
+                                                      thm->sensor[i]->tzd);
+       }
+
+       clk_disable_unprepare(thm->clk);
+       return 0;
+}
+
+static const struct of_device_id sprd_thermal_of_match[] = {
+       { .compatible = "sprd,ums512-thermal", .data = &ums512_data },
+       { },
+};
+
+static const struct dev_pm_ops sprd_thermal_pm_ops = {
+       SET_SYSTEM_SLEEP_PM_OPS(sprd_thm_suspend, sprd_thm_resume)
+};
+
+static struct platform_driver sprd_thermal_driver = {
+       .probe = sprd_thm_probe,
+       .remove = sprd_thm_remove,
+       .driver = {
+               .name = "sprd-thermal",
+               .pm = &sprd_thermal_pm_ops,
+               .of_match_table = sprd_thermal_of_match,
+       },
+};
+
+module_platform_driver(sprd_thermal_driver);
+
+MODULE_AUTHOR("Freeman Liu <freeman.liu@unisoc.com>");
+MODULE_DESCRIPTION("Spreadtrum thermal driver");
+MODULE_LICENSE("GPL v2");
index ad9e3bf..9314e3d 100644 (file)
@@ -478,7 +478,8 @@ static int stm_thermal_resume(struct device *dev)
 }
 #endif /* CONFIG_PM_SLEEP */
 
-SIMPLE_DEV_PM_OPS(stm_thermal_pm_ops, stm_thermal_suspend, stm_thermal_resume);
+static SIMPLE_DEV_PM_OPS(stm_thermal_pm_ops,
+                        stm_thermal_suspend, stm_thermal_resume);
 
 static const struct thermal_zone_of_device_ops stm_tz_ops = {
        .get_temp       = stm_thermal_get_temp,
index 2fa78f7..263b042 100644 (file)
@@ -15,7 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
 #include <linux/clk.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/platform_device.h>
 #include <linux/err.h>
 #include <linux/types.h>
@@ -24,7 +24,6 @@
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #include <linux/of_irq.h>
-#include <linux/of_gpio.h>
 #include <linux/io.h>
 
 #include "ti-bandgap.h"
@@ -743,27 +742,13 @@ exit:
 static int ti_bandgap_tshut_init(struct ti_bandgap *bgp,
                                 struct platform_device *pdev)
 {
-       int gpio_nr = bgp->tshut_gpio;
        int status;
 
-       /* Request for gpio_86 line */
-       status = gpio_request(gpio_nr, "tshut");
-       if (status < 0) {
-               dev_err(bgp->dev, "Could not request for TSHUT GPIO:%i\n", 86);
-               return status;
-       }
-       status = gpio_direction_input(gpio_nr);
-       if (status) {
-               dev_err(bgp->dev, "Cannot set input TSHUT GPIO %d\n", gpio_nr);
-               return status;
-       }
-
-       status = request_irq(gpio_to_irq(gpio_nr), ti_bandgap_tshut_irq_handler,
+       status = request_irq(gpiod_to_irq(bgp->tshut_gpiod),
+                            ti_bandgap_tshut_irq_handler,
                             IRQF_TRIGGER_RISING, "tshut", NULL);
-       if (status) {
-               gpio_free(gpio_nr);
+       if (status)
                dev_err(bgp->dev, "request irq failed for TSHUT");
-       }
 
        return 0;
 }
@@ -860,11 +845,10 @@ static struct ti_bandgap *ti_bandgap_build(struct platform_device *pdev)
        } while (res);
 
        if (TI_BANDGAP_HAS(bgp, TSHUT)) {
-               bgp->tshut_gpio = of_get_gpio(node, 0);
-               if (!gpio_is_valid(bgp->tshut_gpio)) {
-                       dev_err(&pdev->dev, "invalid gpio for tshut (%d)\n",
-                               bgp->tshut_gpio);
-                       return ERR_PTR(-EINVAL);
+               bgp->tshut_gpiod = devm_gpiod_get(&pdev->dev, NULL, GPIOD_IN);
+               if (IS_ERR(bgp->tshut_gpiod)) {
+                       dev_err(&pdev->dev, "invalid gpio for tshut\n");
+                       return ERR_CAST(bgp->tshut_gpiod);
                }
        }
 
@@ -1046,10 +1030,8 @@ put_clks:
 put_fclock:
        clk_put(bgp->fclock);
 free_irqs:
-       if (TI_BANDGAP_HAS(bgp, TSHUT)) {
-               free_irq(gpio_to_irq(bgp->tshut_gpio), NULL);
-               gpio_free(bgp->tshut_gpio);
-       }
+       if (TI_BANDGAP_HAS(bgp, TSHUT))
+               free_irq(gpiod_to_irq(bgp->tshut_gpiod), NULL);
 
        return ret;
 }
@@ -1079,10 +1061,8 @@ int ti_bandgap_remove(struct platform_device *pdev)
        if (TI_BANDGAP_HAS(bgp, TALERT))
                free_irq(bgp->irq, bgp);
 
-       if (TI_BANDGAP_HAS(bgp, TSHUT)) {
-               free_irq(gpio_to_irq(bgp->tshut_gpio), NULL);
-               gpio_free(bgp->tshut_gpio);
-       }
+       if (TI_BANDGAP_HAS(bgp, TSHUT))
+               free_irq(gpiod_to_irq(bgp->tshut_gpiod), NULL);
 
        return 0;
 }
index bb9b0f7..fce4657 100644 (file)
@@ -13,6 +13,8 @@
 #include <linux/types.h>
 #include <linux/err.h>
 
+struct gpio_desc;
+
 /**
  * DOC: bandgap driver data structure
  * ==================================
@@ -199,7 +201,7 @@ struct ti_bandgap {
        struct clk                      *div_clk;
        spinlock_t                      lock; /* shields this struct */
        int                             irq;
-       int                             tshut_gpio;
+       struct gpio_desc                *tshut_gpiod;
        u32                             clk_rate;
 };
 
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
new file mode 100644 (file)
index 0000000..7db1460
--- /dev/null
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config VDPA
+       tristate
+       help
+         Enable this module to support vDPA device that uses a
+         datapath which complies with virtio specifications with
+         vendor specific control path.
+
+menuconfig VDPA_MENU
+       bool "VDPA drivers"
+       default n
+
+if VDPA_MENU
+
+config VDPA_SIM
+       tristate "vDPA device simulator"
+       depends on RUNTIME_TESTING_MENU
+       select VDPA
+       select VHOST_RING
+       default n
+       help
+         vDPA networking device simulator which loop TX traffic back
+         to RX. This device is used for testing, prototyping and
+         development of vDPA.
+
+config IFCVF
+       tristate "Intel IFC VF VDPA driver"
+       depends on PCI_MSI
+       select VDPA
+       default n
+       help
+         This kernel module can drive Intel IFC VF NIC to offload
+         virtio dataplane traffic to hardware.
+         To compile this driver as a module, choose M here: the module will
+         be called ifcvf.
+
+endif # VDPA_MENU
diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
new file mode 100644 (file)
index 0000000..8bbb686
--- /dev/null
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VDPA) += vdpa.o
+obj-$(CONFIG_VDPA_SIM) += vdpa_sim/
+obj-$(CONFIG_IFCVF)    += ifcvf/
diff --git a/drivers/vdpa/ifcvf/Makefile b/drivers/vdpa/ifcvf/Makefile
new file mode 100644 (file)
index 0000000..d709915
--- /dev/null
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_IFCVF) += ifcvf.o
+ifcvf-$(CONFIG_IFCVF) += ifcvf_main.o ifcvf_base.o
diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
new file mode 100644 (file)
index 0000000..b61b06e
--- /dev/null
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel IFC VF NIC driver for virtio dataplane offloading
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Author: Zhu Lingshan <lingshan.zhu@intel.com>
+ *
+ */
+
+#include "ifcvf_base.h"
+
+static inline u8 ifc_ioread8(u8 __iomem *addr)
+{
+       return ioread8(addr);
+}
+static inline u16 ifc_ioread16 (__le16 __iomem *addr)
+{
+       return ioread16(addr);
+}
+
+static inline u32 ifc_ioread32(__le32 __iomem *addr)
+{
+       return ioread32(addr);
+}
+
+static inline void ifc_iowrite8(u8 value, u8 __iomem *addr)
+{
+       iowrite8(value, addr);
+}
+
+static inline void ifc_iowrite16(u16 value, __le16 __iomem *addr)
+{
+       iowrite16(value, addr);
+}
+
+static inline void ifc_iowrite32(u32 value, __le32 __iomem *addr)
+{
+       iowrite32(value, addr);
+}
+
+static void ifc_iowrite64_twopart(u64 val,
+                                 __le32 __iomem *lo, __le32 __iomem *hi)
+{
+       ifc_iowrite32((u32)val, lo);
+       ifc_iowrite32(val >> 32, hi);
+}
+
+struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw)
+{
+       return container_of(hw, struct ifcvf_adapter, vf);
+}
+
+static void __iomem *get_cap_addr(struct ifcvf_hw *hw,
+                                 struct virtio_pci_cap *cap)
+{
+       struct ifcvf_adapter *ifcvf;
+       struct pci_dev *pdev;
+       u32 length, offset;
+       u8 bar;
+
+       length = le32_to_cpu(cap->length);
+       offset = le32_to_cpu(cap->offset);
+       bar = cap->bar;
+
+       ifcvf= vf_to_adapter(hw);
+       pdev = ifcvf->pdev;
+
+       if (bar >= IFCVF_PCI_MAX_RESOURCE) {
+               IFCVF_DBG(pdev,
+                         "Invalid bar number %u to get capabilities\n", bar);
+               return NULL;
+       }
+
+       if (offset + length > pci_resource_len(pdev, bar)) {
+               IFCVF_DBG(pdev,
+                         "offset(%u) + len(%u) overflows bar%u's capability\n",
+                         offset, length, bar);
+               return NULL;
+       }
+
+       return hw->base[bar] + offset;
+}
+
+static int ifcvf_read_config_range(struct pci_dev *dev,
+                                  uint32_t *val, int size, int where)
+{
+       int ret, i;
+
+       for (i = 0; i < size; i += 4) {
+               ret = pci_read_config_dword(dev, where + i, val + i / 4);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
+int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev)
+{
+       struct virtio_pci_cap cap;
+       u16 notify_off;
+       int ret;
+       u8 pos;
+       u32 i;
+
+       ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos);
+       if (ret < 0) {
+               IFCVF_ERR(pdev, "Failed to read PCI capability list\n");
+               return -EIO;
+       }
+
+       while (pos) {
+               ret = ifcvf_read_config_range(pdev, (u32 *)&cap,
+                                             sizeof(cap), pos);
+               if (ret < 0) {
+                       IFCVF_ERR(pdev,
+                                 "Failed to get PCI capability at %x\n", pos);
+                       break;
+               }
+
+               if (cap.cap_vndr != PCI_CAP_ID_VNDR)
+                       goto next;
+
+               switch (cap.cfg_type) {
+               case VIRTIO_PCI_CAP_COMMON_CFG:
+                       hw->common_cfg = get_cap_addr(hw, &cap);
+                       IFCVF_DBG(pdev, "hw->common_cfg = %p\n",
+                                 hw->common_cfg);
+                       break;
+               case VIRTIO_PCI_CAP_NOTIFY_CFG:
+                       pci_read_config_dword(pdev, pos + sizeof(cap),
+                                             &hw->notify_off_multiplier);
+                       hw->notify_bar = cap.bar;
+                       hw->notify_base = get_cap_addr(hw, &cap);
+                       IFCVF_DBG(pdev, "hw->notify_base = %p\n",
+                                 hw->notify_base);
+                       break;
+               case VIRTIO_PCI_CAP_ISR_CFG:
+                       hw->isr = get_cap_addr(hw, &cap);
+                       IFCVF_DBG(pdev, "hw->isr = %p\n", hw->isr);
+                       break;
+               case VIRTIO_PCI_CAP_DEVICE_CFG:
+                       hw->net_cfg = get_cap_addr(hw, &cap);
+                       IFCVF_DBG(pdev, "hw->net_cfg = %p\n", hw->net_cfg);
+                       break;
+               }
+
+next:
+               pos = cap.cap_next;
+       }
+
+       if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+           hw->isr == NULL || hw->net_cfg == NULL) {
+               IFCVF_ERR(pdev, "Incomplete PCI capabilities\n");
+               return -EIO;
+       }
+
+       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+               ifc_iowrite16(i, &hw->common_cfg->queue_select);
+               notify_off = ifc_ioread16(&hw->common_cfg->queue_notify_off);
+               hw->vring[i].notify_addr = hw->notify_base +
+                       notify_off * hw->notify_off_multiplier;
+       }
+
+       hw->lm_cfg = hw->base[IFCVF_LM_BAR];
+
+       IFCVF_DBG(pdev,
+                 "PCI capability mapping: common cfg: %p, notify base: %p\n, isr cfg: %p, device cfg: %p, multiplier: %u\n",
+                 hw->common_cfg, hw->notify_base, hw->isr,
+                 hw->net_cfg, hw->notify_off_multiplier);
+
+       return 0;
+}
+
+u8 ifcvf_get_status(struct ifcvf_hw *hw)
+{
+       return ifc_ioread8(&hw->common_cfg->device_status);
+}
+
+void ifcvf_set_status(struct ifcvf_hw *hw, u8 status)
+{
+       ifc_iowrite8(status, &hw->common_cfg->device_status);
+}
+
+void ifcvf_reset(struct ifcvf_hw *hw)
+{
+       ifcvf_set_status(hw, 0);
+       /* flush set_status, make sure VF is stopped, reset */
+       ifcvf_get_status(hw);
+}
+
+static void ifcvf_add_status(struct ifcvf_hw *hw, u8 status)
+{
+       if (status != 0)
+               status |= ifcvf_get_status(hw);
+
+       ifcvf_set_status(hw, status);
+       ifcvf_get_status(hw);
+}
+
+u64 ifcvf_get_features(struct ifcvf_hw *hw)
+{
+       struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+       u32 features_lo, features_hi;
+
+       ifc_iowrite32(0, &cfg->device_feature_select);
+       features_lo = ifc_ioread32(&cfg->device_feature);
+
+       ifc_iowrite32(1, &cfg->device_feature_select);
+       features_hi = ifc_ioread32(&cfg->device_feature);
+
+       return ((u64)features_hi << 32) | features_lo;
+}
+
+void ifcvf_read_net_config(struct ifcvf_hw *hw, u64 offset,
+                          void *dst, int length)
+{
+       u8 old_gen, new_gen, *p;
+       int i;
+
+       WARN_ON(offset + length > sizeof(struct virtio_net_config));
+       do {
+               old_gen = ifc_ioread8(&hw->common_cfg->config_generation);
+               p = dst;
+               for (i = 0; i < length; i++)
+                       *p++ = ifc_ioread8(hw->net_cfg + offset + i);
+
+               new_gen = ifc_ioread8(&hw->common_cfg->config_generation);
+       } while (old_gen != new_gen);
+}
+
+void ifcvf_write_net_config(struct ifcvf_hw *hw, u64 offset,
+                           const void *src, int length)
+{
+       const u8 *p;
+       int i;
+
+       p = src;
+       WARN_ON(offset + length > sizeof(struct virtio_net_config));
+       for (i = 0; i < length; i++)
+               ifc_iowrite8(*p++, hw->net_cfg + offset + i);
+}
+
+static void ifcvf_set_features(struct ifcvf_hw *hw, u64 features)
+{
+       struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
+
+       ifc_iowrite32(0, &cfg->guest_feature_select);
+       ifc_iowrite32((u32)features, &cfg->guest_feature);
+
+       ifc_iowrite32(1, &cfg->guest_feature_select);
+       ifc_iowrite32(features >> 32, &cfg->guest_feature);
+}
+
+static int ifcvf_config_features(struct ifcvf_hw *hw)
+{
+       struct ifcvf_adapter *ifcvf;
+
+       ifcvf = vf_to_adapter(hw);
+       ifcvf_set_features(hw, hw->req_features);
+       ifcvf_add_status(hw, VIRTIO_CONFIG_S_FEATURES_OK);
+
+       if (!(ifcvf_get_status(hw) & VIRTIO_CONFIG_S_FEATURES_OK)) {
+               IFCVF_ERR(ifcvf->pdev, "Failed to set FEATURES_OK status\n");
+               return -EIO;
+       }
+
+       return 0;
+}
+
+u64 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid)
+{
+       struct ifcvf_lm_cfg __iomem *ifcvf_lm;
+       void __iomem *avail_idx_addr;
+       u16 last_avail_idx;
+       u32 q_pair_id;
+
+       ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
+       q_pair_id = qid / (IFCVF_MAX_QUEUE_PAIRS * 2);
+       avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
+       last_avail_idx = ifc_ioread16(avail_idx_addr);
+
+       return last_avail_idx;
+}
+
+int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u64 num)
+{
+       struct ifcvf_lm_cfg __iomem *ifcvf_lm;
+       void __iomem *avail_idx_addr;
+       u32 q_pair_id;
+
+       ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
+       q_pair_id = qid / (IFCVF_MAX_QUEUE_PAIRS * 2);
+       avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
+       hw->vring[qid].last_avail_idx = num;
+       ifc_iowrite16(num, avail_idx_addr);
+
+       return 0;
+}
+
+static int ifcvf_hw_enable(struct ifcvf_hw *hw)
+{
+       struct ifcvf_lm_cfg __iomem *ifcvf_lm;
+       struct virtio_pci_common_cfg __iomem *cfg;
+       struct ifcvf_adapter *ifcvf;
+       u32 i;
+
+       ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
+       ifcvf = vf_to_adapter(hw);
+       cfg = hw->common_cfg;
+       ifc_iowrite16(IFCVF_MSI_CONFIG_OFF, &cfg->msix_config);
+
+       if (ifc_ioread16(&cfg->msix_config) == VIRTIO_MSI_NO_VECTOR) {
+               IFCVF_ERR(ifcvf->pdev, "No msix vector for device config\n");
+               return -EINVAL;
+       }
+
+       for (i = 0; i < hw->nr_vring; i++) {
+               if (!hw->vring[i].ready)
+                       break;
+
+               ifc_iowrite16(i, &cfg->queue_select);
+               ifc_iowrite64_twopart(hw->vring[i].desc, &cfg->queue_desc_lo,
+                                    &cfg->queue_desc_hi);
+               ifc_iowrite64_twopart(hw->vring[i].avail, &cfg->queue_avail_lo,
+                                     &cfg->queue_avail_hi);
+               ifc_iowrite64_twopart(hw->vring[i].used, &cfg->queue_used_lo,
+                                    &cfg->queue_used_hi);
+               ifc_iowrite16(hw->vring[i].size, &cfg->queue_size);
+               ifc_iowrite16(i + IFCVF_MSI_QUEUE_OFF, &cfg->queue_msix_vector);
+
+               if (ifc_ioread16(&cfg->queue_msix_vector) ==
+                   VIRTIO_MSI_NO_VECTOR) {
+                       IFCVF_ERR(ifcvf->pdev,
+                                 "No msix vector for queue %u\n", i);
+                       return -EINVAL;
+               }
+
+               ifcvf_set_vq_state(hw, i, hw->vring[i].last_avail_idx);
+               ifc_iowrite16(1, &cfg->queue_enable);
+       }
+
+       return 0;
+}
+
+static void ifcvf_hw_disable(struct ifcvf_hw *hw)
+{
+       struct virtio_pci_common_cfg __iomem *cfg;
+       u32 i;
+
+       cfg = hw->common_cfg;
+       ifc_iowrite16(VIRTIO_MSI_NO_VECTOR, &cfg->msix_config);
+
+       for (i = 0; i < hw->nr_vring; i++) {
+               ifc_iowrite16(i, &cfg->queue_select);
+               ifc_iowrite16(VIRTIO_MSI_NO_VECTOR, &cfg->queue_msix_vector);
+       }
+
+       ifc_ioread16(&cfg->queue_msix_vector);
+}
+
+int ifcvf_start_hw(struct ifcvf_hw *hw)
+{
+       ifcvf_reset(hw);
+       ifcvf_add_status(hw, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+       ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER);
+
+       if (ifcvf_config_features(hw) < 0)
+               return -EINVAL;
+
+       if (ifcvf_hw_enable(hw) < 0)
+               return -EINVAL;
+
+       ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER_OK);
+
+       return 0;
+}
+
+void ifcvf_stop_hw(struct ifcvf_hw *hw)
+{
+       ifcvf_hw_disable(hw);
+       ifcvf_reset(hw);
+}
+
+void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid)
+{
+       ifc_iowrite16(qid, hw->vring[qid].notify_addr);
+}
diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h
new file mode 100644 (file)
index 0000000..e803070
--- /dev/null
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Intel IFC VF NIC driver for virtio dataplane offloading
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Author: Zhu Lingshan <lingshan.zhu@intel.com>
+ *
+ */
+
+#ifndef _IFCVF_H_
+#define _IFCVF_H_
+
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+#include <linux/vdpa.h>
+#include <uapi/linux/virtio_net.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_pci.h>
+
+#define IFCVF_VENDOR_ID                0x1AF4
+#define IFCVF_DEVICE_ID                0x1041
+#define IFCVF_SUBSYS_VENDOR_ID 0x8086
+#define IFCVF_SUBSYS_DEVICE_ID 0x001A
+
+#define IFCVF_SUPPORTED_FEATURES \
+               ((1ULL << VIRTIO_NET_F_MAC)                     | \
+                (1ULL << VIRTIO_F_ANY_LAYOUT)                  | \
+                (1ULL << VIRTIO_F_VERSION_1)                   | \
+                (1ULL << VIRTIO_F_ORDER_PLATFORM)              | \
+                (1ULL << VIRTIO_F_IOMMU_PLATFORM)              | \
+                (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+
+/* Only one queue pair for now. */
+#define IFCVF_MAX_QUEUE_PAIRS  1
+
+#define IFCVF_QUEUE_ALIGNMENT  PAGE_SIZE
+#define IFCVF_QUEUE_MAX                32768
+#define IFCVF_MSI_CONFIG_OFF   0
+#define IFCVF_MSI_QUEUE_OFF    1
+#define IFCVF_PCI_MAX_RESOURCE 6
+
+#define IFCVF_LM_CFG_SIZE              0x40
+#define IFCVF_LM_RING_STATE_OFFSET     0x20
+#define IFCVF_LM_BAR                   4
+
+#define IFCVF_ERR(pdev, fmt, ...)      dev_err(&pdev->dev, fmt, ##__VA_ARGS__)
+#define IFCVF_DBG(pdev, fmt, ...)      dev_dbg(&pdev->dev, fmt, ##__VA_ARGS__)
+#define IFCVF_INFO(pdev, fmt, ...)     dev_info(&pdev->dev, fmt, ##__VA_ARGS__)
+
+#define ifcvf_private_to_vf(adapter) \
+       (&((struct ifcvf_adapter *)adapter)->vf)
+
+#define IFCVF_MAX_INTR (IFCVF_MAX_QUEUE_PAIRS * 2 + 1)
+
+struct vring_info {
+       u64 desc;
+       u64 avail;
+       u64 used;
+       u16 size;
+       u16 last_avail_idx;
+       bool ready;
+       void __iomem *notify_addr;
+       u32 irq;
+       struct vdpa_callback cb;
+       char msix_name[256];
+};
+
+struct ifcvf_hw {
+       u8 __iomem *isr;
+       /* Live migration */
+       u8 __iomem *lm_cfg;
+       u16 nr_vring;
+       /* Notification bar number */
+       u8 notify_bar;
+       /* Notificaiton bar address */
+       void __iomem *notify_base;
+       u32 notify_off_multiplier;
+       u64 req_features;
+       struct virtio_pci_common_cfg __iomem *common_cfg;
+       void __iomem *net_cfg;
+       struct vring_info vring[IFCVF_MAX_QUEUE_PAIRS * 2];
+       void __iomem * const *base;
+};
+
+struct ifcvf_adapter {
+       struct vdpa_device vdpa;
+       struct pci_dev *pdev;
+       struct ifcvf_hw vf;
+};
+
+struct ifcvf_vring_lm_cfg {
+       u32 idx_addr[2];
+       u8 reserved[IFCVF_LM_CFG_SIZE - 8];
+};
+
+struct ifcvf_lm_cfg {
+       u8 reserved[IFCVF_LM_RING_STATE_OFFSET];
+       struct ifcvf_vring_lm_cfg vring_lm_cfg[IFCVF_MAX_QUEUE_PAIRS];
+};
+
+int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *dev);
+int ifcvf_start_hw(struct ifcvf_hw *hw);
+void ifcvf_stop_hw(struct ifcvf_hw *hw);
+void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid);
+void ifcvf_read_net_config(struct ifcvf_hw *hw, u64 offset,
+                          void *dst, int length);
+void ifcvf_write_net_config(struct ifcvf_hw *hw, u64 offset,
+                           const void *src, int length);
+u8 ifcvf_get_status(struct ifcvf_hw *hw);
+void ifcvf_set_status(struct ifcvf_hw *hw, u8 status);
+void io_write64_twopart(u64 val, u32 *lo, u32 *hi);
+void ifcvf_reset(struct ifcvf_hw *hw);
+u64 ifcvf_get_features(struct ifcvf_hw *hw);
+u64 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid);
+int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u64 num);
+struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw);
+#endif /* _IFCVF_H_ */
diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
new file mode 100644 (file)
index 0000000..8d54dc5
--- /dev/null
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel IFC VF NIC driver for virtio dataplane offloading
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Author: Zhu Lingshan <lingshan.zhu@intel.com>
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/sysfs.h>
+#include "ifcvf_base.h"
+
+#define VERSION_STRING  "0.1"
+#define DRIVER_AUTHOR   "Intel Corporation"
+#define IFCVF_DRIVER_NAME       "ifcvf"
+
+static irqreturn_t ifcvf_intr_handler(int irq, void *arg)
+{
+       struct vring_info *vring = arg;
+
+       if (vring->cb.callback)
+               return vring->cb.callback(vring->cb.private);
+
+       return IRQ_HANDLED;
+}
+
+static int ifcvf_start_datapath(void *private)
+{
+       struct ifcvf_hw *vf = ifcvf_private_to_vf(private);
+       struct ifcvf_adapter *ifcvf;
+       u8 status;
+       int ret;
+
+       ifcvf = vf_to_adapter(vf);
+       vf->nr_vring = IFCVF_MAX_QUEUE_PAIRS * 2;
+       ret = ifcvf_start_hw(vf);
+       if (ret < 0) {
+               status = ifcvf_get_status(vf);
+               status |= VIRTIO_CONFIG_S_FAILED;
+               ifcvf_set_status(vf, status);
+       }
+
+       return ret;
+}
+
+static int ifcvf_stop_datapath(void *private)
+{
+       struct ifcvf_hw *vf = ifcvf_private_to_vf(private);
+       int i;
+
+       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++)
+               vf->vring[i].cb.callback = NULL;
+
+       ifcvf_stop_hw(vf);
+
+       return 0;
+}
+
+static void ifcvf_reset_vring(struct ifcvf_adapter *adapter)
+{
+       struct ifcvf_hw *vf = ifcvf_private_to_vf(adapter);
+       int i;
+
+       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+               vf->vring[i].last_avail_idx = 0;
+               vf->vring[i].desc = 0;
+               vf->vring[i].avail = 0;
+               vf->vring[i].used = 0;
+               vf->vring[i].ready = 0;
+               vf->vring[i].cb.callback = NULL;
+               vf->vring[i].cb.private = NULL;
+       }
+
+       ifcvf_reset(vf);
+}
+
+static struct ifcvf_adapter *vdpa_to_adapter(struct vdpa_device *vdpa_dev)
+{
+       return container_of(vdpa_dev, struct ifcvf_adapter, vdpa);
+}
+
+static struct ifcvf_hw *vdpa_to_vf(struct vdpa_device *vdpa_dev)
+{
+       struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev);
+
+       return &adapter->vf;
+}
+
+static u64 ifcvf_vdpa_get_features(struct vdpa_device *vdpa_dev)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+       u64 features;
+
+       features = ifcvf_get_features(vf) & IFCVF_SUPPORTED_FEATURES;
+
+       return features;
+}
+
+static int ifcvf_vdpa_set_features(struct vdpa_device *vdpa_dev, u64 features)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       vf->req_features = features;
+
+       return 0;
+}
+
+static u8 ifcvf_vdpa_get_status(struct vdpa_device *vdpa_dev)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       return ifcvf_get_status(vf);
+}
+
+static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status)
+{
+       struct ifcvf_adapter *adapter;
+       struct ifcvf_hw *vf;
+
+       vf  = vdpa_to_vf(vdpa_dev);
+       adapter = dev_get_drvdata(vdpa_dev->dev.parent);
+
+       if (status == 0) {
+               ifcvf_stop_datapath(adapter);
+               ifcvf_reset_vring(adapter);
+               return;
+       }
+
+       if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+               if (ifcvf_start_datapath(adapter) < 0)
+                       IFCVF_ERR(adapter->pdev,
+                                 "Failed to set ifcvf vdpa  status %u\n",
+                                 status);
+       }
+
+       ifcvf_set_status(vf, status);
+}
+
+static u16 ifcvf_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
+{
+       return IFCVF_QUEUE_MAX;
+}
+
+static u64 ifcvf_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       return ifcvf_get_vq_state(vf, qid);
+}
+
+static int ifcvf_vdpa_set_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
+                                  u64 num)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       return ifcvf_set_vq_state(vf, qid, num);
+}
+
+static void ifcvf_vdpa_set_vq_cb(struct vdpa_device *vdpa_dev, u16 qid,
+                                struct vdpa_callback *cb)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       vf->vring[qid].cb = *cb;
+}
+
+static void ifcvf_vdpa_set_vq_ready(struct vdpa_device *vdpa_dev,
+                                   u16 qid, bool ready)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       vf->vring[qid].ready = ready;
+}
+
+static bool ifcvf_vdpa_get_vq_ready(struct vdpa_device *vdpa_dev, u16 qid)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       return vf->vring[qid].ready;
+}
+
+static void ifcvf_vdpa_set_vq_num(struct vdpa_device *vdpa_dev, u16 qid,
+                                 u32 num)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       vf->vring[qid].size = num;
+}
+
+static int ifcvf_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid,
+                                    u64 desc_area, u64 driver_area,
+                                    u64 device_area)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       vf->vring[qid].desc = desc_area;
+       vf->vring[qid].avail = driver_area;
+       vf->vring[qid].used = device_area;
+
+       return 0;
+}
+
+static void ifcvf_vdpa_kick_vq(struct vdpa_device *vdpa_dev, u16 qid)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       ifcvf_notify_queue(vf, qid);
+}
+
+static u32 ifcvf_vdpa_get_generation(struct vdpa_device *vdpa_dev)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       return ioread8(&vf->common_cfg->config_generation);
+}
+
+static u32 ifcvf_vdpa_get_device_id(struct vdpa_device *vdpa_dev)
+{
+       return VIRTIO_ID_NET;
+}
+
+static u32 ifcvf_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev)
+{
+       return IFCVF_SUBSYS_VENDOR_ID;
+}
+
+static u16 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev)
+{
+       return IFCVF_QUEUE_ALIGNMENT;
+}
+
+static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev,
+                                 unsigned int offset,
+                                 void *buf, unsigned int len)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       WARN_ON(offset + len > sizeof(struct virtio_net_config));
+       ifcvf_read_net_config(vf, offset, buf, len);
+}
+
+static void ifcvf_vdpa_set_config(struct vdpa_device *vdpa_dev,
+                                 unsigned int offset, const void *buf,
+                                 unsigned int len)
+{
+       struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
+
+       WARN_ON(offset + len > sizeof(struct virtio_net_config));
+       ifcvf_write_net_config(vf, offset, buf, len);
+}
+
+static void ifcvf_vdpa_set_config_cb(struct vdpa_device *vdpa_dev,
+                                    struct vdpa_callback *cb)
+{
+       /* We don't support config interrupt */
+}
+
+/*
+ * IFCVF currently does't have on-chip IOMMU, so not
+ * implemented set_map()/dma_map()/dma_unmap()
+ */
+static const struct vdpa_config_ops ifc_vdpa_ops = {
+       .get_features   = ifcvf_vdpa_get_features,
+       .set_features   = ifcvf_vdpa_set_features,
+       .get_status     = ifcvf_vdpa_get_status,
+       .set_status     = ifcvf_vdpa_set_status,
+       .get_vq_num_max = ifcvf_vdpa_get_vq_num_max,
+       .get_vq_state   = ifcvf_vdpa_get_vq_state,
+       .set_vq_state   = ifcvf_vdpa_set_vq_state,
+       .set_vq_cb      = ifcvf_vdpa_set_vq_cb,
+       .set_vq_ready   = ifcvf_vdpa_set_vq_ready,
+       .get_vq_ready   = ifcvf_vdpa_get_vq_ready,
+       .set_vq_num     = ifcvf_vdpa_set_vq_num,
+       .set_vq_address = ifcvf_vdpa_set_vq_address,
+       .kick_vq        = ifcvf_vdpa_kick_vq,
+       .get_generation = ifcvf_vdpa_get_generation,
+       .get_device_id  = ifcvf_vdpa_get_device_id,
+       .get_vendor_id  = ifcvf_vdpa_get_vendor_id,
+       .get_vq_align   = ifcvf_vdpa_get_vq_align,
+       .get_config     = ifcvf_vdpa_get_config,
+       .set_config     = ifcvf_vdpa_set_config,
+       .set_config_cb  = ifcvf_vdpa_set_config_cb,
+};
+
+static int ifcvf_request_irq(struct ifcvf_adapter *adapter)
+{
+       struct pci_dev *pdev = adapter->pdev;
+       struct ifcvf_hw *vf = &adapter->vf;
+       int vector, i, ret, irq;
+
+
+       for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) {
+               snprintf(vf->vring[i].msix_name, 256, "ifcvf[%s]-%d\n",
+                        pci_name(pdev), i);
+               vector = i + IFCVF_MSI_QUEUE_OFF;
+               irq = pci_irq_vector(pdev, vector);
+               ret = devm_request_irq(&pdev->dev, irq,
+                                      ifcvf_intr_handler, 0,
+                                      vf->vring[i].msix_name,
+                                      &vf->vring[i]);
+               if (ret) {
+                       IFCVF_ERR(pdev,
+                                 "Failed to request irq for vq %d\n", i);
+                       return ret;
+               }
+               vf->vring[i].irq = irq;
+       }
+
+       return 0;
+}
+
+static void ifcvf_free_irq_vectors(void *data)
+{
+       pci_free_irq_vectors(data);
+}
+
+static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+       struct device *dev = &pdev->dev;
+       struct ifcvf_adapter *adapter;
+       struct ifcvf_hw *vf;
+       int ret;
+
+       ret = pcim_enable_device(pdev);
+       if (ret) {
+               IFCVF_ERR(pdev, "Failed to enable device\n");
+               return ret;
+       }
+
+       ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4),
+                                IFCVF_DRIVER_NAME);
+       if (ret) {
+               IFCVF_ERR(pdev, "Failed to request MMIO region\n");
+               return ret;
+       }
+
+       ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+       if (ret) {
+               IFCVF_ERR(pdev, "No usable DMA confiugration\n");
+               return ret;
+       }
+
+       ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+       if (ret) {
+               IFCVF_ERR(pdev,
+                         "No usable coherent DMA confiugration\n");
+               return ret;
+       }
+
+       ret = pci_alloc_irq_vectors(pdev, IFCVF_MAX_INTR,
+                                   IFCVF_MAX_INTR, PCI_IRQ_MSIX);
+       if (ret < 0) {
+               IFCVF_ERR(pdev, "Failed to alloc irq vectors\n");
+               return ret;
+       }
+
+       ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev);
+       if (ret) {
+               IFCVF_ERR(pdev,
+                         "Failed for adding devres for freeing irq vectors\n");
+               return ret;
+       }
+
+       adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa,
+                                   dev, &ifc_vdpa_ops);
+       if (adapter == NULL) {
+               IFCVF_ERR(pdev, "Failed to allocate vDPA structure");
+               return -ENOMEM;
+       }
+
+       pci_set_master(pdev);
+       pci_set_drvdata(pdev, adapter);
+
+       vf = &adapter->vf;
+       vf->base = pcim_iomap_table(pdev);
+
+       adapter->pdev = pdev;
+       adapter->vdpa.dma_dev = &pdev->dev;
+
+       ret = ifcvf_request_irq(adapter);
+       if (ret) {
+               IFCVF_ERR(pdev, "Failed to request MSI-X irq\n");
+               goto err;
+       }
+
+       ret = ifcvf_init_hw(vf, pdev);
+       if (ret) {
+               IFCVF_ERR(pdev, "Failed to init IFCVF hw\n");
+               goto err;
+       }
+
+       ret = vdpa_register_device(&adapter->vdpa);
+       if (ret) {
+               IFCVF_ERR(pdev, "Failed to register ifcvf to vdpa bus");
+               goto err;
+       }
+
+       return 0;
+
+err:
+       put_device(&adapter->vdpa.dev);
+       return ret;
+}
+
+static void ifcvf_remove(struct pci_dev *pdev)
+{
+       struct ifcvf_adapter *adapter = pci_get_drvdata(pdev);
+
+       vdpa_unregister_device(&adapter->vdpa);
+}
+
+static struct pci_device_id ifcvf_pci_ids[] = {
+       { PCI_DEVICE_SUB(IFCVF_VENDOR_ID,
+               IFCVF_DEVICE_ID,
+               IFCVF_SUBSYS_VENDOR_ID,
+               IFCVF_SUBSYS_DEVICE_ID) },
+       { 0 },
+};
+MODULE_DEVICE_TABLE(pci, ifcvf_pci_ids);
+
+static struct pci_driver ifcvf_driver = {
+       .name     = IFCVF_DRIVER_NAME,
+       .id_table = ifcvf_pci_ids,
+       .probe    = ifcvf_probe,
+       .remove   = ifcvf_remove,
+};
+
+module_pci_driver(ifcvf_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(VERSION_STRING);
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
new file mode 100644 (file)
index 0000000..e9ed6a2
--- /dev/null
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vDPA bus.
+ *
+ * Copyright (c) 2020, Red Hat. All rights reserved.
+ *     Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/vdpa.h>
+
+static DEFINE_IDA(vdpa_index_ida);
+
+static int vdpa_dev_probe(struct device *d)
+{
+       struct vdpa_device *vdev = dev_to_vdpa(d);
+       struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver);
+       int ret = 0;
+
+       if (drv && drv->probe)
+               ret = drv->probe(vdev);
+
+       return ret;
+}
+
+static int vdpa_dev_remove(struct device *d)
+{
+       struct vdpa_device *vdev = dev_to_vdpa(d);
+       struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver);
+
+       if (drv && drv->remove)
+               drv->remove(vdev);
+
+       return 0;
+}
+
+static struct bus_type vdpa_bus = {
+       .name  = "vdpa",
+       .probe = vdpa_dev_probe,
+       .remove = vdpa_dev_remove,
+};
+
+static void vdpa_release_dev(struct device *d)
+{
+       struct vdpa_device *vdev = dev_to_vdpa(d);
+       const struct vdpa_config_ops *ops = vdev->config;
+
+       if (ops->free)
+               ops->free(vdev);
+
+       ida_simple_remove(&vdpa_index_ida, vdev->index);
+       kfree(vdev);
+}
+
+/**
+ * __vdpa_alloc_device - allocate and initilaize a vDPA device
+ * This allows driver to some prepartion after device is
+ * initialized but before registered.
+ * @parent: the parent device
+ * @config: the bus operations that is supported by this device
+ * @size: size of the parent structure that contains private data
+ *
+ * Drvier should use vdap_alloc_device() wrapper macro instead of
+ * using this directly.
+ *
+ * Returns an error when parent/config/dma_dev is not set or fail to get
+ * ida.
+ */
+struct vdpa_device *__vdpa_alloc_device(struct device *parent,
+                                       const struct vdpa_config_ops *config,
+                                       size_t size)
+{
+       struct vdpa_device *vdev;
+       int err = -EINVAL;
+
+       if (!config)
+               goto err;
+
+       if (!!config->dma_map != !!config->dma_unmap)
+               goto err;
+
+       err = -ENOMEM;
+       vdev = kzalloc(size, GFP_KERNEL);
+       if (!vdev)
+               goto err;
+
+       err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
+       if (err < 0)
+               goto err_ida;
+
+       vdev->dev.bus = &vdpa_bus;
+       vdev->dev.parent = parent;
+       vdev->dev.release = vdpa_release_dev;
+       vdev->index = err;
+       vdev->config = config;
+
+       err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index);
+       if (err)
+               goto err_name;
+
+       device_initialize(&vdev->dev);
+
+       return vdev;
+
+err_name:
+       ida_simple_remove(&vdpa_index_ida, vdev->index);
+err_ida:
+       kfree(vdev);
+err:
+       return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(__vdpa_alloc_device);
+
+/**
+ * vdpa_register_device - register a vDPA device
+ * Callers must have a succeed call of vdpa_init_device() before.
+ * @vdev: the vdpa device to be registered to vDPA bus
+ *
+ * Returns an error when fail to add to vDPA bus
+ */
+int vdpa_register_device(struct vdpa_device *vdev)
+{
+       return device_add(&vdev->dev);
+}
+EXPORT_SYMBOL_GPL(vdpa_register_device);
+
+/**
+ * vdpa_unregister_device - unregister a vDPA device
+ * @vdev: the vdpa device to be unregisted from vDPA bus
+ */
+void vdpa_unregister_device(struct vdpa_device *vdev)
+{
+       device_unregister(&vdev->dev);
+}
+EXPORT_SYMBOL_GPL(vdpa_unregister_device);
+
+/**
+ * __vdpa_register_driver - register a vDPA device driver
+ * @drv: the vdpa device driver to be registered
+ * @owner: module owner of the driver
+ *
+ * Returns an err when fail to do the registration
+ */
+int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner)
+{
+       drv->driver.bus = &vdpa_bus;
+       drv->driver.owner = owner;
+
+       return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(__vdpa_register_driver);
+
+/**
+ * vdpa_unregister_driver - unregister a vDPA device driver
+ * @drv: the vdpa device driver to be unregistered
+ */
+void vdpa_unregister_driver(struct vdpa_driver *drv)
+{
+       driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(vdpa_unregister_driver);
+
+static int vdpa_init(void)
+{
+       return bus_register(&vdpa_bus);
+}
+
+static void __exit vdpa_exit(void)
+{
+       bus_unregister(&vdpa_bus);
+       ida_destroy(&vdpa_index_ida);
+}
+core_initcall(vdpa_init);
+module_exit(vdpa_exit);
+
+MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/vdpa/vdpa_sim/Makefile b/drivers/vdpa/vdpa_sim/Makefile
new file mode 100644 (file)
index 0000000..b40278f
--- /dev/null
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
new file mode 100644 (file)
index 0000000..6e8a0cf
--- /dev/null
@@ -0,0 +1,629 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VDPA networking device simulator.
+ *
+ * Copyright (c) 2020, Red Hat Inc. All rights reserved.
+ *     Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/uuid.h>
+#include <linux/iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/sysfs.h>
+#include <linux/file.h>
+#include <linux/etherdevice.h>
+#include <linux/vringh.h>
+#include <linux/vdpa.h>
+#include <linux/vhost_iotlb.h>
+#include <uapi/linux/virtio_config.h>
+#include <uapi/linux/virtio_net.h>
+
+#define DRV_VERSION  "0.1"
+#define DRV_AUTHOR   "Jason Wang <jasowang@redhat.com>"
+#define DRV_DESC     "vDPA Device Simulator"
+#define DRV_LICENSE  "GPL v2"
+
+struct vdpasim_virtqueue {
+       struct vringh vring;
+       struct vringh_kiov iov;
+       unsigned short head;
+       bool ready;
+       u64 desc_addr;
+       u64 device_addr;
+       u64 driver_addr;
+       u32 num;
+       void *private;
+       irqreturn_t (*cb)(void *data);
+};
+
+#define VDPASIM_QUEUE_ALIGN PAGE_SIZE
+#define VDPASIM_QUEUE_MAX 256
+#define VDPASIM_DEVICE_ID 0x1
+#define VDPASIM_VENDOR_ID 0
+#define VDPASIM_VQ_NUM 0x2
+#define VDPASIM_NAME "vdpasim-netdev"
+
+static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) |
+                             (1ULL << VIRTIO_F_VERSION_1)  |
+                             (1ULL << VIRTIO_F_IOMMU_PLATFORM);
+
+/* State of each vdpasim device */
+struct vdpasim {
+       struct vdpa_device vdpa;
+       struct vdpasim_virtqueue vqs[2];
+       struct work_struct work;
+       /* spinlock to synchronize virtqueue state */
+       spinlock_t lock;
+       struct virtio_net_config config;
+       struct vhost_iotlb *iommu;
+       void *buffer;
+       u32 status;
+       u32 generation;
+       u64 features;
+};
+
+static struct vdpasim *vdpasim_dev;
+
+static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa)
+{
+       return container_of(vdpa, struct vdpasim, vdpa);
+}
+
+static struct vdpasim *dev_to_sim(struct device *dev)
+{
+       struct vdpa_device *vdpa = dev_to_vdpa(dev);
+
+       return vdpa_to_sim(vdpa);
+}
+
+static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx)
+{
+       struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+       int ret;
+
+       ret = vringh_init_iotlb(&vq->vring, vdpasim_features,
+                               VDPASIM_QUEUE_MAX, false,
+                               (struct vring_desc *)(uintptr_t)vq->desc_addr,
+                               (struct vring_avail *)
+                               (uintptr_t)vq->driver_addr,
+                               (struct vring_used *)
+                               (uintptr_t)vq->device_addr);
+}
+
+static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq)
+{
+       vq->ready = 0;
+       vq->desc_addr = 0;
+       vq->driver_addr = 0;
+       vq->device_addr = 0;
+       vq->cb = NULL;
+       vq->private = NULL;
+       vringh_init_iotlb(&vq->vring, vdpasim_features, VDPASIM_QUEUE_MAX,
+                         false, NULL, NULL, NULL);
+}
+
+static void vdpasim_reset(struct vdpasim *vdpasim)
+{
+       int i;
+
+       for (i = 0; i < VDPASIM_VQ_NUM; i++)
+               vdpasim_vq_reset(&vdpasim->vqs[i]);
+
+       vhost_iotlb_reset(vdpasim->iommu);
+
+       vdpasim->features = 0;
+       vdpasim->status = 0;
+       ++vdpasim->generation;
+}
+
+static void vdpasim_work(struct work_struct *work)
+{
+       struct vdpasim *vdpasim = container_of(work, struct
+                                                vdpasim, work);
+       struct vdpasim_virtqueue *txq = &vdpasim->vqs[1];
+       struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0];
+       size_t read, write, total_write;
+       int err;
+       int pkts = 0;
+
+       spin_lock(&vdpasim->lock);
+
+       if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
+               goto out;
+
+       if (!txq->ready || !rxq->ready)
+               goto out;
+
+       while (true) {
+               total_write = 0;
+               err = vringh_getdesc_iotlb(&txq->vring, &txq->iov, NULL,
+                                          &txq->head, GFP_ATOMIC);
+               if (err <= 0)
+                       break;
+
+               err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->iov,
+                                          &rxq->head, GFP_ATOMIC);
+               if (err <= 0) {
+                       vringh_complete_iotlb(&txq->vring, txq->head, 0);
+                       break;
+               }
+
+               while (true) {
+                       read = vringh_iov_pull_iotlb(&txq->vring, &txq->iov,
+                                                    vdpasim->buffer,
+                                                    PAGE_SIZE);
+                       if (read <= 0)
+                               break;
+
+                       write = vringh_iov_push_iotlb(&rxq->vring, &rxq->iov,
+                                                     vdpasim->buffer, read);
+                       if (write <= 0)
+                               break;
+
+                       total_write += write;
+               }
+
+               /* Make sure data is wrote before advancing index */
+               smp_wmb();
+
+               vringh_complete_iotlb(&txq->vring, txq->head, 0);
+               vringh_complete_iotlb(&rxq->vring, rxq->head, total_write);
+
+               /* Make sure used is visible before rasing the interrupt. */
+               smp_wmb();
+
+               local_bh_disable();
+               if (txq->cb)
+                       txq->cb(txq->private);
+               if (rxq->cb)
+                       rxq->cb(rxq->private);
+               local_bh_enable();
+
+               if (++pkts > 4) {
+                       schedule_work(&vdpasim->work);
+                       goto out;
+               }
+       }
+
+out:
+       spin_unlock(&vdpasim->lock);
+}
+
+static int dir_to_perm(enum dma_data_direction dir)
+{
+       int perm = -EFAULT;
+
+       switch (dir) {
+       case DMA_FROM_DEVICE:
+               perm = VHOST_MAP_WO;
+               break;
+       case DMA_TO_DEVICE:
+               perm = VHOST_MAP_RO;
+               break;
+       case DMA_BIDIRECTIONAL:
+               perm = VHOST_MAP_RW;
+               break;
+       default:
+               break;
+       }
+
+       return perm;
+}
+
+static dma_addr_t vdpasim_map_page(struct device *dev, struct page *page,
+                                  unsigned long offset, size_t size,
+                                  enum dma_data_direction dir,
+                                  unsigned long attrs)
+{
+       struct vdpasim *vdpasim = dev_to_sim(dev);
+       struct vhost_iotlb *iommu = vdpasim->iommu;
+       u64 pa = (page_to_pfn(page) << PAGE_SHIFT) + offset;
+       int ret, perm = dir_to_perm(dir);
+
+       if (perm < 0)
+               return DMA_MAPPING_ERROR;
+
+       /* For simplicity, use identical mapping to avoid e.g iova
+        * allocator.
+        */
+       ret = vhost_iotlb_add_range(iommu, pa, pa + size - 1,
+                                   pa, dir_to_perm(dir));
+       if (ret)
+               return DMA_MAPPING_ERROR;
+
+       return (dma_addr_t)(pa);
+}
+
+static void vdpasim_unmap_page(struct device *dev, dma_addr_t dma_addr,
+                              size_t size, enum dma_data_direction dir,
+                              unsigned long attrs)
+{
+       struct vdpasim *vdpasim = dev_to_sim(dev);
+       struct vhost_iotlb *iommu = vdpasim->iommu;
+
+       vhost_iotlb_del_range(iommu, (u64)dma_addr,
+                             (u64)dma_addr + size - 1);
+}
+
+static void *vdpasim_alloc_coherent(struct device *dev, size_t size,
+                                   dma_addr_t *dma_addr, gfp_t flag,
+                                   unsigned long attrs)
+{
+       struct vdpasim *vdpasim = dev_to_sim(dev);
+       struct vhost_iotlb *iommu = vdpasim->iommu;
+       void *addr = kmalloc(size, flag);
+       int ret;
+
+       if (!addr)
+               *dma_addr = DMA_MAPPING_ERROR;
+       else {
+               u64 pa = virt_to_phys(addr);
+
+               ret = vhost_iotlb_add_range(iommu, (u64)pa,
+                                           (u64)pa + size - 1,
+                                           pa, VHOST_MAP_RW);
+               if (ret) {
+                       *dma_addr = DMA_MAPPING_ERROR;
+                       kfree(addr);
+                       addr = NULL;
+               } else
+                       *dma_addr = (dma_addr_t)pa;
+       }
+
+       return addr;
+}
+
+static void vdpasim_free_coherent(struct device *dev, size_t size,
+                                 void *vaddr, dma_addr_t dma_addr,
+                                 unsigned long attrs)
+{
+       struct vdpasim *vdpasim = dev_to_sim(dev);
+       struct vhost_iotlb *iommu = vdpasim->iommu;
+
+       vhost_iotlb_del_range(iommu, (u64)dma_addr,
+                             (u64)dma_addr + size - 1);
+       kfree(phys_to_virt((uintptr_t)dma_addr));
+}
+
+static const struct dma_map_ops vdpasim_dma_ops = {
+       .map_page = vdpasim_map_page,
+       .unmap_page = vdpasim_unmap_page,
+       .alloc = vdpasim_alloc_coherent,
+       .free = vdpasim_free_coherent,
+};
+
+static const struct vdpa_config_ops vdpasim_net_config_ops;
+
+static struct vdpasim *vdpasim_create(void)
+{
+       struct virtio_net_config *config;
+       struct vdpasim *vdpasim;
+       struct device *dev;
+       int ret = -ENOMEM;
+
+       vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL,
+                                   &vdpasim_net_config_ops);
+       if (!vdpasim)
+               goto err_alloc;
+
+       INIT_WORK(&vdpasim->work, vdpasim_work);
+       spin_lock_init(&vdpasim->lock);
+
+       dev = &vdpasim->vdpa.dev;
+       dev->coherent_dma_mask = DMA_BIT_MASK(64);
+       set_dma_ops(dev, &vdpasim_dma_ops);
+
+       vdpasim->iommu = vhost_iotlb_alloc(2048, 0);
+       if (!vdpasim->iommu)
+               goto err_iommu;
+
+       vdpasim->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!vdpasim->buffer)
+               goto err_iommu;
+
+       config = &vdpasim->config;
+       config->mtu = 1500;
+       config->status = VIRTIO_NET_S_LINK_UP;
+       eth_random_addr(config->mac);
+
+       vringh_set_iotlb(&vdpasim->vqs[0].vring, vdpasim->iommu);
+       vringh_set_iotlb(&vdpasim->vqs[1].vring, vdpasim->iommu);
+
+       vdpasim->vdpa.dma_dev = dev;
+       ret = vdpa_register_device(&vdpasim->vdpa);
+       if (ret)
+               goto err_iommu;
+
+       return vdpasim;
+
+err_iommu:
+       put_device(dev);
+err_alloc:
+       return ERR_PTR(ret);
+}
+
+static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx,
+                                 u64 desc_area, u64 driver_area,
+                                 u64 device_area)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+       struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+       vq->desc_addr = desc_area;
+       vq->driver_addr = driver_area;
+       vq->device_addr = device_area;
+
+       return 0;
+}
+
+static void vdpasim_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+       struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+       vq->num = num;
+}
+
+static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+       struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+       if (vq->ready)
+               schedule_work(&vdpasim->work);
+}
+
+static void vdpasim_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
+                             struct vdpa_callback *cb)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+       struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+       vq->cb = cb->callback;
+       vq->private = cb->private;
+}
+
+static void vdpasim_set_vq_ready(struct vdpa_device *vdpa, u16 idx, bool ready)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+       struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+       spin_lock(&vdpasim->lock);
+       vq->ready = ready;
+       if (vq->ready)
+               vdpasim_queue_ready(vdpasim, idx);
+       spin_unlock(&vdpasim->lock);
+}
+
+static bool vdpasim_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+       struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+
+       return vq->ready;
+}
+
+static int vdpasim_set_vq_state(struct vdpa_device *vdpa, u16 idx, u64 state)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+       struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+       struct vringh *vrh = &vq->vring;
+
+       spin_lock(&vdpasim->lock);
+       vrh->last_avail_idx = state;
+       spin_unlock(&vdpasim->lock);
+
+       return 0;
+}
+
+static u64 vdpasim_get_vq_state(struct vdpa_device *vdpa, u16 idx)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+       struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
+       struct vringh *vrh = &vq->vring;
+
+       return vrh->last_avail_idx;
+}
+
+static u16 vdpasim_get_vq_align(struct vdpa_device *vdpa)
+{
+       return VDPASIM_QUEUE_ALIGN;
+}
+
+static u64 vdpasim_get_features(struct vdpa_device *vdpa)
+{
+       return vdpasim_features;
+}
+
+static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       /* DMA mapping must be done by driver */
+       if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)))
+               return -EINVAL;
+
+       vdpasim->features = features & vdpasim_features;
+
+       return 0;
+}
+
+static void vdpasim_set_config_cb(struct vdpa_device *vdpa,
+                                 struct vdpa_callback *cb)
+{
+       /* We don't support config interrupt */
+}
+
+static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa)
+{
+       return VDPASIM_QUEUE_MAX;
+}
+
+static u32 vdpasim_get_device_id(struct vdpa_device *vdpa)
+{
+       return VDPASIM_DEVICE_ID;
+}
+
+static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa)
+{
+       return VDPASIM_VENDOR_ID;
+}
+
+static u8 vdpasim_get_status(struct vdpa_device *vdpa)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+       u8 status;
+
+       spin_lock(&vdpasim->lock);
+       status = vdpasim->status;
+       spin_unlock(&vdpasim->lock);
+
+       return vdpasim->status;
+}
+
+static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       spin_lock(&vdpasim->lock);
+       vdpasim->status = status;
+       if (status == 0)
+               vdpasim_reset(vdpasim);
+       spin_unlock(&vdpasim->lock);
+}
+
+static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset,
+                            void *buf, unsigned int len)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       if (offset + len < sizeof(struct virtio_net_config))
+               memcpy(buf, &vdpasim->config + offset, len);
+}
+
+static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset,
+                            const void *buf, unsigned int len)
+{
+       /* No writable config supportted by vdpasim */
+}
+
+static u32 vdpasim_get_generation(struct vdpa_device *vdpa)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       return vdpasim->generation;
+}
+
+static int vdpasim_set_map(struct vdpa_device *vdpa,
+                          struct vhost_iotlb *iotlb)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+       struct vhost_iotlb_map *map;
+       u64 start = 0ULL, last = 0ULL - 1;
+       int ret;
+
+       vhost_iotlb_reset(vdpasim->iommu);
+
+       for (map = vhost_iotlb_itree_first(iotlb, start, last); map;
+            map = vhost_iotlb_itree_next(map, start, last)) {
+               ret = vhost_iotlb_add_range(vdpasim->iommu, map->start,
+                                           map->last, map->addr, map->perm);
+               if (ret)
+                       goto err;
+       }
+       return 0;
+
+err:
+       vhost_iotlb_reset(vdpasim->iommu);
+       return ret;
+}
+
+static int vdpasim_dma_map(struct vdpa_device *vdpa, u64 iova, u64 size,
+                          u64 pa, u32 perm)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       return vhost_iotlb_add_range(vdpasim->iommu, iova,
+                                    iova + size - 1, pa, perm);
+}
+
+static int vdpasim_dma_unmap(struct vdpa_device *vdpa, u64 iova, u64 size)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       vhost_iotlb_del_range(vdpasim->iommu, iova, iova + size - 1);
+
+       return 0;
+}
+
+static void vdpasim_free(struct vdpa_device *vdpa)
+{
+       struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+       cancel_work_sync(&vdpasim->work);
+       kfree(vdpasim->buffer);
+       if (vdpasim->iommu)
+               vhost_iotlb_free(vdpasim->iommu);
+}
+
+static const struct vdpa_config_ops vdpasim_net_config_ops = {
+       .set_vq_address         = vdpasim_set_vq_address,
+       .set_vq_num             = vdpasim_set_vq_num,
+       .kick_vq                = vdpasim_kick_vq,
+       .set_vq_cb              = vdpasim_set_vq_cb,
+       .set_vq_ready           = vdpasim_set_vq_ready,
+       .get_vq_ready           = vdpasim_get_vq_ready,
+       .set_vq_state           = vdpasim_set_vq_state,
+       .get_vq_state           = vdpasim_get_vq_state,
+       .get_vq_align           = vdpasim_get_vq_align,
+       .get_features           = vdpasim_get_features,
+       .set_features           = vdpasim_set_features,
+       .set_config_cb          = vdpasim_set_config_cb,
+       .get_vq_num_max         = vdpasim_get_vq_num_max,
+       .get_device_id          = vdpasim_get_device_id,
+       .get_vendor_id          = vdpasim_get_vendor_id,
+       .get_status             = vdpasim_get_status,
+       .set_status             = vdpasim_set_status,
+       .get_config             = vdpasim_get_config,
+       .set_config             = vdpasim_set_config,
+       .get_generation         = vdpasim_get_generation,
+       .set_map                = vdpasim_set_map,
+       .dma_map                = vdpasim_dma_map,
+       .dma_unmap              = vdpasim_dma_unmap,
+       .free                   = vdpasim_free,
+};
+
+static int __init vdpasim_dev_init(void)
+{
+       vdpasim_dev = vdpasim_create();
+
+       if (!IS_ERR(vdpasim_dev))
+               return 0;
+
+       return PTR_ERR(vdpasim_dev);
+}
+
+static void __exit vdpasim_dev_exit(void)
+{
+       struct vdpa_device *vdpa = &vdpasim_dev->vdpa;
+
+       vdpa_unregister_device(vdpa);
+}
+
+module_init(vdpasim_dev_init)
+module_exit(vdpasim_dev_exit)
+
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE(DRV_LICENSE);
+MODULE_AUTHOR(DRV_AUTHOR);
+MODULE_DESCRIPTION(DRV_DESC);
index 3d03ccb..362b832 100644 (file)
@@ -1,4 +1,29 @@
 # SPDX-License-Identifier: GPL-2.0-only
+config VHOST_IOTLB
+       tristate
+       help
+         Generic IOTLB implementation for vhost and vringh.
+
+config VHOST_RING
+       tristate
+       select VHOST_IOTLB
+       help
+         This option is selected by any driver which needs to access
+         the host side of a virtio ring.
+
+config VHOST
+       tristate
+       select VHOST_IOTLB
+       help
+         This option is selected by any driver which needs to access
+         the core of vhost.
+
+menuconfig VHOST_MENU
+       bool "VHOST drivers"
+       default y
+
+if VHOST_MENU
+
 config VHOST_NET
        tristate "Host kernel accelerator for virtio net"
        depends on NET && EVENTFD && (TUN || !TUN) && (TAP || !TAP)
@@ -23,8 +48,8 @@ config VHOST_SCSI
 config VHOST_VSOCK
        tristate "vhost virtio-vsock driver"
        depends on VSOCKETS && EVENTFD
-       select VIRTIO_VSOCKETS_COMMON
        select VHOST
+       select VIRTIO_VSOCKETS_COMMON
        default n
        ---help---
        This kernel module can be loaded in the host kernel to provide AF_VSOCK
@@ -34,11 +59,17 @@ config VHOST_VSOCK
        To compile this driver as a module, choose M here: the module will be called
        vhost_vsock.
 
-config VHOST
-       tristate
-       ---help---
-         This option is selected by any driver which needs to access
-         the core of vhost.
+config VHOST_VDPA
+       tristate "Vhost driver for vDPA-based backend"
+       depends on EVENTFD
+       select VHOST
+       select VDPA
+       help
+         This kernel module can be loaded in host kernel to accelerate
+         guest virtio devices with the vDPA-based backends.
+
+         To compile this driver as a module, choose M here: the module
+         will be called vhost_vdpa.
 
 config VHOST_CROSS_ENDIAN_LEGACY
        bool "Cross-endian support for vhost"
@@ -54,3 +85,5 @@ config VHOST_CROSS_ENDIAN_LEGACY
          adds some overhead, it is disabled by default.
 
          If unsure, say "N".
+
+endif
diff --git a/drivers/vhost/Kconfig.vringh b/drivers/vhost/Kconfig.vringh
deleted file mode 100644 (file)
index c1fe36a..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config VHOST_RING
-       tristate
-       ---help---
-         This option is selected by any driver which needs to access
-         the host side of a virtio ring.
index 6c6df24..f3e1897 100644 (file)
@@ -10,4 +10,10 @@ vhost_vsock-y := vsock.o
 
 obj-$(CONFIG_VHOST_RING) += vringh.o
 
+obj-$(CONFIG_VHOST_VDPA) += vhost_vdpa.o
+vhost_vdpa-y := vdpa.o
+
 obj-$(CONFIG_VHOST)    += vhost.o
+
+obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o
+vhost_iotlb-y := iotlb.o
diff --git a/drivers/vhost/iotlb.c b/drivers/vhost/iotlb.c
new file mode 100644 (file)
index 0000000..1f0ca6e
--- /dev/null
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2020 Red Hat, Inc.
+ * Author: Jason Wang <jasowang@redhat.com>
+ *
+ * IOTLB implementation for vhost.
+ */
+#include <linux/slab.h>
+#include <linux/vhost_iotlb.h>
+#include <linux/module.h>
+
+#define MOD_VERSION  "0.1"
+#define MOD_DESC     "VHOST IOTLB"
+#define MOD_AUTHOR   "Jason Wang <jasowang@redhat.com>"
+#define MOD_LICENSE  "GPL v2"
+
+#define START(map) ((map)->start)
+#define LAST(map) ((map)->last)
+
+INTERVAL_TREE_DEFINE(struct vhost_iotlb_map,
+                    rb, __u64, __subtree_last,
+                    START, LAST, static inline, vhost_iotlb_itree);
+
+/**
+ * vhost_iotlb_map_free - remove a map node and free it
+ * @iotlb: the IOTLB
+ * @map: the map that want to be remove and freed
+ */
+void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
+                         struct vhost_iotlb_map *map)
+{
+       vhost_iotlb_itree_remove(map, &iotlb->root);
+       list_del(&map->link);
+       kfree(map);
+       iotlb->nmaps--;
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_map_free);
+
+/**
+ * vhost_iotlb_add_range - add a new range to vhost IOTLB
+ * @iotlb: the IOTLB
+ * @start: start of the IOVA range
+ * @last: last of IOVA range
+ * @addr: the address that is mapped to @start
+ * @perm: access permission of this range
+ *
+ * Returns an error last is smaller than start or memory allocation
+ * fails
+ */
+int vhost_iotlb_add_range(struct vhost_iotlb *iotlb,
+                         u64 start, u64 last,
+                         u64 addr, unsigned int perm)
+{
+       struct vhost_iotlb_map *map;
+
+       if (last < start)
+               return -EFAULT;
+
+       if (iotlb->limit &&
+           iotlb->nmaps == iotlb->limit &&
+           iotlb->flags & VHOST_IOTLB_FLAG_RETIRE) {
+               map = list_first_entry(&iotlb->list, typeof(*map), link);
+               vhost_iotlb_map_free(iotlb, map);
+       }
+
+       map = kmalloc(sizeof(*map), GFP_ATOMIC);
+       if (!map)
+               return -ENOMEM;
+
+       map->start = start;
+       map->size = last - start + 1;
+       map->last = last;
+       map->addr = addr;
+       map->perm = perm;
+
+       iotlb->nmaps++;
+       vhost_iotlb_itree_insert(map, &iotlb->root);
+
+       INIT_LIST_HEAD(&map->link);
+       list_add_tail(&map->link, &iotlb->list);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_add_range);
+
+/**
+ * vring_iotlb_del_range - delete overlapped ranges from vhost IOTLB
+ * @iotlb: the IOTLB
+ * @start: start of the IOVA range
+ * @last: last of IOVA range
+ */
+void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last)
+{
+       struct vhost_iotlb_map *map;
+
+       while ((map = vhost_iotlb_itree_iter_first(&iotlb->root,
+                                                  start, last)))
+               vhost_iotlb_map_free(iotlb, map);
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_del_range);
+
+/**
+ * vhost_iotlb_alloc - add a new vhost IOTLB
+ * @limit: maximum number of IOTLB entries
+ * @flags: VHOST_IOTLB_FLAG_XXX
+ *
+ * Returns an error is memory allocation fails
+ */
+struct vhost_iotlb *vhost_iotlb_alloc(unsigned int limit, unsigned int flags)
+{
+       struct vhost_iotlb *iotlb = kzalloc(sizeof(*iotlb), GFP_KERNEL);
+
+       if (!iotlb)
+               return NULL;
+
+       iotlb->root = RB_ROOT_CACHED;
+       iotlb->limit = limit;
+       iotlb->nmaps = 0;
+       iotlb->flags = flags;
+       INIT_LIST_HEAD(&iotlb->list);
+
+       return iotlb;
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_alloc);
+
+/**
+ * vhost_iotlb_reset - reset vhost IOTLB (free all IOTLB entries)
+ * @iotlb: the IOTLB to be reset
+ */
+void vhost_iotlb_reset(struct vhost_iotlb *iotlb)
+{
+       vhost_iotlb_del_range(iotlb, 0ULL, 0ULL - 1);
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_reset);
+
+/**
+ * vhost_iotlb_free - reset and free vhost IOTLB
+ * @iotlb: the IOTLB to be freed
+ */
+void vhost_iotlb_free(struct vhost_iotlb *iotlb)
+{
+       if (iotlb) {
+               vhost_iotlb_reset(iotlb);
+               kfree(iotlb);
+       }
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_free);
+
+/**
+ * vhost_iotlb_itree_first - return the first overlapped range
+ * @iotlb: the IOTLB
+ * @start: start of IOVA range
+ * @end: end of IOVA range
+ */
+struct vhost_iotlb_map *
+vhost_iotlb_itree_first(struct vhost_iotlb *iotlb, u64 start, u64 last)
+{
+       return vhost_iotlb_itree_iter_first(&iotlb->root, start, last);
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_itree_first);
+
+/**
+ * vhost_iotlb_itree_first - return the next overlapped range
+ * @iotlb: the IOTLB
+ * @start: start of IOVA range
+ * @end: end of IOVA range
+ */
+struct vhost_iotlb_map *
+vhost_iotlb_itree_next(struct vhost_iotlb_map *map, u64 start, u64 last)
+{
+       return vhost_iotlb_itree_iter_next(map, start, last);
+}
+EXPORT_SYMBOL_GPL(vhost_iotlb_itree_next);
+
+MODULE_VERSION(MOD_VERSION);
+MODULE_DESCRIPTION(MOD_DESC);
+MODULE_AUTHOR(MOD_AUTHOR);
+MODULE_LICENSE(MOD_LICENSE);
index 18e205e..87469d6 100644 (file)
@@ -1324,7 +1324,8 @@ static int vhost_net_open(struct inode *inode, struct file *f)
        }
        vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
                       UIO_MAXIOV + VHOST_NET_BATCH,
-                      VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT);
+                      VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT,
+                      NULL);
 
        vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
        vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
@@ -1586,7 +1587,7 @@ static long vhost_net_reset_owner(struct vhost_net *n)
        struct socket *tx_sock = NULL;
        struct socket *rx_sock = NULL;
        long err;
-       struct vhost_umem *umem;
+       struct vhost_iotlb *umem;
 
        mutex_lock(&n->dev.mutex);
        err = vhost_dev_check_owner(&n->dev);
index 0b949a1..7653667 100644 (file)
@@ -1628,7 +1628,7 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
                vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
        }
        vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV,
-                      VHOST_SCSI_WEIGHT, 0);
+                      VHOST_SCSI_WEIGHT, 0, NULL);
 
        vhost_scsi_init_inflight(vs, NULL);
 
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
new file mode 100644 (file)
index 0000000..421f02a
--- /dev/null
@@ -0,0 +1,883 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018-2020 Intel Corporation.
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * Author: Tiwei Bie <tiwei.bie@intel.com>
+ *         Jason Wang <jasowang@redhat.com>
+ *
+ * Thanks Michael S. Tsirkin for the valuable comments and
+ * suggestions.  And thanks to Cunming Liang and Zhihong Wang for all
+ * their supports.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/uuid.h>
+#include <linux/vdpa.h>
+#include <linux/nospec.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+
+#include "vhost.h"
+
+enum {
+       VHOST_VDPA_FEATURES =
+               (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
+               (1ULL << VIRTIO_F_ANY_LAYOUT) |
+               (1ULL << VIRTIO_F_VERSION_1) |
+               (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+               (1ULL << VIRTIO_F_RING_PACKED) |
+               (1ULL << VIRTIO_F_ORDER_PLATFORM) |
+               (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+               (1ULL << VIRTIO_RING_F_EVENT_IDX),
+
+       VHOST_VDPA_NET_FEATURES = VHOST_VDPA_FEATURES |
+               (1ULL << VIRTIO_NET_F_CSUM) |
+               (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
+               (1ULL << VIRTIO_NET_F_MTU) |
+               (1ULL << VIRTIO_NET_F_MAC) |
+               (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
+               (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
+               (1ULL << VIRTIO_NET_F_GUEST_ECN) |
+               (1ULL << VIRTIO_NET_F_GUEST_UFO) |
+               (1ULL << VIRTIO_NET_F_HOST_TSO4) |
+               (1ULL << VIRTIO_NET_F_HOST_TSO6) |
+               (1ULL << VIRTIO_NET_F_HOST_ECN) |
+               (1ULL << VIRTIO_NET_F_HOST_UFO) |
+               (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+               (1ULL << VIRTIO_NET_F_STATUS) |
+               (1ULL << VIRTIO_NET_F_SPEED_DUPLEX),
+};
+
+/* Currently, only network backend w/o multiqueue is supported. */
+#define VHOST_VDPA_VQ_MAX      2
+
+#define VHOST_VDPA_DEV_MAX (1U << MINORBITS)
+
+struct vhost_vdpa {
+       struct vhost_dev vdev;
+       struct iommu_domain *domain;
+       struct vhost_virtqueue *vqs;
+       struct completion completion;
+       struct vdpa_device *vdpa;
+       struct device dev;
+       struct cdev cdev;
+       atomic_t opened;
+       int nvqs;
+       int virtio_id;
+       int minor;
+};
+
+static DEFINE_IDA(vhost_vdpa_ida);
+
+static dev_t vhost_vdpa_major;
+
+static const u64 vhost_vdpa_features[] = {
+       [VIRTIO_ID_NET] = VHOST_VDPA_NET_FEATURES,
+};
+
+static void handle_vq_kick(struct vhost_work *work)
+{
+       struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+                                                 poll.work);
+       struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev);
+       const struct vdpa_config_ops *ops = v->vdpa->config;
+
+       ops->kick_vq(v->vdpa, vq - v->vqs);
+}
+
+static irqreturn_t vhost_vdpa_virtqueue_cb(void *private)
+{
+       struct vhost_virtqueue *vq = private;
+       struct eventfd_ctx *call_ctx = vq->call_ctx;
+
+       if (call_ctx)
+               eventfd_signal(call_ctx, 1);
+
+       return IRQ_HANDLED;
+}
+
+static void vhost_vdpa_reset(struct vhost_vdpa *v)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+
+       ops->set_status(vdpa, 0);
+}
+
+static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+       u32 device_id;
+
+       device_id = ops->get_device_id(vdpa);
+
+       if (copy_to_user(argp, &device_id, sizeof(device_id)))
+               return -EFAULT;
+
+       return 0;
+}
+
+static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+       u8 status;
+
+       status = ops->get_status(vdpa);
+
+       if (copy_to_user(statusp, &status, sizeof(status)))
+               return -EFAULT;
+
+       return 0;
+}
+
+static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+       u8 status;
+
+       if (copy_from_user(&status, statusp, sizeof(status)))
+               return -EFAULT;
+
+       /*
+        * Userspace shouldn't remove status bits unless reset the
+        * status to 0.
+        */
+       if (status != 0 && (ops->get_status(vdpa) & ~status) != 0)
+               return -EINVAL;
+
+       ops->set_status(vdpa, status);
+
+       return 0;
+}
+
+static int vhost_vdpa_config_validate(struct vhost_vdpa *v,
+                                     struct vhost_vdpa_config *c)
+{
+       long size = 0;
+
+       switch (v->virtio_id) {
+       case VIRTIO_ID_NET:
+               size = sizeof(struct virtio_net_config);
+               break;
+       }
+
+       if (c->len == 0)
+               return -EINVAL;
+
+       if (c->len > size - c->off)
+               return -E2BIG;
+
+       return 0;
+}
+
+static long vhost_vdpa_get_config(struct vhost_vdpa *v,
+                                 struct vhost_vdpa_config __user *c)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+       struct vhost_vdpa_config config;
+       unsigned long size = offsetof(struct vhost_vdpa_config, buf);
+       u8 *buf;
+
+       if (copy_from_user(&config, c, size))
+               return -EFAULT;
+       if (vhost_vdpa_config_validate(v, &config))
+               return -EINVAL;
+       buf = kvzalloc(config.len, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       ops->get_config(vdpa, config.off, buf, config.len);
+
+       if (copy_to_user(c->buf, buf, config.len)) {
+               kvfree(buf);
+               return -EFAULT;
+       }
+
+       kvfree(buf);
+       return 0;
+}
+
+static long vhost_vdpa_set_config(struct vhost_vdpa *v,
+                                 struct vhost_vdpa_config __user *c)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+       struct vhost_vdpa_config config;
+       unsigned long size = offsetof(struct vhost_vdpa_config, buf);
+       u8 *buf;
+
+       if (copy_from_user(&config, c, size))
+               return -EFAULT;
+       if (vhost_vdpa_config_validate(v, &config))
+               return -EINVAL;
+       buf = kvzalloc(config.len, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       if (copy_from_user(buf, c->buf, config.len)) {
+               kvfree(buf);
+               return -EFAULT;
+       }
+
+       ops->set_config(vdpa, config.off, buf, config.len);
+
+       kvfree(buf);
+       return 0;
+}
+
+static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+       u64 features;
+
+       features = ops->get_features(vdpa);
+       features &= vhost_vdpa_features[v->virtio_id];
+
+       if (copy_to_user(featurep, &features, sizeof(features)))
+               return -EFAULT;
+
+       return 0;
+}
+
+static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+       u64 features;
+
+       /*
+        * It's not allowed to change the features after they have
+        * been negotiated.
+        */
+       if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK)
+               return -EBUSY;
+
+       if (copy_from_user(&features, featurep, sizeof(features)))
+               return -EFAULT;
+
+       if (features & ~vhost_vdpa_features[v->virtio_id])
+               return -EINVAL;
+
+       if (ops->set_features(vdpa, features))
+               return -EINVAL;
+
+       return 0;
+}
+
+static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+       u16 num;
+
+       num = ops->get_vq_num_max(vdpa);
+
+       if (copy_to_user(argp, &num, sizeof(num)))
+               return -EFAULT;
+
+       return 0;
+}
+
+static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
+                                  void __user *argp)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+       struct vdpa_callback cb;
+       struct vhost_virtqueue *vq;
+       struct vhost_vring_state s;
+       u8 status;
+       u32 idx;
+       long r;
+
+       r = get_user(idx, (u32 __user *)argp);
+       if (r < 0)
+               return r;
+
+       if (idx >= v->nvqs)
+               return -ENOBUFS;
+
+       idx = array_index_nospec(idx, v->nvqs);
+       vq = &v->vqs[idx];
+
+       status = ops->get_status(vdpa);
+
+       if (cmd == VHOST_VDPA_SET_VRING_ENABLE) {
+               if (copy_from_user(&s, argp, sizeof(s)))
+                       return -EFAULT;
+               ops->set_vq_ready(vdpa, idx, s.num);
+               return 0;
+       }
+
+       if (cmd == VHOST_GET_VRING_BASE)
+               vq->last_avail_idx = ops->get_vq_state(v->vdpa, idx);
+
+       r = vhost_vring_ioctl(&v->vdev, cmd, argp);
+       if (r)
+               return r;
+
+       switch (cmd) {
+       case VHOST_SET_VRING_ADDR:
+               if (ops->set_vq_address(vdpa, idx,
+                                       (u64)(uintptr_t)vq->desc,
+                                       (u64)(uintptr_t)vq->avail,
+                                       (u64)(uintptr_t)vq->used))
+                       r = -EINVAL;
+               break;
+
+       case VHOST_SET_VRING_BASE:
+               if (ops->set_vq_state(vdpa, idx, vq->last_avail_idx))
+                       r = -EINVAL;
+               break;
+
+       case VHOST_SET_VRING_CALL:
+               if (vq->call_ctx) {
+                       cb.callback = vhost_vdpa_virtqueue_cb;
+                       cb.private = vq;
+               } else {
+                       cb.callback = NULL;
+                       cb.private = NULL;
+               }
+               ops->set_vq_cb(vdpa, idx, &cb);
+               break;
+
+       case VHOST_SET_VRING_NUM:
+               ops->set_vq_num(vdpa, idx, vq->num);
+               break;
+       }
+
+       return r;
+}
+
+static long vhost_vdpa_unlocked_ioctl(struct file *filep,
+                                     unsigned int cmd, unsigned long arg)
+{
+       struct vhost_vdpa *v = filep->private_data;
+       struct vhost_dev *d = &v->vdev;
+       void __user *argp = (void __user *)arg;
+       long r;
+
+       mutex_lock(&d->mutex);
+
+       switch (cmd) {
+       case VHOST_VDPA_GET_DEVICE_ID:
+               r = vhost_vdpa_get_device_id(v, argp);
+               break;
+       case VHOST_VDPA_GET_STATUS:
+               r = vhost_vdpa_get_status(v, argp);
+               break;
+       case VHOST_VDPA_SET_STATUS:
+               r = vhost_vdpa_set_status(v, argp);
+               break;
+       case VHOST_VDPA_GET_CONFIG:
+               r = vhost_vdpa_get_config(v, argp);
+               break;
+       case VHOST_VDPA_SET_CONFIG:
+               r = vhost_vdpa_set_config(v, argp);
+               break;
+       case VHOST_GET_FEATURES:
+               r = vhost_vdpa_get_features(v, argp);
+               break;
+       case VHOST_SET_FEATURES:
+               r = vhost_vdpa_set_features(v, argp);
+               break;
+       case VHOST_VDPA_GET_VRING_NUM:
+               r = vhost_vdpa_get_vring_num(v, argp);
+               break;
+       case VHOST_SET_LOG_BASE:
+       case VHOST_SET_LOG_FD:
+               r = -ENOIOCTLCMD;
+               break;
+       default:
+               r = vhost_dev_ioctl(&v->vdev, cmd, argp);
+               if (r == -ENOIOCTLCMD)
+                       r = vhost_vdpa_vring_ioctl(v, cmd, argp);
+               break;
+       }
+
+       mutex_unlock(&d->mutex);
+       return r;
+}
+
+static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last)
+{
+       struct vhost_dev *dev = &v->vdev;
+       struct vhost_iotlb *iotlb = dev->iotlb;
+       struct vhost_iotlb_map *map;
+       struct page *page;
+       unsigned long pfn, pinned;
+
+       while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) {
+               pinned = map->size >> PAGE_SHIFT;
+               for (pfn = map->addr >> PAGE_SHIFT;
+                    pinned > 0; pfn++, pinned--) {
+                       page = pfn_to_page(pfn);
+                       if (map->perm & VHOST_ACCESS_WO)
+                               set_page_dirty_lock(page);
+                       unpin_user_page(page);
+               }
+               atomic64_sub(map->size >> PAGE_SHIFT, &dev->mm->pinned_vm);
+               vhost_iotlb_map_free(iotlb, map);
+       }
+}
+
+static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v)
+{
+       struct vhost_dev *dev = &v->vdev;
+
+       vhost_vdpa_iotlb_unmap(v, 0ULL, 0ULL - 1);
+       kfree(dev->iotlb);
+       dev->iotlb = NULL;
+}
+
+static int perm_to_iommu_flags(u32 perm)
+{
+       int flags = 0;
+
+       switch (perm) {
+       case VHOST_ACCESS_WO:
+               flags |= IOMMU_WRITE;
+               break;
+       case VHOST_ACCESS_RO:
+               flags |= IOMMU_READ;
+               break;
+       case VHOST_ACCESS_RW:
+               flags |= (IOMMU_WRITE | IOMMU_READ);
+               break;
+       default:
+               WARN(1, "invalidate vhost IOTLB permission\n");
+               break;
+       }
+
+       return flags | IOMMU_CACHE;
+}
+
+static int vhost_vdpa_map(struct vhost_vdpa *v,
+                         u64 iova, u64 size, u64 pa, u32 perm)
+{
+       struct vhost_dev *dev = &v->vdev;
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+       int r = 0;
+
+       r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1,
+                                 pa, perm);
+       if (r)
+               return r;
+
+       if (ops->dma_map)
+               r = ops->dma_map(vdpa, iova, size, pa, perm);
+       else if (ops->set_map)
+               r = ops->set_map(vdpa, dev->iotlb);
+       else
+               r = iommu_map(v->domain, iova, pa, size,
+                             perm_to_iommu_flags(perm));
+
+       return r;
+}
+
+static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size)
+{
+       struct vhost_dev *dev = &v->vdev;
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+
+       vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1);
+
+       if (ops->dma_map)
+               ops->dma_unmap(vdpa, iova, size);
+       else if (ops->set_map)
+               ops->set_map(vdpa, dev->iotlb);
+       else
+               iommu_unmap(v->domain, iova, size);
+}
+
+static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v,
+                                          struct vhost_iotlb_msg *msg)
+{
+       struct vhost_dev *dev = &v->vdev;
+       struct vhost_iotlb *iotlb = dev->iotlb;
+       struct page **page_list;
+       unsigned long list_size = PAGE_SIZE / sizeof(struct page *);
+       unsigned int gup_flags = FOLL_LONGTERM;
+       unsigned long npages, cur_base, map_pfn, last_pfn = 0;
+       unsigned long locked, lock_limit, pinned, i;
+       u64 iova = msg->iova;
+       int ret = 0;
+
+       if (vhost_iotlb_itree_first(iotlb, msg->iova,
+                                   msg->iova + msg->size - 1))
+               return -EEXIST;
+
+       page_list = (struct page **) __get_free_page(GFP_KERNEL);
+       if (!page_list)
+               return -ENOMEM;
+
+       if (msg->perm & VHOST_ACCESS_WO)
+               gup_flags |= FOLL_WRITE;
+
+       npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT;
+       if (!npages)
+               return -EINVAL;
+
+       down_read(&dev->mm->mmap_sem);
+
+       locked = atomic64_add_return(npages, &dev->mm->pinned_vm);
+       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+       if (locked > lock_limit) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       cur_base = msg->uaddr & PAGE_MASK;
+       iova &= PAGE_MASK;
+
+       while (npages) {
+               pinned = min_t(unsigned long, npages, list_size);
+               ret = pin_user_pages(cur_base, pinned,
+                                    gup_flags, page_list, NULL);
+               if (ret != pinned)
+                       goto out;
+
+               if (!last_pfn)
+                       map_pfn = page_to_pfn(page_list[0]);
+
+               for (i = 0; i < ret; i++) {
+                       unsigned long this_pfn = page_to_pfn(page_list[i]);
+                       u64 csize;
+
+                       if (last_pfn && (this_pfn != last_pfn + 1)) {
+                               /* Pin a contiguous chunk of memory */
+                               csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT;
+                               if (vhost_vdpa_map(v, iova, csize,
+                                                  map_pfn << PAGE_SHIFT,
+                                                  msg->perm))
+                                       goto out;
+                               map_pfn = this_pfn;
+                               iova += csize;
+                       }
+
+                       last_pfn = this_pfn;
+               }
+
+               cur_base += ret << PAGE_SHIFT;
+               npages -= ret;
+       }
+
+       /* Pin the rest chunk */
+       ret = vhost_vdpa_map(v, iova, (last_pfn - map_pfn + 1) << PAGE_SHIFT,
+                            map_pfn << PAGE_SHIFT, msg->perm);
+out:
+       if (ret) {
+               vhost_vdpa_unmap(v, msg->iova, msg->size);
+               atomic64_sub(npages, &dev->mm->pinned_vm);
+       }
+       up_read(&dev->mm->mmap_sem);
+       free_page((unsigned long)page_list);
+       return ret;
+}
+
+static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev,
+                                       struct vhost_iotlb_msg *msg)
+{
+       struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev);
+       int r = 0;
+
+       r = vhost_dev_check_owner(dev);
+       if (r)
+               return r;
+
+       switch (msg->type) {
+       case VHOST_IOTLB_UPDATE:
+               r = vhost_vdpa_process_iotlb_update(v, msg);
+               break;
+       case VHOST_IOTLB_INVALIDATE:
+               vhost_vdpa_unmap(v, msg->iova, msg->size);
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+
+       return r;
+}
+
+static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb,
+                                        struct iov_iter *from)
+{
+       struct file *file = iocb->ki_filp;
+       struct vhost_vdpa *v = file->private_data;
+       struct vhost_dev *dev = &v->vdev;
+
+       return vhost_chr_write_iter(dev, from);
+}
+
+static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+       struct device *dma_dev = vdpa_get_dma_dev(vdpa);
+       struct bus_type *bus;
+       int ret;
+
+       /* Device want to do DMA by itself */
+       if (ops->set_map || ops->dma_map)
+               return 0;
+
+       bus = dma_dev->bus;
+       if (!bus)
+               return -EFAULT;
+
+       if (!iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
+               return -ENOTSUPP;
+
+       v->domain = iommu_domain_alloc(bus);
+       if (!v->domain)
+               return -EIO;
+
+       ret = iommu_attach_device(v->domain, dma_dev);
+       if (ret)
+               goto err_attach;
+
+       return 0;
+
+err_attach:
+       iommu_domain_free(v->domain);
+       return ret;
+}
+
+static void vhost_vdpa_free_domain(struct vhost_vdpa *v)
+{
+       struct vdpa_device *vdpa = v->vdpa;
+       struct device *dma_dev = vdpa_get_dma_dev(vdpa);
+
+       if (v->domain) {
+               iommu_detach_device(v->domain, dma_dev);
+               iommu_domain_free(v->domain);
+       }
+
+       v->domain = NULL;
+}
+
+static int vhost_vdpa_open(struct inode *inode, struct file *filep)
+{
+       struct vhost_vdpa *v;
+       struct vhost_dev *dev;
+       struct vhost_virtqueue **vqs;
+       int nvqs, i, r, opened;
+
+       v = container_of(inode->i_cdev, struct vhost_vdpa, cdev);
+       if (!v)
+               return -ENODEV;
+
+       opened = atomic_cmpxchg(&v->opened, 0, 1);
+       if (opened)
+               return -EBUSY;
+
+       nvqs = v->nvqs;
+       vhost_vdpa_reset(v);
+
+       vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL);
+       if (!vqs) {
+               r = -ENOMEM;
+               goto err;
+       }
+
+       dev = &v->vdev;
+       for (i = 0; i < nvqs; i++) {
+               vqs[i] = &v->vqs[i];
+               vqs[i]->handle_kick = handle_vq_kick;
+       }
+       vhost_dev_init(dev, vqs, nvqs, 0, 0, 0,
+                      vhost_vdpa_process_iotlb_msg);
+
+       dev->iotlb = vhost_iotlb_alloc(0, 0);
+       if (!dev->iotlb) {
+               r = -ENOMEM;
+               goto err_init_iotlb;
+       }
+
+       r = vhost_vdpa_alloc_domain(v);
+       if (r)
+               goto err_init_iotlb;
+
+       filep->private_data = v;
+
+       return 0;
+
+err_init_iotlb:
+       vhost_dev_cleanup(&v->vdev);
+err:
+       atomic_dec(&v->opened);
+       return r;
+}
+
+static int vhost_vdpa_release(struct inode *inode, struct file *filep)
+{
+       struct vhost_vdpa *v = filep->private_data;
+       struct vhost_dev *d = &v->vdev;
+
+       mutex_lock(&d->mutex);
+       filep->private_data = NULL;
+       vhost_vdpa_reset(v);
+       vhost_dev_stop(&v->vdev);
+       vhost_vdpa_iotlb_free(v);
+       vhost_vdpa_free_domain(v);
+       vhost_dev_cleanup(&v->vdev);
+       kfree(v->vdev.vqs);
+       mutex_unlock(&d->mutex);
+
+       atomic_dec(&v->opened);
+       complete(&v->completion);
+
+       return 0;
+}
+
+static const struct file_operations vhost_vdpa_fops = {
+       .owner          = THIS_MODULE,
+       .open           = vhost_vdpa_open,
+       .release        = vhost_vdpa_release,
+       .write_iter     = vhost_vdpa_chr_write_iter,
+       .unlocked_ioctl = vhost_vdpa_unlocked_ioctl,
+       .compat_ioctl   = compat_ptr_ioctl,
+};
+
+static void vhost_vdpa_release_dev(struct device *device)
+{
+       struct vhost_vdpa *v =
+              container_of(device, struct vhost_vdpa, dev);
+
+       ida_simple_remove(&vhost_vdpa_ida, v->minor);
+       kfree(v->vqs);
+       kfree(v);
+}
+
+static int vhost_vdpa_probe(struct vdpa_device *vdpa)
+{
+       const struct vdpa_config_ops *ops = vdpa->config;
+       struct vhost_vdpa *v;
+       int minor, nvqs = VHOST_VDPA_VQ_MAX;
+       int r;
+
+       /* Currently, we only accept the network devices. */
+       if (ops->get_device_id(vdpa) != VIRTIO_ID_NET)
+               return -ENOTSUPP;
+
+       v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+       if (!v)
+               return -ENOMEM;
+
+       minor = ida_simple_get(&vhost_vdpa_ida, 0,
+                              VHOST_VDPA_DEV_MAX, GFP_KERNEL);
+       if (minor < 0) {
+               kfree(v);
+               return minor;
+       }
+
+       atomic_set(&v->opened, 0);
+       v->minor = minor;
+       v->vdpa = vdpa;
+       v->nvqs = nvqs;
+       v->virtio_id = ops->get_device_id(vdpa);
+
+       device_initialize(&v->dev);
+       v->dev.release = vhost_vdpa_release_dev;
+       v->dev.parent = &vdpa->dev;
+       v->dev.devt = MKDEV(MAJOR(vhost_vdpa_major), minor);
+       v->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue),
+                              GFP_KERNEL);
+       if (!v->vqs) {
+               r = -ENOMEM;
+               goto err;
+       }
+
+       r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor);
+       if (r)
+               goto err;
+
+       cdev_init(&v->cdev, &vhost_vdpa_fops);
+       v->cdev.owner = THIS_MODULE;
+
+       r = cdev_device_add(&v->cdev, &v->dev);
+       if (r)
+               goto err;
+
+       init_completion(&v->completion);
+       vdpa_set_drvdata(vdpa, v);
+
+       return 0;
+
+err:
+       put_device(&v->dev);
+       return r;
+}
+
+static void vhost_vdpa_remove(struct vdpa_device *vdpa)
+{
+       struct vhost_vdpa *v = vdpa_get_drvdata(vdpa);
+       int opened;
+
+       cdev_device_del(&v->cdev, &v->dev);
+
+       do {
+               opened = atomic_cmpxchg(&v->opened, 0, 1);
+               if (!opened)
+                       break;
+               wait_for_completion(&v->completion);
+       } while (1);
+
+       put_device(&v->dev);
+}
+
+static struct vdpa_driver vhost_vdpa_driver = {
+       .driver = {
+               .name   = "vhost_vdpa",
+       },
+       .probe  = vhost_vdpa_probe,
+       .remove = vhost_vdpa_remove,
+};
+
+static int __init vhost_vdpa_init(void)
+{
+       int r;
+
+       r = alloc_chrdev_region(&vhost_vdpa_major, 0, VHOST_VDPA_DEV_MAX,
+                               "vhost-vdpa");
+       if (r)
+               goto err_alloc_chrdev;
+
+       r = vdpa_register_driver(&vhost_vdpa_driver);
+       if (r)
+               goto err_vdpa_register_driver;
+
+       return 0;
+
+err_vdpa_register_driver:
+       unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX);
+err_alloc_chrdev:
+       return r;
+}
+module_init(vhost_vdpa_init);
+
+static void __exit vhost_vdpa_exit(void)
+{
+       vdpa_unregister_driver(&vhost_vdpa_driver);
+       unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX);
+}
+module_exit(vhost_vdpa_exit);
+
+MODULE_VERSION("0.0.1");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("vDPA-based vhost backend for virtio");
index f44340b..d450e16 100644 (file)
@@ -50,10 +50,6 @@ enum {
 #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
 #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
 
-INTERVAL_TREE_DEFINE(struct vhost_umem_node,
-                    rb, __u64, __subtree_last,
-                    START, LAST, static inline, vhost_umem_interval_tree);
-
 #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
 static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
 {
@@ -457,7 +453,9 @@ static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
 
 void vhost_dev_init(struct vhost_dev *dev,
                    struct vhost_virtqueue **vqs, int nvqs,
-                   int iov_limit, int weight, int byte_weight)
+                   int iov_limit, int weight, int byte_weight,
+                   int (*msg_handler)(struct vhost_dev *dev,
+                                      struct vhost_iotlb_msg *msg))
 {
        struct vhost_virtqueue *vq;
        int i;
@@ -473,6 +471,7 @@ void vhost_dev_init(struct vhost_dev *dev,
        dev->iov_limit = iov_limit;
        dev->weight = weight;
        dev->byte_weight = byte_weight;
+       dev->msg_handler = msg_handler;
        init_llist_head(&dev->work_list);
        init_waitqueue_head(&dev->wait);
        INIT_LIST_HEAD(&dev->read_list);
@@ -581,21 +580,25 @@ err_mm:
 }
 EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
 
-struct vhost_umem *vhost_dev_reset_owner_prepare(void)
+static struct vhost_iotlb *iotlb_alloc(void)
+{
+       return vhost_iotlb_alloc(max_iotlb_entries,
+                                VHOST_IOTLB_FLAG_RETIRE);
+}
+
+struct vhost_iotlb *vhost_dev_reset_owner_prepare(void)
 {
-       return kvzalloc(sizeof(struct vhost_umem), GFP_KERNEL);
+       return iotlb_alloc();
 }
 EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
 
 /* Caller should have device mutex */
-void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem)
+void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
 {
        int i;
 
        vhost_dev_cleanup(dev);
 
-       /* Restore memory to default empty mapping. */
-       INIT_LIST_HEAD(&umem->umem_list);
        dev->umem = umem;
        /* We don't need VQ locks below since vhost_dev_cleanup makes sure
         * VQs aren't running.
@@ -618,28 +621,6 @@ void vhost_dev_stop(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_stop);
 
-static void vhost_umem_free(struct vhost_umem *umem,
-                           struct vhost_umem_node *node)
-{
-       vhost_umem_interval_tree_remove(node, &umem->umem_tree);
-       list_del(&node->link);
-       kfree(node);
-       umem->numem--;
-}
-
-static void vhost_umem_clean(struct vhost_umem *umem)
-{
-       struct vhost_umem_node *node, *tmp;
-
-       if (!umem)
-               return;
-
-       list_for_each_entry_safe(node, tmp, &umem->umem_list, link)
-               vhost_umem_free(umem, node);
-
-       kvfree(umem);
-}
-
 static void vhost_clear_msg(struct vhost_dev *dev)
 {
        struct vhost_msg_node *node, *n;
@@ -677,9 +658,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
                eventfd_ctx_put(dev->log_ctx);
        dev->log_ctx = NULL;
        /* No one will access memory at this point */
-       vhost_umem_clean(dev->umem);
+       vhost_iotlb_free(dev->umem);
        dev->umem = NULL;
-       vhost_umem_clean(dev->iotlb);
+       vhost_iotlb_free(dev->iotlb);
        dev->iotlb = NULL;
        vhost_clear_msg(dev);
        wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
@@ -715,27 +696,26 @@ static bool vhost_overflow(u64 uaddr, u64 size)
 }
 
 /* Caller should have vq mutex and device mutex. */
-static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
+static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem,
                                int log_all)
 {
-       struct vhost_umem_node *node;
+       struct vhost_iotlb_map *map;
 
        if (!umem)
                return false;
 
-       list_for_each_entry(node, &umem->umem_list, link) {
-               unsigned long a = node->userspace_addr;
+       list_for_each_entry(map, &umem->list, link) {
+               unsigned long a = map->addr;
 
-               if (vhost_overflow(node->userspace_addr, node->size))
+               if (vhost_overflow(map->addr, map->size))
                        return false;
 
 
-               if (!access_ok((void __user *)a,
-                                   node->size))
+               if (!access_ok((void __user *)a, map->size))
                        return false;
                else if (log_all && !log_access_ok(log_base,
-                                                  node->start,
-                                                  node->size))
+                                                  map->start,
+                                                  map->size))
                        return false;
        }
        return true;
@@ -745,17 +725,17 @@ static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq,
                                               u64 addr, unsigned int size,
                                               int type)
 {
-       const struct vhost_umem_node *node = vq->meta_iotlb[type];
+       const struct vhost_iotlb_map *map = vq->meta_iotlb[type];
 
-       if (!node)
+       if (!map)
                return NULL;
 
-       return (void *)(uintptr_t)(node->userspace_addr + addr - node->start);
+       return (void *)(uintptr_t)(map->addr + addr - map->start);
 }
 
 /* Can we switch to this memory table? */
 /* Caller should have device mutex but not vq mutex */
-static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
+static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem,
                             int log_all)
 {
        int i;
@@ -1020,47 +1000,6 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq,
        return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
 }
 
-static int vhost_new_umem_range(struct vhost_umem *umem,
-                               u64 start, u64 size, u64 end,
-                               u64 userspace_addr, int perm)
-{
-       struct vhost_umem_node *tmp, *node;
-
-       if (!size)
-               return -EFAULT;
-
-       node = kmalloc(sizeof(*node), GFP_ATOMIC);
-       if (!node)
-               return -ENOMEM;
-
-       if (umem->numem == max_iotlb_entries) {
-               tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link);
-               vhost_umem_free(umem, tmp);
-       }
-
-       node->start = start;
-       node->size = size;
-       node->last = end;
-       node->userspace_addr = userspace_addr;
-       node->perm = perm;
-       INIT_LIST_HEAD(&node->link);
-       list_add_tail(&node->link, &umem->umem_list);
-       vhost_umem_interval_tree_insert(node, &umem->umem_tree);
-       umem->numem++;
-
-       return 0;
-}
-
-static void vhost_del_umem_range(struct vhost_umem *umem,
-                                u64 start, u64 end)
-{
-       struct vhost_umem_node *node;
-
-       while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
-                                                          start, end)))
-               vhost_umem_free(umem, node);
-}
-
 static void vhost_iotlb_notify_vq(struct vhost_dev *d,
                                  struct vhost_iotlb_msg *msg)
 {
@@ -1117,9 +1056,9 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
                        break;
                }
                vhost_vq_meta_reset(dev);
-               if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size,
-                                        msg->iova + msg->size - 1,
-                                        msg->uaddr, msg->perm)) {
+               if (vhost_iotlb_add_range(dev->iotlb, msg->iova,
+                                         msg->iova + msg->size - 1,
+                                         msg->uaddr, msg->perm)) {
                        ret = -ENOMEM;
                        break;
                }
@@ -1131,8 +1070,8 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
                        break;
                }
                vhost_vq_meta_reset(dev);
-               vhost_del_umem_range(dev->iotlb, msg->iova,
-                                    msg->iova + msg->size - 1);
+               vhost_iotlb_del_range(dev->iotlb, msg->iova,
+                                     msg->iova + msg->size - 1);
                break;
        default:
                ret = -EINVAL;
@@ -1178,7 +1117,12 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
                ret = -EINVAL;
                goto done;
        }
-       if (vhost_process_iotlb_msg(dev, &msg)) {
+
+       if (dev->msg_handler)
+               ret = dev->msg_handler(dev, &msg);
+       else
+               ret = vhost_process_iotlb_msg(dev, &msg);
+       if (ret) {
                ret = -EFAULT;
                goto done;
        }
@@ -1311,44 +1255,42 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
 }
 
 static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
-                                const struct vhost_umem_node *node,
+                                const struct vhost_iotlb_map *map,
                                 int type)
 {
        int access = (type == VHOST_ADDR_USED) ?
                     VHOST_ACCESS_WO : VHOST_ACCESS_RO;
 
-       if (likely(node->perm & access))
-               vq->meta_iotlb[type] = node;
+       if (likely(map->perm & access))
+               vq->meta_iotlb[type] = map;
 }
 
 static bool iotlb_access_ok(struct vhost_virtqueue *vq,
                            int access, u64 addr, u64 len, int type)
 {
-       const struct vhost_umem_node *node;
-       struct vhost_umem *umem = vq->iotlb;
+       const struct vhost_iotlb_map *map;
+       struct vhost_iotlb *umem = vq->iotlb;
        u64 s = 0, size, orig_addr = addr, last = addr + len - 1;
 
        if (vhost_vq_meta_fetch(vq, addr, len, type))
                return true;
 
        while (len > s) {
-               node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
-                                                          addr,
-                                                          last);
-               if (node == NULL || node->start > addr) {
+               map = vhost_iotlb_itree_first(umem, addr, last);
+               if (map == NULL || map->start > addr) {
                        vhost_iotlb_miss(vq, addr, access);
                        return false;
-               } else if (!(node->perm & access)) {
+               } else if (!(map->perm & access)) {
                        /* Report the possible access violation by
                         * request another translation from userspace.
                         */
                        return false;
                }
 
-               size = node->size - addr + node->start;
+               size = map->size - addr + map->start;
 
                if (orig_addr == addr && size >= len)
-                       vhost_vq_meta_update(vq, node, type);
+                       vhost_vq_meta_update(vq, map, type);
 
                s += size;
                addr += size;
@@ -1364,12 +1306,12 @@ int vq_meta_prefetch(struct vhost_virtqueue *vq)
        if (!vq->iotlb)
                return 1;
 
-       return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+       return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc,
                               vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
-              iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
+              iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail,
                               vhost_get_avail_size(vq, num),
                               VHOST_ADDR_AVAIL) &&
-              iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
+              iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used,
                               vhost_get_used_size(vq, num), VHOST_ADDR_USED);
 }
 EXPORT_SYMBOL_GPL(vq_meta_prefetch);
@@ -1408,25 +1350,11 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
 
-static struct vhost_umem *vhost_umem_alloc(void)
-{
-       struct vhost_umem *umem = kvzalloc(sizeof(*umem), GFP_KERNEL);
-
-       if (!umem)
-               return NULL;
-
-       umem->umem_tree = RB_ROOT_CACHED;
-       umem->numem = 0;
-       INIT_LIST_HEAD(&umem->umem_list);
-
-       return umem;
-}
-
 static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
 {
        struct vhost_memory mem, *newmem;
        struct vhost_memory_region *region;
-       struct vhost_umem *newumem, *oldumem;
+       struct vhost_iotlb *newumem, *oldumem;
        unsigned long size = offsetof(struct vhost_memory, regions);
        int i;
 
@@ -1448,7 +1376,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
                return -EFAULT;
        }
 
-       newumem = vhost_umem_alloc();
+       newumem = iotlb_alloc();
        if (!newumem) {
                kvfree(newmem);
                return -ENOMEM;
@@ -1457,13 +1385,12 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
        for (region = newmem->regions;
             region < newmem->regions + mem.nregions;
             region++) {
-               if (vhost_new_umem_range(newumem,
-                                        region->guest_phys_addr,
-                                        region->memory_size,
-                                        region->guest_phys_addr +
-                                        region->memory_size - 1,
-                                        region->userspace_addr,
-                                        VHOST_ACCESS_RW))
+               if (vhost_iotlb_add_range(newumem,
+                                         region->guest_phys_addr,
+                                         region->guest_phys_addr +
+                                         region->memory_size - 1,
+                                         region->userspace_addr,
+                                         VHOST_MAP_RW))
                        goto err;
        }
 
@@ -1481,11 +1408,11 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
        }
 
        kvfree(newmem);
-       vhost_umem_clean(oldumem);
+       vhost_iotlb_free(oldumem);
        return 0;
 
 err:
-       vhost_umem_clean(newumem);
+       vhost_iotlb_free(newumem);
        kvfree(newmem);
        return -EFAULT;
 }
@@ -1726,10 +1653,10 @@ EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
 
 int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
 {
-       struct vhost_umem *niotlb, *oiotlb;
+       struct vhost_iotlb *niotlb, *oiotlb;
        int i;
 
-       niotlb = vhost_umem_alloc();
+       niotlb = iotlb_alloc();
        if (!niotlb)
                return -ENOMEM;
 
@@ -1745,7 +1672,7 @@ int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
                mutex_unlock(&vq->mutex);
        }
 
-       vhost_umem_clean(oiotlb);
+       vhost_iotlb_free(oiotlb);
 
        return 0;
 }
@@ -1875,8 +1802,8 @@ static int log_write(void __user *log_base,
 
 static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
 {
-       struct vhost_umem *umem = vq->umem;
-       struct vhost_umem_node *u;
+       struct vhost_iotlb *umem = vq->umem;
+       struct vhost_iotlb_map *u;
        u64 start, end, l, min;
        int r;
        bool hit = false;
@@ -1886,16 +1813,15 @@ static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
                /* More than one GPAs can be mapped into a single HVA. So
                 * iterate all possible umems here to be safe.
                 */
-               list_for_each_entry(u, &umem->umem_list, link) {
-                       if (u->userspace_addr > hva - 1 + len ||
-                           u->userspace_addr - 1 + u->size < hva)
+               list_for_each_entry(u, &umem->list, link) {
+                       if (u->addr > hva - 1 + len ||
+                           u->addr - 1 + u->size < hva)
                                continue;
-                       start = max(u->userspace_addr, hva);
-                       end = min(u->userspace_addr - 1 + u->size,
-                                 hva - 1 + len);
+                       start = max(u->addr, hva);
+                       end = min(u->addr - 1 + u->size, hva - 1 + len);
                        l = end - start + 1;
                        r = log_write(vq->log_base,
-                                     u->start + start - u->userspace_addr,
+                                     u->start + start - u->addr,
                                      l);
                        if (r < 0)
                                return r;
@@ -2046,9 +1972,9 @@ EXPORT_SYMBOL_GPL(vhost_vq_init_access);
 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
                          struct iovec iov[], int iov_size, int access)
 {
-       const struct vhost_umem_node *node;
+       const struct vhost_iotlb_map *map;
        struct vhost_dev *dev = vq->dev;
-       struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem;
+       struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem;
        struct iovec *_iov;
        u64 s = 0;
        int ret = 0;
@@ -2060,25 +1986,24 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
                        break;
                }
 
-               node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
-                                                       addr, addr + len - 1);
-               if (node == NULL || node->start > addr) {
+               map = vhost_iotlb_itree_first(umem, addr, addr + len - 1);
+               if (map == NULL || map->start > addr) {
                        if (umem != dev->iotlb) {
                                ret = -EFAULT;
                                break;
                        }
                        ret = -EAGAIN;
                        break;
-               } else if (!(node->perm & access)) {
+               } else if (!(map->perm & access)) {
                        ret = -EPERM;
                        break;
                }
 
                _iov = iov + ret;
-               size = node->size - addr + node->start;
+               size = map->size - addr + map->start;
                _iov->iov_len = min((u64)len - s, size);
                _iov->iov_base = (void __user *)(unsigned long)
-                       (node->userspace_addr + addr - node->start);
+                                (map->addr + addr - map->start);
                s += size;
                addr += size;
                ++ret;
index a123fd7..1813821 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/atomic.h>
+#include <linux/vhost_iotlb.h>
 
 struct vhost_work;
 typedef void (*vhost_work_fn_t)(struct vhost_work *work);
@@ -52,27 +53,6 @@ struct vhost_log {
        u64 len;
 };
 
-#define START(node) ((node)->start)
-#define LAST(node) ((node)->last)
-
-struct vhost_umem_node {
-       struct rb_node rb;
-       struct list_head link;
-       __u64 start;
-       __u64 last;
-       __u64 size;
-       __u64 userspace_addr;
-       __u32 perm;
-       __u32 flags_padding;
-       __u64 __subtree_last;
-};
-
-struct vhost_umem {
-       struct rb_root_cached umem_tree;
-       struct list_head umem_list;
-       int numem;
-};
-
 enum vhost_uaddr_type {
        VHOST_ADDR_DESC = 0,
        VHOST_ADDR_AVAIL = 1,
@@ -90,7 +70,7 @@ struct vhost_virtqueue {
        struct vring_desc __user *desc;
        struct vring_avail __user *avail;
        struct vring_used __user *used;
-       const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
+       const struct vhost_iotlb_map *meta_iotlb[VHOST_NUM_ADDRS];
        struct file *kick;
        struct eventfd_ctx *call_ctx;
        struct eventfd_ctx *error_ctx;
@@ -128,8 +108,8 @@ struct vhost_virtqueue {
        struct iovec *indirect;
        struct vring_used_elem *heads;
        /* Protected by virtqueue mutex. */
-       struct vhost_umem *umem;
-       struct vhost_umem *iotlb;
+       struct vhost_iotlb *umem;
+       struct vhost_iotlb *iotlb;
        void *private_data;
        u64 acked_features;
        u64 acked_backend_features;
@@ -164,8 +144,8 @@ struct vhost_dev {
        struct eventfd_ctx *log_ctx;
        struct llist_head work_list;
        struct task_struct *worker;
-       struct vhost_umem *umem;
-       struct vhost_umem *iotlb;
+       struct vhost_iotlb *umem;
+       struct vhost_iotlb *iotlb;
        spinlock_t iotlb_lock;
        struct list_head read_list;
        struct list_head pending_list;
@@ -174,16 +154,20 @@ struct vhost_dev {
        int weight;
        int byte_weight;
        u64 kcov_handle;
+       int (*msg_handler)(struct vhost_dev *dev,
+                          struct vhost_iotlb_msg *msg);
 };
 
 bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
 void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs,
-                   int nvqs, int iov_limit, int weight, int byte_weight);
+                   int nvqs, int iov_limit, int weight, int byte_weight,
+                   int (*msg_handler)(struct vhost_dev *dev,
+                                      struct vhost_iotlb_msg *msg));
 long vhost_dev_set_owner(struct vhost_dev *dev);
 bool vhost_dev_has_owner(struct vhost_dev *dev);
 long vhost_dev_check_owner(struct vhost_dev *);
-struct vhost_umem *vhost_dev_reset_owner_prepare(void);
-void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *);
+struct vhost_iotlb *vhost_dev_reset_owner_prepare(void);
+void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *iotlb);
 void vhost_dev_cleanup(struct vhost_dev *);
 void vhost_dev_stop(struct vhost_dev *);
 long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
@@ -229,6 +213,9 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
                             struct iov_iter *from);
 int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled);
 
+void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
+                         struct vhost_iotlb_map *map);
+
 #define vq_err(vq, fmt, ...) do {                                  \
                pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
                if ((vq)->error_ctx)                               \
index a0a2d74..ee0491f 100644 (file)
@@ -13,6 +13,9 @@
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/bvec.h>
+#include <linux/highmem.h>
+#include <linux/vhost_iotlb.h>
 #include <uapi/linux/virtio_config.h>
 
 static __printf(1,2) __cold void vringh_bad(const char *fmt, ...)
@@ -71,9 +74,11 @@ static inline int __vringh_get_head(const struct vringh *vrh,
 }
 
 /* Copy some bytes to/from the iovec.  Returns num copied. */
-static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
+static inline ssize_t vringh_iov_xfer(struct vringh *vrh,
+                                     struct vringh_kiov *iov,
                                      void *ptr, size_t len,
-                                     int (*xfer)(void *addr, void *ptr,
+                                     int (*xfer)(const struct vringh *vrh,
+                                                 void *addr, void *ptr,
                                                  size_t len))
 {
        int err, done = 0;
@@ -82,7 +87,7 @@ static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
                size_t partlen;
 
                partlen = min(iov->iov[iov->i].iov_len, len);
-               err = xfer(iov->iov[iov->i].iov_base, ptr, partlen);
+               err = xfer(vrh, iov->iov[iov->i].iov_base, ptr, partlen);
                if (err)
                        return err;
                done += partlen;
@@ -96,6 +101,7 @@ static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
                        /* Fix up old iov element then increment. */
                        iov->iov[iov->i].iov_len = iov->consumed;
                        iov->iov[iov->i].iov_base -= iov->consumed;
+
                        
                        iov->consumed = 0;
                        iov->i++;
@@ -227,7 +233,8 @@ static int slow_copy(struct vringh *vrh, void *dst, const void *src,
                                      u64 addr,
                                      struct vringh_range *r),
                     struct vringh_range *range,
-                    int (*copy)(void *dst, const void *src, size_t len))
+                    int (*copy)(const struct vringh *vrh,
+                                void *dst, const void *src, size_t len))
 {
        size_t part, len = sizeof(struct vring_desc);
 
@@ -241,7 +248,7 @@ static int slow_copy(struct vringh *vrh, void *dst, const void *src,
                if (!rcheck(vrh, addr, &part, range, getrange))
                        return -EINVAL;
 
-               err = copy(dst, src, part);
+               err = copy(vrh, dst, src, part);
                if (err)
                        return err;
 
@@ -262,7 +269,8 @@ __vringh_iov(struct vringh *vrh, u16 i,
                                             struct vringh_range *)),
             bool (*getrange)(struct vringh *, u64, struct vringh_range *),
             gfp_t gfp,
-            int (*copy)(void *dst, const void *src, size_t len))
+            int (*copy)(const struct vringh *vrh,
+                        void *dst, const void *src, size_t len))
 {
        int err, count = 0, up_next, desc_max;
        struct vring_desc desc, *descs;
@@ -291,7 +299,7 @@ __vringh_iov(struct vringh *vrh, u16 i,
                        err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange,
                                        &slowrange, copy);
                else
-                       err = copy(&desc, &descs[i], sizeof(desc));
+                       err = copy(vrh, &desc, &descs[i], sizeof(desc));
                if (unlikely(err))
                        goto fail;
 
@@ -404,7 +412,8 @@ static inline int __vringh_complete(struct vringh *vrh,
                                    unsigned int num_used,
                                    int (*putu16)(const struct vringh *vrh,
                                                  __virtio16 *p, u16 val),
-                                   int (*putused)(struct vring_used_elem *dst,
+                                   int (*putused)(const struct vringh *vrh,
+                                                  struct vring_used_elem *dst,
                                                   const struct vring_used_elem
                                                   *src, unsigned num))
 {
@@ -420,12 +429,12 @@ static inline int __vringh_complete(struct vringh *vrh,
        /* Compiler knows num_used == 1 sometimes, hence extra check */
        if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) {
                u16 part = vrh->vring.num - off;
-               err = putused(&used_ring->ring[off], used, part);
+               err = putused(vrh, &used_ring->ring[off], used, part);
                if (!err)
-                       err = putused(&used_ring->ring[0], used + part,
+                       err = putused(vrh, &used_ring->ring[0], used + part,
                                      num_used - part);
        } else
-               err = putused(&used_ring->ring[off], used, num_used);
+               err = putused(vrh, &used_ring->ring[off], used, num_used);
 
        if (err) {
                vringh_bad("Failed to write %u used entries %u at %p",
@@ -564,13 +573,15 @@ static inline int putu16_user(const struct vringh *vrh, __virtio16 *p, u16 val)
        return put_user(v, (__force __virtio16 __user *)p);
 }
 
-static inline int copydesc_user(void *dst, const void *src, size_t len)
+static inline int copydesc_user(const struct vringh *vrh,
+                               void *dst, const void *src, size_t len)
 {
        return copy_from_user(dst, (__force void __user *)src, len) ?
                -EFAULT : 0;
 }
 
-static inline int putused_user(struct vring_used_elem *dst,
+static inline int putused_user(const struct vringh *vrh,
+                              struct vring_used_elem *dst,
                               const struct vring_used_elem *src,
                               unsigned int num)
 {
@@ -578,13 +589,15 @@ static inline int putused_user(struct vring_used_elem *dst,
                            sizeof(*dst) * num) ? -EFAULT : 0;
 }
 
-static inline int xfer_from_user(void *src, void *dst, size_t len)
+static inline int xfer_from_user(const struct vringh *vrh, void *src,
+                                void *dst, size_t len)
 {
        return copy_from_user(dst, (__force void __user *)src, len) ?
                -EFAULT : 0;
 }
 
-static inline int xfer_to_user(void *dst, void *src, size_t len)
+static inline int xfer_to_user(const struct vringh *vrh,
+                              void *dst, void *src, size_t len)
 {
        return copy_to_user((__force void __user *)dst, src, len) ?
                -EFAULT : 0;
@@ -706,7 +719,7 @@ EXPORT_SYMBOL(vringh_getdesc_user);
  */
 ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len)
 {
-       return vringh_iov_xfer((struct vringh_kiov *)riov,
+       return vringh_iov_xfer(NULL, (struct vringh_kiov *)riov,
                               dst, len, xfer_from_user);
 }
 EXPORT_SYMBOL(vringh_iov_pull_user);
@@ -722,7 +735,7 @@ EXPORT_SYMBOL(vringh_iov_pull_user);
 ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
                             const void *src, size_t len)
 {
-       return vringh_iov_xfer((struct vringh_kiov *)wiov,
+       return vringh_iov_xfer(NULL, (struct vringh_kiov *)wiov,
                               (void *)src, len, xfer_to_user);
 }
 EXPORT_SYMBOL(vringh_iov_push_user);
@@ -832,13 +845,15 @@ static inline int putu16_kern(const struct vringh *vrh, __virtio16 *p, u16 val)
        return 0;
 }
 
-static inline int copydesc_kern(void *dst, const void *src, size_t len)
+static inline int copydesc_kern(const struct vringh *vrh,
+                               void *dst, const void *src, size_t len)
 {
        memcpy(dst, src, len);
        return 0;
 }
 
-static inline int putused_kern(struct vring_used_elem *dst,
+static inline int putused_kern(const struct vringh *vrh,
+                              struct vring_used_elem *dst,
                               const struct vring_used_elem *src,
                               unsigned int num)
 {
@@ -846,13 +861,15 @@ static inline int putused_kern(struct vring_used_elem *dst,
        return 0;
 }
 
-static inline int xfer_kern(void *src, void *dst, size_t len)
+static inline int xfer_kern(const struct vringh *vrh, void *src,
+                           void *dst, size_t len)
 {
        memcpy(dst, src, len);
        return 0;
 }
 
-static inline int kern_xfer(void *dst, void *src, size_t len)
+static inline int kern_xfer(const struct vringh *vrh, void *dst,
+                           void *src, size_t len)
 {
        memcpy(dst, src, len);
        return 0;
@@ -949,7 +966,7 @@ EXPORT_SYMBOL(vringh_getdesc_kern);
  */
 ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
 {
-       return vringh_iov_xfer(riov, dst, len, xfer_kern);
+       return vringh_iov_xfer(NULL, riov, dst, len, xfer_kern);
 }
 EXPORT_SYMBOL(vringh_iov_pull_kern);
 
@@ -964,7 +981,7 @@ EXPORT_SYMBOL(vringh_iov_pull_kern);
 ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
                             const void *src, size_t len)
 {
-       return vringh_iov_xfer(wiov, (void *)src, len, kern_xfer);
+       return vringh_iov_xfer(NULL, wiov, (void *)src, len, kern_xfer);
 }
 EXPORT_SYMBOL(vringh_iov_push_kern);
 
@@ -1042,4 +1059,362 @@ int vringh_need_notify_kern(struct vringh *vrh)
 }
 EXPORT_SYMBOL(vringh_need_notify_kern);
 
+static int iotlb_translate(const struct vringh *vrh,
+                          u64 addr, u64 len, struct bio_vec iov[],
+                          int iov_size, u32 perm)
+{
+       struct vhost_iotlb_map *map;
+       struct vhost_iotlb *iotlb = vrh->iotlb;
+       int ret = 0;
+       u64 s = 0;
+
+       while (len > s) {
+               u64 size, pa, pfn;
+
+               if (unlikely(ret >= iov_size)) {
+                       ret = -ENOBUFS;
+                       break;
+               }
+
+               map = vhost_iotlb_itree_first(iotlb, addr,
+                                             addr + len - 1);
+               if (!map || map->start > addr) {
+                       ret = -EINVAL;
+                       break;
+               } else if (!(map->perm & perm)) {
+                       ret = -EPERM;
+                       break;
+               }
+
+               size = map->size - addr + map->start;
+               pa = map->addr + addr - map->start;
+               pfn = pa >> PAGE_SHIFT;
+               iov[ret].bv_page = pfn_to_page(pfn);
+               iov[ret].bv_len = min(len - s, size);
+               iov[ret].bv_offset = pa & (PAGE_SIZE - 1);
+               s += size;
+               addr += size;
+               ++ret;
+       }
+
+       return ret;
+}
+
+static inline int copy_from_iotlb(const struct vringh *vrh, void *dst,
+                                 void *src, size_t len)
+{
+       struct iov_iter iter;
+       struct bio_vec iov[16];
+       int ret;
+
+       ret = iotlb_translate(vrh, (u64)(uintptr_t)src,
+                             len, iov, 16, VHOST_MAP_RO);
+       if (ret < 0)
+               return ret;
+
+       iov_iter_bvec(&iter, READ, iov, ret, len);
+
+       ret = copy_from_iter(dst, len, &iter);
+
+       return ret;
+}
+
+static inline int copy_to_iotlb(const struct vringh *vrh, void *dst,
+                               void *src, size_t len)
+{
+       struct iov_iter iter;
+       struct bio_vec iov[16];
+       int ret;
+
+       ret = iotlb_translate(vrh, (u64)(uintptr_t)dst,
+                             len, iov, 16, VHOST_MAP_WO);
+       if (ret < 0)
+               return ret;
+
+       iov_iter_bvec(&iter, WRITE, iov, ret, len);
+
+       return copy_to_iter(src, len, &iter);
+}
+
+static inline int getu16_iotlb(const struct vringh *vrh,
+                              u16 *val, const __virtio16 *p)
+{
+       struct bio_vec iov;
+       void *kaddr, *from;
+       int ret;
+
+       /* Atomic read is needed for getu16 */
+       ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p),
+                             &iov, 1, VHOST_MAP_RO);
+       if (ret < 0)
+               return ret;
+
+       kaddr = kmap_atomic(iov.bv_page);
+       from = kaddr + iov.bv_offset;
+       *val = vringh16_to_cpu(vrh, READ_ONCE(*(__virtio16 *)from));
+       kunmap_atomic(kaddr);
+
+       return 0;
+}
+
+static inline int putu16_iotlb(const struct vringh *vrh,
+                              __virtio16 *p, u16 val)
+{
+       struct bio_vec iov;
+       void *kaddr, *to;
+       int ret;
+
+       /* Atomic write is needed for putu16 */
+       ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p),
+                             &iov, 1, VHOST_MAP_WO);
+       if (ret < 0)
+               return ret;
+
+       kaddr = kmap_atomic(iov.bv_page);
+       to = kaddr + iov.bv_offset;
+       WRITE_ONCE(*(__virtio16 *)to, cpu_to_vringh16(vrh, val));
+       kunmap_atomic(kaddr);
+
+       return 0;
+}
+
+static inline int copydesc_iotlb(const struct vringh *vrh,
+                                void *dst, const void *src, size_t len)
+{
+       int ret;
+
+       ret = copy_from_iotlb(vrh, dst, (void *)src, len);
+       if (ret != len)
+               return -EFAULT;
+
+       return 0;
+}
+
+static inline int xfer_from_iotlb(const struct vringh *vrh, void *src,
+                                 void *dst, size_t len)
+{
+       int ret;
+
+       ret = copy_from_iotlb(vrh, dst, src, len);
+       if (ret != len)
+               return -EFAULT;
+
+       return 0;
+}
+
+static inline int xfer_to_iotlb(const struct vringh *vrh,
+                              void *dst, void *src, size_t len)
+{
+       int ret;
+
+       ret = copy_to_iotlb(vrh, dst, src, len);
+       if (ret != len)
+               return -EFAULT;
+
+       return 0;
+}
+
+static inline int putused_iotlb(const struct vringh *vrh,
+                               struct vring_used_elem *dst,
+                               const struct vring_used_elem *src,
+                               unsigned int num)
+{
+       int size = num * sizeof(*dst);
+       int ret;
+
+       ret = copy_to_iotlb(vrh, dst, (void *)src, num * sizeof(*dst));
+       if (ret != size)
+               return -EFAULT;
+
+       return 0;
+}
+
+/**
+ * vringh_init_iotlb - initialize a vringh for a ring with IOTLB.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid.
+ */
+int vringh_init_iotlb(struct vringh *vrh, u64 features,
+                     unsigned int num, bool weak_barriers,
+                     struct vring_desc *desc,
+                     struct vring_avail *avail,
+                     struct vring_used *used)
+{
+       return vringh_init_kern(vrh, features, num, weak_barriers,
+                               desc, avail, used);
+}
+EXPORT_SYMBOL(vringh_init_iotlb);
+
+/**
+ * vringh_set_iotlb - initialize a vringh for a ring with IOTLB.
+ * @vrh: the vring
+ * @iotlb: iotlb associated with this vring
+ */
+void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb)
+{
+       vrh->iotlb = iotlb;
+}
+EXPORT_SYMBOL(vringh_set_iotlb);
+
+/**
+ * vringh_getdesc_iotlb - get next available descriptor from ring with
+ * IOTLB.
+ * @vrh: the kernelspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @head: head index we received, for passing to vringh_complete_iotlb().
+ * @gfp: flags for allocating larger riov/wiov.
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num.  You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_iotlb(struct vringh *vrh,
+                        struct vringh_kiov *riov,
+                        struct vringh_kiov *wiov,
+                        u16 *head,
+                        gfp_t gfp)
+{
+       int err;
+
+       err = __vringh_get_head(vrh, getu16_iotlb, &vrh->last_avail_idx);
+       if (err < 0)
+               return err;
+
+       /* Empty... */
+       if (err == vrh->vring.num)
+               return 0;
+
+       *head = err;
+       err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL,
+                          gfp, copydesc_iotlb);
+       if (err)
+               return err;
+
+       return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_iotlb);
+
+/**
+ * vringh_iov_pull_iotlb - copy bytes from vring_iov.
+ * @vrh: the vring.
+ * @riov: the riov as passed to vringh_getdesc_iotlb() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_iotlb(struct vringh *vrh,
+                             struct vringh_kiov *riov,
+                             void *dst, size_t len)
+{
+       return vringh_iov_xfer(vrh, riov, dst, len, xfer_from_iotlb);
+}
+EXPORT_SYMBOL(vringh_iov_pull_iotlb);
+
+/**
+ * vringh_iov_push_iotlb - copy bytes into vring_iov.
+ * @vrh: the vring.
+ * @wiov: the wiov as passed to vringh_getdesc_iotlb() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_iotlb(struct vringh *vrh,
+                             struct vringh_kiov *wiov,
+                             const void *src, size_t len)
+{
+       return vringh_iov_xfer(vrh, wiov, (void *)src, len, xfer_to_iotlb);
+}
+EXPORT_SYMBOL(vringh_iov_push_iotlb);
+
+/**
+ * vringh_abandon_iotlb - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ *      vringh_get_iotlb() to undo).
+ *
+ * The next vringh_get_iotlb() will return the old descriptor(s) again.
+ */
+void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num)
+{
+       /* We only update vring_avail_event(vr) when we want to be notified,
+        * so we haven't changed that yet.
+        */
+       vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_iotlb);
+
+/**
+ * vringh_complete_iotlb - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_iotlb.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_iotlb() after one or more calls
+ * to this function.
+ */
+int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len)
+{
+       struct vring_used_elem used;
+
+       used.id = cpu_to_vringh32(vrh, head);
+       used.len = cpu_to_vringh32(vrh, len);
+
+       return __vringh_complete(vrh, &used, 1, putu16_iotlb, putused_iotlb);
+}
+EXPORT_SYMBOL(vringh_complete_iotlb);
+
+/**
+ * vringh_notify_enable_iotlb - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_iotlb(struct vringh *vrh)
+{
+       return __vringh_notify_enable(vrh, getu16_iotlb, putu16_iotlb);
+}
+EXPORT_SYMBOL(vringh_notify_enable_iotlb);
+
+/**
+ * vringh_notify_disable_iotlb - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_iotlb(struct vringh *vrh)
+{
+       __vringh_notify_disable(vrh, putu16_iotlb);
+}
+EXPORT_SYMBOL(vringh_notify_disable_iotlb);
+
+/**
+ * vringh_need_notify_iotlb - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_iotlb() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_iotlb(struct vringh *vrh)
+{
+       return __vringh_need_notify(vrh, getu16_iotlb);
+}
+EXPORT_SYMBOL(vringh_need_notify_iotlb);
+
+
 MODULE_LICENSE("GPL");
index c2d7d57..9766948 100644 (file)
@@ -621,7 +621,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
 
        vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs),
                       UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT,
-                      VHOST_VSOCK_WEIGHT);
+                      VHOST_VSOCK_WEIGHT, NULL);
 
        file->private_data = vsock;
        spin_lock_init(&vsock->send_pkt_list_lock);
index 68f7592..25ef0cb 100644 (file)
@@ -15,7 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/delay.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/fb.h>
 #include <linux/lcd.h>
 #include <linux/spi/spi.h>
@@ -90,9 +90,8 @@ struct corgi_lcd {
        int     mode;
        char    buf[2];
 
-       int     gpio_backlight_on;
-       int     gpio_backlight_cont;
-       int     gpio_backlight_cont_inverted;
+       struct gpio_desc *backlight_on;
+       struct gpio_desc *backlight_cont;
 
        void (*kick_battery)(void);
 };
@@ -403,13 +402,13 @@ static int corgi_bl_set_intensity(struct corgi_lcd *lcd, int intensity)
        corgi_ssp_lcdtg_send(lcd, DUTYCTRL_ADRS, intensity);
 
        /* Bit 5 via GPIO_BACKLIGHT_CONT */
-       cont = !!(intensity & 0x20) ^ lcd->gpio_backlight_cont_inverted;
+       cont = !!(intensity & 0x20);
 
-       if (gpio_is_valid(lcd->gpio_backlight_cont))
-               gpio_set_value_cansleep(lcd->gpio_backlight_cont, cont);
+       if (lcd->backlight_cont)
+               gpiod_set_value_cansleep(lcd->backlight_cont, cont);
 
-       if (gpio_is_valid(lcd->gpio_backlight_on))
-               gpio_set_value_cansleep(lcd->gpio_backlight_on, intensity);
+       if (lcd->backlight_on)
+               gpiod_set_value_cansleep(lcd->backlight_on, intensity);
 
        if (lcd->kick_battery)
                lcd->kick_battery();
@@ -482,48 +481,17 @@ static int setup_gpio_backlight(struct corgi_lcd *lcd,
                                struct corgi_lcd_platform_data *pdata)
 {
        struct spi_device *spi = lcd->spi_dev;
-       int err;
-
-       lcd->gpio_backlight_on = -1;
-       lcd->gpio_backlight_cont = -1;
-
-       if (gpio_is_valid(pdata->gpio_backlight_on)) {
-               err = devm_gpio_request(&spi->dev, pdata->gpio_backlight_on,
-                                       "BL_ON");
-               if (err) {
-                       dev_err(&spi->dev,
-                               "failed to request GPIO%d for backlight_on\n",
-                               pdata->gpio_backlight_on);
-                       return err;
-               }
-
-               lcd->gpio_backlight_on = pdata->gpio_backlight_on;
-               gpio_direction_output(lcd->gpio_backlight_on, 0);
-       }
 
-       if (gpio_is_valid(pdata->gpio_backlight_cont)) {
-               err = devm_gpio_request(&spi->dev, pdata->gpio_backlight_cont,
-                                       "BL_CONT");
-               if (err) {
-                       dev_err(&spi->dev,
-                               "failed to request GPIO%d for backlight_cont\n",
-                               pdata->gpio_backlight_cont);
-                       return err;
-               }
-
-               lcd->gpio_backlight_cont = pdata->gpio_backlight_cont;
-
-               /* spitz and akita use both GPIOs for backlight, and
-                * have inverted polarity of GPIO_BACKLIGHT_CONT
-                */
-               if (gpio_is_valid(lcd->gpio_backlight_on)) {
-                       lcd->gpio_backlight_cont_inverted = 1;
-                       gpio_direction_output(lcd->gpio_backlight_cont, 1);
-               } else {
-                       lcd->gpio_backlight_cont_inverted = 0;
-                       gpio_direction_output(lcd->gpio_backlight_cont, 0);
-               }
-       }
+       lcd->backlight_on = devm_gpiod_get_optional(&spi->dev,
+                                                   "BL_ON", GPIOD_OUT_LOW);
+       if (IS_ERR(lcd->backlight_on))
+               return PTR_ERR(lcd->backlight_on);
+
+       lcd->backlight_cont = devm_gpiod_get_optional(&spi->dev, "BL_CONT",
+                                                     GPIOD_OUT_LOW);
+       if (IS_ERR(lcd->backlight_cont))
+               return PTR_ERR(lcd->backlight_cont);
+
        return 0;
 }
 
index efb4efc..82b8d75 100644 (file)
@@ -7,7 +7,6 @@
 
 #include <linux/delay.h>
 #include <linux/gpio/consumer.h>
-#include <linux/gpio.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -258,8 +257,6 @@ static int pwm_backlight_parse_dt(struct device *dev,
                             &data->post_pwm_on_delay);
        of_property_read_u32(node, "pwm-off-delay-ms", &data->pwm_off_delay);
 
-       data->enable_gpio = -EINVAL;
-
        /*
         * Determine the number of brightness levels, if this property is not
         * set a default table of brightness levels will be used.
@@ -503,22 +500,6 @@ static int pwm_backlight_probe(struct platform_device *pdev)
        }
 
        /*
-        * Compatibility fallback for drivers still using the integer GPIO
-        * platform data. Must go away soon.
-        */
-       if (!pb->enable_gpio && gpio_is_valid(data->enable_gpio)) {
-               ret = devm_gpio_request_one(&pdev->dev, data->enable_gpio,
-                                           GPIOF_OUT_INIT_HIGH, "enable");
-               if (ret < 0) {
-                       dev_err(&pdev->dev, "failed to request GPIO#%d: %d\n",
-                               data->enable_gpio, ret);
-                       goto err_alloc;
-               }
-
-               pb->enable_gpio = gpio_to_desc(data->enable_gpio);
-       }
-
-       /*
         * If the GPIO is not known to be already configured as output, that
         * is, if gpiod_get_direction returns either 1 or -EINVAL, change the
         * direction to output and set the GPIO as active.
index 2833578..9d28a8e 100644 (file)
@@ -1282,6 +1282,9 @@ finished:
        if (!con_is_bound(&fb_con))
                fbcon_exit();
 
+       if (vc->vc_num == logo_shown)
+               logo_shown = FBCON_LOGO_CANSHOW;
+
        return;
 }
 
index 078615c..2bbf94b 100644 (file)
@@ -43,6 +43,19 @@ config VIRTIO_PCI_LEGACY
 
          If unsure, say Y.
 
+config VIRTIO_VDPA
+       tristate "vDPA driver for virtio devices"
+       select VDPA
+       select VIRTIO
+       help
+         This driver provides support for virtio based paravirtual
+         device driver over vDPA bus. For this to be useful, you need
+         an appropriate vDPA device implementation that operates on a
+         physical device to allow the datapath of virtio to be
+         offloaded to hardware.
+
+         If unsure, say M.
+
 config VIRTIO_PMEM
        tristate "Support for virtio pmem driver"
        depends on VIRTIO
@@ -58,6 +71,7 @@ config VIRTIO_BALLOON
        tristate "Virtio balloon driver"
        depends on VIRTIO
        select MEMORY_BALLOON
+       select PAGE_REPORTING
        ---help---
         This driver supports increasing and decreasing the amount
         of memory within a KVM guest.
index 3a2b5c5..29a1386 100644 (file)
@@ -6,3 +6,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
 virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
 obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
 obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
+obj-$(CONFIG_VIRTIO_VDPA) += virtio_vdpa.o
index 341458f..0ef1656 100644 (file)
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/balloon_compaction.h>
+#include <linux/oom.h>
 #include <linux/wait.h>
 #include <linux/mm.h>
 #include <linux/mount.h>
 #include <linux/magic.h>
 #include <linux/pseudo_fs.h>
+#include <linux/page_reporting.h>
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -27,7 +29,9 @@
  */
 #define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
 #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256
-#define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
+/* Maximum number of (4k) pages to deflate on OOM notifications. */
+#define VIRTIO_BALLOON_OOM_NR_PAGES 256
+#define VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY 80
 
 #define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \
                                             __GFP_NOMEMALLOC)
@@ -47,6 +51,7 @@ enum virtio_balloon_vq {
        VIRTIO_BALLOON_VQ_DEFLATE,
        VIRTIO_BALLOON_VQ_STATS,
        VIRTIO_BALLOON_VQ_FREE_PAGE,
+       VIRTIO_BALLOON_VQ_REPORTING,
        VIRTIO_BALLOON_VQ_MAX
 };
 
@@ -112,8 +117,15 @@ struct virtio_balloon {
        /* Memory statistics */
        struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
 
-       /* To register a shrinker to shrink memory upon memory pressure */
+       /* Shrinker to return free pages - VIRTIO_BALLOON_F_FREE_PAGE_HINT */
        struct shrinker shrinker;
+
+       /* OOM notifier to deflate on OOM - VIRTIO_BALLOON_F_DEFLATE_ON_OOM */
+       struct notifier_block oom_nb;
+
+       /* Free page reporting device */
+       struct virtqueue *reporting_vq;
+       struct page_reporting_dev_info pr_dev_info;
 };
 
 static struct virtio_device_id id_table[] = {
@@ -153,6 +165,33 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 
 }
 
+int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info,
+                                  struct scatterlist *sg, unsigned int nents)
+{
+       struct virtio_balloon *vb =
+               container_of(pr_dev_info, struct virtio_balloon, pr_dev_info);
+       struct virtqueue *vq = vb->reporting_vq;
+       unsigned int unused, err;
+
+       /* We should always be able to add these buffers to an empty queue. */
+       err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN);
+
+       /*
+        * In the extremely unlikely case that something has occurred and we
+        * are able to trigger an error we will simply display a warning
+        * and exit without actually processing the pages.
+        */
+       if (WARN_ON_ONCE(err))
+               return err;
+
+       virtqueue_kick(vq);
+
+       /* When host has read buffer, this completes via balloon_ack */
+       wait_event(vb->acked, virtqueue_get_buf(vq, &unused));
+
+       return 0;
+}
+
 static void set_page_pfns(struct virtio_balloon *vb,
                          __virtio32 pfns[], struct page *page)
 {
@@ -481,6 +520,7 @@ static int init_vqs(struct virtio_balloon *vb)
        names[VIRTIO_BALLOON_VQ_STATS] = NULL;
        callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
        names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
+       names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
 
        if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
                names[VIRTIO_BALLOON_VQ_STATS] = "stats";
@@ -492,6 +532,11 @@ static int init_vqs(struct virtio_balloon *vb)
                callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
        }
 
+       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
+               names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq";
+               callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
+       }
+
        err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
                                         vqs, callbacks, names, NULL, NULL);
        if (err)
@@ -524,6 +569,9 @@ static int init_vqs(struct virtio_balloon *vb)
        if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
                vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE];
 
+       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+               vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
+
        return 0;
 }
 
@@ -788,50 +836,13 @@ static unsigned long shrink_free_pages(struct virtio_balloon *vb,
        return blocks_freed * VIRTIO_BALLOON_HINT_BLOCK_PAGES;
 }
 
-static unsigned long leak_balloon_pages(struct virtio_balloon *vb,
-                                          unsigned long pages_to_free)
-{
-       return leak_balloon(vb, pages_to_free * VIRTIO_BALLOON_PAGES_PER_PAGE) /
-               VIRTIO_BALLOON_PAGES_PER_PAGE;
-}
-
-static unsigned long shrink_balloon_pages(struct virtio_balloon *vb,
-                                         unsigned long pages_to_free)
-{
-       unsigned long pages_freed = 0;
-
-       /*
-        * One invocation of leak_balloon can deflate at most
-        * VIRTIO_BALLOON_ARRAY_PFNS_MAX balloon pages, so we call it
-        * multiple times to deflate pages till reaching pages_to_free.
-        */
-       while (vb->num_pages && pages_freed < pages_to_free)
-               pages_freed += leak_balloon_pages(vb,
-                                                 pages_to_free - pages_freed);
-
-       update_balloon_size(vb);
-
-       return pages_freed;
-}
-
 static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker,
                                                  struct shrink_control *sc)
 {
-       unsigned long pages_to_free, pages_freed = 0;
        struct virtio_balloon *vb = container_of(shrinker,
                                        struct virtio_balloon, shrinker);
 
-       pages_to_free = sc->nr_to_scan;
-
-       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
-               pages_freed = shrink_free_pages(vb, pages_to_free);
-
-       if (pages_freed >= pages_to_free)
-               return pages_freed;
-
-       pages_freed += shrink_balloon_pages(vb, pages_to_free - pages_freed);
-
-       return pages_freed;
+       return shrink_free_pages(vb, sc->nr_to_scan);
 }
 
 static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker,
@@ -839,12 +850,22 @@ static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker,
 {
        struct virtio_balloon *vb = container_of(shrinker,
                                        struct virtio_balloon, shrinker);
-       unsigned long count;
 
-       count = vb->num_pages / VIRTIO_BALLOON_PAGES_PER_PAGE;
-       count += vb->num_free_page_blocks * VIRTIO_BALLOON_HINT_BLOCK_PAGES;
+       return vb->num_free_page_blocks * VIRTIO_BALLOON_HINT_BLOCK_PAGES;
+}
+
+static int virtio_balloon_oom_notify(struct notifier_block *nb,
+                                    unsigned long dummy, void *parm)
+{
+       struct virtio_balloon *vb = container_of(nb,
+                                                struct virtio_balloon, oom_nb);
+       unsigned long *freed = parm;
+
+       *freed += leak_balloon(vb, VIRTIO_BALLOON_OOM_NR_PAGES) /
+                 VIRTIO_BALLOON_PAGES_PER_PAGE;
+       update_balloon_size(vb);
 
-       return count;
+       return NOTIFY_OK;
 }
 
 static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb)
@@ -864,7 +885,6 @@ static int virtio_balloon_register_shrinker(struct virtio_balloon *vb)
 static int virtballoon_probe(struct virtio_device *vdev)
 {
        struct virtio_balloon *vb;
-       __u32 poison_val;
        int err;
 
        if (!vdev->config->get) {
@@ -930,27 +950,65 @@ static int virtballoon_probe(struct virtio_device *vdev)
                                                  VIRTIO_BALLOON_CMD_ID_STOP);
                spin_lock_init(&vb->free_page_list_lock);
                INIT_LIST_HEAD(&vb->free_page_list);
-               if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) {
-                       memset(&poison_val, PAGE_POISON, sizeof(poison_val));
-                       virtio_cwrite(vb->vdev, struct virtio_balloon_config,
-                                     poison_val, &poison_val);
-               }
-       }
-       /*
-        * We continue to use VIRTIO_BALLOON_F_DEFLATE_ON_OOM to decide if a
-        * shrinker needs to be registered to relieve memory pressure.
-        */
-       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) {
+               /*
+                * We're allowed to reuse any free pages, even if they are
+                * still to be processed by the host.
+                */
                err = virtio_balloon_register_shrinker(vb);
                if (err)
                        goto out_del_balloon_wq;
        }
+
+       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) {
+               vb->oom_nb.notifier_call = virtio_balloon_oom_notify;
+               vb->oom_nb.priority = VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY;
+               err = register_oom_notifier(&vb->oom_nb);
+               if (err < 0)
+                       goto out_unregister_shrinker;
+       }
+
+       if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) {
+               /* Start with poison val of 0 representing general init */
+               __u32 poison_val = 0;
+
+               /*
+                * Let the hypervisor know that we are expecting a
+                * specific value to be written back in balloon pages.
+                */
+               if (!want_init_on_free())
+                       memset(&poison_val, PAGE_POISON, sizeof(poison_val));
+
+               virtio_cwrite(vb->vdev, struct virtio_balloon_config,
+                             poison_val, &poison_val);
+       }
+
+       vb->pr_dev_info.report = virtballoon_free_page_report;
+       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
+               unsigned int capacity;
+
+               capacity = virtqueue_get_vring_size(vb->reporting_vq);
+               if (capacity < PAGE_REPORTING_CAPACITY) {
+                       err = -ENOSPC;
+                       goto out_unregister_oom;
+               }
+
+               err = page_reporting_register(&vb->pr_dev_info);
+               if (err)
+                       goto out_unregister_oom;
+       }
+
        virtio_device_ready(vdev);
 
        if (towards_target(vb))
                virtballoon_changed(vdev);
        return 0;
 
+out_unregister_oom:
+       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+               unregister_oom_notifier(&vb->oom_nb);
+out_unregister_shrinker:
+       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+               virtio_balloon_unregister_shrinker(vb);
 out_del_balloon_wq:
        if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
                destroy_workqueue(vb->balloon_wq);
@@ -989,7 +1047,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
 {
        struct virtio_balloon *vb = vdev->priv;
 
+       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+               page_reporting_unregister(&vb->pr_dev_info);
        if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+               unregister_oom_notifier(&vb->oom_nb);
+       if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
                virtio_balloon_unregister_shrinker(vb);
        spin_lock_irq(&vb->stop_update_lock);
        vb->stop_update = true;
@@ -1045,7 +1107,10 @@ static int virtballoon_restore(struct virtio_device *vdev)
 
 static int virtballoon_validate(struct virtio_device *vdev)
 {
-       if (!page_poisoning_enabled())
+       /* Tell the host whether we care about poisoned pages. */
+       if (!want_init_on_free() &&
+           (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY) ||
+            !page_poisoning_enabled()))
                __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON);
 
        __virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM);
@@ -1058,6 +1123,7 @@ static unsigned int features[] = {
        VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
        VIRTIO_BALLOON_F_FREE_PAGE_HINT,
        VIRTIO_BALLOON_F_PAGE_POISON,
+       VIRTIO_BALLOON_F_REPORTING,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
new file mode 100644 (file)
index 0000000..c30eb55
--- /dev/null
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VIRTIO based driver for vDPA device
+ *
+ * Copyright (c) 2020, Red Hat. All rights reserved.
+ *     Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/virtio.h>
+#include <linux/vdpa.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+
+#define MOD_VERSION  "0.1"
+#define MOD_AUTHOR   "Jason Wang <jasowang@redhat.com>"
+#define MOD_DESC     "vDPA bus driver for virtio devices"
+#define MOD_LICENSE  "GPL v2"
+
+struct virtio_vdpa_device {
+       struct virtio_device vdev;
+       struct vdpa_device *vdpa;
+       u64 features;
+
+       /* The lock to protect virtqueue list */
+       spinlock_t lock;
+       /* List of virtio_vdpa_vq_info */
+       struct list_head virtqueues;
+};
+
+struct virtio_vdpa_vq_info {
+       /* the actual virtqueue */
+       struct virtqueue *vq;
+
+       /* the list node for the virtqueues list */
+       struct list_head node;
+};
+
+static inline struct virtio_vdpa_device *
+to_virtio_vdpa_device(struct virtio_device *dev)
+{
+       return container_of(dev, struct virtio_vdpa_device, vdev);
+}
+
+static struct vdpa_device *vd_get_vdpa(struct virtio_device *vdev)
+{
+       return to_virtio_vdpa_device(vdev)->vdpa;
+}
+
+static void virtio_vdpa_get(struct virtio_device *vdev, unsigned offset,
+                           void *buf, unsigned len)
+{
+       struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+       const struct vdpa_config_ops *ops = vdpa->config;
+
+       ops->get_config(vdpa, offset, buf, len);
+}
+
+static void virtio_vdpa_set(struct virtio_device *vdev, unsigned offset,
+                           const void *buf, unsigned len)
+{
+       struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+       const struct vdpa_config_ops *ops = vdpa->config;
+
+       ops->set_config(vdpa, offset, buf, len);
+}
+
+static u32 virtio_vdpa_generation(struct virtio_device *vdev)
+{
+       struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+       const struct vdpa_config_ops *ops = vdpa->config;
+
+       if (ops->get_generation)
+               return ops->get_generation(vdpa);
+
+       return 0;
+}
+
+static u8 virtio_vdpa_get_status(struct virtio_device *vdev)
+{
+       struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+       const struct vdpa_config_ops *ops = vdpa->config;
+
+       return ops->get_status(vdpa);
+}
+
+static void virtio_vdpa_set_status(struct virtio_device *vdev, u8 status)
+{
+       struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+       const struct vdpa_config_ops *ops = vdpa->config;
+
+       return ops->set_status(vdpa, status);
+}
+
+static void virtio_vdpa_reset(struct virtio_device *vdev)
+{
+       struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+       const struct vdpa_config_ops *ops = vdpa->config;
+
+       return ops->set_status(vdpa, 0);
+}
+
+static bool virtio_vdpa_notify(struct virtqueue *vq)
+{
+       struct vdpa_device *vdpa = vd_get_vdpa(vq->vdev);
+       const struct vdpa_config_ops *ops = vdpa->config;
+
+       ops->kick_vq(vdpa, vq->index);
+
+       return true;
+}
+
+static irqreturn_t virtio_vdpa_config_cb(void *private)
+{
+       struct virtio_vdpa_device *vd_dev = private;
+
+       virtio_config_changed(&vd_dev->vdev);
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t virtio_vdpa_virtqueue_cb(void *private)
+{
+       struct virtio_vdpa_vq_info *info = private;
+
+       return vring_interrupt(0, info->vq);
+}
+
+static struct virtqueue *
+virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
+                    void (*callback)(struct virtqueue *vq),
+                    const char *name, bool ctx)
+{
+       struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+       struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+       const struct vdpa_config_ops *ops = vdpa->config;
+       struct virtio_vdpa_vq_info *info;
+       struct vdpa_callback cb;
+       struct virtqueue *vq;
+       u64 desc_addr, driver_addr, device_addr;
+       unsigned long flags;
+       u32 align, num;
+       int err;
+
+       if (!name)
+               return NULL;
+
+       /* Queue shouldn't already be set up. */
+       if (ops->get_vq_ready(vdpa, index))
+               return ERR_PTR(-ENOENT);
+
+       /* Allocate and fill out our active queue description */
+       info = kmalloc(sizeof(*info), GFP_KERNEL);
+       if (!info)
+               return ERR_PTR(-ENOMEM);
+
+       num = ops->get_vq_num_max(vdpa);
+       if (num == 0) {
+               err = -ENOENT;
+               goto error_new_virtqueue;
+       }
+
+       /* Create the vring */
+       align = ops->get_vq_align(vdpa);
+       vq = vring_create_virtqueue(index, num, align, vdev,
+                                   true, true, ctx,
+                                   virtio_vdpa_notify, callback, name);
+       if (!vq) {
+               err = -ENOMEM;
+               goto error_new_virtqueue;
+       }
+
+       /* Setup virtqueue callback */
+       cb.callback = virtio_vdpa_virtqueue_cb;
+       cb.private = info;
+       ops->set_vq_cb(vdpa, index, &cb);
+       ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq));
+
+       desc_addr = virtqueue_get_desc_addr(vq);
+       driver_addr = virtqueue_get_avail_addr(vq);
+       device_addr = virtqueue_get_used_addr(vq);
+
+       if (ops->set_vq_address(vdpa, index,
+                               desc_addr, driver_addr,
+                               device_addr)) {
+               err = -EINVAL;
+               goto err_vq;
+       }
+
+       ops->set_vq_ready(vdpa, index, 1);
+
+       vq->priv = info;
+       info->vq = vq;
+
+       spin_lock_irqsave(&vd_dev->lock, flags);
+       list_add(&info->node, &vd_dev->virtqueues);
+       spin_unlock_irqrestore(&vd_dev->lock, flags);
+
+       return vq;
+
+err_vq:
+       vring_del_virtqueue(vq);
+error_new_virtqueue:
+       ops->set_vq_ready(vdpa, index, 0);
+       /* VDPA driver should make sure vq is stopeed here */
+       WARN_ON(ops->get_vq_ready(vdpa, index));
+       kfree(info);
+       return ERR_PTR(err);
+}
+
+static void virtio_vdpa_del_vq(struct virtqueue *vq)
+{
+       struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev);
+       struct vdpa_device *vdpa = vd_dev->vdpa;
+       const struct vdpa_config_ops *ops = vdpa->config;
+       struct virtio_vdpa_vq_info *info = vq->priv;
+       unsigned int index = vq->index;
+       unsigned long flags;
+
+       spin_lock_irqsave(&vd_dev->lock, flags);
+       list_del(&info->node);
+       spin_unlock_irqrestore(&vd_dev->lock, flags);
+
+       /* Select and deactivate the queue */
+       ops->set_vq_ready(vdpa, index, 0);
+       WARN_ON(ops->get_vq_ready(vdpa, index));
+
+       vring_del_virtqueue(vq);
+
+       kfree(info);
+}
+
+static void virtio_vdpa_del_vqs(struct virtio_device *vdev)
+{
+       struct virtqueue *vq, *n;
+
+       list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+               virtio_vdpa_del_vq(vq);
+}
+
+static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+                               struct virtqueue *vqs[],
+                               vq_callback_t *callbacks[],
+                               const char * const names[],
+                               const bool *ctx,
+                               struct irq_affinity *desc)
+{
+       struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+       struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+       const struct vdpa_config_ops *ops = vdpa->config;
+       struct vdpa_callback cb;
+       int i, err, queue_idx = 0;
+
+       for (i = 0; i < nvqs; ++i) {
+               if (!names[i]) {
+                       vqs[i] = NULL;
+                       continue;
+               }
+
+               vqs[i] = virtio_vdpa_setup_vq(vdev, queue_idx++,
+                                             callbacks[i], names[i], ctx ?
+                                             ctx[i] : false);
+               if (IS_ERR(vqs[i])) {
+                       err = PTR_ERR(vqs[i]);
+                       goto err_setup_vq;
+               }
+       }
+
+       cb.callback = virtio_vdpa_config_cb;
+       cb.private = vd_dev;
+       ops->set_config_cb(vdpa, &cb);
+
+       return 0;
+
+err_setup_vq:
+       virtio_vdpa_del_vqs(vdev);
+       return err;
+}
+
+static u64 virtio_vdpa_get_features(struct virtio_device *vdev)
+{
+       struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+       const struct vdpa_config_ops *ops = vdpa->config;
+
+       return ops->get_features(vdpa);
+}
+
+static int virtio_vdpa_finalize_features(struct virtio_device *vdev)
+{
+       struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+       const struct vdpa_config_ops *ops = vdpa->config;
+
+       /* Give virtio_ring a chance to accept features. */
+       vring_transport_features(vdev);
+
+       return ops->set_features(vdpa, vdev->features);
+}
+
+static const char *virtio_vdpa_bus_name(struct virtio_device *vdev)
+{
+       struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+       struct vdpa_device *vdpa = vd_dev->vdpa;
+
+       return dev_name(&vdpa->dev);
+}
+
+static const struct virtio_config_ops virtio_vdpa_config_ops = {
+       .get            = virtio_vdpa_get,
+       .set            = virtio_vdpa_set,
+       .generation     = virtio_vdpa_generation,
+       .get_status     = virtio_vdpa_get_status,
+       .set_status     = virtio_vdpa_set_status,
+       .reset          = virtio_vdpa_reset,
+       .find_vqs       = virtio_vdpa_find_vqs,
+       .del_vqs        = virtio_vdpa_del_vqs,
+       .get_features   = virtio_vdpa_get_features,
+       .finalize_features = virtio_vdpa_finalize_features,
+       .bus_name       = virtio_vdpa_bus_name,
+};
+
+static void virtio_vdpa_release_dev(struct device *_d)
+{
+       struct virtio_device *vdev =
+              container_of(_d, struct virtio_device, dev);
+       struct virtio_vdpa_device *vd_dev =
+              container_of(vdev, struct virtio_vdpa_device, vdev);
+
+       kfree(vd_dev);
+}
+
+static int virtio_vdpa_probe(struct vdpa_device *vdpa)
+{
+       const struct vdpa_config_ops *ops = vdpa->config;
+       struct virtio_vdpa_device *vd_dev, *reg_dev = NULL;
+       int ret = -EINVAL;
+
+       vd_dev = kzalloc(sizeof(*vd_dev), GFP_KERNEL);
+       if (!vd_dev)
+               return -ENOMEM;
+
+       vd_dev->vdev.dev.parent = vdpa_get_dma_dev(vdpa);
+       vd_dev->vdev.dev.release = virtio_vdpa_release_dev;
+       vd_dev->vdev.config = &virtio_vdpa_config_ops;
+       vd_dev->vdpa = vdpa;
+       INIT_LIST_HEAD(&vd_dev->virtqueues);
+       spin_lock_init(&vd_dev->lock);
+
+       vd_dev->vdev.id.device = ops->get_device_id(vdpa);
+       if (vd_dev->vdev.id.device == 0)
+               goto err;
+
+       vd_dev->vdev.id.vendor = ops->get_vendor_id(vdpa);
+       ret = register_virtio_device(&vd_dev->vdev);
+       reg_dev = vd_dev;
+       if (ret)
+               goto err;
+
+       vdpa_set_drvdata(vdpa, vd_dev);
+
+       return 0;
+
+err:
+       if (reg_dev)
+               put_device(&vd_dev->vdev.dev);
+       else
+               kfree(vd_dev);
+       return ret;
+}
+
+static void virtio_vdpa_remove(struct vdpa_device *vdpa)
+{
+       struct virtio_vdpa_device *vd_dev = vdpa_get_drvdata(vdpa);
+
+       unregister_virtio_device(&vd_dev->vdev);
+}
+
+static struct vdpa_driver virtio_vdpa_driver = {
+       .driver = {
+               .name   = "virtio_vdpa",
+       },
+       .probe  = virtio_vdpa_probe,
+       .remove = virtio_vdpa_remove,
+};
+
+module_vdpa_driver(virtio_vdpa_driver);
+
+MODULE_VERSION(MOD_VERSION);
+MODULE_LICENSE(MOD_LICENSE);
+MODULE_AUTHOR(MOD_AUTHOR);
+MODULE_DESCRIPTION(MOD_DESC);
index 9ea2b43..0663c60 100644 (file)
@@ -584,6 +584,14 @@ config DAVINCI_WATCHDOG
          NOTE: once enabled, this timer cannot be disabled.
          Say N if you are unsure.
 
+config K3_RTI_WATCHDOG
+       tristate "Texas Instruments K3 RTI watchdog"
+       depends on ARCH_K3 || COMPILE_TEST
+       select WATCHDOG_CORE
+       help
+         Say Y here if you want to include support for the K3 watchdog
+         timer (RTI module) available in the K3 generation of processors.
+
 config ORION_WATCHDOG
        tristate "Orion watchdog"
        depends on ARCH_ORION5X || ARCH_DOVE || MACH_DOVE || ARCH_MVEBU || (COMPILE_TEST && !ARCH_EBSA110)
index 2ee352b..6de2e4c 100644 (file)
@@ -57,6 +57,7 @@ obj-$(CONFIG_EP93XX_WATCHDOG) += ep93xx_wdt.o
 obj-$(CONFIG_PNX4008_WATCHDOG) += pnx4008_wdt.o
 obj-$(CONFIG_IOP_WATCHDOG) += iop_wdt.o
 obj-$(CONFIG_DAVINCI_WATCHDOG) += davinci_wdt.o
+obj-$(CONFIG_K3_RTI_WATCHDOG) += rti_wdt.o
 obj-$(CONFIG_ORION_WATCHDOG) += orion_wdt.o
 obj-$(CONFIG_SUNXI_WATCHDOG) += sunxi_wdt.o
 obj-$(CONFIG_RN5T618_WATCHDOG) += rn5t618_wdt.o
index f8d58bf..1fe472f 100644 (file)
@@ -244,6 +244,11 @@ static const struct regmap_config imx2_wdt_regmap_config = {
        .max_register = 0x8,
 };
 
+static void imx2_wdt_action(void *data)
+{
+       clk_disable_unprepare(data);
+}
+
 static int __init imx2_wdt_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
@@ -292,6 +297,10 @@ static int __init imx2_wdt_probe(struct platform_device *pdev)
        if (ret)
                return ret;
 
+       ret = devm_add_action_or_reset(dev, imx2_wdt_action, wdev->clk);
+       if (ret)
+               return ret;
+
        regmap_read(wdev->regmap, IMX2_WDT_WRSR, &val);
        wdog->bootstatus = val & IMX2_WDT_WRSR_TOUT ? WDIOF_CARDRESET : 0;
 
@@ -315,32 +324,7 @@ static int __init imx2_wdt_probe(struct platform_device *pdev)
         */
        regmap_write(wdev->regmap, IMX2_WDT_WMCR, 0);
 
-       ret = watchdog_register_device(wdog);
-       if (ret)
-               goto disable_clk;
-
-       dev_info(dev, "timeout %d sec (nowayout=%d)\n",
-                wdog->timeout, nowayout);
-
-       return 0;
-
-disable_clk:
-       clk_disable_unprepare(wdev->clk);
-       return ret;
-}
-
-static int __exit imx2_wdt_remove(struct platform_device *pdev)
-{
-       struct watchdog_device *wdog = platform_get_drvdata(pdev);
-       struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
-
-       watchdog_unregister_device(wdog);
-
-       if (imx2_wdt_is_running(wdev)) {
-               imx2_wdt_ping(wdog);
-               dev_crit(&pdev->dev, "Device removed: Expect reboot!\n");
-       }
-       return 0;
+       return devm_watchdog_register_device(dev, wdog);
 }
 
 static void imx2_wdt_shutdown(struct platform_device *pdev)
@@ -417,7 +401,6 @@ static const struct of_device_id imx2_wdt_dt_ids[] = {
 MODULE_DEVICE_TABLE(of, imx2_wdt_dt_ids);
 
 static struct platform_driver imx2_wdt_driver = {
-       .remove         = __exit_p(imx2_wdt_remove),
        .shutdown       = imx2_wdt_shutdown,
        .driver         = {
                .name   = DRIVER_NAME,
index 11b9e7c..7993c8c 100644 (file)
@@ -4,7 +4,6 @@
  */
 
 #include <linux/clk.h>
-#include <linux/init.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
index 8ed89f0..60a3246 100644 (file)
@@ -6,13 +6,11 @@
 #include <linux/arm-smccc.h>
 #include <linux/firmware/imx/sci.h>
 #include <linux/io.h>
-#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
-#include <linux/reboot.h>
 #include <linux/watchdog.h>
 
 #define DEFAULT_TIMEOUT 60
index 9c773c3..765577f 100644 (file)
@@ -103,30 +103,29 @@ static int npcm_wdt_stop(struct watchdog_device *wdd)
        return 0;
 }
 
-
 static int npcm_wdt_set_timeout(struct watchdog_device *wdd,
                                unsigned int timeout)
 {
        if (timeout < 2)
                wdd->timeout = 1;
        else if (timeout < 3)
-             wdd->timeout = 2;
+               wdd->timeout = 2;
        else if (timeout < 6)
-             wdd->timeout = 5;
+               wdd->timeout = 5;
        else if (timeout < 11)
-             wdd->timeout = 10;
+               wdd->timeout = 10;
        else if (timeout < 22)
-             wdd->timeout = 21;
+               wdd->timeout = 21;
        else if (timeout < 44)
-             wdd->timeout = 43;
+               wdd->timeout = 43;
        else if (timeout < 87)
-             wdd->timeout = 86;
+               wdd->timeout = 86;
        else if (timeout < 173)
-             wdd->timeout = 172;
+               wdd->timeout = 172;
        else if (timeout < 688)
-             wdd->timeout = 687;
+               wdd->timeout = 687;
        else
-             wdd->timeout = 2750;
+               wdd->timeout = 2750;
 
        if (watchdog_active(wdd))
                npcm_wdt_start(wdd);
index 8e6dfe7..4ddb4ea 100644 (file)
@@ -52,7 +52,7 @@
 #define WDT_A370_RATIO         (1 << WDT_A370_RATIO_SHIFT)
 
 static bool nowayout = WATCHDOG_NOWAYOUT;
-static int heartbeat = -1;             /* module parameter (seconds) */
+static int heartbeat;          /* module parameter (seconds) */
 
 struct orion_watchdog;
 
index 1213179..0937b8d 100644 (file)
@@ -192,6 +192,7 @@ static int pm8916_wdt_probe(struct platform_device *pdev)
        wdt->wdev.timeout = PM8916_WDT_DEFAULT_TIMEOUT;
        wdt->wdev.pretimeout = 0;
        watchdog_set_drvdata(&wdt->wdev, wdt);
+       platform_set_drvdata(pdev, wdt);
 
        watchdog_init_timeout(&wdt->wdev, 0, dev);
        pm8916_wdt_configure_timers(&wdt->wdev);
@@ -199,6 +200,29 @@ static int pm8916_wdt_probe(struct platform_device *pdev)
        return devm_watchdog_register_device(dev, &wdt->wdev);
 }
 
+static int __maybe_unused pm8916_wdt_suspend(struct device *dev)
+{
+       struct pm8916_wdt *wdt = dev_get_drvdata(dev);
+
+       if (watchdog_active(&wdt->wdev))
+               return pm8916_wdt_stop(&wdt->wdev);
+
+       return 0;
+}
+
+static int __maybe_unused pm8916_wdt_resume(struct device *dev)
+{
+       struct pm8916_wdt *wdt = dev_get_drvdata(dev);
+
+       if (watchdog_active(&wdt->wdev))
+               return pm8916_wdt_start(&wdt->wdev);
+
+       return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(pm8916_wdt_pm_ops, pm8916_wdt_suspend,
+                        pm8916_wdt_resume);
+
 static const struct of_device_id pm8916_wdt_id_table[] = {
        { .compatible = "qcom,pm8916-wdt" },
        { }
@@ -210,6 +234,7 @@ static struct platform_driver pm8916_wdt_driver = {
        .driver = {
                .name = "pm8916-wdt",
                .of_match_table = of_match_ptr(pm8916_wdt_id_table),
+               .pm = &pm8916_wdt_pm_ops,
        },
 };
 module_platform_driver(pm8916_wdt_driver);
index eb47fe5..ab7465d 100644 (file)
@@ -40,6 +40,11 @@ static const u32 reg_offset_data_kpss[] = {
        [WDT_BITE_TIME] = 0x14,
 };
 
+struct qcom_wdt_match_data {
+       const u32 *offset;
+       bool pretimeout;
+};
+
 struct qcom_wdt {
        struct watchdog_device  wdd;
        unsigned long           rate;
@@ -179,19 +184,29 @@ static void qcom_clk_disable_unprepare(void *data)
        clk_disable_unprepare(data);
 }
 
+static const struct qcom_wdt_match_data match_data_apcs_tmr = {
+       .offset = reg_offset_data_apcs_tmr,
+       .pretimeout = false,
+};
+
+static const struct qcom_wdt_match_data match_data_kpss = {
+       .offset = reg_offset_data_kpss,
+       .pretimeout = true,
+};
+
 static int qcom_wdt_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
        struct qcom_wdt *wdt;
        struct resource *res;
        struct device_node *np = dev->of_node;
-       const u32 *regs;
+       const struct qcom_wdt_match_data *data;
        u32 percpu_offset;
        int irq, ret;
        struct clk *clk;
 
-       regs = of_device_get_match_data(dev);
-       if (!regs) {
+       data = of_device_get_match_data(dev);
+       if (!data) {
                dev_err(dev, "Unsupported QCOM WDT module\n");
                return -ENODEV;
        }
@@ -247,9 +262,8 @@ static int qcom_wdt_probe(struct platform_device *pdev)
 
        /* check if there is pretimeout support */
        irq = platform_get_irq_optional(pdev, 0);
-       if (irq > 0) {
-               ret = devm_request_irq(dev, irq, qcom_wdt_isr,
-                                      IRQF_TRIGGER_RISING,
+       if (data->pretimeout && irq > 0) {
+               ret = devm_request_irq(dev, irq, qcom_wdt_isr, 0,
                                       "wdt_bark", &wdt->wdd);
                if (ret)
                        return ret;
@@ -267,7 +281,7 @@ static int qcom_wdt_probe(struct platform_device *pdev)
        wdt->wdd.min_timeout = 1;
        wdt->wdd.max_timeout = 0x10000000U / wdt->rate;
        wdt->wdd.parent = dev;
-       wdt->layout = regs;
+       wdt->layout = data->offset;
 
        if (readl(wdt_addr(wdt, WDT_STS)) & 1)
                wdt->wdd.bootstatus = WDIOF_CARDRESET;
@@ -311,9 +325,9 @@ static int __maybe_unused qcom_wdt_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(qcom_wdt_pm_ops, qcom_wdt_suspend, qcom_wdt_resume);
 
 static const struct of_device_id qcom_wdt_of_table[] = {
-       { .compatible = "qcom,kpss-timer", .data = reg_offset_data_apcs_tmr },
-       { .compatible = "qcom,scss-timer", .data = reg_offset_data_apcs_tmr },
-       { .compatible = "qcom,kpss-wdt", .data = reg_offset_data_kpss },
+       { .compatible = "qcom,kpss-timer", .data = &match_data_apcs_tmr },
+       { .compatible = "qcom,scss-timer", .data = &match_data_apcs_tmr },
+       { .compatible = "qcom,kpss-wdt", .data = &match_data_kpss },
        { },
 };
 MODULE_DEVICE_TABLE(of, qcom_wdt_of_table);
diff --git a/drivers/watchdog/rti_wdt.c b/drivers/watchdog/rti_wdt.c
new file mode 100644 (file)
index 0000000..d456dd7
--- /dev/null
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Watchdog driver for the K3 RTI module
+ *
+ * (c) Copyright 2019-2020 Texas Instruments Inc.
+ * All rights reserved.
+ */
+
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/watchdog.h>
+
+#define DEFAULT_HEARTBEAT 60
+
+/* Max heartbeat is calculated at 32kHz source clock */
+#define MAX_HEARTBEAT  1000
+
+/* Timer register set definition */
+#define RTIDWDCTRL     0x90
+#define RTIDWDPRLD     0x94
+#define RTIWDSTATUS    0x98
+#define RTIWDKEY       0x9c
+#define RTIDWDCNTR     0xa0
+#define RTIWWDRXCTRL   0xa4
+#define RTIWWDSIZECTRL 0xa8
+
+#define RTIWWDRX_NMI   0xa
+
+#define RTIWWDSIZE_50P 0x50
+
+#define WDENABLE_KEY   0xa98559da
+
+#define WDKEY_SEQ0             0xe51a
+#define WDKEY_SEQ1             0xa35c
+
+#define WDT_PRELOAD_SHIFT      13
+
+#define WDT_PRELOAD_MAX                0xfff
+
+#define DWDST                  BIT(1)
+
+static int heartbeat;
+
+/*
+ * struct to hold data for each WDT device
+ * @base - base io address of WD device
+ * @freq - source clock frequency of WDT
+ * @wdd  - hold watchdog device as is in WDT core
+ */
+struct rti_wdt_device {
+       void __iomem            *base;
+       unsigned long           freq;
+       struct watchdog_device  wdd;
+};
+
+static int rti_wdt_start(struct watchdog_device *wdd)
+{
+       u32 timer_margin;
+       struct rti_wdt_device *wdt = watchdog_get_drvdata(wdd);
+
+       /* set timeout period */
+       timer_margin = (u64)wdd->timeout * wdt->freq;
+       timer_margin >>= WDT_PRELOAD_SHIFT;
+       if (timer_margin > WDT_PRELOAD_MAX)
+               timer_margin = WDT_PRELOAD_MAX;
+       writel_relaxed(timer_margin, wdt->base + RTIDWDPRLD);
+
+       /*
+        * RTI only supports a windowed mode, where the watchdog can only
+        * be petted during the open window; not too early or not too late.
+        * The HW configuration options only allow for the open window size
+        * to be 50% or less than that; we obviouly want to configure the open
+        * window as large as possible so we select the 50% option. To avoid
+        * any glitches, we accommodate 5% safety margin also, so we setup
+        * the min_hw_hearbeat at 55% of the timeout period.
+        */
+       wdd->min_hw_heartbeat_ms = 11 * wdd->timeout * 1000 / 20;
+
+       /* Generate NMI when wdt expires */
+       writel_relaxed(RTIWWDRX_NMI, wdt->base + RTIWWDRXCTRL);
+
+       /* Open window size 50%; this is the largest window size available */
+       writel_relaxed(RTIWWDSIZE_50P, wdt->base + RTIWWDSIZECTRL);
+
+       readl_relaxed(wdt->base + RTIWWDSIZECTRL);
+
+       /* enable watchdog */
+       writel_relaxed(WDENABLE_KEY, wdt->base + RTIDWDCTRL);
+       return 0;
+}
+
+static int rti_wdt_ping(struct watchdog_device *wdd)
+{
+       struct rti_wdt_device *wdt = watchdog_get_drvdata(wdd);
+
+       /* put watchdog in service state */
+       writel_relaxed(WDKEY_SEQ0, wdt->base + RTIWDKEY);
+       /* put watchdog in active state */
+       writel_relaxed(WDKEY_SEQ1, wdt->base + RTIWDKEY);
+
+       return 0;
+}
+
+static unsigned int rti_wdt_get_timeleft(struct watchdog_device *wdd)
+{
+       u64 timer_counter;
+       u32 val;
+       struct rti_wdt_device *wdt = watchdog_get_drvdata(wdd);
+
+       /* if timeout has occurred then return 0 */
+       val = readl_relaxed(wdt->base + RTIWDSTATUS);
+       if (val & DWDST)
+               return 0;
+
+       timer_counter = readl_relaxed(wdt->base + RTIDWDCNTR);
+
+       do_div(timer_counter, wdt->freq);
+
+       return timer_counter;
+}
+
+static const struct watchdog_info rti_wdt_info = {
+       .options = WDIOF_KEEPALIVEPING,
+       .identity = "K3 RTI Watchdog",
+};
+
+static const struct watchdog_ops rti_wdt_ops = {
+       .owner          = THIS_MODULE,
+       .start          = rti_wdt_start,
+       .ping           = rti_wdt_ping,
+       .get_timeleft   = rti_wdt_get_timeleft,
+};
+
+static int rti_wdt_probe(struct platform_device *pdev)
+{
+       int ret = 0;
+       struct device *dev = &pdev->dev;
+       struct resource *wdt_mem;
+       struct watchdog_device *wdd;
+       struct rti_wdt_device *wdt;
+       struct clk *clk;
+
+       wdt = devm_kzalloc(dev, sizeof(*wdt), GFP_KERNEL);
+       if (!wdt)
+               return -ENOMEM;
+
+       clk = clk_get(dev, NULL);
+       if (IS_ERR(clk)) {
+               if (PTR_ERR(clk) != -EPROBE_DEFER)
+                       dev_err(dev, "failed to get clock\n");
+               return PTR_ERR(clk);
+       }
+
+       wdt->freq = clk_get_rate(clk);
+
+       clk_put(clk);
+
+       if (!wdt->freq) {
+               dev_err(dev, "Failed to get fck rate.\n");
+               return -EINVAL;
+       }
+
+       pm_runtime_enable(dev);
+       ret = pm_runtime_get_sync(dev);
+       if (ret) {
+               if (ret != -EPROBE_DEFER)
+                       dev_err(&pdev->dev, "runtime pm failed\n");
+               return ret;
+       }
+
+       platform_set_drvdata(pdev, wdt);
+
+       wdd = &wdt->wdd;
+       wdd->info = &rti_wdt_info;
+       wdd->ops = &rti_wdt_ops;
+       wdd->min_timeout = 1;
+       wdd->max_hw_heartbeat_ms = (WDT_PRELOAD_MAX << WDT_PRELOAD_SHIFT) /
+               wdt->freq * 1000;
+       wdd->timeout = DEFAULT_HEARTBEAT;
+       wdd->parent = dev;
+
+       watchdog_init_timeout(wdd, heartbeat, dev);
+
+       watchdog_set_drvdata(wdd, wdt);
+       watchdog_set_nowayout(wdd, 1);
+       watchdog_set_restart_priority(wdd, 128);
+
+       wdt_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       wdt->base = devm_ioremap_resource(dev, wdt_mem);
+       if (IS_ERR(wdt->base)) {
+               ret = PTR_ERR(wdt->base);
+               goto err_iomap;
+       }
+
+       ret = watchdog_register_device(wdd);
+       if (ret) {
+               dev_err(dev, "cannot register watchdog device\n");
+               goto err_iomap;
+       }
+
+       return 0;
+
+err_iomap:
+       pm_runtime_put_sync(&pdev->dev);
+
+       return ret;
+}
+
+static int rti_wdt_remove(struct platform_device *pdev)
+{
+       struct rti_wdt_device *wdt = platform_get_drvdata(pdev);
+
+       watchdog_unregister_device(&wdt->wdd);
+       pm_runtime_put(&pdev->dev);
+
+       return 0;
+}
+
+static const struct of_device_id rti_wdt_of_match[] = {
+       { .compatible = "ti,j7-rti-wdt", },
+       {},
+};
+MODULE_DEVICE_TABLE(of, rti_wdt_of_match);
+
+static struct platform_driver rti_wdt_driver = {
+       .driver = {
+               .name = "rti-wdt",
+               .of_match_table = rti_wdt_of_match,
+       },
+       .probe = rti_wdt_probe,
+       .remove = rti_wdt_remove,
+};
+
+module_platform_driver(rti_wdt_driver);
+
+MODULE_AUTHOR("Tero Kristo <t-kristo@ti.com>");
+MODULE_DESCRIPTION("K3 RTI Watchdog Driver");
+
+module_param(heartbeat, int, 0);
+MODULE_PARM_DESC(heartbeat,
+                "Watchdog heartbeat period in seconds from 1 to "
+                __MODULE_STRING(MAX_HEARTBEAT) ", default "
+                __MODULE_STRING(DEFAULT_HEARTBEAT));
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:rti-wdt");
index 861daf4..4238447 100644 (file)
 
 static DEFINE_IDA(watchdog_ida);
 
+static int stop_on_reboot = -1;
+module_param(stop_on_reboot, int, 0444);
+MODULE_PARM_DESC(stop_on_reboot, "Stop watchdogs on reboot (0=keep watching, 1=stop)");
+
 /*
  * Deferred Registration infrastructure.
  *
@@ -254,6 +258,14 @@ static int __watchdog_register_device(struct watchdog_device *wdd)
                }
        }
 
+       /* Module parameter to force watchdog policy on reboot. */
+       if (stop_on_reboot != -1) {
+               if (stop_on_reboot)
+                       set_bit(WDOG_STOP_ON_REBOOT, &wdd->status);
+               else
+                       clear_bit(WDOG_STOP_ON_REBOOT, &wdd->status);
+       }
+
        if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
                wdd->reboot_nb.notifier_call = watchdog_reboot_notifier;
 
index 8b5c742..7e4cd34 100644 (file)
@@ -282,6 +282,7 @@ static int watchdog_start(struct watchdog_device *wdd)
        if (err == 0) {
                set_bit(WDOG_ACTIVE, &wdd->status);
                wd_data->last_keepalive = started_at;
+               wd_data->last_hw_keepalive = started_at;
                watchdog_update_worker(wdd);
        }
 
index 030ce24..d96ad8f 100644 (file)
@@ -13,7 +13,6 @@
 #include <linux/platform_device.h>
 #include <linux/watchdog.h>
 #include <linux/uaccess.h>
-#include <linux/gpio.h>
 
 #include <linux/mfd/wm831x/core.h>
 #include <linux/mfd/wm831x/pdata.h>
@@ -29,7 +28,6 @@ struct wm831x_wdt_drvdata {
        struct watchdog_device wdt;
        struct wm831x *wm831x;
        struct mutex lock;
-       int update_gpio;
        int update_state;
 };
 
@@ -103,14 +101,6 @@ static int wm831x_wdt_ping(struct watchdog_device *wdt_dev)
 
        mutex_lock(&driver_data->lock);
 
-       if (driver_data->update_gpio) {
-               gpio_set_value_cansleep(driver_data->update_gpio,
-                                       driver_data->update_state);
-               driver_data->update_state = !driver_data->update_state;
-               ret = 0;
-               goto out;
-       }
-
        reg = wm831x_reg_read(wm831x, WM831X_WATCHDOG);
 
        if (!(reg & WM831X_WDOG_RST_SRC)) {
@@ -239,23 +229,6 @@ static int wm831x_wdt_probe(struct platform_device *pdev)
                reg |= pdata->secondary << WM831X_WDOG_SECACT_SHIFT;
                reg |= pdata->software << WM831X_WDOG_RST_SRC_SHIFT;
 
-               if (pdata->update_gpio) {
-                       ret = devm_gpio_request_one(dev, pdata->update_gpio,
-                                                   GPIOF_OUT_INIT_LOW,
-                                                   "Watchdog update");
-                       if (ret < 0) {
-                               dev_err(wm831x->dev,
-                                       "Failed to request update GPIO: %d\n",
-                                       ret);
-                               return ret;
-                       }
-
-                       driver_data->update_gpio = pdata->update_gpio;
-
-                       /* Make sure the watchdog takes hardware updates */
-                       reg |= WM831X_WDOG_RST_SRC;
-               }
-
                ret = wm831x_reg_unlock(wm831x);
                if (ret == 0) {
                        ret = wm831x_reg_write(wm831x, WM831X_WATCHDOG, reg);
index 4a363a8..cab86a0 100644 (file)
@@ -422,7 +422,7 @@ static int ziirave_firm_upload(struct watchdog_device *wdd,
 
 static const struct watchdog_info ziirave_wdt_info = {
        .options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING,
-       .identity = "Zodiac RAVE Watchdog",
+       .identity = "RAVE Switch Watchdog",
 };
 
 static const struct watchdog_ops ziirave_wdt_ops = {
index f4713ea..13f25e2 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/highuid.h>
 #include <linux/compiler.h>
 #include <linux/highmem.h>
+#include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/vmalloc.h>
 #include <linux/security.h>
@@ -698,19 +699,11 @@ static int load_elf_binary(struct linux_binprm *bprm)
        unsigned long reloc_func_desc __maybe_unused = 0;
        int executable_stack = EXSTACK_DEFAULT;
        struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf;
-       struct {
-               struct elfhdr interp_elf_ex;
-       } *loc;
+       struct elfhdr *interp_elf_ex = NULL;
        struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
        struct mm_struct *mm;
        struct pt_regs *regs;
 
-       loc = kmalloc(sizeof(*loc), GFP_KERNEL);
-       if (!loc) {
-               retval = -ENOMEM;
-               goto out_ret;
-       }
-
        retval = -ENOEXEC;
        /* First of all, some simple consistency checks */
        if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
@@ -770,9 +763,15 @@ static int load_elf_binary(struct linux_binprm *bprm)
                 */
                would_dump(bprm, interpreter);
 
+               interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
+               if (!interp_elf_ex) {
+                       retval = -ENOMEM;
+                       goto out_free_ph;
+               }
+
                /* Get the exec headers */
-               retval = elf_read(interpreter, &loc->interp_elf_ex,
-                                 sizeof(loc->interp_elf_ex), 0);
+               retval = elf_read(interpreter, interp_elf_ex,
+                                 sizeof(*interp_elf_ex), 0);
                if (retval < 0)
                        goto out_free_dentry;
 
@@ -806,25 +805,25 @@ out_free_interp:
        if (interpreter) {
                retval = -ELIBBAD;
                /* Not an ELF interpreter */
-               if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+               if (memcmp(interp_elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
                        goto out_free_dentry;
                /* Verify the interpreter has a valid arch */
-               if (!elf_check_arch(&loc->interp_elf_ex) ||
-                   elf_check_fdpic(&loc->interp_elf_ex))
+               if (!elf_check_arch(interp_elf_ex) ||
+                   elf_check_fdpic(interp_elf_ex))
                        goto out_free_dentry;
 
                /* Load the interpreter program headers */
-               interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex,
+               interp_elf_phdata = load_elf_phdrs(interp_elf_ex,
                                                   interpreter);
                if (!interp_elf_phdata)
                        goto out_free_dentry;
 
                /* Pass PT_LOPROC..PT_HIPROC headers to arch code */
                elf_ppnt = interp_elf_phdata;
-               for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++)
+               for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++)
                        switch (elf_ppnt->p_type) {
                        case PT_LOPROC ... PT_HIPROC:
-                               retval = arch_elf_pt_proc(&loc->interp_elf_ex,
+                               retval = arch_elf_pt_proc(interp_elf_ex,
                                                          elf_ppnt, interpreter,
                                                          true, &arch_state);
                                if (retval)
@@ -839,7 +838,7 @@ out_free_interp:
         * the exec syscall.
         */
        retval = arch_check_elf(elf_ex,
-                               !!interpreter, &loc->interp_elf_ex,
+                               !!interpreter, interp_elf_ex,
                                &arch_state);
        if (retval)
                goto out_free_dentry;
@@ -1055,7 +1054,7 @@ out_free_interp:
        }
 
        if (interpreter) {
-               elf_entry = load_elf_interp(&loc->interp_elf_ex,
+               elf_entry = load_elf_interp(interp_elf_ex,
                                            interpreter,
                                            load_bias, interp_elf_phdata);
                if (!IS_ERR((void *)elf_entry)) {
@@ -1064,7 +1063,7 @@ out_free_interp:
                         * adjustment
                         */
                        interp_load_addr = elf_entry;
-                       elf_entry += loc->interp_elf_ex.e_entry;
+                       elf_entry += interp_elf_ex->e_entry;
                }
                if (BAD_ADDR(elf_entry)) {
                        retval = IS_ERR((void *)elf_entry) ?
@@ -1075,6 +1074,9 @@ out_free_interp:
 
                allow_write_access(interpreter);
                fput(interpreter);
+
+               kfree(interp_elf_ex);
+               kfree(interp_elf_phdata);
        } else {
                elf_entry = e_entry;
                if (BAD_ADDR(elf_entry)) {
@@ -1083,7 +1085,6 @@ out_free_interp:
                }
        }
 
-       kfree(interp_elf_phdata);
        kfree(elf_phdata);
 
        set_binfmt(&elf_format);
@@ -1153,12 +1154,11 @@ out_free_interp:
        start_thread(regs, elf_entry, bprm->p);
        retval = 0;
 out:
-       kfree(loc);
-out_ret:
        return retval;
 
        /* error cleanup */
 out_free_dentry:
+       kfree(interp_elf_ex);
        kfree(interp_elf_phdata);
        allow_write_access(interpreter);
        if (interpreter)
@@ -1317,7 +1317,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
        }
 
        /* Hugetlb memory check */
-       if (vma->vm_flags & VM_HUGETLB) {
+       if (is_vm_hugetlb_page(vma)) {
                if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
                        goto whole;
                if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
index 7ab6166..6f4678d 100644 (file)
@@ -159,8 +159,6 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
        if (!PagePrivate(page))
                return;
 
-       ClearPageChecked(page);
-
        dout("%p invalidatepage %p idx %lu full dirty page\n",
             inode, page, page->index);
 
@@ -183,6 +181,47 @@ static int ceph_releasepage(struct page *page, gfp_t g)
 }
 
 /*
+ * Read some contiguous pages.  If we cross a stripe boundary, shorten
+ * *plen.  Return number of bytes read, or error.
+ */
+static int ceph_sync_readpages(struct ceph_fs_client *fsc,
+                              struct ceph_vino vino,
+                              struct ceph_file_layout *layout,
+                              u64 off, u64 *plen,
+                              u32 truncate_seq, u64 truncate_size,
+                              struct page **pages, int num_pages,
+                              int page_align)
+{
+       struct ceph_osd_client *osdc = &fsc->client->osdc;
+       struct ceph_osd_request *req;
+       int rc = 0;
+
+       dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+            vino.snap, off, *plen);
+       req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
+                                   CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+                                   NULL, truncate_seq, truncate_size,
+                                   false);
+       if (IS_ERR(req))
+               return PTR_ERR(req);
+
+       /* it may be a short read due to an object boundary */
+       osd_req_op_extent_osd_data_pages(req, 0,
+                               pages, *plen, page_align, false, false);
+
+       dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
+            off, *plen, *plen, page_align);
+
+       rc = ceph_osdc_start_request(osdc, req, false);
+       if (!rc)
+               rc = ceph_osdc_wait_request(osdc, req);
+
+       ceph_osdc_put_request(req);
+       dout("readpages result %d\n", rc);
+       return rc;
+}
+
+/*
  * read a single page, without unlocking it.
  */
 static int ceph_do_readpage(struct file *filp, struct page *page)
@@ -218,7 +257,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
 
        dout("readpage inode %p file %p page %p index %lu\n",
             inode, filp, page, page->index);
-       err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
+       err = ceph_sync_readpages(fsc, ceph_vino(inode),
                                  &ci->i_layout, off, &len,
                                  ci->i_truncate_seq, ci->i_truncate_size,
                                  &page, 1, 0);
@@ -571,6 +610,47 @@ static u64 get_writepages_data_length(struct inode *inode,
 }
 
 /*
+ * do a synchronous write on N pages
+ */
+static int ceph_sync_writepages(struct ceph_fs_client *fsc,
+                               struct ceph_vino vino,
+                               struct ceph_file_layout *layout,
+                               struct ceph_snap_context *snapc,
+                               u64 off, u64 len,
+                               u32 truncate_seq, u64 truncate_size,
+                               struct timespec64 *mtime,
+                               struct page **pages, int num_pages)
+{
+       struct ceph_osd_client *osdc = &fsc->client->osdc;
+       struct ceph_osd_request *req;
+       int rc = 0;
+       int page_align = off & ~PAGE_MASK;
+
+       req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
+                                   CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
+                                   snapc, truncate_seq, truncate_size,
+                                   true);
+       if (IS_ERR(req))
+               return PTR_ERR(req);
+
+       /* it may be a short write due to an object boundary */
+       osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
+                               false, false);
+       dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
+
+       req->r_mtime = *mtime;
+       rc = ceph_osdc_start_request(osdc, req, true);
+       if (!rc)
+               rc = ceph_osdc_wait_request(osdc, req);
+
+       ceph_osdc_put_request(req);
+       if (rc == 0)
+               rc = len;
+       dout("writepages result %d\n", rc);
+       return rc;
+}
+
+/*
  * Write a single page, but leave the page locked.
  *
  * If we get a write error, mark the mapping for error, but still adjust the
@@ -628,7 +708,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
                set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 
        set_page_writeback(page);
-       err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode),
+       err = ceph_sync_writepages(fsc, ceph_vino(inode),
                                   &ci->i_layout, snapc, page_off, len,
                                   ceph_wbc.truncate_seq,
                                   ceph_wbc.truncate_size,
@@ -1575,7 +1655,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
        do {
                lock_page(page);
 
-               if ((off > size) || (page->mapping != inode->i_mapping)) {
+               if (page_mkwrite_check_truncate(page, inode) < 0) {
                        unlock_page(page);
                        ret = VM_FAULT_NOPAGE;
                        break;
index 270b769..2f5cb6b 100644 (file)
@@ -32,7 +32,7 @@ struct ceph_fscache_entry {
        size_t uniq_len;
        /* The following members must be last */
        struct ceph_fsid fsid;
-       char uniquifier[0];
+       char uniquifier[];
 };
 
 static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
index 28ae0c1..185db76 100644 (file)
@@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
                               struct ceph_inode_info *ci)
 {
        struct ceph_mount_options *opt = mdsc->fsc->mount_options;
-
-       ci->i_hold_caps_min = round_jiffies(jiffies +
-                                           opt->caps_wanted_delay_min * HZ);
        ci->i_hold_caps_max = round_jiffies(jiffies +
                                            opt->caps_wanted_delay_max * HZ);
-       dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
-            ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+       dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
+            ci->i_hold_caps_max - jiffies);
 }
 
 /*
@@ -508,10 +505,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
  *    -> we take mdsc->cap_delay_lock
  */
 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
-                               struct ceph_inode_info *ci,
-                               bool set_timeout)
+                               struct ceph_inode_info *ci)
 {
-       dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+       dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
             ci->i_ceph_flags, ci->i_hold_caps_max);
        if (!mdsc->stopping) {
                spin_lock(&mdsc->cap_delay_lock);
@@ -520,8 +516,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
                                goto no_change;
                        list_del_init(&ci->i_cap_delay_list);
                }
-               if (set_timeout)
-                       __cap_set_timeouts(mdsc, ci);
+               __cap_set_timeouts(mdsc, ci);
                list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
 no_change:
                spin_unlock(&mdsc->cap_delay_lock);
@@ -561,19 +556,20 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
        spin_unlock(&mdsc->cap_delay_lock);
 }
 
-/*
- * Common issue checks for add_cap, handle_cap_grant.
- */
+/* Common issue checks for add_cap, handle_cap_grant. */
 static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
                              unsigned issued)
 {
        unsigned had = __ceph_caps_issued(ci, NULL);
 
+       lockdep_assert_held(&ci->i_ceph_lock);
+
        /*
         * Each time we receive FILE_CACHE anew, we increment
         * i_rdcache_gen.
         */
-       if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+       if (S_ISREG(ci->vfs_inode.i_mode) &&
+           (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
            (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
                ci->i_rdcache_gen++;
        }
@@ -592,6 +588,13 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
                        __ceph_dir_clear_complete(ci);
                }
        }
+
+       /* Wipe saved layout if we're losing DIR_CREATE caps */
+       if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
+               !(issued & CEPH_CAP_DIR_CREATE)) {
+            ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
+            memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+       }
 }
 
 /*
@@ -605,7 +608,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
  */
 void ceph_add_cap(struct inode *inode,
                  struct ceph_mds_session *session, u64 cap_id,
-                 int fmode, unsigned issued, unsigned wanted,
+                 unsigned issued, unsigned wanted,
                  unsigned seq, unsigned mseq, u64 realmino, int flags,
                  struct ceph_cap **new_cap)
 {
@@ -621,13 +624,6 @@ void ceph_add_cap(struct inode *inode,
        dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
             session->s_mds, cap_id, ceph_cap_string(issued), seq);
 
-       /*
-        * If we are opening the file, include file mode wanted bits
-        * in wanted.
-        */
-       if (fmode >= 0)
-               wanted |= ceph_caps_for_mode(fmode);
-
        spin_lock(&session->s_gen_ttl_lock);
        gen = session->s_cap_gen;
        spin_unlock(&session->s_gen_ttl_lock);
@@ -725,7 +721,7 @@ void ceph_add_cap(struct inode *inode,
                dout(" issued %s, mds wanted %s, actual %s, queueing\n",
                     ceph_cap_string(issued), ceph_cap_string(wanted),
                     ceph_cap_string(actual_wanted));
-               __cap_delay_requeue(mdsc, ci, true);
+               __cap_delay_requeue(mdsc, ci);
        }
 
        if (flags & CEPH_CAP_FLAG_AUTH) {
@@ -752,9 +748,6 @@ void ceph_add_cap(struct inode *inode,
        cap->issue_seq = seq;
        cap->mseq = mseq;
        cap->cap_gen = gen;
-
-       if (fmode >= 0)
-               __ceph_get_fmode(ci, fmode);
 }
 
 /*
@@ -958,29 +951,97 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
        if (ci->i_rd_ref)
                used |= CEPH_CAP_FILE_RD;
        if (ci->i_rdcache_ref ||
-           (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
+           (S_ISREG(ci->vfs_inode.i_mode) &&
             ci->vfs_inode.i_data.nrpages))
                used |= CEPH_CAP_FILE_CACHE;
        if (ci->i_wr_ref)
                used |= CEPH_CAP_FILE_WR;
        if (ci->i_wb_ref || ci->i_wrbuffer_ref)
                used |= CEPH_CAP_FILE_BUFFER;
+       if (ci->i_fx_ref)
+               used |= CEPH_CAP_FILE_EXCL;
        return used;
 }
 
+#define FMODE_WAIT_BIAS 1000
+
 /*
  * wanted, by virtue of open file modes
  */
 int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 {
-       int i, bits = 0;
-       for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
-               if (ci->i_nr_by_mode[i])
-                       bits |= 1 << i;
+       const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
+       const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
+       const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
+       const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
+       struct ceph_mount_options *opt =
+               ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+       unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
+       unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
+
+       if (S_ISDIR(ci->vfs_inode.i_mode)) {
+               int want = 0;
+
+               /* use used_cutoff here, to keep dir's wanted caps longer */
+               if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
+                   time_after(ci->i_last_rd, used_cutoff))
+                       want |= CEPH_CAP_ANY_SHARED;
+
+               if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
+                   time_after(ci->i_last_wr, used_cutoff)) {
+                       want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+                       if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+                               want |= CEPH_CAP_ANY_DIR_OPS;
+               }
+
+               if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
+                       want |= CEPH_CAP_PIN;
+
+               return want;
+       } else {
+               int bits = 0;
+
+               if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
+                       if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
+                           time_after(ci->i_last_rd, used_cutoff))
+                               bits |= 1 << RD_SHIFT;
+               } else if (time_after(ci->i_last_rd, idle_cutoff)) {
+                       bits |= 1 << RD_SHIFT;
+               }
+
+               if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
+                       if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
+                           time_after(ci->i_last_wr, used_cutoff))
+                               bits |= 1 << WR_SHIFT;
+               } else if (time_after(ci->i_last_wr, idle_cutoff)) {
+                       bits |= 1 << WR_SHIFT;
+               }
+
+               /* check lazyio only when read/write is wanted */
+               if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
+                   ci->i_nr_by_mode[LAZY_SHIFT] > 0)
+                       bits |= 1 << LAZY_SHIFT;
+
+               return bits ? ceph_caps_for_mode(bits >> 1) : 0;
        }
-       if (bits == 0)
-               return 0;
-       return ceph_caps_for_mode(bits >> 1);
+}
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+       int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+       if (S_ISDIR(ci->vfs_inode.i_mode)) {
+               /* we want EXCL if holding caps of dir ops */
+               if (w & CEPH_CAP_ANY_DIR_OPS)
+                       w |= CEPH_CAP_FILE_EXCL;
+       } else {
+               /* we want EXCL if dirty data */
+               if (w & CEPH_CAP_FILE_BUFFER)
+                       w |= CEPH_CAP_FILE_EXCL;
+       }
+       return w;
 }
 
 /*
@@ -1004,14 +1065,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
        return mds_wanted;
 }
 
-/*
- * called under i_ceph_lock
- */
-static int __ceph_is_single_caps(struct ceph_inode_info *ci)
-{
-       return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
-}
-
 int ceph_is_any_caps(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1274,9 +1327,15 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        struct cap_msg_args arg;
        int held, revoking;
        int wake = 0;
-       int delayed = 0;
        int ret;
 
+       /* Don't send anything if it's still being created. Return delayed */
+       if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+               spin_unlock(&ci->i_ceph_lock);
+               dout("%s async create in flight for %p\n", __func__, inode);
+               return 1;
+       }
+
        held = cap->issued | cap->implemented;
        revoking = cap->implemented & ~cap->issued;
        retain &= ~revoking;
@@ -1287,28 +1346,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
             ceph_cap_string(revoking));
        BUG_ON((retain & CEPH_CAP_PIN) == 0);
 
-       arg.session = cap->session;
-
-       /* don't release wanted unless we've waited a bit. */
-       if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
-           time_before(jiffies, ci->i_hold_caps_min)) {
-               dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
-                    ceph_cap_string(cap->issued),
-                    ceph_cap_string(cap->issued & retain),
-                    ceph_cap_string(cap->mds_wanted),
-                    ceph_cap_string(want));
-               want |= cap->mds_wanted;
-               retain |= cap->issued;
-               delayed = 1;
-       }
-       ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
-       if (want & ~cap->mds_wanted) {
-               /* user space may open/close single file frequently.
-                * This avoids droping mds_wanted immediately after
-                * requesting new mds_wanted.
-                */
-               __cap_set_timeouts(mdsc, ci);
-       }
+       ci->i_ceph_flags &= ~CEPH_I_FLUSH;
 
        cap->issued &= retain;  /* drop bits we don't want */
        if (cap->implemented & ~cap->issued) {
@@ -1323,6 +1361,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        cap->implemented &= cap->issued | used;
        cap->mds_wanted = want;
 
+       arg.session = cap->session;
        arg.ino = ceph_vino(inode).ino;
        arg.cid = cap->cap_id;
        arg.follows = flushing ? ci->i_head_snapc->seq : 0;
@@ -1332,7 +1371,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        arg.size = inode->i_size;
        ci->i_reported_size = arg.size;
        arg.max_size = ci->i_wanted_max_size;
-       ci->i_requested_max_size = arg.max_size;
+       if (cap == ci->i_auth_cap)
+               ci->i_requested_max_size = arg.max_size;
 
        if (flushing & CEPH_CAP_XATTR_EXCL) {
                old_blob = __ceph_build_xattrs_blob(ci);
@@ -1383,14 +1423,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 
        ret = send_cap_msg(&arg);
        if (ret < 0) {
-               dout("error sending cap msg, must requeue %p\n", inode);
-               delayed = 1;
+               pr_err("error sending cap msg, ino (%llx.%llx) "
+                      "flushing %s tid %llu, requeue\n",
+                      ceph_vinop(inode), ceph_cap_string(flushing),
+                      flush_tid);
+               spin_lock(&ci->i_ceph_lock);
+               __cap_delay_requeue(mdsc, ci);
+               spin_unlock(&ci->i_ceph_lock);
        }
 
        if (wake)
                wake_up_all(&ci->i_cap_wq);
 
-       return delayed;
+       return ret;
 }
 
 static inline int __send_flush_snap(struct inode *inode,
@@ -1617,6 +1662,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
        int was = ci->i_dirty_caps;
        int dirty = 0;
 
+       lockdep_assert_held(&ci->i_ceph_lock);
+
        if (!ci->i_auth_cap) {
                pr_warn("__mark_dirty_caps %p %llx mask %s, "
                        "but no auth cap (session was closed?)\n",
@@ -1654,7 +1701,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
        if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
            (mask & CEPH_CAP_FILE_BUFFER))
                dirty |= I_DIRTY_DATASYNC;
-       __cap_delay_requeue(mdsc, ci, true);
+       __cap_delay_requeue(mdsc, ci);
        return dirty;
 }
 
@@ -1726,6 +1773,7 @@ static u64 __mark_caps_flushing(struct inode *inode,
        struct ceph_cap_flush *cf = NULL;
        int flushing;
 
+       lockdep_assert_held(&ci->i_ceph_lock);
        BUG_ON(ci->i_dirty_caps == 0);
        BUG_ON(list_empty(&ci->i_dirty_item));
        BUG_ON(!ci->i_prealloc_cap_flush);
@@ -1805,8 +1853,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
  * versus held caps.  Release, flush, ack revoked caps to mds as
  * appropriate.
  *
- *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
- *    cap release further.
  *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
  *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
  *    further delay.
@@ -1825,24 +1871,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
        int mds = -1;   /* keep track of how far we've gone through i_caps list
                           to avoid an infinite loop on retry */
        struct rb_node *p;
-       int delayed = 0, sent = 0;
-       bool no_delay = flags & CHECK_CAPS_NODELAY;
        bool queue_invalidate = false;
        bool tried_invalidate = false;
 
-       /* if we are unmounting, flush any unused caps immediately. */
-       if (mdsc->stopping)
-               no_delay = true;
-
        spin_lock(&ci->i_ceph_lock);
-
        if (ci->i_ceph_flags & CEPH_I_FLUSH)
                flags |= CHECK_CAPS_FLUSH;
 
-       if (!(flags & CHECK_CAPS_AUTHONLY) ||
-           (ci->i_auth_cap && __ceph_is_single_caps(ci)))
-               __cap_delay_cancel(mdsc, ci);
-
        goto retry_locked;
 retry:
        spin_lock(&ci->i_ceph_lock);
@@ -1866,10 +1901,11 @@ retry_locked:
                         * revoking the shared cap on every create/unlink
                         * operation.
                         */
-                       if (IS_RDONLY(inode))
+                       if (IS_RDONLY(inode)) {
                                want = CEPH_CAP_ANY_SHARED;
-                       else
-                               want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+                       } else {
+                               want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+                       }
                        retain |= want;
                } else {
 
@@ -1885,14 +1921,13 @@ retry_locked:
        }
 
        dout("check_caps %p file_want %s used %s dirty %s flushing %s"
-            " issued %s revoking %s retain %s %s%s%s\n", inode,
+            " issued %s revoking %s retain %s %s%s\n", inode,
             ceph_cap_string(file_wanted),
             ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
             ceph_cap_string(ci->i_flushing_caps),
             ceph_cap_string(issued), ceph_cap_string(revoking),
             ceph_cap_string(retain),
             (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
-            (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
             (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
 
        /*
@@ -1900,8 +1935,8 @@ retry_locked:
         * have cached pages, but don't want them, then try to invalidate.
         * If we fail, it's because pages are locked.... try again later.
         */
-       if ((!no_delay || mdsc->stopping) &&
-           !S_ISDIR(inode->i_mode) &&          /* ignore readdir cache */
+       if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
+           S_ISREG(inode->i_mode) &&
            !(ci->i_wb_ref || ci->i_wrbuffer_ref) &&   /* no dirty pages... */
            inode->i_data.nrpages &&            /* have cached pages */
            (revoking & (CEPH_CAP_FILE_CACHE|
@@ -1973,28 +2008,17 @@ retry_locked:
                }
 
                /* want more caps from mds? */
-               if (want & ~(cap->mds_wanted | cap->issued))
-                       goto ack;
+               if (want & ~cap->mds_wanted) {
+                       if (want & ~(cap->mds_wanted | cap->issued))
+                               goto ack;
+                       if (!__cap_is_valid(cap))
+                               goto ack;
+               }
 
                /* things we might delay */
                if ((cap->issued & ~retain) == 0)
                        continue;     /* nope, all good */
 
-               if (no_delay)
-                       goto ack;
-
-               /* delay? */
-               if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
-                   time_before(jiffies, ci->i_hold_caps_max)) {
-                       dout(" delaying issued %s -> %s, wanted %s -> %s\n",
-                            ceph_cap_string(cap->issued),
-                            ceph_cap_string(cap->issued & retain),
-                            ceph_cap_string(cap->mds_wanted),
-                            ceph_cap_string(want));
-                       delayed++;
-                       continue;
-               }
-
 ack:
                if (session && session != cap->session) {
                        dout("oops, wrong session %p mutex\n", session);
@@ -2055,18 +2079,20 @@ ack:
                }
 
                mds = cap->mds;  /* remember mds, so we don't repeat */
-               sent++;
 
                /* __send_cap drops i_ceph_lock */
-               delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
-                               cap_used, want, retain, flushing,
-                               flush_tid, oldest_flush_tid);
+               __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want,
+                          retain, flushing, flush_tid, oldest_flush_tid);
                goto retry; /* retake i_ceph_lock and restart our cap scan. */
        }
 
-       /* Reschedule delayed caps release if we delayed anything */
-       if (delayed)
-               __cap_delay_requeue(mdsc, ci, false);
+       /* periodically re-calculate caps wanted by open files */
+       if (__ceph_is_any_real_caps(ci) &&
+           list_empty(&ci->i_cap_delay_list) &&
+           (file_wanted & ~CEPH_CAP_PIN) &&
+           !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+               __cap_delay_requeue(mdsc, ci);
+       }
 
        spin_unlock(&ci->i_ceph_lock);
 
@@ -2095,7 +2121,6 @@ retry:
 retry_locked:
        if (ci->i_dirty_caps && ci->i_auth_cap) {
                struct ceph_cap *cap = ci->i_auth_cap;
-               int delayed;
 
                if (session != cap->session) {
                        spin_unlock(&ci->i_ceph_lock);
@@ -2124,18 +2149,10 @@ retry_locked:
                                                 &oldest_flush_tid);
 
                /* __send_cap drops i_ceph_lock */
-               delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-                                    CEPH_CLIENT_CAPS_SYNC,
-                                    __ceph_caps_used(ci),
-                                    __ceph_caps_wanted(ci),
-                                    (cap->issued | cap->implemented),
-                                    flushing, flush_tid, oldest_flush_tid);
-
-               if (delayed) {
-                       spin_lock(&ci->i_ceph_lock);
-                       __cap_delay_requeue(mdsc, ci, true);
-                       spin_unlock(&ci->i_ceph_lock);
-               }
+               __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
+                          __ceph_caps_used(ci), __ceph_caps_wanted(ci),
+                          (cap->issued | cap->implemented),
+                          flushing, flush_tid, oldest_flush_tid);
        } else {
                if (!list_empty(&ci->i_cap_flush_list)) {
                        struct ceph_cap_flush *cf =
@@ -2233,6 +2250,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        if (datasync)
                goto out;
 
+       ret = ceph_wait_on_async_create(inode);
+       if (ret)
+               goto out;
+
        dirty = try_flush_caps(inode, &flush_tid);
        dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
 
@@ -2335,22 +2356,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
                if (cf->caps) {
                        dout("kick_flushing_caps %p cap %p tid %llu %s\n",
                             inode, cap, cf->tid, ceph_cap_string(cf->caps));
-                       ci->i_ceph_flags |= CEPH_I_NODELAY;
-
-                       ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+                       __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
                                         (cf->tid < last_snap_flush ?
                                          CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
                                          __ceph_caps_used(ci),
                                          __ceph_caps_wanted(ci),
                                          (cap->issued | cap->implemented),
                                          cf->caps, cf->tid, oldest_flush_tid);
-                       if (ret) {
-                               pr_err("kick_flushing_caps: error sending "
-                                       "cap flush, ino (%llx.%llx) "
-                                       "tid %llu flushing %s\n",
-                                       ceph_vinop(inode), cf->tid,
-                                       ceph_cap_string(cf->caps));
-                       }
                } else {
                        struct ceph_cap_snap *capsnap =
                                        container_of(cf, struct ceph_cap_snap,
@@ -2457,16 +2469,15 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
        }
 }
 
-static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
-                                    struct ceph_mds_session *session,
-                                    struct inode *inode)
-       __releases(ci->i_ceph_lock)
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
+                                  struct ceph_inode_info *ci)
 {
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_cap *cap;
+       struct ceph_mds_client *mdsc = session->s_mdsc;
+       struct ceph_cap *cap = ci->i_auth_cap;
+
+       lockdep_assert_held(&ci->i_ceph_lock);
 
-       cap = ci->i_auth_cap;
-       dout("kick_flushing_inode_caps %p flushing %s\n", inode,
+       dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
             ceph_cap_string(ci->i_flushing_caps));
 
        if (!list_empty(&ci->i_cap_flush_list)) {
@@ -2478,9 +2489,6 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
                spin_unlock(&mdsc->cap_dirty_lock);
 
                __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
-               spin_unlock(&ci->i_ceph_lock);
-       } else {
-               spin_unlock(&ci->i_ceph_lock);
        }
 }
 
@@ -2488,18 +2496,20 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
 /*
  * Take references to capabilities we hold, so that we don't release
  * them to the MDS prematurely.
- *
- * Protected by i_ceph_lock.
  */
-static void __take_cap_refs(struct ceph_inode_info *ci, int got,
+void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
                            bool snap_rwsem_locked)
 {
+       lockdep_assert_held(&ci->i_ceph_lock);
+
        if (got & CEPH_CAP_PIN)
                ci->i_pin_ref++;
        if (got & CEPH_CAP_FILE_RD)
                ci->i_rd_ref++;
        if (got & CEPH_CAP_FILE_CACHE)
                ci->i_rdcache_ref++;
+       if (got & CEPH_CAP_FILE_EXCL)
+               ci->i_fx_ref++;
        if (got & CEPH_CAP_FILE_WR) {
                if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
                        BUG_ON(!snap_rwsem_locked);
@@ -2512,7 +2522,7 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
                if (ci->i_wb_ref == 0)
                        ihold(&ci->vfs_inode);
                ci->i_wb_ref++;
-               dout("__take_cap_refs %p wb %d -> %d (?)\n",
+               dout("%s %p wb %d -> %d (?)\n", __func__,
                     &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
        }
 }
@@ -2524,14 +2534,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
  * Note that caller is responsible for ensuring max_size increases are
  * requested from the MDS.
  *
- * Returns 0 if caps were not able to be acquired (yet), a 1 if they were,
- * or a negative error code.
- *
- * FIXME: how does a 0 return differ from -EAGAIN?
+ * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
+ * or a negative error code. There are 3 speical error codes:
+ *  -EAGAIN: need to sleep but non-blocking is specified
+ *  -EFBIG:  ask caller to call check_max_size() and try again.
+ *  -ESTALE: ask caller to call ceph_renew_caps() and try again.
  */
 enum {
-       NON_BLOCKING    = 1,
-       CHECK_FILELOCK  = 2,
+       /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
+       NON_BLOCKING    = (1 << 8),
+       CHECK_FILELOCK  = (1 << 9),
 };
 
 static int try_get_cap_refs(struct inode *inode, int need, int want,
@@ -2541,7 +2553,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        int ret = 0;
        int have, implemented;
-       int file_wanted;
        bool snap_rwsem_locked = false;
 
        dout("get_cap_refs %p need %s want %s\n", inode,
@@ -2557,15 +2568,6 @@ again:
                goto out_unlock;
        }
 
-       /* make sure file is actually open */
-       file_wanted = __ceph_caps_file_wanted(ci);
-       if ((file_wanted & need) != need) {
-               dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
-                    ceph_cap_string(need), ceph_cap_string(file_wanted));
-               ret = -EBADF;
-               goto out_unlock;
-       }
-
        /* finish pending truncate */
        while (ci->i_truncate_pending) {
                spin_unlock(&ci->i_ceph_lock);
@@ -2584,7 +2586,7 @@ again:
                        dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
                             inode, endoff, ci->i_max_size);
                        if (endoff > ci->i_requested_max_size)
-                               ret = -EAGAIN;
+                               ret = ci->i_auth_cap ? -EFBIG : -ESTALE;
                        goto out_unlock;
                }
                /*
@@ -2630,51 +2632,55 @@ again:
                                }
                                snap_rwsem_locked = true;
                        }
-                       *got = need | (have & want);
-                       if ((need & CEPH_CAP_FILE_RD) &&
+                       if ((have & want) == want)
+                               *got = need | want;
+                       else
+                               *got = need;
+                       if (S_ISREG(inode->i_mode) &&
+                           (need & CEPH_CAP_FILE_RD) &&
                            !(*got & CEPH_CAP_FILE_CACHE))
                                ceph_disable_fscache_readpage(ci);
-                       __take_cap_refs(ci, *got, true);
+                       ceph_take_cap_refs(ci, *got, true);
                        ret = 1;
                }
        } else {
                int session_readonly = false;
-               if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
+               int mds_wanted;
+               if (ci->i_auth_cap &&
+                   (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
                        struct ceph_mds_session *s = ci->i_auth_cap->session;
                        spin_lock(&s->s_cap_lock);
                        session_readonly = s->s_readonly;
                        spin_unlock(&s->s_cap_lock);
                }
                if (session_readonly) {
-                       dout("get_cap_refs %p needed %s but mds%d readonly\n",
+                       dout("get_cap_refs %p need %s but mds%d readonly\n",
                             inode, ceph_cap_string(need), ci->i_auth_cap->mds);
                        ret = -EROFS;
                        goto out_unlock;
                }
 
-               if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
-                       int mds_wanted;
-                       if (READ_ONCE(mdsc->fsc->mount_state) ==
-                           CEPH_MOUNT_SHUTDOWN) {
-                               dout("get_cap_refs %p forced umount\n", inode);
-                               ret = -EIO;
-                               goto out_unlock;
-                       }
-                       mds_wanted = __ceph_caps_mds_wanted(ci, false);
-                       if (need & ~(mds_wanted & need)) {
-                               dout("get_cap_refs %p caps were dropped"
-                                    " (session killed?)\n", inode);
-                               ret = -ESTALE;
-                               goto out_unlock;
-                       }
-                       if (!(file_wanted & ~mds_wanted))
-                               ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
+               if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+                       dout("get_cap_refs %p forced umount\n", inode);
+                       ret = -EIO;
+                       goto out_unlock;
+               }
+               mds_wanted = __ceph_caps_mds_wanted(ci, false);
+               if (need & ~mds_wanted) {
+                       dout("get_cap_refs %p need %s > mds_wanted %s\n",
+                            inode, ceph_cap_string(need),
+                            ceph_cap_string(mds_wanted));
+                       ret = -ESTALE;
+                       goto out_unlock;
                }
 
-               dout("get_cap_refs %p have %s needed %s\n", inode,
+               dout("get_cap_refs %p have %s need %s\n", inode,
                     ceph_cap_string(have), ceph_cap_string(need));
        }
 out_unlock:
+
+       __ceph_touch_fmode(ci, mdsc, flags);
+
        spin_unlock(&ci->i_ceph_lock);
        if (snap_rwsem_locked)
                up_read(&mdsc->snap_rwsem);
@@ -2712,20 +2718,40 @@ static void check_max_size(struct inode *inode, loff_t endoff)
                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
 }
 
+static inline int get_used_fmode(int caps)
+{
+       int fmode = 0;
+       if (caps & CEPH_CAP_FILE_RD)
+               fmode |= CEPH_FILE_MODE_RD;
+       if (caps & CEPH_CAP_FILE_WR)
+               fmode |= CEPH_FILE_MODE_WR;
+       return fmode;
+}
+
 int ceph_try_get_caps(struct inode *inode, int need, int want,
                      bool nonblock, int *got)
 {
-       int ret;
+       int ret, flags;
 
        BUG_ON(need & ~CEPH_CAP_FILE_RD);
-       BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
-       ret = ceph_pool_perm_check(inode, need);
-       if (ret < 0)
-               return ret;
+       BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
+                       CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+                       CEPH_CAP_ANY_DIR_OPS));
+       if (need) {
+               ret = ceph_pool_perm_check(inode, need);
+               if (ret < 0)
+                       return ret;
+       }
+
+       flags = get_used_fmode(need | want);
+       if (nonblock)
+               flags |= NON_BLOCKING;
 
-       ret = try_get_cap_refs(inode, need, want, 0,
-                              (nonblock ? NON_BLOCKING : 0), got);
-       return ret == -EAGAIN ? 0 : ret;
+       ret = try_get_cap_refs(inode, need, want, 0, flags, got);
+       /* three special error codes */
+       if (ret == -EAGAIN || ret == -EFBIG || ret == -EAGAIN)
+               ret = 0;
+       return ret;
 }
 
 /*
@@ -2750,16 +2776,16 @@ int ceph_get_caps(struct file *filp, int need, int want,
            fi->filp_gen != READ_ONCE(fsc->filp_gen))
                return -EBADF;
 
-       while (true) {
-               if (endoff > 0)
-                       check_max_size(inode, endoff);
+       flags = get_used_fmode(need | want);
 
-               flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
+       while (true) {
+               flags &= CEPH_FILE_MODE_MASK;
+               if (atomic_read(&fi->num_locks))
+                       flags |= CHECK_FILELOCK;
                _got = 0;
                ret = try_get_cap_refs(inode, need, want, endoff,
                                       flags, &_got);
-               if (ret == -EAGAIN)
-                       continue;
+               WARN_ON_ONCE(ret == -EAGAIN);
                if (!ret) {
                        struct ceph_mds_client *mdsc = fsc->mdsc;
                        struct cap_wait cw;
@@ -2774,6 +2800,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
                        list_add(&cw.list, &mdsc->cap_wait_list);
                        spin_unlock(&mdsc->caps_list_lock);
 
+                       /* make sure used fmode not timeout */
+                       ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
                        add_wait_queue(&ci->i_cap_wq, &wait);
 
                        flags |= NON_BLOCKING;
@@ -2787,6 +2815,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
                        }
 
                        remove_wait_queue(&ci->i_cap_wq, &wait);
+                       ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
 
                        spin_lock(&mdsc->caps_list_lock);
                        list_del(&cw.list);
@@ -2804,16 +2833,26 @@ int ceph_get_caps(struct file *filp, int need, int want,
                }
 
                if (ret < 0) {
+                       if (ret == -EFBIG || ret == -ESTALE) {
+                               int ret2 = ceph_wait_on_async_create(inode);
+                               if (ret2 < 0)
+                                       return ret2;
+                       }
+                       if (ret == -EFBIG) {
+                               check_max_size(inode, endoff);
+                               continue;
+                       }
                        if (ret == -ESTALE) {
                                /* session was killed, try renew caps */
-                               ret = ceph_renew_caps(inode);
+                               ret = ceph_renew_caps(inode, flags);
                                if (ret == 0)
                                        continue;
                        }
                        return ret;
                }
 
-               if (ci->i_inline_version != CEPH_INLINE_NONE &&
+               if (S_ISREG(ci->vfs_inode.i_mode) &&
+                   ci->i_inline_version != CEPH_INLINE_NONE &&
                    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
                    i_size_read(inode) > 0) {
                        struct page *page =
@@ -2846,7 +2885,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
                break;
        }
 
-       if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
+       if (S_ISREG(ci->vfs_inode.i_mode) &&
+           (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
                ceph_fscache_revalidate_cookie(ci);
 
        *got = _got;
@@ -2860,7 +2900,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
 {
        spin_lock(&ci->i_ceph_lock);
-       __take_cap_refs(ci, caps, false);
+       ceph_take_cap_refs(ci, caps, false);
        spin_unlock(&ci->i_ceph_lock);
 }
 
@@ -2911,6 +2951,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
        if (had & CEPH_CAP_FILE_CACHE)
                if (--ci->i_rdcache_ref == 0)
                        last++;
+       if (had & CEPH_CAP_FILE_EXCL)
+               if (--ci->i_fx_ref == 0)
+                       last++;
        if (had & CEPH_CAP_FILE_BUFFER) {
                if (--ci->i_wb_ref == 0) {
                        last++;
@@ -2950,7 +2993,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
        dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
             last ? " last" : "", put ? " put" : "");
 
-       if (last && !flushsnaps)
+       if (last)
                ceph_check_caps(ci, 0, NULL);
        else if (flushsnaps)
                ceph_flush_snaps(ci, NULL);
@@ -3032,7 +3075,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
        spin_unlock(&ci->i_ceph_lock);
 
        if (last) {
-               ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+               ceph_check_caps(ci, 0, NULL);
        } else if (flush_snaps) {
                ceph_flush_snaps(ci, NULL);
        }
@@ -3133,7 +3176,7 @@ static void handle_cap_grant(struct inode *inode,
         * try to invalidate (once).  (If there are dirty buffers, we
         * will invalidate _after_ writeback.)
         */
-       if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
+       if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
            ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
            (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
            !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
@@ -3297,11 +3340,12 @@ static void handle_cap_grant(struct inode *inode,
                     ceph_cap_string(cap->issued),
                     ceph_cap_string(newcaps),
                     ceph_cap_string(revoking));
-               if (revoking & used & CEPH_CAP_FILE_BUFFER)
+               if (S_ISREG(inode->i_mode) &&
+                   (revoking & used & CEPH_CAP_FILE_BUFFER))
                        writeback = true;  /* initiate writeback; will delay ack */
-               else if (revoking == CEPH_CAP_FILE_CACHE &&
-                        (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
-                        queue_invalidate)
+               else if (queue_invalidate &&
+                        revoking == CEPH_CAP_FILE_CACHE &&
+                        (newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
                        ; /* do nothing yet, invalidation will be queued */
                else if (cap == ci->i_auth_cap)
                        check_caps = 1; /* check auth cap only */
@@ -3339,7 +3383,8 @@ static void handle_cap_grant(struct inode *inode,
        if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
                if (newcaps & ~extra_info->issued)
                        wake = true;
-               kick_flushing_inode_caps(session->s_mdsc, session, inode);
+               ceph_kick_flushing_inode_caps(session, ci);
+               spin_unlock(&ci->i_ceph_lock);
                up_read(&session->s_mdsc->snap_rwsem);
        } else {
                spin_unlock(&ci->i_ceph_lock);
@@ -3367,10 +3412,10 @@ static void handle_cap_grant(struct inode *inode,
                wake_up_all(&ci->i_cap_wq);
 
        if (check_caps == 1)
-               ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+               ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
                                session);
        else if (check_caps == 2)
-               ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+               ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
        else
                mutex_unlock(&session->s_mutex);
 }
@@ -3619,8 +3664,6 @@ retry:
                goto out_unlock;
 
        if (target < 0) {
-               if (cap->mds_wanted | cap->issued)
-                       ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
                __ceph_remove_cap(cap, false);
                goto out_unlock;
        }
@@ -3668,7 +3711,7 @@ retry:
                /* add placeholder for the export tagert */
                int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
                tcap = new_cap;
-               ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+               ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
                             t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
 
                if (!list_empty(&ci->i_cap_flush_list) &&
@@ -3773,7 +3816,7 @@ retry:
        __ceph_caps_issued(ci, &issued);
        issued |= __ceph_caps_dirty(ci);
 
-       ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+       ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
                     realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
 
        ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
@@ -4047,7 +4090,6 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 {
        struct inode *inode;
        struct ceph_inode_info *ci;
-       int flags = CHECK_CAPS_NODELAY;
 
        dout("check_delayed_caps\n");
        while (1) {
@@ -4067,7 +4109,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 
                if (inode) {
                        dout("check_delayed_caps on %p\n", inode);
-                       ceph_check_caps(ci, flags, NULL);
+                       ceph_check_caps(ci, 0, NULL);
                        /* avoid calling iput_final() in tick thread */
                        ceph_async_iput(inode);
                }
@@ -4092,7 +4134,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
                ihold(inode);
                dout("flush_dirty_caps %p\n", inode);
                spin_unlock(&mdsc->cap_dirty_lock);
-               ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
+               ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
                iput(inode);
                spin_lock(&mdsc->cap_dirty_lock);
        }
@@ -4100,14 +4142,31 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
        dout("flush_dirty_caps done\n");
 }
 
-void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
+void __ceph_touch_fmode(struct ceph_inode_info *ci,
+                       struct ceph_mds_client *mdsc, int fmode)
+{
+       unsigned long now = jiffies;
+       if (fmode & CEPH_FILE_MODE_RD)
+               ci->i_last_rd = now;
+       if (fmode & CEPH_FILE_MODE_WR)
+               ci->i_last_wr = now;
+       /* queue periodic check */
+       if (fmode &&
+           __ceph_is_any_real_caps(ci) &&
+           list_empty(&ci->i_cap_delay_list))
+               __cap_delay_requeue(mdsc, ci);
+}
+
+void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
 {
        int i;
        int bits = (fmode << 1) | 1;
+       spin_lock(&ci->i_ceph_lock);
        for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
                if (bits & (1 << i))
-                       ci->i_nr_by_mode[i]++;
+                       ci->i_nr_by_mode[i] += count;
        }
+       spin_unlock(&ci->i_ceph_lock);
 }
 
 /*
@@ -4115,26 +4174,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
  * we may need to release capabilities to the MDS (or schedule
  * their delayed release).
  */
-void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
 {
-       int i, last = 0;
+       int i;
        int bits = (fmode << 1) | 1;
        spin_lock(&ci->i_ceph_lock);
        for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
                if (bits & (1 << i)) {
-                       BUG_ON(ci->i_nr_by_mode[i] == 0);
-                       if (--ci->i_nr_by_mode[i] == 0)
-                               last++;
+                       BUG_ON(ci->i_nr_by_mode[i] < count);
+                       ci->i_nr_by_mode[i] -= count;
                }
        }
-       dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
-            &ci->vfs_inode, fmode,
-            ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
-            ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
        spin_unlock(&ci->i_ceph_lock);
-
-       if (last && ci->i_vino.snap == CEPH_NOSNAP)
-               ceph_check_caps(ci, 0, NULL);
 }
 
 /*
@@ -4152,7 +4203,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
        if (inode->i_nlink == 1) {
                drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
 
-               ci->i_ceph_flags |= CEPH_I_NODELAY;
                if (__ceph_caps_dirty(ci)) {
                        struct ceph_mds_client *mdsc =
                                ceph_inode_to_client(inode)->mdsc;
@@ -4208,8 +4258,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
                if (force || (cap->issued & drop)) {
                        if (cap->issued & drop) {
                                int wanted = __ceph_caps_wanted(ci);
-                               if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
-                                       wanted |= cap->mds_wanted;
                                dout("encode_inode_release %p cap %p "
                                     "%s -> %s, wanted %s -> %s\n", inode, cap,
                                     ceph_cap_string(cap->issued),
index fb7cabd..481ac97 100644 (file)
@@ -218,10 +218,10 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
        return 0;
 }
 
-CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
-CEPH_DEFINE_SHOW_FUNC(mdsc_show)
-CEPH_DEFINE_SHOW_FUNC(caps_show)
-CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
+DEFINE_SHOW_ATTRIBUTE(mdsmap);
+DEFINE_SHOW_ATTRIBUTE(mdsc);
+DEFINE_SHOW_ATTRIBUTE(caps);
+DEFINE_SHOW_ATTRIBUTE(mds_sessions);
 
 
 /*
@@ -281,25 +281,25 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
                                        0400,
                                        fsc->client->debugfs_dir,
                                        fsc,
-                                       &mdsmap_show_fops);
+                                       &mdsmap_fops);
 
        fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
                                        0400,
                                        fsc->client->debugfs_dir,
                                        fsc,
-                                       &mds_sessions_show_fops);
+                                       &mds_sessions_fops);
 
        fsc->debugfs_mdsc = debugfs_create_file("mdsc",
                                                0400,
                                                fsc->client->debugfs_dir,
                                                fsc,
-                                               &mdsc_show_fops);
+                                               &mdsc_fops);
 
        fsc->debugfs_caps = debugfs_create_file("caps",
                                                   0400,
                                                   fsc->client->debugfs_dir,
                                                   fsc,
-                                                  &caps_show_fops);
+                                                  &caps_fops);
 }
 
 
index d0cd0ab..d594c26 100644 (file)
@@ -335,8 +335,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                ctx->pos = 2;
        }
 
-       /* can we use the dcache? */
        spin_lock(&ci->i_ceph_lock);
+       /* request Fx cap. if have Fx, we don't need to release Fs cap
+        * for later create/unlink. */
+       __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR);
+       /* can we use the dcache? */
        if (ceph_test_mount_opt(fsc, DCACHE) &&
            !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
            ceph_snap(inode) != CEPH_SNAPDIR &&
@@ -752,7 +755,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                struct ceph_dentry_info *di = ceph_dentry(dentry);
 
                spin_lock(&ci->i_ceph_lock);
-               dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+               dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
                if (strncmp(dentry->d_name.name,
                            fsc->mount_options->snapdir_name,
                            dentry->d_name.len) &&
@@ -760,6 +763,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                    ceph_test_mount_opt(fsc, DCACHE) &&
                    __ceph_dir_is_complete(ci) &&
                    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+                       __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
                        spin_unlock(&ci->i_ceph_lock);
                        dout(" dir %p complete, -ENOENT\n", dir);
                        d_add(dentry, NULL);
@@ -1036,6 +1040,78 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
        return err;
 }
 
+static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
+                                struct ceph_mds_request *req)
+{
+       int result = req->r_err ? req->r_err :
+                       le32_to_cpu(req->r_reply_info.head->result);
+
+       if (result == -EJUKEBOX)
+               goto out;
+
+       /* If op failed, mark everyone involved for errors */
+       if (result) {
+               int pathlen;
+               u64 base;
+               char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+                                                 &base, 0);
+
+               /* mark error on parent + clear complete */
+               mapping_set_error(req->r_parent->i_mapping, result);
+               ceph_dir_clear_complete(req->r_parent);
+
+               /* drop the dentry -- we don't know its status */
+               if (!d_unhashed(req->r_dentry))
+                       d_drop(req->r_dentry);
+
+               /* mark inode itself for an error (since metadata is bogus) */
+               mapping_set_error(req->r_old_inode->i_mapping, result);
+
+               pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
+                       base, IS_ERR(path) ? "<<bad>>" : path, result);
+               ceph_mdsc_free_path(path, pathlen);
+       }
+out:
+       iput(req->r_old_inode);
+       ceph_mdsc_release_dir_caps(req);
+}
+
+static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
+{
+       struct ceph_inode_info *ci = ceph_inode(dir);
+       struct ceph_dentry_info *di;
+       int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK;
+
+       spin_lock(&ci->i_ceph_lock);
+       if ((__ceph_caps_issued(ci, NULL) & want) == want) {
+               ceph_take_cap_refs(ci, want, false);
+               got = want;
+       }
+       spin_unlock(&ci->i_ceph_lock);
+
+       /* If we didn't get anything, return 0 */
+       if (!got)
+               return 0;
+
+        spin_lock(&dentry->d_lock);
+        di = ceph_dentry(dentry);
+       /*
+        * - We are holding Fx, which implies Fs caps.
+        * - Only support async unlink for primary linkage
+        */
+       if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen ||
+           !(di->flags & CEPH_DENTRY_PRIMARY_LINK))
+               want = 0;
+        spin_unlock(&dentry->d_lock);
+
+       /* Do we still want what we've got? */
+       if (want == got)
+               return got;
+
+       ceph_put_cap_refs(ci, got);
+       return 0;
+}
+
 /*
  * rmdir and unlink are differ only by the metadata op code
  */
@@ -1045,6 +1121,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct inode *inode = d_inode(dentry);
        struct ceph_mds_request *req;
+       bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
        int err = -EROFS;
        int op;
 
@@ -1059,6 +1136,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
                        CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
        } else
                goto out;
+retry:
        req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
@@ -1067,13 +1145,39 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        req->r_parent = dir;
-       set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
        req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
-       err = ceph_mdsc_do_request(mdsc, dir, req);
-       if (!err && !req->r_reply_info.head->is_dentry)
-               d_delete(dentry);
+
+       if (try_async && op == CEPH_MDS_OP_UNLINK &&
+           (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
+               dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
+                    dentry->d_name.len, dentry->d_name.name,
+                    ceph_cap_string(req->r_dir_caps));
+               set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+               req->r_callback = ceph_async_unlink_cb;
+               req->r_old_inode = d_inode(dentry);
+               ihold(req->r_old_inode);
+               err = ceph_mdsc_submit_request(mdsc, dir, req);
+               if (!err) {
+                       /*
+                        * We have enough caps, so we assume that the unlink
+                        * will succeed. Fix up the target inode and dcache.
+                        */
+                       drop_nlink(inode);
+                       d_delete(dentry);
+               } else if (err == -EJUKEBOX) {
+                       try_async = false;
+                       ceph_mdsc_put_request(req);
+                       goto retry;
+               }
+       } else {
+               set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+               err = ceph_mdsc_do_request(mdsc, dir, req);
+               if (!err && !req->r_reply_info.head->is_dentry)
+                       d_delete(dentry);
+       }
+
        ceph_mdsc_put_request(req);
 out:
        return err;
@@ -1411,6 +1515,7 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
        spin_lock(&dentry->d_lock);
        di->time = jiffies;
        di->lease_shared_gen = 0;
+       di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
        __dentry_lease_unlist(di);
        spin_unlock(&dentry->d_lock);
 }
@@ -1520,7 +1625,8 @@ static int __dir_lease_try_check(const struct dentry *dentry)
 /*
  * Check if directory-wide content lease/cap is valid.
  */
-static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
+                             struct ceph_mds_client *mdsc)
 {
        struct ceph_inode_info *ci = ceph_inode(dir);
        int valid;
@@ -1528,7 +1634,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
 
        spin_lock(&ci->i_ceph_lock);
        valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
-       shared_gen = atomic_read(&ci->i_shared_gen);
+       if (valid) {
+               __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
+               shared_gen = atomic_read(&ci->i_shared_gen);
+       }
        spin_unlock(&ci->i_ceph_lock);
        if (valid) {
                struct ceph_dentry_info *di;
@@ -1554,6 +1663,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
        int valid = 0;
        struct dentry *parent;
        struct inode *dir, *inode;
+       struct ceph_mds_client *mdsc;
 
        if (flags & LOOKUP_RCU) {
                parent = READ_ONCE(dentry->d_parent);
@@ -1570,6 +1680,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
        dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry,
             dentry, inode, ceph_dentry(dentry)->offset);
 
+       mdsc = ceph_sb_to_client(dir->i_sb)->mdsc;
+
        /* always trust cached snapped dentries, snapdir dentry */
        if (ceph_snap(dir) != CEPH_NOSNAP) {
                dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
@@ -1581,7 +1693,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
                valid = dentry_lease_is_valid(dentry, flags);
                if (valid == -ECHILD)
                        return valid;
-               if (valid || dir_lease_is_valid(dir, dentry)) {
+               if (valid || dir_lease_is_valid(dir, dentry, mdsc)) {
                        if (inode)
                                valid = ceph_is_any_caps(inode);
                        else
@@ -1590,8 +1702,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
        }
 
        if (!valid) {
-               struct ceph_mds_client *mdsc =
-                       ceph_sb_to_client(dir->i_sb)->mdsc;
                struct ceph_mds_request *req;
                int op, err;
                u32 mask;
index b6bfa94..79dc068 100644 (file)
@@ -315,6 +315,11 @@ static struct dentry *__get_parent(struct super_block *sb,
 
        req->r_num_caps = 1;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
+       if (err) {
+               ceph_mdsc_put_request(req);
+               return ERR_PTR(err);
+       }
+
        inode = req->r_target_inode;
        if (inode)
                ihold(inode);
index 5a478cd..4a5ccbb 100644 (file)
@@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
        if (isdir) {
                struct ceph_dir_file_info *dfi =
                        kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
-               if (!dfi) {
-                       ceph_put_fmode(ci, fmode); /* clean up */
+               if (!dfi)
                        return -ENOMEM;
-               }
 
                file->private_data = dfi;
                fi = &dfi->file_info;
@@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
                dfi->readdir_cache_idx = -1;
        } else {
                fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
-               if (!fi) {
-                       ceph_put_fmode(ci, fmode); /* clean up */
+               if (!fi)
                        return -ENOMEM;
-               }
 
                file->private_data = fi;
        }
 
+       ceph_get_fmode(ci, fmode, 1);
        fi->fmode = fmode;
+
        spin_lock_init(&fi->rw_contexts_lock);
        INIT_LIST_HEAD(&fi->rw_contexts);
        fi->meta_err = errseq_sample(&ci->i_meta_err);
@@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
        case S_IFLNK:
                dout("init_file %p %p 0%o (symlink)\n", inode, file,
                     inode->i_mode);
-               ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
                break;
 
        default:
@@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
                 * we need to drop the open ref now, since we don't
                 * have .release set to ceph_release.
                 */
-               ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
                BUG_ON(inode->i_fop->release == ceph_release);
 
                /* call the proper open fop */
@@ -285,14 +281,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 /*
  * try renew caps after session gets killed.
  */
-int ceph_renew_caps(struct inode *inode)
+int ceph_renew_caps(struct inode *inode, int fmode)
 {
-       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_mds_request *req;
        int err, flags, wanted;
 
        spin_lock(&ci->i_ceph_lock);
+       __ceph_touch_fmode(ci, mdsc, fmode);
        wanted = __ceph_caps_file_wanted(ci);
        if (__ceph_is_any_real_caps(ci) &&
            (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
@@ -326,7 +323,6 @@ int ceph_renew_caps(struct inode *inode)
        req->r_inode = inode;
        ihold(inode);
        req->r_num_caps = 1;
-       req->r_fmode = -1;
 
        err = ceph_mdsc_do_request(mdsc, NULL, req);
        ceph_mdsc_put_request(req);
@@ -372,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
 
        /* trivially open snapdir */
        if (ceph_snap(inode) == CEPH_SNAPDIR) {
-               spin_lock(&ci->i_ceph_lock);
-               __ceph_get_fmode(ci, fmode);
-               spin_unlock(&ci->i_ceph_lock);
                return ceph_init_file(inode, file, fmode);
        }
 
@@ -392,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file)
                dout("open %p fmode %d want %s issued %s using existing\n",
                     inode, fmode, ceph_cap_string(wanted),
                     ceph_cap_string(issued));
-               __ceph_get_fmode(ci, fmode);
+               __ceph_touch_fmode(ci, mdsc, fmode);
                spin_unlock(&ci->i_ceph_lock);
 
                /* adjust wanted? */
@@ -404,7 +397,7 @@ int ceph_open(struct inode *inode, struct file *file)
                return ceph_init_file(inode, file, fmode);
        } else if (ceph_snap(inode) != CEPH_NOSNAP &&
                   (ci->i_snap_caps & wanted) == wanted) {
-               __ceph_get_fmode(ci, fmode);
+               __ceph_touch_fmode(ci, mdsc, fmode);
                spin_unlock(&ci->i_ceph_lock);
                return ceph_init_file(inode, file, fmode);
        }
@@ -430,6 +423,236 @@ out:
        return err;
 }
 
+/* Clone the layout from a synchronous create, if the dir now has Dc caps */
+static void
+cache_file_layout(struct inode *dst, struct inode *src)
+{
+       struct ceph_inode_info *cdst = ceph_inode(dst);
+       struct ceph_inode_info *csrc = ceph_inode(src);
+
+       spin_lock(&cdst->i_ceph_lock);
+       if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
+           !ceph_file_layout_is_valid(&cdst->i_cached_layout)) {
+               memcpy(&cdst->i_cached_layout, &csrc->i_layout,
+                       sizeof(cdst->i_cached_layout));
+               rcu_assign_pointer(cdst->i_cached_layout.pool_ns,
+                                  ceph_try_get_string(csrc->i_layout.pool_ns));
+       }
+       spin_unlock(&cdst->i_ceph_lock);
+}
+
+/*
+ * Try to set up an async create. We need caps, a file layout, and inode number,
+ * and either a lease on the dentry or complete dir info. If any of those
+ * criteria are not satisfied, then return false and the caller can go
+ * synchronous.
+ */
+static int try_prep_async_create(struct inode *dir, struct dentry *dentry,
+                                struct ceph_file_layout *lo, u64 *pino)
+{
+       struct ceph_inode_info *ci = ceph_inode(dir);
+       struct ceph_dentry_info *di = ceph_dentry(dentry);
+       int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE;
+       u64 ino;
+
+       spin_lock(&ci->i_ceph_lock);
+       /* No auth cap means no chance for Dc caps */
+       if (!ci->i_auth_cap)
+               goto no_async;
+
+       /* Any delegated inos? */
+       if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos))
+               goto no_async;
+
+       if (!ceph_file_layout_is_valid(&ci->i_cached_layout))
+               goto no_async;
+
+       if ((__ceph_caps_issued(ci, NULL) & want) != want)
+               goto no_async;
+
+       if (d_in_lookup(dentry)) {
+               if (!__ceph_dir_is_complete(ci))
+                       goto no_async;
+               spin_lock(&dentry->d_lock);
+               di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
+               spin_unlock(&dentry->d_lock);
+       } else if (atomic_read(&ci->i_shared_gen) !=
+                  READ_ONCE(di->lease_shared_gen)) {
+               goto no_async;
+       }
+
+       ino = ceph_get_deleg_ino(ci->i_auth_cap->session);
+       if (!ino)
+               goto no_async;
+
+       *pino = ino;
+       ceph_take_cap_refs(ci, want, false);
+       memcpy(lo, &ci->i_cached_layout, sizeof(*lo));
+       rcu_assign_pointer(lo->pool_ns,
+                          ceph_try_get_string(ci->i_cached_layout.pool_ns));
+       got = want;
+no_async:
+       spin_unlock(&ci->i_ceph_lock);
+       return got;
+}
+
+static void restore_deleg_ino(struct inode *dir, u64 ino)
+{
+       struct ceph_inode_info *ci = ceph_inode(dir);
+       struct ceph_mds_session *s = NULL;
+
+       spin_lock(&ci->i_ceph_lock);
+       if (ci->i_auth_cap)
+               s = ceph_get_mds_session(ci->i_auth_cap->session);
+       spin_unlock(&ci->i_ceph_lock);
+       if (s) {
+               int err = ceph_restore_deleg_ino(s, ino);
+               if (err)
+                       pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n",
+                               ino, err);
+               ceph_put_mds_session(s);
+       }
+}
+
+static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_request *req)
+{
+       int result = req->r_err ? req->r_err :
+                       le32_to_cpu(req->r_reply_info.head->result);
+
+       if (result == -EJUKEBOX)
+               goto out;
+
+       mapping_set_error(req->r_parent->i_mapping, result);
+
+       if (result) {
+               struct dentry *dentry = req->r_dentry;
+               int pathlen;
+               u64 base;
+               char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+                                                 &base, 0);
+
+               ceph_dir_clear_complete(req->r_parent);
+               if (!d_unhashed(dentry))
+                       d_drop(dentry);
+
+               /* FIXME: start returning I/O errors on all accesses? */
+               pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
+                       base, IS_ERR(path) ? "<<bad>>" : path, result);
+               ceph_mdsc_free_path(path, pathlen);
+       }
+
+       if (req->r_target_inode) {
+               struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+               u64 ino = ceph_vino(req->r_target_inode).ino;
+
+               if (req->r_deleg_ino != ino)
+                       pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
+                               __func__, req->r_err, req->r_deleg_ino, ino);
+               mapping_set_error(req->r_target_inode->i_mapping, result);
+
+               spin_lock(&ci->i_ceph_lock);
+               if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+                       ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
+                       wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+               }
+               ceph_kick_flushing_inode_caps(req->r_session, ci);
+               spin_unlock(&ci->i_ceph_lock);
+       } else {
+               pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
+                       req->r_deleg_ino);
+       }
+out:
+       ceph_mdsc_release_dir_caps(req);
+}
+
+static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
+                                   struct file *file, umode_t mode,
+                                   struct ceph_mds_request *req,
+                                   struct ceph_acl_sec_ctx *as_ctx,
+                                   struct ceph_file_layout *lo)
+{
+       int ret;
+       char xattr_buf[4];
+       struct ceph_mds_reply_inode in = { };
+       struct ceph_mds_reply_info_in iinfo = { .in = &in };
+       struct ceph_inode_info *ci = ceph_inode(dir);
+       struct inode *inode;
+       struct timespec64 now;
+       struct ceph_vino vino = { .ino = req->r_deleg_ino,
+                                 .snap = CEPH_NOSNAP };
+
+       ktime_get_real_ts64(&now);
+
+       inode = ceph_get_inode(dentry->d_sb, vino);
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+
+       iinfo.inline_version = CEPH_INLINE_NONE;
+       iinfo.change_attr = 1;
+       ceph_encode_timespec64(&iinfo.btime, &now);
+
+       iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+       iinfo.xattr_data = xattr_buf;
+       memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+
+       in.ino = cpu_to_le64(vino.ino);
+       in.snapid = cpu_to_le64(CEPH_NOSNAP);
+       in.version = cpu_to_le64(1);    // ???
+       in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
+       in.cap.cap_id = cpu_to_le64(1);
+       in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
+       in.cap.flags = CEPH_CAP_FLAG_AUTH;
+       in.ctime = in.mtime = in.atime = iinfo.btime;
+       in.mode = cpu_to_le32((u32)mode);
+       in.truncate_seq = cpu_to_le32(1);
+       in.truncate_size = cpu_to_le64(-1ULL);
+       in.xattr_version = cpu_to_le64(1);
+       in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
+       in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ?
+                               dir->i_gid : current_fsgid()));
+       in.nlink = cpu_to_le32(1);
+       in.max_size = cpu_to_le64(lo->stripe_unit);
+
+       ceph_file_layout_to_legacy(lo, &in.layout);
+
+       ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
+                             req->r_fmode, NULL);
+       if (ret) {
+               dout("%s failed to fill inode: %d\n", __func__, ret);
+               ceph_dir_clear_complete(dir);
+               if (!d_unhashed(dentry))
+                       d_drop(dentry);
+               if (inode->i_state & I_NEW)
+                       discard_new_inode(inode);
+       } else {
+               struct dentry *dn;
+
+               dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
+                       vino.ino, dir->i_ino, dentry->d_name.name);
+               ceph_dir_clear_ordered(dir);
+               ceph_init_inode_acls(inode, as_ctx);
+               if (inode->i_state & I_NEW) {
+                       /*
+                        * If it's not I_NEW, then someone created this before
+                        * we got here. Assume the server is aware of it at
+                        * that point and don't worry about setting
+                        * CEPH_I_ASYNC_CREATE.
+                        */
+                       ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE;
+                       unlock_new_inode(inode);
+               }
+               if (d_in_lookup(dentry) || d_really_is_negative(dentry)) {
+                       if (!d_unhashed(dentry))
+                               d_drop(dentry);
+                       dn = d_splice_alias(inode, dentry);
+                       WARN_ON_ONCE(dn && dn != dentry);
+               }
+               file->f_mode |= FMODE_CREATED;
+               ret = finish_open(file, dentry, ceph_open);
+       }
+       return ret;
+}
 
 /*
  * Do a lookup + open with a single request.  If we get a non-existent
@@ -443,6 +666,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
        struct ceph_mds_request *req;
        struct dentry *dn;
        struct ceph_acl_sec_ctx as_ctx = {};
+       bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
        int mask;
        int err;
 
@@ -466,7 +690,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
                /* If it's not being looked up, it's negative */
                return -ENOENT;
        }
-
+retry:
        /* do the open */
        req = prepare_open_request(dir->i_sb, flags, mode);
        if (IS_ERR(req)) {
@@ -475,21 +699,43 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
        }
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
+       mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+       if (ceph_security_xattr_wanted(dir))
+               mask |= CEPH_CAP_XATTR_SHARED;
+       req->r_args.open.mask = cpu_to_le32(mask);
+       req->r_parent = dir;
+
        if (flags & O_CREAT) {
+               struct ceph_file_layout lo;
+
                req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
                req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
                if (as_ctx.pagelist) {
                        req->r_pagelist = as_ctx.pagelist;
                        as_ctx.pagelist = NULL;
                }
+               if (try_async &&
+                   (req->r_dir_caps =
+                     try_prep_async_create(dir, dentry, &lo,
+                                           &req->r_deleg_ino))) {
+                       set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+                       req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
+                       req->r_callback = ceph_async_create_cb;
+                       err = ceph_mdsc_submit_request(mdsc, dir, req);
+                       if (!err) {
+                               err = ceph_finish_async_create(dir, dentry,
+                                                       file, mode, req,
+                                                       &as_ctx, &lo);
+                       } else if (err == -EJUKEBOX) {
+                               restore_deleg_ino(dir, req->r_deleg_ino);
+                               ceph_mdsc_put_request(req);
+                               try_async = false;
+                               goto retry;
+                       }
+                       goto out_req;
+               }
        }
 
-       mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
-       if (ceph_security_xattr_wanted(dir))
-               mask |= CEPH_CAP_XATTR_SHARED;
-       req->r_args.open.mask = cpu_to_le32(mask);
-
-       req->r_parent = dir;
        set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
        err = ceph_mdsc_do_request(mdsc,
                                   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
@@ -518,14 +764,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
        } else {
                dout("atomic_open finish_open on dn %p\n", dn);
                if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
-                       ceph_init_inode_acls(d_inode(dentry), &as_ctx);
+                       struct inode *newino = d_inode(dentry);
+
+                       cache_file_layout(dir, newino);
+                       ceph_init_inode_acls(newino, &as_ctx);
                        file->f_mode |= FMODE_CREATED;
                }
                err = finish_open(file, dentry, ceph_open);
        }
 out_req:
-       if (!req->r_err && req->r_target_inode)
-               ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
        ceph_mdsc_put_request(req);
 out_ctx:
        ceph_release_acl_sec_ctx(&as_ctx);
@@ -542,7 +789,7 @@ int ceph_release(struct inode *inode, struct file *file)
                dout("release inode %p dir file %p\n", inode, file);
                WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
 
-               ceph_put_fmode(ci, dfi->file_info.fmode);
+               ceph_put_fmode(ci, dfi->file_info.fmode, 1);
 
                if (dfi->last_readdir)
                        ceph_mdsc_put_request(dfi->last_readdir);
@@ -554,7 +801,8 @@ int ceph_release(struct inode *inode, struct file *file)
                dout("release inode %p regular file %p\n", inode, file);
                WARN_ON(!list_empty(&fi->rw_contexts));
 
-               ceph_put_fmode(ci, fi->fmode);
+               ceph_put_fmode(ci, fi->fmode, 1);
+
                kmem_cache_free(ceph_file_cachep, fi);
        }
 
@@ -1567,7 +1815,7 @@ retry_snap:
                if (dirty)
                        __mark_inode_dirty(inode, dirty);
                if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
-                       ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
+                       ceph_check_caps(ci, 0, NULL);
        }
 
        dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
@@ -1944,6 +2192,71 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
        return 0;
 }
 
+static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
+                                   struct ceph_inode_info *dst_ci, u64 *dst_off,
+                                   struct ceph_fs_client *fsc,
+                                   size_t len, unsigned int flags)
+{
+       struct ceph_object_locator src_oloc, dst_oloc;
+       struct ceph_object_id src_oid, dst_oid;
+       size_t bytes = 0;
+       u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
+       u32 src_objlen, dst_objlen;
+       u32 object_size = src_ci->i_layout.object_size;
+       int ret;
+
+       src_oloc.pool = src_ci->i_layout.pool_id;
+       src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
+       dst_oloc.pool = dst_ci->i_layout.pool_id;
+       dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
+
+       while (len >= object_size) {
+               ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
+                                             object_size, &src_objnum,
+                                             &src_objoff, &src_objlen);
+               ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
+                                             object_size, &dst_objnum,
+                                             &dst_objoff, &dst_objlen);
+               ceph_oid_init(&src_oid);
+               ceph_oid_printf(&src_oid, "%llx.%08llx",
+                               src_ci->i_vino.ino, src_objnum);
+               ceph_oid_init(&dst_oid);
+               ceph_oid_printf(&dst_oid, "%llx.%08llx",
+                               dst_ci->i_vino.ino, dst_objnum);
+               /* Do an object remote copy */
+               ret = ceph_osdc_copy_from(&fsc->client->osdc,
+                                         src_ci->i_vino.snap, 0,
+                                         &src_oid, &src_oloc,
+                                         CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+                                         CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
+                                         &dst_oid, &dst_oloc,
+                                         CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+                                         CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
+                                         dst_ci->i_truncate_seq,
+                                         dst_ci->i_truncate_size,
+                                         CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
+               if (ret) {
+                       if (ret == -EOPNOTSUPP) {
+                               fsc->have_copy_from2 = false;
+                               pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
+                       }
+                       dout("ceph_osdc_copy_from returned %d\n", ret);
+                       if (!bytes)
+                               bytes = ret;
+                       goto out;
+               }
+               len -= object_size;
+               bytes += object_size;
+               *src_off += object_size;
+               *dst_off += object_size;
+       }
+
+out:
+       ceph_oloc_destroy(&src_oloc);
+       ceph_oloc_destroy(&dst_oloc);
+       return bytes;
+}
+
 static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
                                      struct file *dst_file, loff_t dst_off,
                                      size_t len, unsigned int flags)
@@ -1954,14 +2267,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
        struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
        struct ceph_cap_flush *prealloc_cf;
        struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
-       struct ceph_object_locator src_oloc, dst_oloc;
-       struct ceph_object_id src_oid, dst_oid;
-       loff_t endoff = 0, size;
-       ssize_t ret = -EIO;
+       loff_t size;
+       ssize_t ret = -EIO, bytes;
        u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
-       u32 src_objlen, dst_objlen, object_size;
+       u32 src_objlen, dst_objlen;
        int src_got = 0, dst_got = 0, err, dirty;
-       bool do_final_copy = false;
 
        if (src_inode->i_sb != dst_inode->i_sb) {
                struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
@@ -2039,22 +2349,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
        if (ret < 0)
                goto out_caps;
 
-       size = i_size_read(dst_inode);
-       endoff = dst_off + len;
-
        /* Drop dst file cached pages */
        ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
                                            dst_off >> PAGE_SHIFT,
-                                           endoff >> PAGE_SHIFT);
+                                           (dst_off + len) >> PAGE_SHIFT);
        if (ret < 0) {
                dout("Failed to invalidate inode pages (%zd)\n", ret);
                ret = 0; /* XXX */
        }
-       src_oloc.pool = src_ci->i_layout.pool_id;
-       src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
-       dst_oloc.pool = dst_ci->i_layout.pool_id;
-       dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
-
        ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
                                      src_ci->i_layout.object_size,
                                      &src_objnum, &src_objoff, &src_objlen);
@@ -2073,6 +2375,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
         * starting at the src_off
         */
        if (src_objoff) {
+               dout("Initial partial copy of %u bytes\n", src_objlen);
+
                /*
                 * we need to temporarily drop all caps as we'll be calling
                 * {read,write}_iter, which will get caps again.
@@ -2080,8 +2384,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
                put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
                ret = do_splice_direct(src_file, &src_off, dst_file,
                                       &dst_off, src_objlen, flags);
-               if (ret < 0) {
-                       dout("do_splice_direct returned %d\n", err);
+               /* Abort on short copies or on error */
+               if (ret < src_objlen) {
+                       dout("Failed partial copy (%zd)\n", ret);
                        goto out;
                }
                len -= ret;
@@ -2094,65 +2399,27 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
                if (err < 0)
                        goto out_caps;
        }
-       object_size = src_ci->i_layout.object_size;
-       while (len >= object_size) {
-               ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
-                                             object_size, &src_objnum,
-                                             &src_objoff, &src_objlen);
-               ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
-                                             object_size, &dst_objnum,
-                                             &dst_objoff, &dst_objlen);
-               ceph_oid_init(&src_oid);
-               ceph_oid_printf(&src_oid, "%llx.%08llx",
-                               src_ci->i_vino.ino, src_objnum);
-               ceph_oid_init(&dst_oid);
-               ceph_oid_printf(&dst_oid, "%llx.%08llx",
-                               dst_ci->i_vino.ino, dst_objnum);
-               /* Do an object remote copy */
-               err = ceph_osdc_copy_from(
-                       &src_fsc->client->osdc,
-                       src_ci->i_vino.snap, 0,
-                       &src_oid, &src_oloc,
-                       CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
-                       CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
-                       &dst_oid, &dst_oloc,
-                       CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
-                       CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
-                       dst_ci->i_truncate_seq, dst_ci->i_truncate_size,
-                       CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
-               if (err) {
-                       if (err == -EOPNOTSUPP) {
-                               src_fsc->have_copy_from2 = false;
-                               pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
-                       }
-                       dout("ceph_osdc_copy_from returned %d\n", err);
-                       if (!ret)
-                               ret = err;
-                       goto out_caps;
-               }
-               len -= object_size;
-               src_off += object_size;
-               dst_off += object_size;
-               ret += object_size;
-       }
 
-       if (len)
-               /* We still need one final local copy */
-               do_final_copy = true;
+       size = i_size_read(dst_inode);
+       bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
+                                    src_fsc, len, flags);
+       if (bytes <= 0) {
+               if (!ret)
+                       ret = bytes;
+               goto out_caps;
+       }
+       dout("Copied %zu bytes out of %zu\n", bytes, len);
+       len -= bytes;
+       ret += bytes;
 
        file_update_time(dst_file);
        inode_inc_iversion_raw(dst_inode);
 
-       if (endoff > size) {
-               int caps_flags = 0;
-
+       if (dst_off > size) {
                /* Let the MDS know about dst file size change */
-               if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
-                       caps_flags |= CHECK_CAPS_NODELAY;
-               if (ceph_inode_set_size(dst_inode, endoff))
-                       caps_flags |= CHECK_CAPS_AUTHONLY;
-               if (caps_flags)
-                       ceph_check_caps(dst_ci, caps_flags, NULL);
+               if (ceph_inode_set_size(dst_inode, dst_off) ||
+                   ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
+                       ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL);
        }
        /* Mark Fw dirty */
        spin_lock(&dst_ci->i_ceph_lock);
@@ -2165,15 +2432,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
 out_caps:
        put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
 
-       if (do_final_copy) {
-               err = do_splice_direct(src_file, &src_off, dst_file,
-                                      &dst_off, len, flags);
-               if (err < 0) {
-                       dout("do_splice_direct returned %d\n", err);
-                       goto out;
-               }
-               len -= err;
-               ret += err;
+       /*
+        * Do the final manual copy if we still have some bytes left, unless
+        * there were errors in remote object copies (len >= object_size).
+        */
+       if (len && (len < src_ci->i_layout.object_size)) {
+               dout("Final partial copy of %zu bytes\n", len);
+               bytes = do_splice_direct(src_file, &src_off, dst_file,
+                                        &dst_off, len, flags);
+               if (bytes > 0)
+                       ret += bytes;
+               else
+                       dout("Failed partial copy (%zd)\n", bytes);
        }
 
 out:
index d01710a..7fef94f 100644 (file)
@@ -82,10 +82,14 @@ struct inode *ceph_get_snapdir(struct inode *parent)
        inode->i_mode = parent->i_mode;
        inode->i_uid = parent->i_uid;
        inode->i_gid = parent->i_gid;
+       inode->i_mtime = parent->i_mtime;
+       inode->i_ctime = parent->i_ctime;
+       inode->i_atime = parent->i_atime;
        inode->i_op = &ceph_snapdir_iops;
        inode->i_fop = &ceph_snapdir_fops;
        ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
        ci->i_rbytes = 0;
+       ci->i_btime = ceph_inode(parent)->i_btime;
 
        if (inode->i_state & I_NEW)
                unlock_new_inode(inode);
@@ -447,6 +451,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_max_files = 0;
 
        memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+       memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
        RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
 
        ci->i_fragtree = RB_ROOT;
@@ -471,13 +476,13 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_prealloc_cap_flush = NULL;
        INIT_LIST_HEAD(&ci->i_cap_flush_list);
        init_waitqueue_head(&ci->i_cap_wq);
-       ci->i_hold_caps_min = 0;
        ci->i_hold_caps_max = 0;
        INIT_LIST_HEAD(&ci->i_cap_delay_list);
        INIT_LIST_HEAD(&ci->i_cap_snaps);
        ci->i_head_snapc = NULL;
        ci->i_snap_caps = 0;
 
+       ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
        for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
                ci->i_nr_by_mode[i] = 0;
 
@@ -496,6 +501,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_rdcache_ref = 0;
        ci->i_wr_ref = 0;
        ci->i_wb_ref = 0;
+       ci->i_fx_ref = 0;
        ci->i_wrbuffer_ref = 0;
        ci->i_wrbuffer_ref_head = 0;
        atomic_set(&ci->i_filelock_ref, 0);
@@ -586,6 +592,7 @@ void ceph_evict_inode(struct inode *inode)
                ceph_buffer_put(ci->i_xattrs.prealloc_blob);
 
        ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+       ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
 }
 
 static inline blkcnt_t calc_inode_blocks(u64 size)
@@ -636,7 +643,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
                        if ((issued & (CEPH_CAP_FILE_CACHE|
                                       CEPH_CAP_FILE_BUFFER)) ||
                            mapping_mapped(inode->i_mapping) ||
-                           __ceph_caps_file_wanted(ci)) {
+                           __ceph_is_file_opened(ci)) {
                                ci->i_truncate_pending++;
                                queue_trunc = 1;
                        }
@@ -727,11 +734,11 @@ void ceph_fill_file_time(struct inode *inode, int issued,
  * Populate an inode based on info from mds.  May be called on new or
  * existing inodes.
  */
-static int fill_inode(struct inode *inode, struct page *locked_page,
-                     struct ceph_mds_reply_info_in *iinfo,
-                     struct ceph_mds_reply_dirfrag *dirinfo,
-                     struct ceph_mds_session *session, int cap_fmode,
-                     struct ceph_cap_reservation *caps_reservation)
+int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+                   struct ceph_mds_reply_info_in *iinfo,
+                   struct ceph_mds_reply_dirfrag *dirinfo,
+                   struct ceph_mds_session *session, int cap_fmode,
+                   struct ceph_cap_reservation *caps_reservation)
 {
        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        struct ceph_mds_reply_inode *info = iinfo->in;
@@ -748,7 +755,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
        bool new_version = false;
        bool fill_inline = false;
 
-       dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+       dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
             inode, ceph_vinop(inode), le64_to_cpu(info->version),
             ci->i_version);
 
@@ -769,7 +776,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
        if (iinfo->xattr_len > 4) {
                xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
                if (!xattr_blob)
-                       pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+                       pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
                               iinfo->xattr_len);
        }
 
@@ -932,8 +939,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                        spin_unlock(&ci->i_ceph_lock);
 
                        if (symlen != i_size_read(inode)) {
-                               pr_err("fill_inode %llx.%llx BAD symlink "
-                                       "size %lld\n", ceph_vinop(inode),
+                               pr_err("%s %llx.%llx BAD symlink "
+                                       "size %lld\n", __func__,
+                                       ceph_vinop(inode),
                                        i_size_read(inode));
                                i_size_write(inode, symlen);
                                inode->i_blocks = calc_inode_blocks(symlen);
@@ -957,7 +965,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                inode->i_fop = &ceph_dir_fops;
                break;
        default:
-               pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+               pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
                       ceph_vinop(inode), inode->i_mode);
        }
 
@@ -966,7 +974,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                if (ceph_snap(inode) == CEPH_NOSNAP) {
                        ceph_add_cap(inode, session,
                                     le64_to_cpu(info->cap.cap_id),
-                                    cap_fmode, info_caps,
+                                    info_caps,
                                     le32_to_cpu(info->cap.wanted),
                                     le32_to_cpu(info->cap.seq),
                                     le32_to_cpu(info->cap.mseq),
@@ -991,13 +999,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                        dout(" %p got snap_caps %s\n", inode,
                             ceph_cap_string(info_caps));
                        ci->i_snap_caps |= info_caps;
-                       if (cap_fmode >= 0)
-                               __ceph_get_fmode(ci, cap_fmode);
                }
-       } else if (cap_fmode >= 0) {
-               pr_warn("mds issued no caps on %llx.%llx\n",
-                          ceph_vinop(inode));
-               __ceph_get_fmode(ci, cap_fmode);
        }
 
        if (iinfo->inline_version > 0 &&
@@ -1009,6 +1011,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                        fill_inline = true;
        }
 
+       if (cap_fmode >= 0) {
+               if (!info_caps)
+                       pr_warn("mds issued no caps on %llx.%llx\n",
+                               ceph_vinop(inode));
+               __ceph_touch_fmode(ci, mdsc, cap_fmode);
+       }
+
        spin_unlock(&ci->i_ceph_lock);
 
        if (fill_inline)
@@ -1050,6 +1059,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
                                  struct ceph_mds_session **old_lease_session)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
+       unsigned mask = le16_to_cpu(lease->mask);
        long unsigned duration = le32_to_cpu(lease->duration_ms);
        long unsigned ttl = from_time + (duration * HZ) / 1000;
        long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
@@ -1061,8 +1071,13 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
        if (ceph_snap(dir) != CEPH_NOSNAP)
                return;
 
+       if (mask & CEPH_LEASE_PRIMARY_LINK)
+               di->flags |= CEPH_DENTRY_PRIMARY_LINK;
+       else
+               di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
+
        di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
-       if (duration == 0) {
+       if (!(mask & CEPH_LEASE_VALID)) {
                __ceph_dentry_dir_lease_touch(di);
                return;
        }
@@ -1239,10 +1254,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
                struct inode *dir = req->r_parent;
 
                if (dir) {
-                       err = fill_inode(dir, NULL,
-                                        &rinfo->diri, rinfo->dirfrag,
-                                        session, -1,
-                                        &req->r_caps_reservation);
+                       err = ceph_fill_inode(dir, NULL, &rinfo->diri,
+                                             rinfo->dirfrag, session, -1,
+                                             &req->r_caps_reservation);
                        if (err < 0)
                                goto done;
                } else {
@@ -1307,13 +1321,14 @@ retry_lookup:
                        goto done;
                }
 
-               err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
-                               session,
+               err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
+                               NULL, session,
                                (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
+                                !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
                                 rinfo->head->result == 0) ?  req->r_fmode : -1,
                                &req->r_caps_reservation);
                if (err < 0) {
-                       pr_err("fill_inode badness %p %llx.%llx\n",
+                       pr_err("ceph_fill_inode badness %p %llx.%llx\n",
                                in, ceph_vinop(in));
                        if (in->i_state & I_NEW)
                                discard_new_inode(in);
@@ -1500,10 +1515,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
                        dout("new_inode badness got %d\n", err);
                        continue;
                }
-               rc = fill_inode(in, NULL, &rde->inode, NULL, session,
-                               -1, &req->r_caps_reservation);
+               rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+                                    -1, &req->r_caps_reservation);
                if (rc < 0) {
-                       pr_err("fill_inode badness on %p got %d\n", in, rc);
+                       pr_err("ceph_fill_inode badness on %p got %d\n",
+                              in, rc);
                        err = rc;
                        if (in->i_state & I_NEW) {
                                ihold(in);
@@ -1707,10 +1723,10 @@ retry_lookup:
                        }
                }
 
-               ret = fill_inode(in, NULL, &rde->inode, NULL, session,
-                                -1, &req->r_caps_reservation);
+               ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
+                                     -1, &req->r_caps_reservation);
                if (ret < 0) {
-                       pr_err("fill_inode badness on %p\n", in);
+                       pr_err("ceph_fill_inode badness on %p\n", in);
                        if (d_really_is_negative(dn)) {
                                /* avoid calling iput_final() in mds
                                 * dispatch threads */
@@ -1972,7 +1988,7 @@ retry:
        mutex_unlock(&ci->i_truncate_mutex);
 
        if (wrbuffer_refs == 0)
-               ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+               ceph_check_caps(ci, 0, NULL);
 
        wake_up_all(&ci->i_cap_wq);
 }
index c90f03b..6e061bf 100644 (file)
@@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
        struct ceph_file_info *fi = file->private_data;
        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 
        if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
                spin_lock(&ci->i_ceph_lock);
                fi->fmode |= CEPH_FILE_MODE_LAZY;
                ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
+               __ceph_touch_fmode(ci, mdsc, fi->fmode);
                spin_unlock(&ci->i_ceph_lock);
                dout("ioctl_layzio: file %p marked lazy\n", file);
 
index 544e9e8..d6b9166 100644 (file)
@@ -210,6 +210,21 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
        return 0;
 }
 
+static int try_unlock_file(struct file *file, struct file_lock *fl)
+{
+       int err;
+       unsigned int orig_flags = fl->fl_flags;
+       fl->fl_flags |= FL_EXISTS;
+       err = locks_lock_file_wait(file, fl);
+       fl->fl_flags = orig_flags;
+       if (err == -ENOENT) {
+               if (!(orig_flags & FL_EXISTS))
+                       err = 0;
+               return err;
+       }
+       return 1;
+}
+
 /**
  * Attempt to set an fcntl lock.
  * For now, this just goes away to the server. Later it may be more awesome.
@@ -255,9 +270,15 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
 
+       if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) {
+               err = try_unlock_file(file, fl);
+               if (err <= 0)
+                       return err;
+       }
+
        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
        if (!err) {
-               if (op == CEPH_MDS_OP_SETFILELOCK) {
+               if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
                        dout("mds locked, locking locally\n");
                        err = posix_lock_file(file, fl, NULL);
                        if (err) {
@@ -311,9 +332,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
 
+       if (F_UNLCK == fl->fl_type) {
+               err = try_unlock_file(file, fl);
+               if (err <= 0)
+                       return err;
+       }
+
        err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
                                inode, lock_cmd, wait, fl);
-       if (!err) {
+       if (!err && F_UNLCK != fl->fl_type) {
                err = locks_lock_file_wait(file, fl);
                if (err) {
                        ceph_lock_message(CEPH_LOCK_FLOCK,
index bbbbddf..486f91f 100644 (file)
@@ -415,21 +415,121 @@ bad:
        return -EIO;
 }
 
+
+#if BITS_PER_LONG == 64
+
+#define DELEGATED_INO_AVAILABLE                xa_mk_value(1)
+
+static int ceph_parse_deleg_inos(void **p, void *end,
+                                struct ceph_mds_session *s)
+{
+       u32 sets;
+
+       ceph_decode_32_safe(p, end, sets, bad);
+       dout("got %u sets of delegated inodes\n", sets);
+       while (sets--) {
+               u64 start, len, ino;
+
+               ceph_decode_64_safe(p, end, start, bad);
+               ceph_decode_64_safe(p, end, len, bad);
+               while (len--) {
+                       int err = xa_insert(&s->s_delegated_inos, ino = start++,
+                                           DELEGATED_INO_AVAILABLE,
+                                           GFP_KERNEL);
+                       if (!err) {
+                               dout("added delegated inode 0x%llx\n",
+                                    start - 1);
+                       } else if (err == -EBUSY) {
+                               pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
+                                       start - 1);
+                       } else {
+                               return err;
+                       }
+               }
+       }
+       return 0;
+bad:
+       return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+       unsigned long ino;
+       void *val;
+
+       xa_for_each(&s->s_delegated_inos, ino, val) {
+               val = xa_erase(&s->s_delegated_inos, ino);
+               if (val == DELEGATED_INO_AVAILABLE)
+                       return ino;
+       }
+       return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+       return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
+                        GFP_KERNEL);
+}
+#else /* BITS_PER_LONG == 64 */
+/*
+ * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
+ * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
+ * and bottom words?
+ */
+static int ceph_parse_deleg_inos(void **p, void *end,
+                                struct ceph_mds_session *s)
+{
+       u32 sets;
+
+       ceph_decode_32_safe(p, end, sets, bad);
+       if (sets)
+               ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
+       return 0;
+bad:
+       return -EIO;
+}
+
+u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
+{
+       return 0;
+}
+
+int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
+{
+       return 0;
+}
+#endif /* BITS_PER_LONG == 64 */
+
 /*
  * parse create results
  */
 static int parse_reply_info_create(void **p, void *end,
                                  struct ceph_mds_reply_info_parsed *info,
-                                 u64 features)
+                                 u64 features, struct ceph_mds_session *s)
 {
+       int ret;
+
        if (features == (u64)-1 ||
            (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
-               /* Malformed reply? */
                if (*p == end) {
+                       /* Malformed reply? */
                        info->has_create_ino = false;
-               } else {
+               } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
+                       u8 struct_v, struct_compat;
+                       u32 len;
+
                        info->has_create_ino = true;
+                       ceph_decode_8_safe(p, end, struct_v, bad);
+                       ceph_decode_8_safe(p, end, struct_compat, bad);
+                       ceph_decode_32_safe(p, end, len, bad);
+                       ceph_decode_64_safe(p, end, info->ino, bad);
+                       ret = ceph_parse_deleg_inos(p, end, s);
+                       if (ret)
+                               return ret;
+               } else {
+                       /* legacy */
                        ceph_decode_64_safe(p, end, info->ino, bad);
+                       info->has_create_ino = true;
                }
        } else {
                if (*p != end)
@@ -448,7 +548,7 @@ bad:
  */
 static int parse_reply_info_extra(void **p, void *end,
                                  struct ceph_mds_reply_info_parsed *info,
-                                 u64 features)
+                                 u64 features, struct ceph_mds_session *s)
 {
        u32 op = le32_to_cpu(info->head->op);
 
@@ -457,7 +557,7 @@ static int parse_reply_info_extra(void **p, void *end,
        else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
                return parse_reply_info_readdir(p, end, info, features);
        else if (op == CEPH_MDS_OP_CREATE)
-               return parse_reply_info_create(p, end, info, features);
+               return parse_reply_info_create(p, end, info, features, s);
        else
                return -EIO;
 }
@@ -465,7 +565,7 @@ static int parse_reply_info_extra(void **p, void *end,
 /*
  * parse entire mds reply
  */
-static int parse_reply_info(struct ceph_msg *msg,
+static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
                            struct ceph_mds_reply_info_parsed *info,
                            u64 features)
 {
@@ -490,7 +590,7 @@ static int parse_reply_info(struct ceph_msg *msg,
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
                ceph_decode_need(&p, end, len, bad);
-               err = parse_reply_info_extra(&p, p+len, info, features);
+               err = parse_reply_info_extra(&p, p+len, info, features, s);
                if (err < 0)
                        goto out_bad;
        }
@@ -558,6 +658,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
        if (refcount_dec_and_test(&s->s_ref)) {
                if (s->s_auth.authorizer)
                        ceph_auth_destroy_authorizer(s->s_auth.authorizer);
+               xa_destroy(&s->s_delegated_inos);
                kfree(s);
        }
 }
@@ -645,6 +746,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
        refcount_set(&s->s_ref, 1);
        INIT_LIST_HEAD(&s->s_waiting);
        INIT_LIST_HEAD(&s->s_unsafe);
+       xa_init(&s->s_delegated_inos);
        s->s_num_cap_releases = 0;
        s->s_cap_reconnect = 0;
        s->s_cap_iterator = NULL;
@@ -699,6 +801,7 @@ void ceph_mdsc_release_request(struct kref *kref)
        struct ceph_mds_request *req = container_of(kref,
                                                    struct ceph_mds_request,
                                                    r_kref);
+       ceph_mdsc_release_dir_caps(req);
        destroy_reply_info(&req->r_reply_info);
        if (req->r_request)
                ceph_msg_put(req->r_request);
@@ -736,7 +839,7 @@ void ceph_mdsc_release_request(struct kref *kref)
        put_request_session(req);
        ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
        WARN_ON_ONCE(!list_empty(&req->r_wait));
-       kfree(req);
+       kmem_cache_free(ceph_mds_request_cachep, req);
 }
 
 DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
@@ -793,8 +896,13 @@ static void __register_request(struct ceph_mds_client *mdsc,
                mdsc->oldest_tid = req->r_tid;
 
        if (dir) {
+               struct ceph_inode_info *ci = ceph_inode(dir);
+
                ihold(dir);
                req->r_unsafe_dir = dir;
+               spin_lock(&ci->i_unsafe_lock);
+               list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+               spin_unlock(&ci->i_unsafe_lock);
        }
 }
 
@@ -822,8 +930,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 
        erase_request(&mdsc->request_tree, req);
 
-       if (req->r_unsafe_dir  &&
-           test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
+       if (req->r_unsafe_dir) {
                struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
                spin_lock(&ci->i_unsafe_lock);
                list_del_init(&req->r_unsafe_dir_item);
@@ -1407,8 +1514,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
        dout("removing cap %p, ci is %p, inode is %p\n",
             cap, ci, &ci->vfs_inode);
        spin_lock(&ci->i_ceph_lock);
-       if (cap->mds_wanted | cap->issued)
-               ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
        __ceph_remove_cap(cap, false);
        if (!ci->i_auth_cap) {
                struct ceph_cap_flush *cf;
@@ -1574,9 +1679,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
                        /* mds did not re-issue stale cap */
                        spin_lock(&ci->i_ceph_lock);
                        cap->issued = cap->implemented = CEPH_CAP_PIN;
-                       /* make sure mds knows what we want */
-                       if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
-                               ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
                        spin_unlock(&ci->i_ceph_lock);
                }
        } else if (ev == FORCE_RO) {
@@ -1772,7 +1874,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
        }
        /* The inode has cached pages, but it's no longer used.
         * we can safely drop it */
-       if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
+       if (S_ISREG(inode->i_mode) &&
+           wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
            !(oissued & CEPH_CAP_FILE_CACHE)) {
          used = 0;
          oissued = 0;
@@ -2089,8 +2192,9 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
 struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 {
-       struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+       struct ceph_mds_request *req;
 
+       req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);
        if (!req)
                return ERR_PTR(-ENOMEM);
 
@@ -2368,7 +2472,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        head->op = cpu_to_le32(req->r_op);
        head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
        head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
-       head->ino = 0;
+       head->ino = cpu_to_le64(req->r_deleg_ino);
        head->args = req->r_args;
 
        ceph_encode_filepath(&p, end, ino1, path1);
@@ -2382,7 +2486,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        if (req->r_inode_drop)
                releases += ceph_encode_inode_release(&p,
                      req->r_inode ? req->r_inode : d_inode(req->r_dentry),
-                     mds, req->r_inode_drop, req->r_inode_unless, 0);
+                     mds, req->r_inode_drop, req->r_inode_unless,
+                     req->r_op == CEPH_MDS_OP_READDIR);
        if (req->r_dentry_drop)
                releases += ceph_encode_dentry_release(&p, req->r_dentry,
                                req->r_parent, mds, req->r_dentry_drop,
@@ -2522,12 +2627,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
        if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
                flags |= CEPH_MDS_FLAG_REPLAY;
+       if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
+               flags |= CEPH_MDS_FLAG_ASYNC;
        if (req->r_parent)
                flags |= CEPH_MDS_FLAG_WANT_DENTRY;
        rhead->flags = cpu_to_le32(flags);
        rhead->num_fwd = req->r_num_fwd;
        rhead->num_retry = req->r_attempts - 1;
-       rhead->ino = 0;
 
        dout(" r_parent = %p\n", req->r_parent);
        return 0;
@@ -2573,7 +2679,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
        if (req->r_timeout &&
            time_after_eq(jiffies, req->r_started + req->r_timeout)) {
                dout("do_request timed out\n");
-               err = -EIO;
+               err = -ETIMEDOUT;
                goto finish;
        }
        if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
@@ -2605,6 +2711,10 @@ static void __do_request(struct ceph_mds_client *mdsc,
        mds = __choose_mds(mdsc, req, &random);
        if (mds < 0 ||
            ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+               if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
+                       err = -EJUKEBOX;
+                       goto finish;
+               }
                dout("do_request no mds or not active, waiting for map\n");
                list_add(&req->r_wait, &mdsc->waiting_for_map);
                return;
@@ -2629,6 +2739,15 @@ static void __do_request(struct ceph_mds_client *mdsc,
                        err = -EACCES;
                        goto out_session;
                }
+               /*
+                * We cannot queue async requests since the caps and delegated
+                * inodes are bound to the session. Just return -EJUKEBOX and
+                * let the caller retry a sync request in that case.
+                */
+               if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
+                       err = -EJUKEBOX;
+                       goto out_session;
+               }
                if (session->s_state == CEPH_MDS_SESSION_NEW ||
                    session->s_state == CEPH_MDS_SESSION_CLOSING) {
                        __open_session(mdsc, session);
@@ -2709,19 +2828,43 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
                              struct ceph_mds_request *req)
 {
-       int err;
+       int err = 0;
 
        /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
        if (req->r_inode)
                ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
        if (req->r_parent) {
-               ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+               struct ceph_inode_info *ci = ceph_inode(req->r_parent);
+               int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?
+                           CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;
+               spin_lock(&ci->i_ceph_lock);
+               ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
+               __ceph_touch_fmode(ci, mdsc, fmode);
+               spin_unlock(&ci->i_ceph_lock);
                ihold(req->r_parent);
        }
        if (req->r_old_dentry_dir)
                ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
                                  CEPH_CAP_PIN);
 
+       if (req->r_inode) {
+               err = ceph_wait_on_async_create(req->r_inode);
+               if (err) {
+                       dout("%s: wait for async create returned: %d\n",
+                            __func__, err);
+                       return err;
+               }
+       }
+
+       if (!err && req->r_old_inode) {
+               err = ceph_wait_on_async_create(req->r_old_inode);
+               if (err) {
+                       dout("%s: wait for async create returned: %d\n",
+                            __func__, err);
+                       return err;
+               }
+       }
+
        dout("submit_request on %p for inode %p\n", req, dir);
        mutex_lock(&mdsc->mutex);
        __register_request(mdsc, req, dir);
@@ -2747,7 +2890,7 @@ static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
                if (timeleft > 0)
                        err = 0;
                else if (!timeleft)
-                       err = -EIO;  /* timed out */
+                       err = -ETIMEDOUT;  /* timed out */
                else
                        err = timeleft;  /* killed */
        }
@@ -2935,22 +3078,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        } else {
                set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
                list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
-               if (req->r_unsafe_dir) {
-                       struct ceph_inode_info *ci =
-                                       ceph_inode(req->r_unsafe_dir);
-                       spin_lock(&ci->i_unsafe_lock);
-                       list_add_tail(&req->r_unsafe_dir_item,
-                                     &ci->i_unsafe_dirops);
-                       spin_unlock(&ci->i_unsafe_lock);
-               }
        }
 
        dout("handle_reply tid %lld result %d\n", tid, result);
        rinfo = &req->r_reply_info;
        if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
-               err = parse_reply_info(msg, rinfo, (u64)-1);
+               err = parse_reply_info(session, msg, rinfo, (u64)-1);
        else
-               err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
+               err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
        mutex_unlock(&mdsc->mutex);
 
        mutex_lock(&session->s_mutex);
@@ -3249,6 +3384,17 @@ bad:
        return;
 }
 
+void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
+{
+       int dcaps;
+
+       dcaps = xchg(&req->r_dir_caps, 0);
+       if (dcaps) {
+               dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
+               ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
+       }
+}
+
 /*
  * called under session->mutex.
  */
@@ -3276,9 +3422,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
                        continue;
                if (req->r_attempts == 0)
                        continue; /* only old requests */
-               if (req->r_session &&
-                   req->r_session->s_mds == session->s_mds)
-                       __send_request(mdsc, session, req, true);
+               if (!req->r_session)
+                       continue;
+               if (req->r_session->s_mds != session->s_mds)
+                       continue;
+
+               ceph_mdsc_release_dir_caps(req);
+
+               __send_request(mdsc, session, req, true);
        }
        mutex_unlock(&mdsc->mutex);
 }
@@ -3362,7 +3513,7 @@ fail_msg:
 /*
  * Encode information about a cap for a reconnect with the MDS.
  */
-static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
                          void *arg)
 {
        union {
@@ -3385,6 +3536,15 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
        cap->mseq = 0;       /* and migrate_seq */
        cap->cap_gen = cap->session->s_cap_gen;
 
+       /* These are lost when the session goes away */
+       if (S_ISDIR(inode->i_mode)) {
+               if (cap->issued & CEPH_CAP_DIR_CREATE) {
+                       ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
+                       memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
+               }
+               cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;
+       }
+
        if (recon_state->msg_version >= 2) {
                rec.v2.cap_id = cpu_to_le64(cap->cap_id);
                rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
@@ -3626,6 +3786,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
        if (!reply)
                goto fail_nomsg;
 
+       xa_destroy(&session->s_delegated_inos);
+
        mutex_lock(&session->s_mutex);
        session->s_state = CEPH_MDS_SESSION_RECONNECTING;
        session->s_seq = 0;
@@ -3681,7 +3843,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
                recon_state.msg_version = 2;
        }
        /* trsaverse this session's caps */
-       err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state);
+       err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
 
        spin_lock(&session->s_cap_lock);
        session->s_cap_reconnect = 0;
index 27a7446..4e5be79 100644 (file)
@@ -23,8 +23,9 @@ enum ceph_feature_type {
        CEPHFS_FEATURE_RECLAIM_CLIENT,
        CEPHFS_FEATURE_LAZY_CAP_WANTED,
        CEPHFS_FEATURE_MULTI_RECONNECT,
+       CEPHFS_FEATURE_DELEG_INO,
 
-       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
+       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
 };
 
 /*
@@ -37,6 +38,7 @@ enum ceph_feature_type {
        CEPHFS_FEATURE_REPLY_ENCODING,          \
        CEPHFS_FEATURE_LAZY_CAP_WANTED,         \
        CEPHFS_FEATURE_MULTI_RECONNECT,         \
+       CEPHFS_FEATURE_DELEG_INO,               \
                                                \
        CEPHFS_FEATURE_MAX,                     \
 }
@@ -201,6 +203,7 @@ struct ceph_mds_session {
 
        struct list_head  s_waiting;  /* waiting requests */
        struct list_head  s_unsafe;   /* unsafe requests */
+       struct xarray     s_delegated_inos;
 };
 
 /*
@@ -255,6 +258,7 @@ struct ceph_mds_request {
 #define CEPH_MDS_R_GOT_RESULT          (5) /* got a result */
 #define CEPH_MDS_R_DID_PREPOPULATE     (6) /* prepopulated readdir */
 #define CEPH_MDS_R_PARENT_LOCKED       (7) /* is r_parent->i_rwsem wlocked? */
+#define CEPH_MDS_R_ASYNC               (8) /* async request */
        unsigned long   r_req_flags;
 
        struct mutex r_fill_mutex;
@@ -263,6 +267,7 @@ struct ceph_mds_request {
        int r_fmode;        /* file mode, if expecting cap */
        kuid_t r_uid;
        kgid_t r_gid;
+       int r_request_release_offset;
        struct timespec64 r_stamp;
 
        /* for choosing which mds to send this request to */
@@ -280,12 +285,16 @@ struct ceph_mds_request {
        int r_old_inode_drop, r_old_inode_unless;
 
        struct ceph_msg  *r_request;  /* original request */
-       int r_request_release_offset;
        struct ceph_msg  *r_reply;
        struct ceph_mds_reply_info_parsed r_reply_info;
-       struct page *r_locked_page;
        int r_err;
 
+
+       struct page *r_locked_page;
+       int r_dir_caps;
+       int r_num_caps;
+       u32               r_readdir_offset;
+
        unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
        unsigned long r_started;  /* start time to measure timeout against */
        unsigned long r_request_started; /* start time for mds request only,
@@ -304,6 +313,7 @@ struct ceph_mds_request {
        int               r_num_fwd;    /* number of forward attempts */
        int               r_resend_mds; /* mds to resend to next, if any*/
        u32               r_sent_on_mseq; /* cap mseq request was sent at*/
+       u64               r_deleg_ino;
 
        struct list_head  r_wait;
        struct completion r_completion;
@@ -315,10 +325,8 @@ struct ceph_mds_request {
        long long         r_dir_release_cnt;
        long long         r_dir_ordered_cnt;
        int               r_readdir_cache_idx;
-       u32               r_readdir_offset;
 
        struct ceph_cap_reservation r_caps_reservation;
-       int r_num_caps;
 };
 
 struct ceph_pool_perm {
@@ -488,6 +496,7 @@ extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
 extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
                                struct inode *dir,
                                struct ceph_mds_request *req);
+extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req);
 static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
 {
        kref_get(&req->r_kref);
@@ -537,4 +546,15 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
 extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
                          struct ceph_mds_session *session,
                          int max_caps);
+
+static inline int ceph_wait_on_async_create(struct inode *inode)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+
+       return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
+                          TASK_INTERRUPTIBLE);
+}
+
+extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
+extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
 #endif
index c7f1506..c9784eb 100644 (file)
@@ -155,6 +155,7 @@ enum {
        Opt_acl,
        Opt_quotadf,
        Opt_copyfrom,
+       Opt_wsync,
 };
 
 enum ceph_recover_session_mode {
@@ -194,6 +195,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
        fsparam_string  ("snapdirname",                 Opt_snapdirname),
        fsparam_string  ("source",                      Opt_source),
        fsparam_u32     ("wsize",                       Opt_wsize),
+       fsparam_flag_no ("wsync",                       Opt_wsync),
        {}
 };
 
@@ -444,6 +446,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
                        fc->sb_flags &= ~SB_POSIXACL;
                }
                break;
+       case Opt_wsync:
+               if (!result.negated)
+                       fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS;
+               else
+                       fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
+               break;
        default:
                BUG();
        }
@@ -567,6 +575,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
        if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
                seq_show_option(m, "recover_session", "clean");
 
+       if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+               seq_puts(m, ",nowsync");
+
        if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
                seq_printf(m, ",wsize=%u", fsopt->wsize);
        if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -729,6 +740,7 @@ struct kmem_cache *ceph_cap_flush_cachep;
 struct kmem_cache *ceph_dentry_cachep;
 struct kmem_cache *ceph_file_cachep;
 struct kmem_cache *ceph_dir_file_cachep;
+struct kmem_cache *ceph_mds_request_cachep;
 
 static void ceph_inode_init_once(void *foo)
 {
@@ -769,6 +781,10 @@ static int __init init_caches(void)
        if (!ceph_dir_file_cachep)
                goto bad_dir_file;
 
+       ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD);
+       if (!ceph_mds_request_cachep)
+               goto bad_mds_req;
+
        error = ceph_fscache_register();
        if (error)
                goto bad_fscache;
@@ -776,6 +792,8 @@ static int __init init_caches(void)
        return 0;
 
 bad_fscache:
+       kmem_cache_destroy(ceph_mds_request_cachep);
+bad_mds_req:
        kmem_cache_destroy(ceph_dir_file_cachep);
 bad_dir_file:
        kmem_cache_destroy(ceph_file_cachep);
@@ -804,6 +822,7 @@ static void destroy_caches(void)
        kmem_cache_destroy(ceph_dentry_cachep);
        kmem_cache_destroy(ceph_file_cachep);
        kmem_cache_destroy(ceph_dir_file_cachep);
+       kmem_cache_destroy(ceph_mds_request_cachep);
 
        ceph_fscache_unregister();
 }
@@ -1107,6 +1126,15 @@ static void ceph_free_fc(struct fs_context *fc)
 
 static int ceph_reconfigure_fc(struct fs_context *fc)
 {
+       struct ceph_parse_opts_ctx *pctx = fc->fs_private;
+       struct ceph_mount_options *fsopt = pctx->opts;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb);
+
+       if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
+               ceph_set_mount_opt(fsc, ASYNC_DIROPS);
+       else
+               ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
+
        sync_filesystem(fc->root->d_sb);
        return 0;
 }
index 037cdfb..60aac3a 100644 (file)
 #define CEPH_MOUNT_OPT_MOUNTWAIT       (1<<12) /* mount waits if no mds is up */
 #define CEPH_MOUNT_OPT_NOQUOTADF       (1<<13) /* no root dir quota in statfs */
 #define CEPH_MOUNT_OPT_NOCOPYFROM      (1<<14) /* don't use RADOS 'copy-from' op */
+#define CEPH_MOUNT_OPT_ASYNC_DIROPS    (1<<15) /* allow async directory ops */
 
 #define CEPH_MOUNT_OPT_DEFAULT                 \
        (CEPH_MOUNT_OPT_DCACHE |                \
         CEPH_MOUNT_OPT_NOCOPYFROM)
 
 #define ceph_set_mount_opt(fsc, opt) \
-       (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+       (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt
+#define ceph_clear_mount_opt(fsc, opt) \
+       (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt
 #define ceph_test_mount_opt(fsc, opt) \
        (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
 
@@ -170,9 +173,9 @@ struct ceph_cap {
        struct list_head caps_item;
 };
 
-#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
-#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
-#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
+#define CHECK_CAPS_AUTHONLY   1  /* only check auth cap */
+#define CHECK_CAPS_FLUSH      2  /* flush any dirty caps */
+#define CHECK_CAPS_NOINVAL    4  /* don't invalidate pagecache */
 
 struct ceph_cap_flush {
        u64 tid;
@@ -284,6 +287,7 @@ struct ceph_dentry_info {
 #define CEPH_DENTRY_REFERENCED         1
 #define CEPH_DENTRY_LEASE_LIST         2
 #define CEPH_DENTRY_SHRINK_LIST                4
+#define CEPH_DENTRY_PRIMARY_LINK       8
 
 struct ceph_inode_xattrs_info {
        /*
@@ -315,13 +319,14 @@ struct ceph_inode_info {
        u64 i_inline_version;
        u32 i_time_warp_seq;
 
-       unsigned i_ceph_flags;
+       unsigned long i_ceph_flags;
        atomic64_t i_release_count;
        atomic64_t i_ordered_count;
        atomic64_t i_complete_seq[2];
 
        struct ceph_dir_layout i_dir_layout;
        struct ceph_file_layout i_layout;
+       struct ceph_file_layout i_cached_layout;        // for async creates
        char *i_symlink;
 
        /* for dirs */
@@ -352,7 +357,6 @@ struct ceph_inode_info {
        struct ceph_cap_flush *i_prealloc_cap_flush;
        struct list_head i_cap_flush_list;
        wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
-       unsigned long i_hold_caps_min; /* jiffies */
        unsigned long i_hold_caps_max; /* jiffies */
        struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
        struct ceph_cap_reservation i_cap_migration_resv;
@@ -361,6 +365,8 @@ struct ceph_inode_info {
                                                    dirty|flushing caps */
        unsigned i_snap_caps;           /* cap bits for snapped files */
 
+       unsigned long i_last_rd;
+       unsigned long i_last_wr;
        int i_nr_by_mode[CEPH_FILE_MODE_BITS];  /* open file counts */
 
        struct mutex i_truncate_mutex;
@@ -375,7 +381,7 @@ struct ceph_inode_info {
 
        /* held references to caps */
        int i_pin_ref;
-       int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
+       int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref;
        int i_wrbuffer_ref, i_wrbuffer_ref_head;
        atomic_t i_filelock_ref;
        atomic_t i_shared_gen;       /* increment each time we get FILE_SHARED */
@@ -511,18 +517,18 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
  * Ceph inode.
  */
 #define CEPH_I_DIR_ORDERED     (1 << 0)  /* dentries in dir are ordered */
-#define CEPH_I_NODELAY         (1 << 1)  /* do not delay cap release */
 #define CEPH_I_FLUSH           (1 << 2)  /* do not delay flush of dirty metadata */
 #define CEPH_I_POOL_PERM       (1 << 3)  /* pool rd/wr bits are valid */
 #define CEPH_I_POOL_RD         (1 << 4)  /* can read from pool */
 #define CEPH_I_POOL_WR         (1 << 5)  /* can write to pool */
 #define CEPH_I_SEC_INITED      (1 << 6)  /* security initialized */
-#define CEPH_I_CAP_DROPPED     (1 << 7)  /* caps were forcibly dropped */
-#define CEPH_I_KICK_FLUSH      (1 << 8)  /* kick flushing caps */
-#define CEPH_I_FLUSH_SNAPS     (1 << 9)  /* need flush snapss */
-#define CEPH_I_ERROR_WRITE     (1 << 10) /* have seen write errors */
-#define CEPH_I_ERROR_FILELOCK  (1 << 11) /* have seen file lock errors */
-#define CEPH_I_ODIRECT         (1 << 12) /* inode in direct I/O mode */
+#define CEPH_I_KICK_FLUSH      (1 << 7)  /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS     (1 << 8)  /* need flush snapss */
+#define CEPH_I_ERROR_WRITE     (1 << 9) /* have seen write errors */
+#define CEPH_I_ERROR_FILELOCK  (1 << 10) /* have seen file lock errors */
+#define CEPH_I_ODIRECT         (1 << 11) /* inode in direct I/O mode */
+#define CEPH_ASYNC_CREATE_BIT  (12)      /* async create in flight for this */
+#define CEPH_I_ASYNC_CREATE    (1 << CEPH_ASYNC_CREATE_BIT)
 
 /*
  * Masks of ceph inode work.
@@ -674,18 +680,12 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
 extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
 extern int __ceph_caps_used(struct ceph_inode_info *ci);
 
-extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
-
-/*
- * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
- */
-static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
 {
-       int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
-       if (w & CEPH_CAP_FILE_BUFFER)
-               w |= CEPH_CAP_FILE_EXCL;  /* we want EXCL if dirty data */
-       return w;
+       return ci->i_nr_by_mode[0];
 }
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
 
 /* what the mds thinks we want */
 extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
@@ -899,6 +899,9 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
 }
 
 /* inode.c */
+struct ceph_mds_reply_info_in;
+struct ceph_mds_reply_dirfrag;
+
 extern const struct inode_operations ceph_file_iops;
 
 extern struct inode *ceph_alloc_inode(struct super_block *sb);
@@ -914,6 +917,11 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
                                u64 time_warp_seq, struct timespec64 *ctime,
                                struct timespec64 *mtime,
                                struct timespec64 *atime);
+extern int ceph_fill_inode(struct inode *inode, struct page *locked_page,
+                   struct ceph_mds_reply_info_in *iinfo,
+                   struct ceph_mds_reply_dirfrag *dirinfo,
+                   struct ceph_mds_session *session, int cap_fmode,
+                   struct ceph_cap_reservation *caps_reservation);
 extern int ceph_fill_trace(struct super_block *sb,
                           struct ceph_mds_request *req);
 extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
@@ -1042,7 +1050,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
                                     struct ceph_cap_reservation *ctx);
 extern void ceph_add_cap(struct inode *inode,
                         struct ceph_mds_session *session, u64 cap_id,
-                        int fmode, unsigned issued, unsigned wanted,
+                        unsigned issued, unsigned wanted,
                         unsigned cap, unsigned seq, u64 realmino, int flags,
                         struct ceph_cap **new_cap);
 extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
@@ -1058,8 +1066,12 @@ extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
                                          struct ceph_mds_session *session);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                                    struct ceph_mds_session *session);
+void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
+                                  struct ceph_inode_info *ci);
 extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
                                             int mds);
+extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
+                               bool snap_rwsem_locked);
 extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
 extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
 extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
@@ -1084,8 +1096,10 @@ extern int ceph_try_get_caps(struct inode *inode,
                             int need, int want, bool nonblock, int *got);
 
 /* for counting open files by mode */
-extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
-extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
+extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
+                              struct ceph_mds_client *mdsc, int fmode);
 
 /* addr.c */
 extern const struct address_space_operations ceph_aops;
@@ -1097,7 +1111,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 
-extern int ceph_renew_caps(struct inode *inode);
+extern int ceph_renew_caps(struct inode *inode, int fmode);
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
                            struct file *file, unsigned flags, umode_t mode);
index 35da144..11b1672 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1038,50 +1038,43 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
        return ret;
 }
 
-static bool dax_range_is_aligned(struct block_device *bdev,
-                                unsigned int offset, unsigned int length)
+int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
+                  struct iomap *iomap)
 {
-       unsigned short sector_size = bdev_logical_block_size(bdev);
+       sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
+       pgoff_t pgoff;
+       long rc, id;
+       void *kaddr;
+       bool page_aligned = false;
 
-       if (!IS_ALIGNED(offset, sector_size))
-               return false;
-       if (!IS_ALIGNED(length, sector_size))
-               return false;
 
-       return true;
-}
+       if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
+           IS_ALIGNED(size, PAGE_SIZE))
+               page_aligned = true;
 
-int __dax_zero_page_range(struct block_device *bdev,
-               struct dax_device *dax_dev, sector_t sector,
-               unsigned int offset, unsigned int size)
-{
-       if (dax_range_is_aligned(bdev, offset, size)) {
-               sector_t start_sector = sector + (offset >> 9);
+       rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
+       if (rc)
+               return rc;
 
-               return blkdev_issue_zeroout(bdev, start_sector,
-                               size >> 9, GFP_NOFS, 0);
-       } else {
-               pgoff_t pgoff;
-               long rc, id;
-               void *kaddr;
+       id = dax_read_lock();
 
-               rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
-               if (rc)
-                       return rc;
+       if (page_aligned)
+               rc = dax_zero_page_range(iomap->dax_dev, pgoff,
+                                        size >> PAGE_SHIFT);
+       else
+               rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
+       if (rc < 0) {
+               dax_read_unlock(id);
+               return rc;
+       }
 
-               id = dax_read_lock();
-               rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
-               if (rc < 0) {
-                       dax_read_unlock(id);
-                       return rc;
-               }
+       if (!page_aligned) {
                memset(kaddr + offset, 0, size);
-               dax_flush(dax_dev, kaddr + offset, size);
-               dax_read_unlock(id);
+               dax_flush(iomap->dax_dev, kaddr + offset, size);
        }
+       dax_read_unlock(id);
        return 0;
 }
-EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
 static loff_t
 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
index eee3c92..8c59664 100644 (file)
@@ -218,13 +218,18 @@ struct eventpoll {
        struct file *file;
 
        /* used to optimize loop detection check */
-       int visited;
        struct list_head visited_list_link;
+       int visited;
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
        /* used to track busy poll napi_id */
        unsigned int napi_id;
 #endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       /* tracks wakeup nests for lockdep validation */
+       u8 nests;
+#endif
 };
 
 /* Wait structure used by the poll hooks */
@@ -545,30 +550,47 @@ out_unlock:
  */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-static DEFINE_PER_CPU(int, wakeup_nest);
-
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
 {
+       struct eventpoll *ep_src;
        unsigned long flags;
-       int subclass;
+       u8 nests = 0;
 
-       local_irq_save(flags);
-       preempt_disable();
-       subclass = __this_cpu_read(wakeup_nest);
-       spin_lock_nested(&wq->lock, subclass + 1);
-       __this_cpu_inc(wakeup_nest);
-       wake_up_locked_poll(wq, POLLIN);
-       __this_cpu_dec(wakeup_nest);
-       spin_unlock(&wq->lock);
-       local_irq_restore(flags);
-       preempt_enable();
+       /*
+        * To set the subclass or nesting level for spin_lock_irqsave_nested()
+        * it might be natural to create a per-cpu nest count. However, since
+        * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
+        * schedule() in the -rt kernel, the per-cpu variable are no longer
+        * protected. Thus, we are introducing a per eventpoll nest field.
+        * If we are not being call from ep_poll_callback(), epi is NULL and
+        * we are at the first level of nesting, 0. Otherwise, we are being
+        * called from ep_poll_callback() and if a previous wakeup source is
+        * not an epoll file itself, we are at depth 1 since the wakeup source
+        * is depth 0. If the wakeup source is a previous epoll file in the
+        * wakeup chain then we use its nests value and record ours as
+        * nests + 1. The previous epoll file nests value is stable since its
+        * already holding its own poll_wait.lock.
+        */
+       if (epi) {
+               if ((is_file_epoll(epi->ffd.file))) {
+                       ep_src = epi->ffd.file->private_data;
+                       nests = ep_src->nests;
+               } else {
+                       nests = 1;
+               }
+       }
+       spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
+       ep->nests = nests + 1;
+       wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
+       ep->nests = 0;
+       spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
 }
 
 #else
 
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
 {
-       wake_up_poll(wq, EPOLLIN);
+       wake_up_poll(&ep->poll_wait, EPOLLIN);
 }
 
 #endif
@@ -789,7 +811,7 @@ static void ep_free(struct eventpoll *ep)
 
        /* We need to release all tasks waiting for these file */
        if (waitqueue_active(&ep->poll_wait))
-               ep_poll_safewake(&ep->poll_wait);
+               ep_poll_safewake(ep, NULL);
 
        /*
         * We need to lock this because we could be hit by
@@ -1258,7 +1280,7 @@ out_unlock:
 
        /* We have to call this outside the lock */
        if (pwake)
-               ep_poll_safewake(&ep->poll_wait);
+               ep_poll_safewake(ep, epi);
 
        if (!(epi->event.events & EPOLLEXCLUSIVE))
                ewake = 1;
@@ -1562,7 +1584,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 
        /* We have to call this outside the lock */
        if (pwake)
-               ep_poll_safewake(&ep->poll_wait);
+               ep_poll_safewake(ep, NULL);
 
        return 0;
 
@@ -1666,7 +1688,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 
        /* We have to call this outside the lock */
        if (pwake)
-               ep_poll_safewake(&ep->poll_wait);
+               ep_poll_safewake(ep, NULL);
 
        return 0;
 }
index f0faada..bb68d21 100644 (file)
@@ -118,3 +118,12 @@ config F2FS_FS_LZ4
        default y
        help
          Support LZ4 compress algorithm, if unsure, say Y.
+
+config F2FS_FS_ZSTD
+       bool "ZSTD compression support"
+       depends on F2FS_FS_COMPRESSION
+       select ZSTD_COMPRESS
+       select ZSTD_DECOMPRESS
+       default y
+       help
+         Support ZSTD compress algorithm, if unsure, say Y.
index 44e84ac..852890b 100644 (file)
@@ -50,9 +50,6 @@ repeat:
        return page;
 }
 
-/*
- * We guarantee no failure on the returned page.
- */
 static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
                                                        bool is_meta)
 {
@@ -206,7 +203,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 }
 
 /*
- * Readahead CP/NAT/SIT/SSA pages
+ * Readahead CP/NAT/SIT/SSA/POR pages
  */
 int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
                                                        int type, bool sync)
@@ -898,7 +895,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
                return -ENOMEM;
        /*
         * Finding out valid cp block involves read both
-        * sets( cp pack1 and cp pack 2)
+        * sets( cp pack 1 and cp pack 2)
         */
        cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
        cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
@@ -1250,20 +1247,20 @@ static void unblock_operations(struct f2fs_sb_info *sbi)
        f2fs_unlock_all(sbi);
 }
 
-void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
+void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
 {
        DEFINE_WAIT(wait);
 
        for (;;) {
                prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
 
-               if (!get_pages(sbi, F2FS_WB_CP_DATA))
+               if (!get_pages(sbi, type))
                        break;
 
                if (unlikely(f2fs_cp_error(sbi)))
                        break;
 
-               io_schedule_timeout(5*HZ);
+               io_schedule_timeout(DEFAULT_IO_TIMEOUT);
        }
        finish_wait(&sbi->cp_wait, &wait);
 }
@@ -1301,10 +1298,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        else
                __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
 
-       if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) ||
-               is_sbi_flag_set(sbi, SBI_IS_RESIZEFS))
+       if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
                __set_ckpt_flags(ckpt, CP_FSCK_FLAG);
 
+       if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS))
+               __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG);
+       else
+               __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG);
+
        if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
                __set_ckpt_flags(ckpt, CP_DISABLED_FLAG);
        else
@@ -1384,13 +1385,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
        /* Flush all the NAT/SIT pages */
        f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
-       f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
-                                       !f2fs_cp_error(sbi));
 
-       /*
-        * modify checkpoint
-        * version number is already updated
-        */
+       /* start to update checkpoint, cp ver is already updated previously */
        ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true));
        ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
        for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
@@ -1493,11 +1489,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
        /* Here, we have one bio having CP pack except cp pack 2 page */
        f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
-       f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
-                                       !f2fs_cp_error(sbi));
+       /* Wait for all dirty meta pages to be submitted for IO */
+       f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META);
 
        /* wait for previous submitted meta pages writeback */
-       f2fs_wait_on_all_pages_writeback(sbi);
+       f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
 
        /* flush all device cache */
        err = f2fs_flush_device_cache(sbi);
@@ -1506,7 +1502,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
        /* barrier and flush checkpoint cp pack 2 page if it can */
        commit_checkpoint(sbi, ckpt, start_blk);
-       f2fs_wait_on_all_pages_writeback(sbi);
+       f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
 
        /*
         * invalidate intermediate page cache borrowed from meta inode which are
@@ -1543,9 +1539,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0;
 }
 
-/*
- * We guarantee that this checkpoint procedure will not fail.
- */
 int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1613,7 +1606,6 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
        f2fs_flush_sit_entries(sbi, cpc);
 
-       /* unlock all the fs_lock[] in do_checkpoint() */
        err = do_checkpoint(sbi, cpc);
        if (err)
                f2fs_release_discard_addrs(sbi);
@@ -1626,7 +1618,7 @@ stop:
        if (cpc->reason & CP_RECOVERY)
                f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver);
 
-       /* do checkpoint periodically */
+       /* update CP_TIME to trigger checkpoint periodically */
        f2fs_update_time(sbi, CP_TIME);
        trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
 out:
index d8a64be..df7b2d1 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/backing-dev.h>
 #include <linux/lzo.h>
 #include <linux/lz4.h>
+#include <linux/zstd.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -20,6 +21,8 @@ struct f2fs_compress_ops {
        int (*init_compress_ctx)(struct compress_ctx *cc);
        void (*destroy_compress_ctx)(struct compress_ctx *cc);
        int (*compress_pages)(struct compress_ctx *cc);
+       int (*init_decompress_ctx)(struct decompress_io_ctx *dic);
+       void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic);
        int (*decompress_pages)(struct decompress_io_ctx *dic);
 };
 
@@ -52,7 +55,7 @@ bool f2fs_is_compressed_page(struct page *page)
 }
 
 static void f2fs_set_compressed_page(struct page *page,
-               struct inode *inode, pgoff_t index, void *data, refcount_t *r)
+               struct inode *inode, pgoff_t index, void *data)
 {
        SetPagePrivate(page);
        set_page_private(page, (unsigned long)data);
@@ -60,8 +63,6 @@ static void f2fs_set_compressed_page(struct page *page,
        /* i_crypto_info and iv index */
        page->index = index;
        page->mapping = inode->i_mapping;
-       if (r)
-               refcount_inc(r);
 }
 
 static void f2fs_put_compressed_page(struct page *page)
@@ -291,6 +292,165 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = {
 };
 #endif
 
+#ifdef CONFIG_F2FS_FS_ZSTD
+#define F2FS_ZSTD_DEFAULT_CLEVEL       1
+
+static int zstd_init_compress_ctx(struct compress_ctx *cc)
+{
+       ZSTD_parameters params;
+       ZSTD_CStream *stream;
+       void *workspace;
+       unsigned int workspace_size;
+
+       params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0);
+       workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams);
+
+       workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
+                                       workspace_size, GFP_NOFS);
+       if (!workspace)
+               return -ENOMEM;
+
+       stream = ZSTD_initCStream(params, 0, workspace, workspace_size);
+       if (!stream) {
+               printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initCStream failed\n",
+                               KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+                               __func__);
+               kvfree(workspace);
+               return -EIO;
+       }
+
+       cc->private = workspace;
+       cc->private2 = stream;
+
+       cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE;
+       return 0;
+}
+
+static void zstd_destroy_compress_ctx(struct compress_ctx *cc)
+{
+       kvfree(cc->private);
+       cc->private = NULL;
+       cc->private2 = NULL;
+}
+
+static int zstd_compress_pages(struct compress_ctx *cc)
+{
+       ZSTD_CStream *stream = cc->private2;
+       ZSTD_inBuffer inbuf;
+       ZSTD_outBuffer outbuf;
+       int src_size = cc->rlen;
+       int dst_size = src_size - PAGE_SIZE - COMPRESS_HEADER_SIZE;
+       int ret;
+
+       inbuf.pos = 0;
+       inbuf.src = cc->rbuf;
+       inbuf.size = src_size;
+
+       outbuf.pos = 0;
+       outbuf.dst = cc->cbuf->cdata;
+       outbuf.size = dst_size;
+
+       ret = ZSTD_compressStream(stream, &outbuf, &inbuf);
+       if (ZSTD_isError(ret)) {
+               printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n",
+                               KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+                               __func__, ZSTD_getErrorCode(ret));
+               return -EIO;
+       }
+
+       ret = ZSTD_endStream(stream, &outbuf);
+       if (ZSTD_isError(ret)) {
+               printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_endStream returned %d\n",
+                               KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+                               __func__, ZSTD_getErrorCode(ret));
+               return -EIO;
+       }
+
+       cc->clen = outbuf.pos;
+       return 0;
+}
+
+static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic)
+{
+       ZSTD_DStream *stream;
+       void *workspace;
+       unsigned int workspace_size;
+
+       workspace_size = ZSTD_DStreamWorkspaceBound(MAX_COMPRESS_WINDOW_SIZE);
+
+       workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode),
+                                       workspace_size, GFP_NOFS);
+       if (!workspace)
+               return -ENOMEM;
+
+       stream = ZSTD_initDStream(MAX_COMPRESS_WINDOW_SIZE,
+                                       workspace, workspace_size);
+       if (!stream) {
+               printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initDStream failed\n",
+                               KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
+                               __func__);
+               kvfree(workspace);
+               return -EIO;
+       }
+
+       dic->private = workspace;
+       dic->private2 = stream;
+
+       return 0;
+}
+
+static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic)
+{
+       kvfree(dic->private);
+       dic->private = NULL;
+       dic->private2 = NULL;
+}
+
+static int zstd_decompress_pages(struct decompress_io_ctx *dic)
+{
+       ZSTD_DStream *stream = dic->private2;
+       ZSTD_inBuffer inbuf;
+       ZSTD_outBuffer outbuf;
+       int ret;
+
+       inbuf.pos = 0;
+       inbuf.src = dic->cbuf->cdata;
+       inbuf.size = dic->clen;
+
+       outbuf.pos = 0;
+       outbuf.dst = dic->rbuf;
+       outbuf.size = dic->rlen;
+
+       ret = ZSTD_decompressStream(stream, &outbuf, &inbuf);
+       if (ZSTD_isError(ret)) {
+               printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n",
+                               KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
+                               __func__, ZSTD_getErrorCode(ret));
+               return -EIO;
+       }
+
+       if (dic->rlen != outbuf.pos) {
+               printk_ratelimited("%sF2FS-fs (%s): %s ZSTD invalid rlen:%zu, "
+                               "expected:%lu\n", KERN_ERR,
+                               F2FS_I_SB(dic->inode)->sb->s_id,
+                               __func__, dic->rlen,
+                               PAGE_SIZE << dic->log_cluster_size);
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static const struct f2fs_compress_ops f2fs_zstd_ops = {
+       .init_compress_ctx      = zstd_init_compress_ctx,
+       .destroy_compress_ctx   = zstd_destroy_compress_ctx,
+       .compress_pages         = zstd_compress_pages,
+       .init_decompress_ctx    = zstd_init_decompress_ctx,
+       .destroy_decompress_ctx = zstd_destroy_decompress_ctx,
+       .decompress_pages       = zstd_decompress_pages,
+};
+#endif
+
 static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = {
 #ifdef CONFIG_F2FS_FS_LZO
        &f2fs_lzo_ops,
@@ -302,6 +462,11 @@ static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = {
 #else
        NULL,
 #endif
+#ifdef CONFIG_F2FS_FS_ZSTD
+       &f2fs_zstd_ops,
+#else
+       NULL,
+#endif
 };
 
 bool f2fs_is_compress_backend_ready(struct inode *inode)
@@ -334,9 +499,11 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
        trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx,
                                cc->cluster_size, fi->i_compress_algorithm);
 
-       ret = cops->init_compress_ctx(cc);
-       if (ret)
-               goto out;
+       if (cops->init_compress_ctx) {
+               ret = cops->init_compress_ctx(cc);
+               if (ret)
+                       goto out;
+       }
 
        max_len = COMPRESS_HEADER_SIZE + cc->clen;
        cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE);
@@ -380,21 +547,27 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
        }
 
        cc->cbuf->clen = cpu_to_le32(cc->clen);
-       cc->cbuf->chksum = cpu_to_le32(0);
 
        for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++)
                cc->cbuf->reserved[i] = cpu_to_le32(0);
 
+       nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE);
+
+       /* zero out any unused part of the last page */
+       memset(&cc->cbuf->cdata[cc->clen], 0,
+              (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE));
+
        vunmap(cc->cbuf);
        vunmap(cc->rbuf);
 
-       nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE);
-
        for (i = nr_cpages; i < cc->nr_cpages; i++) {
                f2fs_put_compressed_page(cc->cpages[i]);
                cc->cpages[i] = NULL;
        }
 
+       if (cops->destroy_compress_ctx)
+               cops->destroy_compress_ctx(cc);
+
        cc->nr_cpages = nr_cpages;
 
        trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
@@ -413,7 +586,8 @@ out_free_cpages:
        kfree(cc->cpages);
        cc->cpages = NULL;
 destroy_compress_ctx:
-       cops->destroy_compress_ctx(cc);
+       if (cops->destroy_compress_ctx)
+               cops->destroy_compress_ctx(cc);
 out:
        trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
                                                        cc->clen, ret);
@@ -447,10 +621,16 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
                goto out_free_dic;
        }
 
+       if (cops->init_decompress_ctx) {
+               ret = cops->init_decompress_ctx(dic);
+               if (ret)
+                       goto out_free_dic;
+       }
+
        dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL);
        if (!dic->rbuf) {
                ret = -ENOMEM;
-               goto out_free_dic;
+               goto destroy_decompress_ctx;
        }
 
        dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO);
@@ -473,7 +653,12 @@ out_vunmap_cbuf:
        vunmap(dic->cbuf);
 out_vunmap_rbuf:
        vunmap(dic->rbuf);
+destroy_decompress_ctx:
+       if (cops->destroy_decompress_ctx)
+               cops->destroy_decompress_ctx(dic);
 out_free_dic:
+       if (verity)
+               refcount_set(&dic->ref, dic->nr_cpages);
        if (!verity)
                f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
                                                                ret, false);
@@ -532,8 +717,7 @@ static bool __cluster_may_compress(struct compress_ctx *cc)
        return true;
 }
 
-/* return # of compressed block addresses */
-static int f2fs_compressed_blocks(struct compress_ctx *cc)
+static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr)
 {
        struct dnode_of_data dn;
        int ret;
@@ -554,10 +738,15 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc)
                for (i = 1; i < cc->cluster_size; i++) {
                        block_t blkaddr;
 
-                       blkaddr = datablock_addr(dn.inode,
+                       blkaddr = data_blkaddr(dn.inode,
                                        dn.node_page, dn.ofs_in_node + i);
-                       if (blkaddr != NULL_ADDR)
-                               ret++;
+                       if (compr) {
+                               if (__is_valid_data_blkaddr(blkaddr))
+                                       ret++;
+                       } else {
+                               if (blkaddr != NULL_ADDR)
+                                       ret++;
+                       }
                }
        }
 fail:
@@ -565,6 +754,18 @@ fail:
        return ret;
 }
 
+/* return # of compressed blocks in compressed cluster */
+static int f2fs_compressed_blocks(struct compress_ctx *cc)
+{
+       return __f2fs_cluster_blocks(cc, true);
+}
+
+/* return # of valid blocks in compressed cluster */
+static int f2fs_cluster_blocks(struct compress_ctx *cc, bool compr)
+{
+       return __f2fs_cluster_blocks(cc, false);
+}
+
 int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
 {
        struct compress_ctx cc = {
@@ -574,7 +775,7 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
                .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size,
        };
 
-       return f2fs_compressed_blocks(&cc);
+       return f2fs_cluster_blocks(&cc, false);
 }
 
 static bool cluster_may_compress(struct compress_ctx *cc)
@@ -623,7 +824,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
        bool prealloc;
 
 retry:
-       ret = f2fs_compressed_blocks(cc);
+       ret = f2fs_cluster_blocks(cc, false);
        if (ret <= 0)
                return ret;
 
@@ -653,7 +854,7 @@ retry:
                struct bio *bio = NULL;
 
                ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size,
-                                               &last_block_in_bio, false);
+                                       &last_block_in_bio, false, true);
                f2fs_destroy_compress_ctx(cc);
                if (ret)
                        goto release_pages;
@@ -772,7 +973,6 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
                .encrypted_page = NULL,
                .compressed_page = NULL,
                .submitted = false,
-               .need_lock = LOCK_RETRY,
                .io_type = io_type,
                .io_wbc = wbc,
                .encrypted = f2fs_encrypted_file(cc->inode),
@@ -785,16 +985,17 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
        loff_t psize;
        int i, err;
 
-       set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
+       if (!f2fs_trylock_op(sbi))
+               return -EAGAIN;
 
-       f2fs_lock_op(sbi);
+       set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
 
        err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
        if (err)
                goto out_unlock_op;
 
        for (i = 0; i < cc->cluster_size; i++) {
-               if (datablock_addr(dn.inode, dn.node_page,
+               if (data_blkaddr(dn.inode, dn.node_page,
                                        dn.ofs_in_node + i) == NULL_ADDR)
                        goto out_put_dnode;
        }
@@ -813,7 +1014,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 
        cic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
        cic->inode = inode;
-       refcount_set(&cic->ref, 1);
+       refcount_set(&cic->ref, cc->nr_cpages);
        cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
                        cc->log_cluster_size, GFP_NOFS);
        if (!cic->rpages)
@@ -823,8 +1024,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
 
        for (i = 0; i < cc->nr_cpages; i++) {
                f2fs_set_compressed_page(cc->cpages[i], inode,
-                                       cc->rpages[i + 1]->index,
-                                       cic, i ? &cic->ref : NULL);
+                                       cc->rpages[i + 1]->index, cic);
                fio.compressed_page = cc->cpages[i];
                if (fio.encrypted) {
                        fio.page = cc->rpages[i + 1];
@@ -843,9 +1043,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
        for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) {
                block_t blkaddr;
 
-               blkaddr = datablock_addr(dn.inode, dn.node_page,
-                                                       dn.ofs_in_node);
-               fio.page = cic->rpages[i];
+               blkaddr = f2fs_data_blkaddr(&dn);
+               fio.page = cc->rpages[i];
                fio.old_blkaddr = blkaddr;
 
                /* cluster header */
@@ -895,10 +1094,10 @@ unlock_continue:
        f2fs_put_dnode(&dn);
        f2fs_unlock_op(sbi);
 
-       down_write(&fi->i_sem);
+       spin_lock(&fi->i_size_lock);
        if (fi->last_disk_size < psize)
                fi->last_disk_size = psize;
-       up_write(&fi->i_sem);
+       spin_unlock(&fi->i_size_lock);
 
        f2fs_put_rpages(cc);
        f2fs_destroy_compress_ctx(cc);
@@ -984,24 +1183,30 @@ retry_write:
                                unlock_page(cc->rpages[i]);
                                ret = 0;
                        } else if (ret == -EAGAIN) {
+                               /*
+                                * for quota file, just redirty left pages to
+                                * avoid deadlock caused by cluster update race
+                                * from foreground operation.
+                                */
+                               if (IS_NOQUOTA(cc->inode)) {
+                                       err = 0;
+                                       goto out_err;
+                               }
                                ret = 0;
                                cond_resched();
-                               congestion_wait(BLK_RW_ASYNC, HZ/50);
+                               congestion_wait(BLK_RW_ASYNC,
+                                               DEFAULT_IO_TIMEOUT);
                                lock_page(cc->rpages[i]);
                                clear_page_dirty_for_io(cc->rpages[i]);
                                goto retry_write;
                        }
                        err = ret;
-                       goto out_fail;
+                       goto out_err;
                }
 
                *submitted += _submitted;
        }
        return 0;
-
-out_fail:
-       /* TODO: revoke partially updated block addresses */
-       BUG_ON(compr_blocks);
 out_err:
        for (++i; i < cc->cluster_size; i++) {
                if (!cc->rpages[i])
@@ -1069,7 +1274,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
 
        dic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
        dic->inode = cc->inode;
-       refcount_set(&dic->ref, 1);
+       refcount_set(&dic->ref, cc->nr_cpages);
        dic->cluster_idx = cc->cluster_idx;
        dic->cluster_size = cc->cluster_size;
        dic->log_cluster_size = cc->log_cluster_size;
@@ -1093,8 +1298,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
                        goto out_free;
 
                f2fs_set_compressed_page(page, cc->inode,
-                                       start_idx + i + 1,
-                                       dic, i ? &dic->ref : NULL);
+                                       start_idx + i + 1, dic);
                dic->cpages[i] = page;
        }
 
@@ -1104,20 +1308,16 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
                goto out_free;
 
        for (i = 0; i < dic->cluster_size; i++) {
-               if (cc->rpages[i])
+               if (cc->rpages[i]) {
+                       dic->tpages[i] = cc->rpages[i];
                        continue;
+               }
 
                dic->tpages[i] = f2fs_grab_page();
                if (!dic->tpages[i])
                        goto out_free;
        }
 
-       for (i = 0; i < dic->cluster_size; i++) {
-               if (dic->tpages[i])
-                       continue;
-               dic->tpages[i] = cc->rpages[i];
-       }
-
        return dic;
 
 out_free:
@@ -1133,7 +1333,10 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
                for (i = 0; i < dic->cluster_size; i++) {
                        if (dic->rpages[i])
                                continue;
-                       f2fs_put_page(dic->tpages[i], 1);
+                       if (!dic->tpages[i])
+                               continue;
+                       unlock_page(dic->tpages[i]);
+                       put_page(dic->tpages[i]);
                }
                kfree(dic->tpages);
        }
@@ -1162,15 +1365,17 @@ void f2fs_decompress_end_io(struct page **rpages,
                if (!rpage)
                        continue;
 
-               if (err || PageError(rpage)) {
-                       ClearPageUptodate(rpage);
-                       ClearPageError(rpage);
-               } else {
-                       if (!verity || fsverity_verify_page(rpage))
-                               SetPageUptodate(rpage);
-                       else
-                               SetPageError(rpage);
+               if (err || PageError(rpage))
+                       goto clear_uptodate;
+
+               if (!verity || fsverity_verify_page(rpage)) {
+                       SetPageUptodate(rpage);
+                       goto unlock;
                }
+clear_uptodate:
+               ClearPageUptodate(rpage);
+               ClearPageError(rpage);
+unlock:
                unlock_page(rpage);
        }
 }
index b27b721..cdf2f62 100644 (file)
@@ -54,17 +54,13 @@ static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask,
        return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset);
 }
 
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail)
+struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio)
 {
-       struct bio *bio;
-
-       if (no_fail) {
+       if (noio) {
                /* No failure on bio allocation */
-               bio = __f2fs_bio_alloc(GFP_NOIO, npages);
-               if (!bio)
-                       bio = __f2fs_bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
-               return bio;
+               return __f2fs_bio_alloc(GFP_NOIO, npages);
        }
+
        if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
                f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
                return NULL;
@@ -143,6 +139,8 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
                        f2fs_decompress_pages(bio, page, verity);
                        continue;
                }
+               if (verity)
+                       continue;
 #endif
 
                /* PG_error was set if any post_read step failed */
@@ -191,12 +189,38 @@ static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size)
 
 static void f2fs_verify_bio(struct bio *bio)
 {
-       struct page *page = bio_first_page_all(bio);
-       struct decompress_io_ctx *dic =
-                       (struct decompress_io_ctx *)page_private(page);
+       struct bio_vec *bv;
+       struct bvec_iter_all iter_all;
 
-       f2fs_verify_pages(dic->rpages, dic->cluster_size);
-       f2fs_free_dic(dic);
+       bio_for_each_segment_all(bv, bio, iter_all) {
+               struct page *page = bv->bv_page;
+               struct decompress_io_ctx *dic;
+
+               dic = (struct decompress_io_ctx *)page_private(page);
+
+               if (dic) {
+                       if (refcount_dec_not_one(&dic->ref))
+                               continue;
+                       f2fs_verify_pages(dic->rpages,
+                                               dic->cluster_size);
+                       f2fs_free_dic(dic);
+                       continue;
+               }
+
+               if (bio->bi_status || PageError(page))
+                       goto clear_uptodate;
+
+               if (fsverity_verify_page(page)) {
+                       SetPageUptodate(page);
+                       goto unlock;
+               }
+clear_uptodate:
+               ClearPageUptodate(page);
+               ClearPageError(page);
+unlock:
+               dec_page_count(F2FS_P_SB(page), __read_io_type(page));
+               unlock_page(page);
+       }
 }
 #endif
 
@@ -364,9 +388,6 @@ static void f2fs_write_end_io(struct bio *bio)
        bio_put(bio);
 }
 
-/*
- * Return true, if pre_bio's bdev is same as its target device.
- */
 struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
                                block_t blk_addr, struct bio *bio)
 {
@@ -403,6 +424,9 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
        return 0;
 }
 
+/*
+ * Return true, if pre_bio's bdev is same as its target device.
+ */
 static bool __same_bdev(struct f2fs_sb_info *sbi,
                                block_t blk_addr, struct bio *bio)
 {
@@ -410,9 +434,6 @@ static bool __same_bdev(struct f2fs_sb_info *sbi,
        return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno;
 }
 
-/*
- * Low-level block read/write IO operations.
- */
 static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
 {
        struct f2fs_sb_info *sbi = fio->sbi;
@@ -445,7 +466,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
                if (type != DATA && type != NODE)
                        goto submit_io;
 
-               if (test_opt(sbi, LFS) && current->plug)
+               if (f2fs_lfs_mode(sbi) && current->plug)
                        blk_finish_plug(current->plug);
 
                if (F2FS_IO_ALIGNED(sbi))
@@ -928,14 +949,15 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
 
 static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
                                      unsigned nr_pages, unsigned op_flag,
-                                     pgoff_t first_idx)
+                                     pgoff_t first_idx, bool for_write)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct bio *bio;
        struct bio_post_read_ctx *ctx;
        unsigned int post_read_steps = 0;
 
-       bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
+       bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES),
+                                                               for_write);
        if (!bio)
                return ERR_PTR(-ENOMEM);
        f2fs_target_device(sbi, blkaddr, bio);
@@ -970,12 +992,12 @@ static void f2fs_release_read_bio(struct bio *bio)
 
 /* This can handle encryption stuffs */
 static int f2fs_submit_page_read(struct inode *inode, struct page *page,
-                                                       block_t blkaddr)
+                                               block_t blkaddr, bool for_write)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct bio *bio;
 
-       bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index);
+       bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index, for_write);
        if (IS_ERR(bio))
                return PTR_ERR(bio);
 
@@ -1047,8 +1069,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
        f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
 
        for (; count > 0; dn->ofs_in_node++) {
-               block_t blkaddr = datablock_addr(dn->inode,
-                                       dn->node_page, dn->ofs_in_node);
+               block_t blkaddr = f2fs_data_blkaddr(dn);
                if (blkaddr == NULL_ADDR) {
                        dn->data_blkaddr = NEW_ADDR;
                        __set_data_blkaddr(dn);
@@ -1162,7 +1183,7 @@ got_it:
                return page;
        }
 
-       err = f2fs_submit_page_read(inode, page, dn.data_blkaddr);
+       err = f2fs_submit_page_read(inode, page, dn.data_blkaddr, for_write);
        if (err)
                goto put_err;
        return page;
@@ -1300,8 +1321,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
        if (err)
                return err;
 
-       dn->data_blkaddr = datablock_addr(dn->inode,
-                               dn->node_page, dn->ofs_in_node);
+       dn->data_blkaddr = f2fs_data_blkaddr(dn);
        if (dn->data_blkaddr != NULL_ADDR)
                goto alloc;
 
@@ -1388,13 +1408,9 @@ void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
 }
 
 /*
- * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with
- * f2fs_map_blocks structure.
- * If original data blocks are allocated, then give them to blockdev.
- * Otherwise,
- *     a. preallocate requested block addresses
- *     b. do not use extent cache for better performance
- *     c. give the block addresses to blockdev
+ * f2fs_map_blocks() tries to find or build mapping relationship which
+ * maps continuous logical blocks to physical blocks, and return such
+ * info via f2fs_map_blocks structure.
  */
 int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
                                                int create, int flag)
@@ -1422,7 +1438,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
        end = pgofs + maxblocks;
 
        if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
-               if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+               if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
                                                        map->m_may_create)
                        goto next_dnode;
 
@@ -1467,7 +1483,7 @@ next_dnode:
        end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
 
 next_block:
-       blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
+       blkaddr = f2fs_data_blkaddr(&dn);
 
        if (__is_valid_data_blkaddr(blkaddr) &&
                !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
@@ -1477,7 +1493,7 @@ next_block:
 
        if (__is_valid_data_blkaddr(blkaddr)) {
                /* use out-place-update for driect IO under LFS mode */
-               if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+               if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
                                                        map->m_may_create) {
                        err = __allocate_data_block(&dn, map->m_seg_type);
                        if (err)
@@ -1980,7 +1996,8 @@ submit_and_realloc:
        }
        if (bio == NULL) {
                bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
-                               is_readahead ? REQ_RAHEAD : 0, page->index);
+                               is_readahead ? REQ_RAHEAD : 0, page->index,
+                               false);
                if (IS_ERR(bio)) {
                        ret = PTR_ERR(bio);
                        bio = NULL;
@@ -2015,7 +2032,7 @@ out:
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
                                unsigned nr_pages, sector_t *last_block_in_bio,
-                               bool is_readahead)
+                               bool is_readahead, bool for_write)
 {
        struct dnode_of_data dn;
        struct inode *inode = cc->inode;
@@ -2031,7 +2048,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 
        f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc));
 
-       last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+       last_block_in_file = (f2fs_readpage_limit(inode) +
+                                       blocksize - 1) >> blkbits;
 
        /* get rid of pages beyond EOF */
        for (i = 0; i < cc->cluster_size; i++) {
@@ -2067,7 +2085,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
        for (i = 1; i < cc->cluster_size; i++) {
                block_t blkaddr;
 
-               blkaddr = datablock_addr(dn.inode, dn.node_page,
+               blkaddr = data_blkaddr(dn.inode, dn.node_page,
                                                dn.ofs_in_node + i);
 
                if (!__is_valid_data_blkaddr(blkaddr))
@@ -2096,7 +2114,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
                struct page *page = dic->cpages[i];
                block_t blkaddr;
 
-               blkaddr = datablock_addr(dn.inode, dn.node_page,
+               blkaddr = data_blkaddr(dn.inode, dn.node_page,
                                                dn.ofs_in_node + i + 1);
 
                if (bio && !page_is_mergeable(sbi, bio,
@@ -2109,7 +2127,7 @@ submit_and_realloc:
                if (!bio) {
                        bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
                                        is_readahead ? REQ_RAHEAD : 0,
-                                       page->index);
+                                       page->index, for_write);
                        if (IS_ERR(bio)) {
                                ret = PTR_ERR(bio);
                                bio = NULL;
@@ -2210,7 +2228,7 @@ int f2fs_mpage_readpages(struct address_space *mapping,
                                ret = f2fs_read_multi_pages(&cc, &bio,
                                                        max_nr_pages,
                                                        &last_block_in_bio,
-                                                       is_readahead);
+                                                       is_readahead, false);
                                f2fs_destroy_compress_ctx(&cc);
                                if (ret)
                                        goto set_error_page;
@@ -2253,7 +2271,7 @@ next_page:
                                ret = f2fs_read_multi_pages(&cc, &bio,
                                                        max_nr_pages,
                                                        &last_block_in_bio,
-                                                       is_readahead);
+                                                       is_readahead, false);
                                f2fs_destroy_compress_ctx(&cc);
                        }
                }
@@ -2326,7 +2344,7 @@ retry_encrypt:
                /* flush pending IOs and wait for a while in the ENOMEM case */
                if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
                        f2fs_flush_merged_writes(fio->sbi);
-                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
                        gfp_flags |= __GFP_NOFAIL;
                        goto retry_encrypt;
                }
@@ -2397,7 +2415,7 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
-       if (test_opt(sbi, LFS))
+       if (f2fs_lfs_mode(sbi))
                return true;
        if (S_ISDIR(inode->i_mode))
                return true;
@@ -2647,10 +2665,10 @@ write:
        if (err) {
                file_set_keep_isize(inode);
        } else {
-               down_write(&F2FS_I(inode)->i_sem);
+               spin_lock(&F2FS_I(inode)->i_size_lock);
                if (F2FS_I(inode)->last_disk_size < psize)
                        F2FS_I(inode)->last_disk_size = psize;
-               up_write(&F2FS_I(inode)->i_sem);
+               spin_unlock(&F2FS_I(inode)->i_size_lock);
        }
 
 done:
@@ -2917,7 +2935,7 @@ result:
                                        if (wbc->sync_mode == WB_SYNC_ALL) {
                                                cond_resched();
                                                congestion_wait(BLK_RW_ASYNC,
-                                                               HZ/50);
+                                                       DEFAULT_IO_TIMEOUT);
                                                goto retry_write;
                                        }
                                        goto next;
@@ -2973,15 +2991,17 @@ next:
 static inline bool __should_serialize_io(struct inode *inode,
                                        struct writeback_control *wbc)
 {
+       /* to avoid deadlock in path of data flush */
+       if (F2FS_I(inode)->cp_task)
+               return false;
+
        if (!S_ISREG(inode->i_mode))
                return false;
-       if (f2fs_compressed_file(inode))
-               return true;
        if (IS_NOQUOTA(inode))
                return false;
-       /* to avoid deadlock in path of data flush */
-       if (F2FS_I(inode)->cp_task)
-               return false;
+
+       if (f2fs_compressed_file(inode))
+               return true;
        if (wbc->sync_mode != WB_SYNC_ALL)
                return true;
        if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks)
@@ -3283,7 +3303,7 @@ repeat:
                        err = -EFSCORRUPTED;
                        goto fail;
                }
-               err = f2fs_submit_page_read(inode, page, blkaddr);
+               err = f2fs_submit_page_read(inode, page, blkaddr, true);
                if (err)
                        goto fail;
 
@@ -3464,7 +3484,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
        err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
                        iter, rw == WRITE ? get_data_block_dio_write :
                        get_data_block_dio, NULL, f2fs_dio_submit_bio,
-                       DIO_LOCKING | DIO_SKIP_HOLES);
+                       rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES :
+                       DIO_SKIP_HOLES);
 
        if (do_opu)
                up_read(&fi->i_gc_rwsem[READ]);
@@ -3861,7 +3882,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi)
 
 int __init f2fs_init_bio_entry_cache(void)
 {
-       bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab",
+       bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
                        sizeof(struct bio_entry));
        if (!bio_entry_slab)
                return -ENOMEM;
index 6b89eae..0dbcb0f 100644 (file)
@@ -301,6 +301,9 @@ static int stat_show(struct seq_file *s, void *v)
                           si->ssa_area_segs, si->main_area_segs);
                seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
                           si->overp_segs, si->rsvd_segs);
+               seq_printf(s, "Current Time Sec: %llu / Mounted Time Sec: %llu\n\n",
+                                       ktime_get_boottime_seconds(),
+                                       SIT_I(si->sbi)->mounted_time);
                if (test_opt(si->sbi, DISCARD))
                        seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n",
                                si->utilization, si->valid_count, si->discard_blks);
index 27d0dd7..44bfc46 100644 (file)
@@ -471,7 +471,6 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
                        struct page *dpage)
 {
        struct page *page;
-       int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir));
        int err;
 
        if (is_inode_flag_set(inode, FI_NEW_INODE)) {
@@ -498,8 +497,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
                if (err)
                        goto put_error;
 
-               if ((IS_ENCRYPTED(dir) || dummy_encrypt) &&
-                                       f2fs_may_encrypt(inode)) {
+               if (IS_ENCRYPTED(inode)) {
                        err = fscrypt_inherit_context(dir, inode, page, false);
                        if (err)
                                goto put_error;
@@ -850,12 +848,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
                        0);
        set_page_dirty(page);
 
-       dir->i_ctime = dir->i_mtime = current_time(dir);
-       f2fs_mark_inode_dirty_sync(dir, false);
-
-       if (inode)
-               f2fs_drop_nlink(dir, inode);
-
        if (bit_pos == NR_DENTRY_IN_BLOCK &&
                !f2fs_truncate_hole(dir, page->index, page->index + 1)) {
                f2fs_clear_page_cache_dirty_tag(page);
@@ -867,6 +859,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
                f2fs_remove_dirty_inode(dir);
        }
        f2fs_put_page(page, 1);
+
+       dir->i_ctime = dir->i_mtime = current_time(dir);
+       f2fs_mark_inode_dirty_sync(dir, false);
+
+       if (inode)
+               f2fs_drop_nlink(dir, inode);
 }
 
 bool f2fs_empty_dir(struct inode *dir)
index 088c3e7..ba470d5 100644 (file)
@@ -75,7 +75,6 @@ extern const char *f2fs_fault_name[FAULT_MAX];
 /*
  * For mount options
  */
-#define F2FS_MOUNT_BG_GC               0x00000001
 #define F2FS_MOUNT_DISABLE_ROLL_FORWARD        0x00000002
 #define F2FS_MOUNT_DISCARD             0x00000004
 #define F2FS_MOUNT_NOHEAP              0x00000008
@@ -89,11 +88,8 @@ extern const char *f2fs_fault_name[FAULT_MAX];
 #define F2FS_MOUNT_NOBARRIER           0x00000800
 #define F2FS_MOUNT_FASTBOOT            0x00001000
 #define F2FS_MOUNT_EXTENT_CACHE                0x00002000
-#define F2FS_MOUNT_FORCE_FG_GC         0x00004000
 #define F2FS_MOUNT_DATA_FLUSH          0x00008000
 #define F2FS_MOUNT_FAULT_INJECTION     0x00010000
-#define F2FS_MOUNT_ADAPTIVE            0x00020000
-#define F2FS_MOUNT_LFS                 0x00040000
 #define F2FS_MOUNT_USRQUOTA            0x00080000
 #define F2FS_MOUNT_GRPQUOTA            0x00100000
 #define F2FS_MOUNT_PRJQUOTA            0x00200000
@@ -101,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
 #define F2FS_MOUNT_INLINE_XATTR_SIZE   0x00800000
 #define F2FS_MOUNT_RESERVE_ROOT                0x01000000
 #define F2FS_MOUNT_DISABLE_CHECKPOINT  0x02000000
+#define F2FS_MOUNT_NORECOVERY          0x04000000
 
 #define F2FS_OPTION(sbi)       ((sbi)->mount_opt)
 #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -139,6 +136,8 @@ struct f2fs_mount_info {
        int whint_mode;
        int alloc_mode;                 /* segment allocation policy */
        int fsync_mode;                 /* fsync policy */
+       int fs_mode;                    /* fs mode: LFS or ADAPTIVE */
+       int bggc_mode;                  /* bggc mode: off, on or sync */
        bool test_dummy_encryption;     /* test dummy encryption */
        block_t unusable_cap;           /* Amount of space allowed to be
                                         * unusable when disabling checkpoint
@@ -332,8 +331,8 @@ struct discard_policy {
        bool io_aware;                  /* issue discard in idle time */
        bool sync;                      /* submit discard with REQ_SYNC flag */
        bool ordered;                   /* issue discard by lba order */
+       bool timeout;                   /* discard timeout for put_super */
        unsigned int granularity;       /* discard granularity */
-       int timeout;                    /* discard timeout for put_super */
 };
 
 struct discard_cmd_control {
@@ -428,6 +427,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
 #define F2FS_IOC_GET_PIN_FILE          _IOR(F2FS_IOCTL_MAGIC, 14, __u32)
 #define F2FS_IOC_PRECACHE_EXTENTS      _IO(F2FS_IOCTL_MAGIC, 15)
 #define F2FS_IOC_RESIZE_FS             _IOW(F2FS_IOCTL_MAGIC, 16, __u64)
+#define F2FS_IOC_GET_COMPRESS_BLOCKS   _IOR(F2FS_IOCTL_MAGIC, 17, __u64)
 
 #define F2FS_IOC_GET_VOLUME_NAME       FS_IOC_GETFSLABEL
 #define F2FS_IOC_SET_VOLUME_NAME       FS_IOC_SETFSLABEL
@@ -560,6 +560,9 @@ enum {
 
 #define DEFAULT_RETRY_IO_COUNT 8       /* maximum retry read IO count */
 
+/* congestion wait timeout value, default: 20ms */
+#define        DEFAULT_IO_TIMEOUT      (msecs_to_jiffies(20))
+
 /* maximum retry quota flush count */
 #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT                8
 
@@ -676,6 +679,44 @@ enum {
        MAX_GC_FAILURE
 };
 
+/* used for f2fs_inode_info->flags */
+enum {
+       FI_NEW_INODE,           /* indicate newly allocated inode */
+       FI_DIRTY_INODE,         /* indicate inode is dirty or not */
+       FI_AUTO_RECOVER,        /* indicate inode is recoverable */
+       FI_DIRTY_DIR,           /* indicate directory has dirty pages */
+       FI_INC_LINK,            /* need to increment i_nlink */
+       FI_ACL_MODE,            /* indicate acl mode */
+       FI_NO_ALLOC,            /* should not allocate any blocks */
+       FI_FREE_NID,            /* free allocated nide */
+       FI_NO_EXTENT,           /* not to use the extent cache */
+       FI_INLINE_XATTR,        /* used for inline xattr */
+       FI_INLINE_DATA,         /* used for inline data*/
+       FI_INLINE_DENTRY,       /* used for inline dentry */
+       FI_APPEND_WRITE,        /* inode has appended data */
+       FI_UPDATE_WRITE,        /* inode has in-place-update data */
+       FI_NEED_IPU,            /* used for ipu per file */
+       FI_ATOMIC_FILE,         /* indicate atomic file */
+       FI_ATOMIC_COMMIT,       /* indicate the state of atomical committing */
+       FI_VOLATILE_FILE,       /* indicate volatile file */
+       FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
+       FI_DROP_CACHE,          /* drop dirty page cache */
+       FI_DATA_EXIST,          /* indicate data exists */
+       FI_INLINE_DOTS,         /* indicate inline dot dentries */
+       FI_DO_DEFRAG,           /* indicate defragment is running */
+       FI_DIRTY_FILE,          /* indicate regular/symlink has dirty pages */
+       FI_NO_PREALLOC,         /* indicate skipped preallocated blocks */
+       FI_HOT_DATA,            /* indicate file is hot */
+       FI_EXTRA_ATTR,          /* indicate file has extra attribute */
+       FI_PROJ_INHERIT,        /* indicate file inherits projectid */
+       FI_PIN_FILE,            /* indicate file should not be gced */
+       FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
+       FI_VERITY_IN_PROGRESS,  /* building fs-verity Merkle tree */
+       FI_COMPRESSED_FILE,     /* indicate file's data can be compressed */
+       FI_MMAP_FILE,           /* indicate file was mmapped */
+       FI_MAX,                 /* max flag, never be used */
+};
+
 struct f2fs_inode_info {
        struct inode vfs_inode;         /* serve a vfs inode */
        unsigned long i_flags;          /* keep an inode flags for ioctl */
@@ -688,7 +729,7 @@ struct f2fs_inode_info {
        umode_t i_acl_mode;             /* keep file acl mode temporarily */
 
        /* Use below internally in f2fs*/
-       unsigned long flags;            /* use to pass per-file flags */
+       unsigned long flags[BITS_TO_LONGS(FI_MAX)];     /* use to pass per-file flags */
        struct rw_semaphore i_sem;      /* protect fi info */
        atomic_t dirty_pages;           /* # of dirty pages */
        f2fs_hash_t chash;              /* hash value of given file name */
@@ -697,6 +738,7 @@ struct f2fs_inode_info {
        struct task_struct *cp_task;    /* separate cp/wb IO stats*/
        nid_t i_xattr_nid;              /* node id that contains xattrs */
        loff_t  last_disk_size;         /* lastly written file size */
+       spinlock_t i_size_lock;         /* protect last_disk_size */
 
 #ifdef CONFIG_QUOTA
        struct dquot *i_dquot[MAXQUOTAS];
@@ -1173,6 +1215,20 @@ enum {
 };
 
 enum {
+       BGGC_MODE_ON,           /* background gc is on */
+       BGGC_MODE_OFF,          /* background gc is off */
+       BGGC_MODE_SYNC,         /*
+                                * background gc is on, migrating blocks
+                                * like foreground gc
+                                */
+};
+
+enum {
+       FS_MODE_ADAPTIVE,       /* use both lfs/ssr allocation */
+       FS_MODE_LFS,            /* use lfs allocation only */
+};
+
+enum {
        WHINT_MODE_OFF,         /* not pass down write hints */
        WHINT_MODE_USER,        /* try to pass down hints given by users */
        WHINT_MODE_FS,          /* pass down hints with F2FS policy */
@@ -1212,13 +1268,13 @@ enum fsync_mode {
 enum compress_algorithm_type {
        COMPRESS_LZO,
        COMPRESS_LZ4,
+       COMPRESS_ZSTD,
        COMPRESS_MAX,
 };
 
-#define COMPRESS_DATA_RESERVED_SIZE            4
+#define COMPRESS_DATA_RESERVED_SIZE            5
 struct compress_data {
        __le32 clen;                    /* compressed data size */
-       __le32 chksum;                  /* checksum of compressed data */
        __le32 reserved[COMPRESS_DATA_RESERVED_SIZE];   /* reserved */
        u8 cdata[];                     /* compressed data */
 };
@@ -1242,6 +1298,7 @@ struct compress_ctx {
        size_t rlen;                    /* valid data length in rbuf */
        size_t clen;                    /* valid data length in cbuf */
        void *private;                  /* payload buffer for specified compression algorithm */
+       void *private2;                 /* extra payload buffer */
 };
 
 /* compress context for write IO path */
@@ -1271,11 +1328,14 @@ struct decompress_io_ctx {
        size_t clen;                    /* valid data length in cbuf */
        refcount_t ref;                 /* referrence count of compressed page */
        bool failed;                    /* indicate IO error during decompression */
+       void *private;                  /* payload buffer for specified decompression algorithm */
+       void *private2;                 /* extra payload buffer */
 };
 
 #define NULL_CLUSTER                   ((unsigned int)(~0))
 #define MIN_COMPRESS_LOG_SIZE          2
 #define MAX_COMPRESS_LOG_SIZE          8
+#define MAX_COMPRESS_WINDOW_SIZE       ((PAGE_SIZE) << MAX_COMPRESS_LOG_SIZE)
 
 struct f2fs_sb_info {
        struct super_block *sb;                 /* pointer to VFS super block */
@@ -1471,6 +1531,9 @@ struct f2fs_sb_info {
        __u32 s_chksum_seed;
 
        struct workqueue_struct *post_read_wq;  /* post read workqueue */
+
+       struct kmem_cache *inline_xattr_slab;   /* inline xattr entry */
+       unsigned int inline_xattr_slab_size;    /* default inline xattr slab size */
 };
 
 struct f2fs_private_dio {
@@ -2211,7 +2274,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
                dquot_free_inode(inode);
        } else {
                if (unlikely(inode->i_blocks == 0)) {
-                       f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu",
+                       f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%lu, iblocks:%llu",
                                  inode->i_ino,
                                  (unsigned long long)inode->i_blocks);
                        set_sbi_flag(sbi, SBI_NEED_FSCK);
@@ -2379,7 +2442,7 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
 }
 
 static inline int f2fs_has_extra_attr(struct inode *inode);
-static inline block_t datablock_addr(struct inode *inode,
+static inline block_t data_blkaddr(struct inode *inode,
                        struct page *node_page, unsigned int offset)
 {
        struct f2fs_node *raw_node;
@@ -2389,9 +2452,9 @@ static inline block_t datablock_addr(struct inode *inode,
 
        raw_node = F2FS_NODE(node_page);
 
-       /* from GC path only */
        if (is_inode) {
                if (!inode)
+                       /* from GC path only */
                        base = offset_in_addr(&raw_node->i);
                else if (f2fs_has_extra_attr(inode))
                        base = get_extra_isize(inode);
@@ -2401,6 +2464,11 @@ static inline block_t datablock_addr(struct inode *inode,
        return le32_to_cpu(addr_array[base + offset]);
 }
 
+static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn)
+{
+       return data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node);
+}
+
 static inline int f2fs_test_bit(unsigned int nr, char *addr)
 {
        int mask;
@@ -2498,43 +2566,6 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
                return flags & F2FS_OTHER_FLMASK;
 }
 
-/* used for f2fs_inode_info->flags */
-enum {
-       FI_NEW_INODE,           /* indicate newly allocated inode */
-       FI_DIRTY_INODE,         /* indicate inode is dirty or not */
-       FI_AUTO_RECOVER,        /* indicate inode is recoverable */
-       FI_DIRTY_DIR,           /* indicate directory has dirty pages */
-       FI_INC_LINK,            /* need to increment i_nlink */
-       FI_ACL_MODE,            /* indicate acl mode */
-       FI_NO_ALLOC,            /* should not allocate any blocks */
-       FI_FREE_NID,            /* free allocated nide */
-       FI_NO_EXTENT,           /* not to use the extent cache */
-       FI_INLINE_XATTR,        /* used for inline xattr */
-       FI_INLINE_DATA,         /* used for inline data*/
-       FI_INLINE_DENTRY,       /* used for inline dentry */
-       FI_APPEND_WRITE,        /* inode has appended data */
-       FI_UPDATE_WRITE,        /* inode has in-place-update data */
-       FI_NEED_IPU,            /* used for ipu per file */
-       FI_ATOMIC_FILE,         /* indicate atomic file */
-       FI_ATOMIC_COMMIT,       /* indicate the state of atomical committing */
-       FI_VOLATILE_FILE,       /* indicate volatile file */
-       FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
-       FI_DROP_CACHE,          /* drop dirty page cache */
-       FI_DATA_EXIST,          /* indicate data exists */
-       FI_INLINE_DOTS,         /* indicate inline dot dentries */
-       FI_DO_DEFRAG,           /* indicate defragment is running */
-       FI_DIRTY_FILE,          /* indicate regular/symlink has dirty pages */
-       FI_NO_PREALLOC,         /* indicate skipped preallocated blocks */
-       FI_HOT_DATA,            /* indicate file is hot */
-       FI_EXTRA_ATTR,          /* indicate file has extra attribute */
-       FI_PROJ_INHERIT,        /* indicate file inherits projectid */
-       FI_PIN_FILE,            /* indicate file should not be gced */
-       FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
-       FI_VERITY_IN_PROGRESS,  /* building fs-verity Merkle tree */
-       FI_COMPRESSED_FILE,     /* indicate file's data can be compressed */
-       FI_MMAP_FILE,           /* indicate file was mmapped */
-};
-
 static inline void __mark_inode_dirty_flag(struct inode *inode,
                                                int flag, bool set)
 {
@@ -2549,27 +2580,24 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
        case FI_DATA_EXIST:
        case FI_INLINE_DOTS:
        case FI_PIN_FILE:
-       case FI_COMPRESSED_FILE:
                f2fs_mark_inode_dirty_sync(inode, true);
        }
 }
 
 static inline void set_inode_flag(struct inode *inode, int flag)
 {
-       if (!test_bit(flag, &F2FS_I(inode)->flags))
-               set_bit(flag, &F2FS_I(inode)->flags);
+       test_and_set_bit(flag, F2FS_I(inode)->flags);
        __mark_inode_dirty_flag(inode, flag, true);
 }
 
 static inline int is_inode_flag_set(struct inode *inode, int flag)
 {
-       return test_bit(flag, &F2FS_I(inode)->flags);
+       return test_bit(flag, F2FS_I(inode)->flags);
 }
 
 static inline void clear_inode_flag(struct inode *inode, int flag)
 {
-       if (test_bit(flag, &F2FS_I(inode)->flags))
-               clear_bit(flag, &F2FS_I(inode)->flags);
+       test_and_clear_bit(flag, F2FS_I(inode)->flags);
        __mark_inode_dirty_flag(inode, flag, false);
 }
 
@@ -2660,19 +2688,19 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
        struct f2fs_inode_info *fi = F2FS_I(inode);
 
        if (ri->i_inline & F2FS_INLINE_XATTR)
-               set_bit(FI_INLINE_XATTR, &fi->flags);
+               set_bit(FI_INLINE_XATTR, fi->flags);
        if (ri->i_inline & F2FS_INLINE_DATA)
-               set_bit(FI_INLINE_DATA, &fi->flags);
+               set_bit(FI_INLINE_DATA, fi->flags);
        if (ri->i_inline & F2FS_INLINE_DENTRY)
-               set_bit(FI_INLINE_DENTRY, &fi->flags);
+               set_bit(FI_INLINE_DENTRY, fi->flags);
        if (ri->i_inline & F2FS_DATA_EXIST)
-               set_bit(FI_DATA_EXIST, &fi->flags);
+               set_bit(FI_DATA_EXIST, fi->flags);
        if (ri->i_inline & F2FS_INLINE_DOTS)
-               set_bit(FI_INLINE_DOTS, &fi->flags);
+               set_bit(FI_INLINE_DOTS, fi->flags);
        if (ri->i_inline & F2FS_EXTRA_ATTR)
-               set_bit(FI_EXTRA_ATTR, &fi->flags);
+               set_bit(FI_EXTRA_ATTR, fi->flags);
        if (ri->i_inline & F2FS_PIN_FILE)
-               set_bit(FI_PIN_FILE, &fi->flags);
+               set_bit(FI_PIN_FILE, fi->flags);
 }
 
 static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
@@ -2857,9 +2885,9 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
        if (!f2fs_is_time_consistent(inode))
                return false;
 
-       down_read(&F2FS_I(inode)->i_sem);
+       spin_lock(&F2FS_I(inode)->i_size_lock);
        ret = F2FS_I(inode)->last_disk_size == i_size_read(inode);
-       up_read(&F2FS_I(inode)->i_sem);
+       spin_unlock(&F2FS_I(inode)->i_size_lock);
 
        return ret;
 }
@@ -3213,7 +3241,7 @@ void f2fs_drop_inmem_pages(struct inode *inode);
 void f2fs_drop_inmem_page(struct inode *inode, struct page *page);
 int f2fs_commit_inmem_pages(struct inode *inode);
 void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
-void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg);
 int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino);
 int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi);
 int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
@@ -3309,7 +3337,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi);
 void f2fs_update_dirty_page(struct inode *inode, struct page *page);
 void f2fs_remove_dirty_inode(struct inode *inode);
 int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
-void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi);
+void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type);
 int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
 void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
 int __init f2fs_create_checkpoint_caches(void);
@@ -3320,7 +3348,7 @@ void f2fs_destroy_checkpoint_caches(void);
  */
 int __init f2fs_init_bioset(void);
 void f2fs_destroy_bioset(void);
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail);
+struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio);
 int f2fs_init_bio_entry_cache(void);
 void f2fs_destroy_bio_entry_cache(void);
 void f2fs_submit_bio(struct f2fs_sb_info *sbi,
@@ -3776,7 +3804,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
 int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
 int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
                                unsigned nr_pages, sector_t *last_block_in_bio,
-                               bool is_readahead);
+                               bool is_readahead, bool for_write);
 struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
 void f2fs_free_dic(struct decompress_io_ctx *dic);
 void f2fs_decompress_end_io(struct page **rpages,
@@ -3813,6 +3841,7 @@ static inline void set_compress_context(struct inode *inode)
        F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
        set_inode_flag(inode, FI_COMPRESSED_FILE);
        stat_inc_compr_inode(inode);
+       f2fs_mark_inode_dirty_sync(inode, true);
 }
 
 static inline u64 f2fs_disable_compressed_file(struct inode *inode)
@@ -3821,12 +3850,17 @@ static inline u64 f2fs_disable_compressed_file(struct inode *inode)
 
        if (!f2fs_compressed_file(inode))
                return 0;
-       if (fi->i_compr_blocks)
-               return fi->i_compr_blocks;
+       if (S_ISREG(inode->i_mode)) {
+               if (get_dirty_pages(inode))
+                       return 1;
+               if (fi->i_compr_blocks)
+                       return fi->i_compr_blocks;
+       }
 
        fi->i_flags &= ~F2FS_COMPR_FL;
-       clear_inode_flag(inode, FI_COMPRESSED_FILE);
        stat_dec_compr_inode(inode);
+       clear_inode_flag(inode, FI_COMPRESSED_FILE);
+       f2fs_mark_inode_dirty_sync(inode, true);
        return 0;
 }
 
@@ -3903,31 +3937,25 @@ static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi)
        return false;
 }
 
-
-static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
+static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
 {
-       clear_opt(sbi, ADAPTIVE);
-       clear_opt(sbi, LFS);
-
-       switch (mt) {
-       case F2FS_MOUNT_ADAPTIVE:
-               set_opt(sbi, ADAPTIVE);
-               break;
-       case F2FS_MOUNT_LFS:
-               set_opt(sbi, LFS);
-               break;
-       }
+       return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
 }
 
-static inline bool f2fs_may_encrypt(struct inode *inode)
+static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode)
 {
 #ifdef CONFIG_FS_ENCRYPTION
+       struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
        umode_t mode = inode->i_mode;
 
-       return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
-#else
-       return false;
+       /*
+        * If the directory encrypted or dummy encryption enabled,
+        * then we should encrypt the inode.
+        */
+       if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi))
+               return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
 #endif
+       return false;
 }
 
 static inline bool f2fs_may_compress(struct inode *inode)
@@ -3971,7 +3999,7 @@ static inline int allow_outplace_dio(struct inode *inode,
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        int rw = iov_iter_rw(iter);
 
-       return (test_opt(sbi, LFS) && (rw == WRITE) &&
+       return (f2fs_lfs_mode(sbi) && (rw == WRITE) &&
                                !block_unaligned_IO(inode, iocb, iter));
 }
 
@@ -3993,7 +4021,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
         */
        if (f2fs_sb_has_blkzoned(sbi))
                return true;
-       if (test_opt(sbi, LFS) && (rw == WRITE)) {
+       if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
                if (block_unaligned_IO(inode, iocb, iter))
                        return true;
                if (F2FS_IO_ALIGNED(sbi))
index 351762f..6ab8f62 100644 (file)
@@ -106,13 +106,20 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
                err = f2fs_get_block(&dn, page->index);
                f2fs_put_dnode(&dn);
                __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
-               if (err) {
-                       unlock_page(page);
-                       goto out_sem;
-               }
        }
 
-       /* fill the page */
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+       if (!need_alloc) {
+               set_new_dnode(&dn, inode, NULL, NULL, 0);
+               err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
+               f2fs_put_dnode(&dn);
+       }
+#endif
+       if (err) {
+               unlock_page(page);
+               goto out_sem;
+       }
+
        f2fs_wait_on_page_writeback(page, DATA, false, true);
 
        /* wait for GCed page writeback via META_MAPPING */
@@ -448,8 +455,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
                                data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
                        block_t blkaddr;
 
-                       blkaddr = datablock_addr(dn.inode,
-                                       dn.node_page, dn.ofs_in_node);
+                       blkaddr = f2fs_data_blkaddr(&dn);
 
                        if (__is_valid_data_blkaddr(blkaddr) &&
                                !f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
@@ -793,6 +799,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
        }
 
        flags = fi->i_flags;
+       if (flags & F2FS_COMPR_FL)
+               stat->attributes |= STATX_ATTR_COMPRESSED;
        if (flags & F2FS_APPEND_FL)
                stat->attributes |= STATX_ATTR_APPEND;
        if (IS_ENCRYPTED(inode))
@@ -804,7 +812,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
        if (IS_VERITY(inode))
                stat->attributes |= STATX_ATTR_VERITY;
 
-       stat->attributes_mask |= (STATX_ATTR_APPEND |
+       stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
+                                 STATX_ATTR_APPEND |
                                  STATX_ATTR_ENCRYPTED |
                                  STATX_ATTR_IMMUTABLE |
                                  STATX_ATTR_NODUMP |
@@ -929,10 +938,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
                if (err)
                        return err;
 
-               down_write(&F2FS_I(inode)->i_sem);
+               spin_lock(&F2FS_I(inode)->i_size_lock);
                inode->i_mtime = inode->i_ctime = current_time(inode);
                F2FS_I(inode)->last_disk_size = i_size_read(inode);
-               up_write(&F2FS_I(inode)->i_sem);
+               spin_unlock(&F2FS_I(inode)->i_size_lock);
        }
 
        __setattr_copy(inode, attr);
@@ -1109,8 +1118,7 @@ next_dnode:
        done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
                                                        dn.ofs_in_node, len);
        for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
-               *blkaddr = datablock_addr(dn.inode,
-                                       dn.node_page, dn.ofs_in_node);
+               *blkaddr = f2fs_data_blkaddr(&dn);
 
                if (__is_valid_data_blkaddr(*blkaddr) &&
                        !f2fs_is_valid_blkaddr(sbi, *blkaddr,
@@ -1121,7 +1129,7 @@ next_dnode:
 
                if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) {
 
-                       if (test_opt(sbi, LFS)) {
+                       if (f2fs_lfs_mode(sbi)) {
                                f2fs_put_dnode(&dn);
                                return -EOPNOTSUPP;
                        }
@@ -1199,8 +1207,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
                                ADDRS_PER_PAGE(dn.node_page, dst_inode) -
                                                dn.ofs_in_node, len - i);
                        do {
-                               dn.data_blkaddr = datablock_addr(dn.inode,
-                                               dn.node_page, dn.ofs_in_node);
+                               dn.data_blkaddr = f2fs_data_blkaddr(&dn);
                                f2fs_truncate_data_blocks_range(&dn, 1);
 
                                if (do_replace[i]) {
@@ -1376,8 +1383,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
        int ret;
 
        for (; index < end; index++, dn->ofs_in_node++) {
-               if (datablock_addr(dn->inode, dn->node_page,
-                                       dn->ofs_in_node) == NULL_ADDR)
+               if (f2fs_data_blkaddr(dn) == NULL_ADDR)
                        count++;
        }
 
@@ -1388,8 +1394,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
 
        dn->ofs_in_node = ofs_in_node;
        for (index = start; index < end; index++, dn->ofs_in_node++) {
-               dn->data_blkaddr = datablock_addr(dn->inode,
-                                       dn->node_page, dn->ofs_in_node);
+               dn->data_blkaddr = f2fs_data_blkaddr(dn);
                /*
                 * f2fs_reserve_new_blocks will not guarantee entire block
                 * allocation.
@@ -1787,12 +1792,15 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id)
 static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
 {
        struct f2fs_inode_info *fi = F2FS_I(inode);
+       u32 masked_flags = fi->i_flags & mask;
+
+       f2fs_bug_on(F2FS_I_SB(inode), (iflags & ~mask));
 
        /* Is it quota file? Do not allow user to mess with it */
        if (IS_NOQUOTA(inode))
                return -EPERM;
 
-       if ((iflags ^ fi->i_flags) & F2FS_CASEFOLD_FL) {
+       if ((iflags ^ masked_flags) & F2FS_CASEFOLD_FL) {
                if (!f2fs_sb_has_casefold(F2FS_I_SB(inode)))
                        return -EOPNOTSUPP;
                if (!f2fs_empty_dir(inode))
@@ -1806,27 +1814,22 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
                        return -EINVAL;
        }
 
-       if ((iflags ^ fi->i_flags) & F2FS_COMPR_FL) {
-               if (S_ISREG(inode->i_mode) &&
-                       (fi->i_flags & F2FS_COMPR_FL || i_size_read(inode) ||
-                                               F2FS_HAS_BLOCKS(inode)))
-                       return -EINVAL;
+       if ((iflags ^ masked_flags) & F2FS_COMPR_FL) {
+               if (masked_flags & F2FS_COMPR_FL) {
+                       if (f2fs_disable_compressed_file(inode))
+                               return -EINVAL;
+               }
                if (iflags & F2FS_NOCOMP_FL)
                        return -EINVAL;
                if (iflags & F2FS_COMPR_FL) {
-                       int err = f2fs_convert_inline_inode(inode);
-
-                       if (err)
-                               return err;
-
                        if (!f2fs_may_compress(inode))
                                return -EINVAL;
 
                        set_compress_context(inode);
                }
        }
-       if ((iflags ^ fi->i_flags) & F2FS_NOCOMP_FL) {
-               if (fi->i_flags & F2FS_COMPR_FL)
+       if ((iflags ^ masked_flags) & F2FS_NOCOMP_FL) {
+               if (masked_flags & F2FS_COMPR_FL)
                        return -EINVAL;
        }
 
@@ -3401,6 +3404,21 @@ out:
        return err;
 }
 
+static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg)
+{
+       struct inode *inode = file_inode(filp);
+       __u64 blocks;
+
+       if (!f2fs_sb_has_compression(F2FS_I_SB(inode)))
+               return -EOPNOTSUPP;
+
+       if (!f2fs_compressed_file(inode))
+               return -EINVAL;
+
+       blocks = F2FS_I(inode)->i_compr_blocks;
+       return put_user(blocks, (u64 __user *)arg);
+}
+
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
@@ -3481,6 +3499,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return f2fs_get_volume_name(filp, arg);
        case F2FS_IOC_SET_VOLUME_NAME:
                return f2fs_set_volume_name(filp, arg);
+       case F2FS_IOC_GET_COMPRESS_BLOCKS:
+               return f2fs_get_compress_blocks(filp, arg);
        default:
                return -ENOTTY;
        }
@@ -3508,8 +3528,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                goto out;
        }
 
-       if (!f2fs_is_compress_backend_ready(inode))
-               return -EOPNOTSUPP;
+       if (!f2fs_is_compress_backend_ready(inode)) {
+               ret = -EOPNOTSUPP;
+               goto out;
+       }
 
        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!inode_trylock(inode)) {
@@ -3639,6 +3661,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case FS_IOC_MEASURE_VERITY:
        case F2FS_IOC_GET_VOLUME_NAME:
        case F2FS_IOC_SET_VOLUME_NAME:
+       case F2FS_IOC_GET_COMPRESS_BLOCKS:
                break;
        default:
                return -ENOIOCTLCMD;
index db8725d..26248c8 100644 (file)
@@ -31,6 +31,8 @@ static int gc_thread_func(void *data)
 
        set_freezable();
        do {
+               bool sync_mode;
+
                wait_event_interruptible_timeout(*wq,
                                kthread_should_stop() || freezing(current) ||
                                gc_th->gc_wake,
@@ -101,15 +103,17 @@ static int gc_thread_func(void *data)
 do_gc:
                stat_inc_bggc_count(sbi->stat_info);
 
+               sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC;
+
                /* if return value is not zero, no victim was selected */
-               if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
+               if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO))
                        wait_ms = gc_th->no_gc_sleep_time;
 
                trace_f2fs_background_gc(sbi->sb, wait_ms,
                                prefree_segments(sbi), free_segments(sbi));
 
                /* balancing f2fs's metadata periodically */
-               f2fs_balance_fs_bg(sbi);
+               f2fs_balance_fs_bg(sbi, true);
 next:
                sb_end_write(sbi->sb);
 
@@ -192,7 +196,10 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
                p->ofs_unit = sbi->segs_per_sec;
        }
 
-       /* we need to check every dirty segments in the FG_GC case */
+       /*
+        * adjust candidates range, should select all dirty segments for
+        * foreground GC and urgent GC cases.
+        */
        if (gc_type != FG_GC &&
                        (sbi->gc_mode != GC_URGENT) &&
                        p->max_search > sbi->max_victim_search)
@@ -634,7 +641,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
        }
 
        *nofs = ofs_of_node(node_page);
-       source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
+       source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
        f2fs_put_page(node_page, 1);
 
        if (source_blkaddr != blkaddr) {
@@ -762,7 +769,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
        struct page *page, *mpage;
        block_t newaddr;
        int err = 0;
-       bool lfs_mode = test_opt(fio.sbi, LFS);
+       bool lfs_mode = f2fs_lfs_mode(fio.sbi);
 
        /* do not read out */
        page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
@@ -970,7 +977,8 @@ retry:
                if (err) {
                        clear_cold_data(page);
                        if (err == -ENOMEM) {
-                               congestion_wait(BLK_RW_ASYNC, HZ/50);
+                               congestion_wait(BLK_RW_ASYNC,
+                                               DEFAULT_IO_TIMEOUT);
                                goto retry;
                        }
                        if (is_dirty)
@@ -1018,8 +1026,8 @@ next_step:
                 * race condition along with SSR block allocation.
                 */
                if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) ||
-                               get_valid_blocks(sbi, segno, false) ==
-                                                       sbi->blocks_per_seg)
+                               get_valid_blocks(sbi, segno, true) ==
+                                                       BLKS_PER_SEC(sbi))
                        return submitted;
 
                if (check_valid_map(sbi, segno, off) == 0)
@@ -1203,7 +1211,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 
                if (get_valid_blocks(sbi, segno, false) == 0)
                        goto freed;
-               if (__is_large_section(sbi) &&
+               if (gc_type == BG_GC && __is_large_section(sbi) &&
                                migrated >= sbi->migration_granularity)
                        goto skip;
                if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
@@ -1233,12 +1241,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
                                                        segno, gc_type);
 
                stat_inc_seg_count(sbi, type, gc_type);
+               migrated++;
 
 freed:
                if (gc_type == FG_GC &&
                                get_valid_blocks(sbi, segno, false) == 0)
                        seg_freed++;
-               migrated++;
 
                if (__is_large_section(sbi) && segno + 1 < end_segno)
                        sbi->next_victim_seg[gc_type] = segno + 1;
@@ -1434,12 +1442,19 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start,
 static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
 {
        struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi);
-       int section_count = le32_to_cpu(raw_sb->section_count);
-       int segment_count = le32_to_cpu(raw_sb->segment_count);
-       int segment_count_main = le32_to_cpu(raw_sb->segment_count_main);
-       long long block_count = le64_to_cpu(raw_sb->block_count);
+       int section_count;
+       int segment_count;
+       int segment_count_main;
+       long long block_count;
        int segs = secs * sbi->segs_per_sec;
 
+       down_write(&sbi->sb_lock);
+
+       section_count = le32_to_cpu(raw_sb->section_count);
+       segment_count = le32_to_cpu(raw_sb->segment_count);
+       segment_count_main = le32_to_cpu(raw_sb->segment_count_main);
+       block_count = le64_to_cpu(raw_sb->block_count);
+
        raw_sb->section_count = cpu_to_le32(section_count + secs);
        raw_sb->segment_count = cpu_to_le32(segment_count + segs);
        raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
@@ -1453,6 +1468,8 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
                raw_sb->devs[last_dev].total_segments =
                                                cpu_to_le32(dev_segs + segs);
        }
+
+       up_write(&sbi->sb_lock);
 }
 
 static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
@@ -1570,11 +1587,17 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
                goto out;
        }
 
+       mutex_lock(&sbi->cp_mutex);
        update_fs_metadata(sbi, -secs);
        clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
+       set_sbi_flag(sbi, SBI_IS_DIRTY);
+       mutex_unlock(&sbi->cp_mutex);
+
        err = f2fs_sync_fs(sbi->sb, 1);
        if (err) {
+               mutex_lock(&sbi->cp_mutex);
                update_fs_metadata(sbi, secs);
+               mutex_unlock(&sbi->cp_mutex);
                update_sb_metadata(sbi, secs);
                f2fs_commit_super(sbi, false);
        }
index 78c3f1d..44582a4 100644 (file)
@@ -291,13 +291,30 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
                        fi->i_flags & F2FS_COMPR_FL &&
                        F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
                                                i_log_cluster_size)) {
-               if (ri->i_compress_algorithm >= COMPRESS_MAX)
+               if (ri->i_compress_algorithm >= COMPRESS_MAX) {
+                       f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
+                               "compress algorithm: %u, run fsck to fix",
+                                 __func__, inode->i_ino,
+                                 ri->i_compress_algorithm);
                        return false;
-               if (le64_to_cpu(ri->i_compr_blocks) > inode->i_blocks)
+               }
+               if (le64_to_cpu(ri->i_compr_blocks) >
+                               SECTOR_TO_BLOCK(inode->i_blocks)) {
+                       f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent "
+                               "i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix",
+                                 __func__, inode->i_ino,
+                                 le64_to_cpu(ri->i_compr_blocks),
+                                 SECTOR_TO_BLOCK(inode->i_blocks));
                        return false;
+               }
                if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
-                       ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE)
+                       ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) {
+                       f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
+                               "log cluster size: %u, run fsck to fix",
+                                 __func__, inode->i_ino,
+                                 ri->i_log_cluster_size);
                        return false;
+               }
        }
 
        return true;
@@ -345,7 +362,7 @@ static int do_read_inode(struct inode *inode)
        fi->i_flags = le32_to_cpu(ri->i_flags);
        if (S_ISREG(inode->i_mode))
                fi->i_flags &= ~F2FS_PROJINHERIT_FL;
-       fi->flags = 0;
+       bitmap_zero(fi->flags, FI_MAX);
        fi->i_advise = ri->i_advise;
        fi->i_pino = le32_to_cpu(ri->i_pino);
        fi->i_dir_level = ri->i_dir_level;
@@ -518,7 +535,7 @@ retry:
        inode = f2fs_iget(sb, ino);
        if (IS_ERR(inode)) {
                if (PTR_ERR(inode) == -ENOMEM) {
-                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
                        goto retry;
                }
        }
@@ -759,7 +776,7 @@ no_delete:
        else
                f2fs_inode_synced(inode);
 
-       /* ino == 0, if f2fs_new_inode() was failed t*/
+       /* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */
        if (inode->i_ino)
                invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino,
                                                        inode->i_ino);
index 2aa0354..f54119d 100644 (file)
@@ -75,9 +75,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 
        set_inode_flag(inode, FI_NEW_INODE);
 
-       /* If the directory encrypted, then we should encrypt the inode. */
-       if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
-                               f2fs_may_encrypt(inode))
+       if (f2fs_may_encrypt(dir, inode))
                f2fs_set_encrypted_inode(inode);
 
        if (f2fs_sb_has_extra_attr(sbi)) {
@@ -177,7 +175,7 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub)
 }
 
 /*
- * Set multimedia files as cold files for hot/cold data separation
+ * Set file's temperature for hot/cold data separation
  */
 static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
                const unsigned char *name)
@@ -876,12 +874,6 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        if (!f2fs_is_checkpoint_ready(sbi))
                return -ENOSPC;
 
-       if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
-               int err = fscrypt_get_encryption_info(dir);
-               if (err)
-                       return err;
-       }
-
        return __f2fs_tmpfile(dir, dentry, mode, NULL);
 }
 
index 9d02cdc..ecbd6bd 100644 (file)
@@ -510,9 +510,6 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
        return nr - nr_shrink;
 }
 
-/*
- * This function always returns success
- */
 int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
                                                struct node_info *ni)
 {
@@ -716,8 +713,7 @@ got:
 /*
  * Caller should call f2fs_put_dnode(dn).
  * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
- * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
- * In the case of RDONLY_NODE, we don't need to care about mutex.
+ * f2fs_unlock_op() only if mode is set with ALLOC_NODE.
  */
 int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 {
@@ -809,8 +805,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
        dn->nid = nids[level];
        dn->ofs_in_node = offset[level];
        dn->node_page = npage[level];
-       dn->data_blkaddr = datablock_addr(dn->inode,
-                               dn->node_page, dn->ofs_in_node);
+       dn->data_blkaddr = f2fs_data_blkaddr(dn);
        return 0;
 
 release_pages:
@@ -1188,8 +1183,9 @@ int f2fs_remove_inode_page(struct inode *inode)
        }
 
        if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) {
-               f2fs_warn(F2FS_I_SB(inode), "Inconsistent i_blocks, ino:%lu, iblocks:%llu",
-                         inode->i_ino, (unsigned long long)inode->i_blocks);
+               f2fs_warn(F2FS_I_SB(inode),
+                       "f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu",
+                       inode->i_ino, (unsigned long long)inode->i_blocks);
                set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
        }
 
@@ -1562,15 +1558,16 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
        if (atomic && !test_opt(sbi, NOBARRIER))
                fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
 
-       set_page_writeback(page);
-       ClearPageError(page);
-
+       /* should add to global list before clearing PAGECACHE status */
        if (f2fs_in_warm_node_list(sbi, page)) {
                seq = f2fs_add_fsync_node_entry(sbi, page);
                if (seq_id)
                        *seq_id = seq;
        }
 
+       set_page_writeback(page);
+       ClearPageError(page);
+
        fio.old_blkaddr = ni.blk_addr;
        f2fs_do_write_node_page(nid, &fio);
        set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
@@ -1979,7 +1976,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
                goto skip_write;
 
        /* balancing f2fs's metadata in background */
-       f2fs_balance_fs_bg(sbi);
+       f2fs_balance_fs_bg(sbi, true);
 
        /* collect a number of dirty node pages and write together */
        if (wbc->sync_mode != WB_SYNC_ALL &&
@@ -2602,7 +2599,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
 retry:
        ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
        if (!ipage) {
-               congestion_wait(BLK_RW_ASYNC, HZ/50);
+               congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
                goto retry;
        }
 
@@ -3193,22 +3190,22 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
 
 int __init f2fs_create_node_manager_caches(void)
 {
-       nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
+       nat_entry_slab = f2fs_kmem_cache_create("f2fs_nat_entry",
                        sizeof(struct nat_entry));
        if (!nat_entry_slab)
                goto fail;
 
-       free_nid_slab = f2fs_kmem_cache_create("free_nid",
+       free_nid_slab = f2fs_kmem_cache_create("f2fs_free_nid",
                        sizeof(struct free_nid));
        if (!free_nid_slab)
                goto destroy_nat_entry;
 
-       nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
+       nat_entry_set_slab = f2fs_kmem_cache_create("f2fs_nat_entry_set",
                        sizeof(struct nat_entry_set));
        if (!nat_entry_set_slab)
                goto destroy_free_nid;
 
-       fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
+       fsync_node_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_node_entry",
                        sizeof(struct fsync_node_entry));
        if (!fsync_node_entry_slab)
                goto destroy_nat_entry_set;
index 763d5c0..dd804c0 100644 (file)
@@ -496,8 +496,7 @@ out:
        return 0;
 
 truncate_out:
-       if (datablock_addr(tdn.inode, tdn.node_page,
-                                       tdn.ofs_in_node) == blkaddr)
+       if (f2fs_data_blkaddr(&tdn) == blkaddr)
                f2fs_truncate_data_blocks_range(&tdn, 1);
        if (dn->inode->i_ino == nid && !dn->inode_page_locked)
                unlock_page(dn->inode_page);
@@ -535,7 +534,7 @@ retry_dn:
        err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE);
        if (err) {
                if (err == -ENOMEM) {
-                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
                        goto retry_dn;
                }
                goto out;
@@ -560,8 +559,8 @@ retry_dn:
        for (; start < end; start++, dn.ofs_in_node++) {
                block_t src, dest;
 
-               src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
-               dest = datablock_addr(dn.inode, page, dn.ofs_in_node);
+               src = f2fs_data_blkaddr(&dn);
+               dest = data_blkaddr(dn.inode, page, dn.ofs_in_node);
 
                if (__is_valid_data_blkaddr(src) &&
                        !f2fs_is_valid_blkaddr(sbi, src, META_POR)) {
@@ -618,7 +617,8 @@ retry_prev:
                        err = check_index_in_prev_nodes(sbi, dest, &dn);
                        if (err) {
                                if (err == -ENOMEM) {
-                                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                                       congestion_wait(BLK_RW_ASYNC,
+                                                       DEFAULT_IO_TIMEOUT);
                                        goto retry_prev;
                                }
                                goto err;
index cf0eb00..b7a9421 100644 (file)
@@ -172,7 +172,7 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi)
        int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
        int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
 
-       if (test_opt(sbi, LFS))
+       if (f2fs_lfs_mode(sbi))
                return false;
        if (sbi->gc_mode == GC_URGENT)
                return true;
@@ -245,7 +245,8 @@ retry:
                                                                LOOKUP_NODE);
                        if (err) {
                                if (err == -ENOMEM) {
-                                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                                       congestion_wait(BLK_RW_ASYNC,
+                                                       DEFAULT_IO_TIMEOUT);
                                        cond_resched();
                                        goto retry;
                                }
@@ -312,7 +313,7 @@ next:
 skip:
                iput(inode);
        }
-       congestion_wait(BLK_RW_ASYNC, HZ/50);
+       congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
        cond_resched();
        if (gc_failure) {
                if (++looped >= count)
@@ -415,7 +416,8 @@ retry:
                        err = f2fs_do_write_data_page(&fio);
                        if (err) {
                                if (err == -ENOMEM) {
-                                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                                       congestion_wait(BLK_RW_ASYNC,
+                                                       DEFAULT_IO_TIMEOUT);
                                        cond_resched();
                                        goto retry;
                                }
@@ -494,7 +496,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
 
        /* balance_fs_bg is able to be pending */
        if (need && excess_cached_nats(sbi))
-               f2fs_balance_fs_bg(sbi);
+               f2fs_balance_fs_bg(sbi, false);
 
        if (!f2fs_is_checkpoint_ready(sbi))
                return;
@@ -509,7 +511,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
        }
 }
 
-void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
 {
        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
                return;
@@ -538,7 +540,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
                        excess_dirty_nats(sbi) ||
                        excess_dirty_nodes(sbi) ||
                        f2fs_time_over(sbi, CP_TIME)) {
-               if (test_opt(sbi, DATA_FLUSH)) {
+               if (test_opt(sbi, DATA_FLUSH) && from_bg) {
                        struct blk_plug plug;
 
                        mutex_lock(&sbi->flush_lock);
@@ -1078,7 +1080,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
 
        dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
        dpolicy->io_aware_gran = MAX_PLIST_NUM;
-       dpolicy->timeout = 0;
+       dpolicy->timeout = false;
 
        if (discard_type == DPOLICY_BG) {
                dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
@@ -1103,6 +1105,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
                dpolicy->io_aware = false;
                /* we need to issue all to keep CP_TRIMMED_FLAG */
                dpolicy->granularity = 1;
+               dpolicy->timeout = true;
        }
 }
 
@@ -1471,12 +1474,12 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
        int i, issued = 0;
        bool io_interrupted = false;
 
-       if (dpolicy->timeout != 0)
-               f2fs_update_time(sbi, dpolicy->timeout);
+       if (dpolicy->timeout)
+               f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT);
 
        for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
-               if (dpolicy->timeout != 0 &&
-                               f2fs_time_over(sbi, dpolicy->timeout))
+               if (dpolicy->timeout &&
+                               f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
                        break;
 
                if (i + 1 < dpolicy->granularity)
@@ -1497,8 +1500,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
                list_for_each_entry_safe(dc, tmp, pend_list, list) {
                        f2fs_bug_on(sbi, dc->state != D_PREP);
 
-                       if (dpolicy->timeout != 0 &&
-                               f2fs_time_over(sbi, dpolicy->timeout))
+                       if (dpolicy->timeout &&
+                               f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
                                break;
 
                        if (dpolicy->io_aware && i < dpolicy->io_aware_gran &&
@@ -1677,7 +1680,6 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
 
        __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
                                        dcc->discard_granularity);
-       dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT;
        __issue_discard_cmd(sbi, &dpolicy);
        dropped = __drop_discard_cmd(sbi);
 
@@ -1940,7 +1942,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
        unsigned int start = 0, end = -1;
        unsigned int secno, start_segno;
        bool force = (cpc->reason & CP_DISCARD);
-       bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
+       bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
 
        mutex_lock(&dirty_i->seglist_lock);
 
@@ -1972,7 +1974,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
                                        (end - 1) <= cpc->trim_end)
                                continue;
 
-               if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) {
+               if (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi)) {
                        f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
                                (end - start) << sbi->log_blocks_per_seg);
                        continue;
@@ -2801,7 +2803,7 @@ next:
                        blk_finish_plug(&plug);
                        mutex_unlock(&dcc->cmd_lock);
                        trimmed += __wait_all_discard_cmd(sbi, NULL);
-                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
                        goto next;
                }
 skip:
@@ -2830,7 +2832,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
        struct discard_policy dpolicy;
        unsigned long long trimmed = 0;
        int err = 0;
-       bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
+       bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
 
        if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
                return -EINVAL;
@@ -3193,7 +3195,7 @@ static void update_device_state(struct f2fs_io_info *fio)
 static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
 {
        int type = __get_segment_type(fio);
-       bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA);
+       bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
 
        if (keep_order)
                down_read(&fio->sbi->io_order_lock);
@@ -4071,7 +4073,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
        sit_i->dirty_sentries = 0;
        sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
        sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
-       sit_i->mounted_time = ktime_get_real_seconds();
+       sit_i->mounted_time = ktime_get_boottime_seconds();
        init_rwsem(&sit_i->sentry_lock);
        return 0;
 }
@@ -4678,7 +4680,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
        if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS)
                sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS;
 
-       if (!test_opt(sbi, LFS))
+       if (!f2fs_lfs_mode(sbi))
                sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
        sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
        sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
@@ -4830,22 +4832,22 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
 
 int __init f2fs_create_segment_manager_caches(void)
 {
-       discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
+       discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry",
                        sizeof(struct discard_entry));
        if (!discard_entry_slab)
                goto fail;
 
-       discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd",
+       discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd",
                        sizeof(struct discard_cmd));
        if (!discard_cmd_slab)
                goto destroy_discard_entry;
 
-       sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
+       sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set",
                        sizeof(struct sit_entry_set));
        if (!sit_entry_set_slab)
                goto destroy_discard_cmd;
 
-       inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
+       inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry",
                        sizeof(struct inmem_pages));
        if (!inmem_entry_slab)
                goto destroy_sit_entry_set;
index 459dc39..7a83bd5 100644 (file)
@@ -756,7 +756,7 @@ static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi,
                                                bool base_time)
 {
        struct sit_info *sit_i = SIT_I(sbi);
-       time64_t diff, now = ktime_get_real_seconds();
+       time64_t diff, now = ktime_get_boottime_seconds();
 
        if (now >= sit_i->mounted_time)
                return sit_i->elapsed_time + now - sit_i->mounted_time;
index a467aca..d66de59 100644 (file)
@@ -58,7 +58,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
                /* count extent cache entries */
                count += __count_extent_cache(sbi);
 
-               /* shrink clean nat cache entries */
+               /* count clean nat cache entries */
                count += __count_nat_entries(sbi);
 
                /* count free nids cache entries */
index d398b2d..f2dfc21 100644 (file)
@@ -428,14 +428,11 @@ static int parse_options(struct super_block *sb, char *options)
                        if (!name)
                                return -ENOMEM;
                        if (strlen(name) == 2 && !strncmp(name, "on", 2)) {
-                               set_opt(sbi, BG_GC);
-                               clear_opt(sbi, FORCE_FG_GC);
+                               F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
                        } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) {
-                               clear_opt(sbi, BG_GC);
-                               clear_opt(sbi, FORCE_FG_GC);
+                               F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF;
                        } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) {
-                               set_opt(sbi, BG_GC);
-                               set_opt(sbi, FORCE_FG_GC);
+                               F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC;
                        } else {
                                kvfree(name);
                                return -EINVAL;
@@ -447,7 +444,7 @@ static int parse_options(struct super_block *sb, char *options)
                        break;
                case Opt_norecovery:
                        /* this option mounts f2fs with ro */
-                       set_opt(sbi, DISABLE_ROLL_FORWARD);
+                       set_opt(sbi, NORECOVERY);
                        if (!f2fs_readonly(sb))
                                return -EINVAL;
                        break;
@@ -601,10 +598,10 @@ static int parse_options(struct super_block *sb, char *options)
                                        kvfree(name);
                                        return -EINVAL;
                                }
-                               set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+                               F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
                        } else if (strlen(name) == 3 &&
                                        !strncmp(name, "lfs", 3)) {
-                               set_opt_mode(sbi, F2FS_MOUNT_LFS);
+                               F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
                        } else {
                                kvfree(name);
                                return -EINVAL;
@@ -833,6 +830,10 @@ static int parse_options(struct super_block *sb, char *options)
                                        !strcmp(name, "lz4")) {
                                F2FS_OPTION(sbi).compress_algorithm =
                                                                COMPRESS_LZ4;
+                       } else if (strlen(name) == 4 &&
+                                       !strcmp(name, "zstd")) {
+                               F2FS_OPTION(sbi).compress_algorithm =
+                                                               COMPRESS_ZSTD;
                        } else {
                                kfree(name);
                                return -EINVAL;
@@ -905,7 +906,7 @@ static int parse_options(struct super_block *sb, char *options)
        }
 #endif
 
-       if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) {
+       if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) {
                f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO",
                         F2FS_IO_SIZE_KB(sbi));
                return -EINVAL;
@@ -935,7 +936,7 @@ static int parse_options(struct super_block *sb, char *options)
                }
        }
 
-       if (test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, LFS)) {
+       if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) {
                f2fs_err(sbi, "LFS not compatible with checkpoint=disable\n");
                return -EINVAL;
        }
@@ -961,6 +962,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
        /* Initialize f2fs-specific inode info */
        atomic_set(&fi->dirty_pages, 0);
        init_rwsem(&fi->i_sem);
+       spin_lock_init(&fi->i_size_lock);
        INIT_LIST_HEAD(&fi->dirty_list);
        INIT_LIST_HEAD(&fi->gdirty_list);
        INIT_LIST_HEAD(&fi->inmem_ilist);
@@ -1173,7 +1175,7 @@ static void f2fs_put_super(struct super_block *sb)
        /* our cp_error case, we can wait for any writeback page */
        f2fs_flush_merged_writes(sbi);
 
-       f2fs_wait_on_all_pages_writeback(sbi);
+       f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
 
        f2fs_bug_on(sbi, sbi->fsync_node_num);
 
@@ -1205,6 +1207,7 @@ static void f2fs_put_super(struct super_block *sb)
        kvfree(sbi->raw_super);
 
        destroy_device_list(sbi);
+       f2fs_destroy_xattr_caches(sbi);
        mempool_destroy(sbi->write_io_dummy);
 #ifdef CONFIG_QUOTA
        for (i = 0; i < MAXQUOTAS; i++)
@@ -1421,6 +1424,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
        case COMPRESS_LZ4:
                algtype = "lz4";
                break;
+       case COMPRESS_ZSTD:
+               algtype = "zstd";
+               break;
        }
        seq_printf(seq, ",compress_algorithm=%s", algtype);
 
@@ -1437,16 +1443,17 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
 
-       if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) {
-               if (test_opt(sbi, FORCE_FG_GC))
-                       seq_printf(seq, ",background_gc=%s", "sync");
-               else
-                       seq_printf(seq, ",background_gc=%s", "on");
-       } else {
+       if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC)
+               seq_printf(seq, ",background_gc=%s", "sync");
+       else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_ON)
+               seq_printf(seq, ",background_gc=%s", "on");
+       else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF)
                seq_printf(seq, ",background_gc=%s", "off");
-       }
+
        if (test_opt(sbi, DISABLE_ROLL_FORWARD))
                seq_puts(seq, ",disable_roll_forward");
+       if (test_opt(sbi, NORECOVERY))
+               seq_puts(seq, ",norecovery");
        if (test_opt(sbi, DISCARD))
                seq_puts(seq, ",discard");
        else
@@ -1498,9 +1505,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
                seq_puts(seq, ",data_flush");
 
        seq_puts(seq, ",mode=");
-       if (test_opt(sbi, ADAPTIVE))
+       if (F2FS_OPTION(sbi).fs_mode == FS_MODE_ADAPTIVE)
                seq_puts(seq, "adaptive");
-       else if (test_opt(sbi, LFS))
+       else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS)
                seq_puts(seq, "lfs");
        seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs);
        if (test_opt(sbi, RESERVE_ROOT))
@@ -1571,11 +1578,11 @@ static void default_options(struct f2fs_sb_info *sbi)
        F2FS_OPTION(sbi).test_dummy_encryption = false;
        F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
        F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
-       F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO;
+       F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4;
        F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE;
        F2FS_OPTION(sbi).compress_ext_cnt = 0;
+       F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
 
-       set_opt(sbi, BG_GC);
        set_opt(sbi, INLINE_XATTR);
        set_opt(sbi, INLINE_DATA);
        set_opt(sbi, INLINE_DENTRY);
@@ -1587,9 +1594,9 @@ static void default_options(struct f2fs_sb_info *sbi)
        set_opt(sbi, FLUSH_MERGE);
        set_opt(sbi, DISCARD);
        if (f2fs_sb_has_blkzoned(sbi))
-               set_opt_mode(sbi, F2FS_MOUNT_LFS);
+               F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
        else
-               set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+               F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
 
 #ifdef CONFIG_F2FS_FS_XATTR
        set_opt(sbi, XATTR_USER);
@@ -1658,7 +1665,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
 out_unlock:
        up_write(&sbi->gc_lock);
 restore_flag:
-       sbi->sb->s_flags = s_flags;     /* Restore MS_RDONLY status */
+       sbi->sb->s_flags = s_flags;     /* Restore SB_RDONLY status */
        return err;
 }
 
@@ -1781,7 +1788,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
         * or if background_gc = off is passed in mount
         * option. Also sync the filesystem.
         */
-       if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) {
+       if ((*flags & SB_RDONLY) ||
+                       F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) {
                if (sbi->gc_thread) {
                        f2fs_stop_gc_thread(sbi);
                        need_restart_gc = true;
@@ -1886,7 +1894,8 @@ repeat:
                page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS);
                if (IS_ERR(page)) {
                        if (PTR_ERR(page) == -ENOMEM) {
-                               congestion_wait(BLK_RW_ASYNC, HZ/50);
+                               congestion_wait(BLK_RW_ASYNC,
+                                               DEFAULT_IO_TIMEOUT);
                                goto repeat;
                        }
                        set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -1928,6 +1937,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
        int offset = off & (sb->s_blocksize - 1);
        size_t towrite = len;
        struct page *page;
+       void *fsdata = NULL;
        char *kaddr;
        int err = 0;
        int tocopy;
@@ -1937,10 +1947,11 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
                                                                towrite);
 retry:
                err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
-                                                       &page, NULL);
+                                                       &page, &fsdata);
                if (unlikely(err)) {
                        if (err == -ENOMEM) {
-                               congestion_wait(BLK_RW_ASYNC, HZ/50);
+                               congestion_wait(BLK_RW_ASYNC,
+                                               DEFAULT_IO_TIMEOUT);
                                goto retry;
                        }
                        set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -1953,7 +1964,7 @@ retry:
                flush_dcache_page(page);
 
                a_ops->write_end(NULL, mapping, off, tocopy, tocopy,
-                                               page, NULL);
+                                               page, fsdata);
                offset = 0;
                towrite -= tocopy;
                off += tocopy;
@@ -3457,12 +3468,17 @@ try_onemore:
                }
        }
 
+       /* init per sbi slab cache */
+       err = f2fs_init_xattr_caches(sbi);
+       if (err)
+               goto free_io_dummy;
+
        /* get an inode for meta space */
        sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
        if (IS_ERR(sbi->meta_inode)) {
                f2fs_err(sbi, "Failed to read F2FS meta data inode");
                err = PTR_ERR(sbi->meta_inode);
-               goto free_io_dummy;
+               goto free_xattr_cache;
        }
 
        err = f2fs_get_valid_checkpoint(sbi);
@@ -3590,7 +3606,7 @@ try_onemore:
                        f2fs_err(sbi, "Cannot turn on quotas: error %d", err);
        }
 #endif
-       /* if there are nt orphan nodes free them */
+       /* if there are any orphan inodes, free them */
        err = f2fs_recover_orphan_inodes(sbi);
        if (err)
                goto free_meta;
@@ -3599,7 +3615,8 @@ try_onemore:
                goto reset_checkpoint;
 
        /* recover fsynced data */
-       if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+       if (!test_opt(sbi, DISABLE_ROLL_FORWARD) &&
+                       !test_opt(sbi, NORECOVERY)) {
                /*
                 * mount should be failed, when device has readonly mode, and
                 * previous checkpoint was not done by clean system shutdown.
@@ -3665,7 +3682,7 @@ reset_checkpoint:
         * If filesystem is not mounted as read-only then
         * do start the gc_thread.
         */
-       if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) {
+       if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) {
                /* After POR, we can run background GC thread.*/
                err = f2fs_start_gc_thread(sbi);
                if (err)
@@ -3734,6 +3751,8 @@ free_meta_inode:
        make_bad_inode(sbi->meta_inode);
        iput(sbi->meta_inode);
        sbi->meta_inode = NULL;
+free_xattr_cache:
+       f2fs_destroy_xattr_caches(sbi);
 free_io_dummy:
        mempool_destroy(sbi->write_io_dummy);
 free_percpu:
index 91d6497..e3bbbef 100644 (file)
@@ -109,47 +109,47 @@ static ssize_t features_show(struct f2fs_attr *a,
                return sprintf(buf, "0\n");
 
        if (f2fs_sb_has_encrypt(sbi))
-               len += snprintf(buf, PAGE_SIZE - len, "%s",
+               len += scnprintf(buf, PAGE_SIZE - len, "%s",
                                                "encryption");
        if (f2fs_sb_has_blkzoned(sbi))
-               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+               len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "blkzoned");
        if (f2fs_sb_has_extra_attr(sbi))
-               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+               len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "extra_attr");
        if (f2fs_sb_has_project_quota(sbi))
-               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+               len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "projquota");
        if (f2fs_sb_has_inode_chksum(sbi))
-               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+               len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "inode_checksum");
        if (f2fs_sb_has_flexible_inline_xattr(sbi))
-               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+               len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "flexible_inline_xattr");
        if (f2fs_sb_has_quota_ino(sbi))
-               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+               len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "quota_ino");
        if (f2fs_sb_has_inode_crtime(sbi))
-               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+               len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "inode_crtime");
        if (f2fs_sb_has_lost_found(sbi))
-               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+               len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "lost_found");
        if (f2fs_sb_has_verity(sbi))
-               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+               len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "verity");
        if (f2fs_sb_has_sb_chksum(sbi))
-               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+               len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "sb_checksum");
        if (f2fs_sb_has_casefold(sbi))
-               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+               len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "casefold");
        if (f2fs_sb_has_compression(sbi))
-               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+               len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "compression");
-       len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+       len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                len ? ", " : "", "pin_file");
-       len += snprintf(buf + len, PAGE_SIZE - len, "\n");
+       len += scnprintf(buf + len, PAGE_SIZE - len, "\n");
        return len;
 }
 
@@ -185,6 +185,12 @@ static ssize_t encoding_show(struct f2fs_attr *a,
        return sprintf(buf, "(none)");
 }
 
+static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
+               struct f2fs_sb_info *sbi, char *buf)
+{
+       return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time);
+}
+
 #ifdef CONFIG_F2FS_STAT_FS
 static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a,
                                struct f2fs_sb_info *sbi, char *buf)
@@ -233,16 +239,16 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
                int hot_count = sbi->raw_super->hot_ext_count;
                int len = 0, i;
 
-               len += snprintf(buf + len, PAGE_SIZE - len,
+               len += scnprintf(buf + len, PAGE_SIZE - len,
                                                "cold file extension:\n");
                for (i = 0; i < cold_count; i++)
-                       len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+                       len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n",
                                                                extlist[i]);
 
-               len += snprintf(buf + len, PAGE_SIZE - len,
+               len += scnprintf(buf + len, PAGE_SIZE - len,
                                                "hot file extension:\n");
                for (i = cold_count; i < cold_count + hot_count; i++)
-                       len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+                       len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n",
                                                                extlist[i]);
                return len;
        }
@@ -544,6 +550,7 @@ F2FS_GENERAL_RO_ATTR(features);
 F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
 F2FS_GENERAL_RO_ATTR(unusable);
 F2FS_GENERAL_RO_ATTR(encoding);
+F2FS_GENERAL_RO_ATTR(mounted_time_sec);
 #ifdef CONFIG_F2FS_STAT_FS
 F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
 F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
@@ -573,7 +580,9 @@ F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
 #endif
 F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
 F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
 F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION);
+#endif
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -621,6 +630,7 @@ static struct attribute *f2fs_attrs[] = {
        ATTR_LIST(reserved_blocks),
        ATTR_LIST(current_reserved_blocks),
        ATTR_LIST(encoding),
+       ATTR_LIST(mounted_time_sec),
 #ifdef CONFIG_F2FS_STAT_FS
        ATTR_LIST(cp_foreground_calls),
        ATTR_LIST(cp_background_calls),
@@ -654,7 +664,9 @@ static struct attribute *f2fs_feat_attrs[] = {
 #endif
        ATTR_LIST(sb_checksum),
        ATTR_LIST(casefold),
+#ifdef CONFIG_F2FS_FS_COMPRESSION
        ATTR_LIST(compression),
+#endif
        NULL,
 };
 ATTRIBUTE_GROUPS(f2fs_feat);
index 296b318..4f6582e 100644 (file)
 #include "xattr.h"
 #include "segment.h"
 
+static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline)
+{
+       if (likely(size == sbi->inline_xattr_slab_size)) {
+               *is_inline = true;
+               return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS);
+       }
+       *is_inline = false;
+       return f2fs_kzalloc(sbi, size, GFP_NOFS);
+}
+
+static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr,
+                                                       bool is_inline)
+{
+       if (is_inline)
+               kmem_cache_free(sbi->inline_xattr_slab, xattr_addr);
+       else
+               kvfree(xattr_addr);
+}
+
 static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
                struct dentry *unused, struct inode *inode,
                const char *name, void *buffer, size_t size)
@@ -301,7 +320,8 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr)
 static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
                                unsigned int index, unsigned int len,
                                const char *name, struct f2fs_xattr_entry **xe,
-                               void **base_addr, int *base_size)
+                               void **base_addr, int *base_size,
+                               bool *is_inline)
 {
        void *cur_addr, *txattr_addr, *last_txattr_addr;
        void *last_addr = NULL;
@@ -312,12 +332,12 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
        if (!xnid && !inline_size)
                return -ENODATA;
 
-       *base_size = XATTR_SIZE(xnid, inode) + XATTR_PADDING_SIZE;
-       txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS);
+       *base_size = XATTR_SIZE(inode) + XATTR_PADDING_SIZE;
+       txattr_addr = xattr_alloc(F2FS_I_SB(inode), *base_size, is_inline);
        if (!txattr_addr)
                return -ENOMEM;
 
-       last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(xnid, inode);
+       last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(inode);
 
        /* read from inline xattr */
        if (inline_size) {
@@ -362,7 +382,7 @@ check:
        *base_addr = txattr_addr;
        return 0;
 out:
-       kvfree(txattr_addr);
+       xattr_free(F2FS_I_SB(inode), txattr_addr, *is_inline);
        return err;
 }
 
@@ -499,6 +519,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
        unsigned int size, len;
        void *base_addr = NULL;
        int base_size;
+       bool is_inline;
 
        if (name == NULL)
                return -EINVAL;
@@ -509,7 +530,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
 
        down_read(&F2FS_I(inode)->i_xattr_sem);
        error = lookup_all_xattrs(inode, ipage, index, len, name,
-                               &entry, &base_addr, &base_size);
+                               &entry, &base_addr, &base_size, &is_inline);
        up_read(&F2FS_I(inode)->i_xattr_sem);
        if (error)
                return error;
@@ -532,14 +553,13 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
        }
        error = size;
 out:
-       kvfree(base_addr);
+       xattr_free(F2FS_I_SB(inode), base_addr, is_inline);
        return error;
 }
 
 ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
        struct inode *inode = d_inode(dentry);
-       nid_t xnid = F2FS_I(inode)->i_xattr_nid;
        struct f2fs_xattr_entry *entry;
        void *base_addr, *last_base_addr;
        int error = 0;
@@ -551,7 +571,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
        if (error)
                return error;
 
-       last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode);
+       last_base_addr = (void *)base_addr + XATTR_SIZE(inode);
 
        list_for_each_xattr(entry, base_addr) {
                const struct xattr_handler *handler =
@@ -609,7 +629,6 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 {
        struct f2fs_xattr_entry *here, *last;
        void *base_addr, *last_base_addr;
-       nid_t xnid = F2FS_I(inode)->i_xattr_nid;
        int found, newsize;
        size_t len;
        __u32 new_hsize;
@@ -633,7 +652,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
        if (error)
                return error;
 
-       last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode);
+       last_base_addr = (void *)base_addr + XATTR_SIZE(inode);
 
        /* find entry with wanted name. */
        here = __find_xattr(base_addr, last_base_addr, index, len, name);
@@ -758,14 +777,34 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
        f2fs_balance_fs(sbi, true);
 
        f2fs_lock_op(sbi);
-       /* protect xattr_ver */
-       down_write(&F2FS_I(inode)->i_sem);
        down_write(&F2FS_I(inode)->i_xattr_sem);
        err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
        up_write(&F2FS_I(inode)->i_xattr_sem);
-       up_write(&F2FS_I(inode)->i_sem);
        f2fs_unlock_op(sbi);
 
        f2fs_update_time(sbi, REQ_TIME);
        return err;
 }
+
+int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi)
+{
+       dev_t dev = sbi->sb->s_bdev->bd_dev;
+       char slab_name[32];
+
+       sprintf(slab_name, "f2fs_xattr_entry-%u:%u", MAJOR(dev), MINOR(dev));
+
+       sbi->inline_xattr_slab_size = F2FS_OPTION(sbi).inline_xattr_size *
+                                       sizeof(__le32) + XATTR_PADDING_SIZE;
+
+       sbi->inline_xattr_slab = f2fs_kmem_cache_create(slab_name,
+                                       sbi->inline_xattr_slab_size);
+       if (!sbi->inline_xattr_slab)
+               return -ENOMEM;
+
+       return 0;
+}
+
+void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi)
+{
+       kmem_cache_destroy(sbi->inline_xattr_slab);
+}
index de0c600..938fcd2 100644 (file)
@@ -49,7 +49,7 @@ struct f2fs_xattr_entry {
        __u8    e_name_index;
        __u8    e_name_len;
        __le16  e_value_size;   /* size of attribute value */
-       char    e_name[0];      /* attribute name */
+       char    e_name[];      /* attribute name */
 };
 
 #define XATTR_HDR(ptr)         ((struct f2fs_xattr_header *)(ptr))
@@ -73,7 +73,8 @@ struct f2fs_xattr_entry {
                                entry = XATTR_NEXT_ENTRY(entry))
 #define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer))
 #define XATTR_PADDING_SIZE     (sizeof(__u32))
-#define XATTR_SIZE(x,i)                (((x) ? VALID_XATTR_BLOCK_SIZE : 0) +   \
+#define XATTR_SIZE(i)          ((F2FS_I(i)->i_xattr_nid ?              \
+                                       VALID_XATTR_BLOCK_SIZE : 0) +   \
                                                (inline_xattr_size(i)))
 #define MIN_OFFSET(i)          XATTR_ALIGN(inline_xattr_size(i) +      \
                                                VALID_XATTR_BLOCK_SIZE)
@@ -130,6 +131,8 @@ extern int f2fs_setxattr(struct inode *, int, const char *,
 extern int f2fs_getxattr(struct inode *, int, const char *, void *,
                                                size_t, struct page *);
 extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
+extern int f2fs_init_xattr_caches(struct f2fs_sb_info *);
+extern void f2fs_destroy_xattr_caches(struct f2fs_sb_info *);
 #else
 
 #define f2fs_xattr_handlers    NULL
@@ -150,6 +153,8 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
 {
        return -EOPNOTSUPP;
 }
+static inline int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { }
 #endif
 
 #ifdef CONFIG_F2FS_FS_SECURITY
index cc5cf22..4023c98 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/kthread.h>
 #include <linux/rculist_nulls.h>
 #include <linux/fs_struct.h>
+#include <linux/task_work.h>
 
 #include "io-wq.h"
 
@@ -716,6 +717,9 @@ static int io_wq_manager(void *data)
        complete(&wq->done);
 
        while (!kthread_should_stop()) {
+               if (current->task_works)
+                       task_work_run();
+
                for_each_node(node) {
                        struct io_wqe *wqe = wq->wqes[node];
                        bool fork_worker[2] = { false, false };
@@ -738,6 +742,9 @@ static int io_wq_manager(void *data)
                schedule_timeout(HZ);
        }
 
+       if (current->task_works)
+               task_work_run();
+
        return 0;
 err:
        set_bit(IO_WQ_BIT_ERROR, &wq->state);
@@ -1124,3 +1131,8 @@ void io_wq_destroy(struct io_wq *wq)
        if (refcount_dec_and_test(&wq->use_refs))
                __io_wq_destroy(wq);
 }
+
+struct task_struct *io_wq_get_task(struct io_wq *wq)
+{
+       return wq->manager;
+}
index 3ee7356..5ba12de 100644 (file)
@@ -136,6 +136,8 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
 enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
                                        void *data);
 
+struct task_struct *io_wq_get_task(struct io_wq *wq);
+
 #if defined(CONFIG_IO_WQ)
 extern void io_wq_worker_sleeping(struct task_struct *);
 extern void io_wq_worker_running(struct task_struct *);
index 358f97b..5190bfb 100644 (file)
@@ -186,14 +186,23 @@ struct fixed_file_table {
        struct file             **files;
 };
 
+struct fixed_file_ref_node {
+       struct percpu_ref               refs;
+       struct list_head                node;
+       struct list_head                file_list;
+       struct fixed_file_data          *file_data;
+       struct work_struct              work;
+};
+
 struct fixed_file_data {
        struct fixed_file_table         *table;
        struct io_ring_ctx              *ctx;
 
+       struct percpu_ref               *cur_refs;
        struct percpu_ref               refs;
-       struct llist_head               put_llist;
-       struct work_struct              ref_work;
        struct completion               done;
+       struct list_head                ref_list;
+       spinlock_t                      lock;
 };
 
 struct io_buffer {
@@ -317,6 +326,8 @@ struct io_ring_ctx {
                spinlock_t              inflight_lock;
                struct list_head        inflight_list;
        } ____cacheline_aligned_in_smp;
+
+       struct work_struct              exit_work;
 };
 
 /*
@@ -599,6 +610,7 @@ struct io_kiocb {
        };
 
        struct io_async_ctx             *io;
+       int                             cflags;
        bool                            needs_fixed_file;
        u8                              opcode;
 
@@ -606,10 +618,8 @@ struct io_kiocb {
        struct list_head        list;
        unsigned int            flags;
        refcount_t              refs;
-       union {
-               struct task_struct      *task;
-               unsigned long           fsize;
-       };
+       struct task_struct      *task;
+       unsigned long           fsize;
        u64                     user_data;
        u32                     result;
        u32                     sequence;
@@ -618,6 +628,8 @@ struct io_kiocb {
 
        struct list_head        inflight_entry;
 
+       struct percpu_ref       *fixed_file_refs;
+
        union {
                /*
                 * Only commands that never go async can use the below fields,
@@ -629,7 +641,6 @@ struct io_kiocb {
                        struct callback_head    task_work;
                        struct hlist_node       hash_node;
                        struct async_poll       *apoll;
-                       int                     cflags;
                };
                struct io_wq_work       work;
        };
@@ -848,7 +859,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
                                 struct io_uring_files_update *ip,
                                 unsigned nr_args);
 static int io_grab_files(struct io_kiocb *req);
-static void io_ring_file_ref_flush(struct fixed_file_data *data);
 static void io_cleanup_req(struct io_kiocb *req);
 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
                       int fd, struct file **out_file, bool fixed);
@@ -1285,8 +1295,8 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
        return NULL;
 }
 
-static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
-                                  struct io_submit_state *state)
+static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
+                                    struct io_submit_state *state)
 {
        gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
        struct io_kiocb *req;
@@ -1319,41 +1329,20 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
                req = state->reqs[state->free_reqs];
        }
 
-got_it:
-       req->io = NULL;
-       req->file = NULL;
-       req->ctx = ctx;
-       req->flags = 0;
-       /* one is dropped after submission, the other at completion */
-       refcount_set(&req->refs, 2);
-       req->result = 0;
-       INIT_IO_WORK(&req->work, io_wq_submit_work);
        return req;
 fallback:
-       req = io_get_fallback_req(ctx);
-       if (req)
-               goto got_it;
-       percpu_ref_put(&ctx->refs);
-       return NULL;
+       return io_get_fallback_req(ctx);
 }
 
 static inline void io_put_file(struct io_kiocb *req, struct file *file,
                          bool fixed)
 {
        if (fixed)
-               percpu_ref_put(&req->ctx->file_data->refs);
+               percpu_ref_put(req->fixed_file_refs);
        else
                fput(file);
 }
 
-static void __io_req_do_free(struct io_kiocb *req)
-{
-       if (likely(!io_is_fallback_req(req)))
-               kmem_cache_free(req_cachep, req);
-       else
-               clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
-}
-
 static void __io_req_aux_free(struct io_kiocb *req)
 {
        if (req->flags & REQ_F_NEED_CLEANUP)
@@ -1362,6 +1351,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
        kfree(req->io);
        if (req->file)
                io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
+       if (req->task)
+               put_task_struct(req->task);
 
        io_req_work_drop_env(req);
 }
@@ -1382,7 +1373,10 @@ static void __io_free_req(struct io_kiocb *req)
        }
 
        percpu_ref_put(&req->ctx->refs);
-       __io_req_do_free(req);
+       if (likely(!io_is_fallback_req(req)))
+               kmem_cache_free(req_cachep, req);
+       else
+               clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
 }
 
 struct req_batch {
@@ -1393,21 +1387,18 @@ struct req_batch {
 
 static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 {
-       int fixed_refs = rb->to_free;
-
        if (!rb->to_free)
                return;
        if (rb->need_iter) {
                int i, inflight = 0;
                unsigned long flags;
 
-               fixed_refs = 0;
                for (i = 0; i < rb->to_free; i++) {
                        struct io_kiocb *req = rb->reqs[i];
 
                        if (req->flags & REQ_F_FIXED_FILE) {
                                req->file = NULL;
-                               fixed_refs++;
+                               percpu_ref_put(req->fixed_file_refs);
                        }
                        if (req->flags & REQ_F_INFLIGHT)
                                inflight++;
@@ -1433,8 +1424,6 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
        }
 do_free:
        kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
-       if (fixed_refs)
-               percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
        percpu_ref_put_many(&ctx->refs, rb->to_free);
        rb->to_free = rb->need_iter = 0;
 }
@@ -1738,11 +1727,24 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
        io_free_req_many(ctx, &rb);
 }
 
+static void io_iopoll_queue(struct list_head *again)
+{
+       struct io_kiocb *req;
+
+       do {
+               req = list_first_entry(again, struct io_kiocb, list);
+               list_del(&req->list);
+               refcount_inc(&req->refs);
+               io_queue_async_work(req);
+       } while (!list_empty(again));
+}
+
 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
                        long min)
 {
        struct io_kiocb *req, *tmp;
        LIST_HEAD(done);
+       LIST_HEAD(again);
        bool spin;
        int ret;
 
@@ -1757,9 +1759,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
                struct kiocb *kiocb = &req->rw.kiocb;
 
                /*
-                * Move completed entries to our local list. If we find a
-                * request that requires polling, break out and complete
-                * the done list first, if we have entries there.
+                * Move completed and retryable entries to our local lists.
+                * If we find a request that requires polling, break out
+                * and complete those lists first, if we have entries there.
                 */
                if (req->flags & REQ_F_IOPOLL_COMPLETED) {
                        list_move_tail(&req->list, &done);
@@ -1768,6 +1770,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
                if (!list_empty(&done))
                        break;
 
+               if (req->result == -EAGAIN) {
+                       list_move_tail(&req->list, &again);
+                       continue;
+               }
+               if (!list_empty(&again))
+                       break;
+
                ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
                if (ret < 0)
                        break;
@@ -1780,6 +1789,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
        if (!list_empty(&done))
                io_iopoll_complete(ctx, nr_events, &done);
 
+       if (!list_empty(&again))
+               io_iopoll_queue(&again);
+
        return ret;
 }
 
@@ -2465,8 +2477,9 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
        req->io->rw.iov = iovec;
        if (!req->io->rw.iov) {
                req->io->rw.iov = req->io->rw.fast_iov;
-               memcpy(req->io->rw.iov, fast_iov,
-                       sizeof(struct iovec) * iter->nr_segs);
+               if (req->io->rw.iov != fast_iov)
+                       memcpy(req->io->rw.iov, fast_iov,
+                              sizeof(struct iovec) * iter->nr_segs);
        } else {
                req->flags |= REQ_F_NEED_CLEANUP;
        }
@@ -2920,7 +2933,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
        if (sqe->ioprio || sqe->buf_index)
                return -EINVAL;
-       if (sqe->flags & IOSQE_FIXED_FILE)
+       if (req->flags & REQ_F_FIXED_FILE)
                return -EBADF;
        if (req->flags & REQ_F_NEED_CLEANUP)
                return 0;
@@ -2929,6 +2942,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        req->open.how.mode = READ_ONCE(sqe->len);
        fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
        req->open.how.flags = READ_ONCE(sqe->open_flags);
+       if (force_o_largefile())
+               req->open.how.flags |= O_LARGEFILE;
 
        req->open.filename = getname(fname);
        if (IS_ERR(req->open.filename)) {
@@ -2951,7 +2966,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
        if (sqe->ioprio || sqe->buf_index)
                return -EINVAL;
-       if (sqe->flags & IOSQE_FIXED_FILE)
+       if (req->flags & REQ_F_FIXED_FILE)
                return -EBADF;
        if (req->flags & REQ_F_NEED_CLEANUP)
                return 0;
@@ -3305,7 +3320,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
        if (sqe->ioprio || sqe->buf_index)
                return -EINVAL;
-       if (sqe->flags & IOSQE_FIXED_FILE)
+       if (req->flags & REQ_F_FIXED_FILE)
                return -EBADF;
        if (req->flags & REQ_F_NEED_CLEANUP)
                return 0;
@@ -3382,7 +3397,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
            sqe->rw_flags || sqe->buf_index)
                return -EINVAL;
-       if (sqe->flags & IOSQE_FIXED_FILE)
+       if (req->flags & REQ_F_FIXED_FILE)
                return -EBADF;
 
        req->close.fd = READ_ONCE(sqe->fd);
@@ -3481,14 +3496,11 @@ static void __io_sync_file_range(struct io_kiocb *req)
 static void io_sync_file_range_finish(struct io_wq_work **workptr)
 {
        struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-       struct io_kiocb *nxt = NULL;
 
        if (io_req_cancelled(req))
                return;
        __io_sync_file_range(req);
        io_put_req(req); /* put submission ref */
-       if (nxt)
-               io_wq_assign_next(workptr, nxt);
 }
 
 static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
@@ -4114,6 +4126,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
                           __poll_t mask, task_work_func_t func)
 {
        struct task_struct *tsk;
+       int ret;
 
        /* for instances that support it check for an event match first: */
        if (mask && !(mask & poll->events))
@@ -4127,11 +4140,15 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
        req->result = mask;
        init_task_work(&req->task_work, func);
        /*
-        * If this fails, then the task is exiting. If that is the case, then
-        * the exit check will ultimately cancel these work items. Hence we
-        * don't need to check here and handle it specifically.
+        * If this fails, then the task is exiting. Punt to one of the io-wq
+        * threads to ensure the work gets run, we can't always rely on exit
+        * cancelation taking care of this.
         */
-       task_work_add(tsk, &req->task_work, true);
+       ret = task_work_add(tsk, &req->task_work, true);
+       if (unlikely(ret)) {
+               tsk = io_wq_get_task(req->ctx->io_wq);
+               task_work_add(tsk, &req->task_work, true);
+       }
        wake_up_process(tsk);
        return 1;
 }
@@ -4251,10 +4268,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
        req->flags |= REQ_F_POLLED;
        memcpy(&apoll->work, &req->work, sizeof(req->work));
 
-       /*
-        * Don't need a reference here, as we're adding it to the task
-        * task_works list. If the task exits, the list is pruned.
-        */
+       get_task_struct(current);
        req->task = current;
        req->apoll = apoll;
        INIT_HLIST_NODE(&req->hash_node);
@@ -4407,8 +4421,20 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
 static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
 {
        struct io_ring_ctx *ctx = req->ctx;
+       struct io_poll_iocb *poll = &req->poll;
+
+       if (!req->result && !READ_ONCE(poll->canceled)) {
+               struct poll_table_struct pt = { ._key = poll->events };
+
+               req->result = vfs_poll(req->file, &pt) & poll->events;
+       }
 
        spin_lock_irq(&ctx->completion_lock);
+       if (!req->result && !READ_ONCE(poll->canceled)) {
+               add_wait_queue(poll->head, &poll->wait);
+               spin_unlock_irq(&ctx->completion_lock);
+               return;
+       }
        hash_del(&req->hash_node);
        io_poll_complete(req, req->result, 0);
        req->flags |= REQ_F_COMP_LOCKED;
@@ -4465,10 +4491,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
        events = READ_ONCE(sqe->poll_events);
        poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
 
-       /*
-        * Don't need a reference here, as we're adding it to the task
-        * task_works list. If the task exits, the list is pruned.
-        */
+       get_task_struct(current);
        req->task = current;
        return 0;
 }
@@ -5331,7 +5354,8 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
                file = io_file_from_index(ctx, fd);
                if (!file)
                        return -EBADF;
-               percpu_ref_get(&ctx->file_data->refs);
+               req->fixed_file_refs = ctx->file_data->cur_refs;
+               percpu_ref_get(req->fixed_file_refs);
        } else {
                trace_io_uring_file_get(ctx, fd);
                file = __io_file_get(state, fd);
@@ -5344,15 +5368,10 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
 }
 
 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
-                          const struct io_uring_sqe *sqe)
+                          int fd, unsigned int flags)
 {
-       unsigned flags;
-       int fd;
        bool fixed;
 
-       flags = READ_ONCE(sqe->flags);
-       fd = READ_ONCE(sqe->fd);
-
        if (!io_req_needs_file(req, fd))
                return 0;
 
@@ -5594,7 +5613,7 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 {
        struct io_ring_ctx *ctx = req->ctx;
        unsigned int sqe_flags;
-       int ret, id;
+       int ret, id, fd;
 
        sqe_flags = READ_ONCE(sqe->flags);
 
@@ -5625,7 +5644,8 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                                        IOSQE_ASYNC | IOSQE_FIXED_FILE |
                                        IOSQE_BUFFER_SELECT);
 
-       ret = io_req_set_file(state, req, sqe);
+       fd = READ_ONCE(sqe->fd);
+       ret = io_req_set_file(state, req, fd, sqe_flags);
        if (unlikely(ret)) {
 err_req:
                io_cqring_add_event(req, ret);
@@ -5741,8 +5761,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
  * used, it's important that those reads are done through READ_ONCE() to
  * prevent a re-load down the line.
  */
-static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
-                         const struct io_uring_sqe **sqe_ptr)
+static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 {
        u32 *sq_array = ctx->sq_array;
        unsigned head;
@@ -5756,25 +5775,40 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
         *    though the application is the one updating it.
         */
        head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
-       if (likely(head < ctx->sq_entries)) {
-               /*
-                * All io need record the previous position, if LINK vs DARIN,
-                * it can be used to mark the position of the first IO in the
-                * link list.
-                */
-               req->sequence = ctx->cached_sq_head;
-               *sqe_ptr = &ctx->sq_sqes[head];
-               req->opcode = READ_ONCE((*sqe_ptr)->opcode);
-               req->user_data = READ_ONCE((*sqe_ptr)->user_data);
-               ctx->cached_sq_head++;
-               return true;
-       }
+       if (likely(head < ctx->sq_entries))
+               return &ctx->sq_sqes[head];
 
        /* drop invalid entries */
-       ctx->cached_sq_head++;
        ctx->cached_sq_dropped++;
        WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
-       return false;
+       return NULL;
+}
+
+static inline void io_consume_sqe(struct io_ring_ctx *ctx)
+{
+       ctx->cached_sq_head++;
+}
+
+static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+                       const struct io_uring_sqe *sqe)
+{
+       /*
+        * All io need record the previous position, if LINK vs DARIN,
+        * it can be used to mark the position of the first IO in the
+        * link list.
+        */
+       req->sequence = ctx->cached_sq_head;
+       req->opcode = READ_ONCE(sqe->opcode);
+       req->user_data = READ_ONCE(sqe->user_data);
+       req->io = NULL;
+       req->file = NULL;
+       req->ctx = ctx;
+       req->flags = 0;
+       /* one is dropped after submission, the other at completion */
+       refcount_set(&req->refs, 2);
+       req->task = NULL;
+       req->result = 0;
+       INIT_IO_WORK(&req->work, io_wq_submit_work);
 }
 
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
@@ -5812,17 +5846,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
                struct io_kiocb *req;
                int err;
 
-               req = io_get_req(ctx, statep);
+               sqe = io_get_sqe(ctx);
+               if (unlikely(!sqe)) {
+                       io_consume_sqe(ctx);
+                       break;
+               }
+               req = io_alloc_req(ctx, statep);
                if (unlikely(!req)) {
                        if (!submitted)
                                submitted = -EAGAIN;
                        break;
                }
-               if (!io_get_sqring(ctx, req, &sqe)) {
-                       __io_req_do_free(req);
-                       break;
-               }
 
+               io_init_req(ctx, req, sqe);
+               io_consume_sqe(ctx);
                /* will complete beyond this point, count as submitted */
                submitted++;
 
@@ -5962,6 +5999,7 @@ static int io_sq_thread(void *data)
                                }
                                if (current->task_works) {
                                        task_work_run();
+                                       finish_wait(&ctx->sqo_wait, &wait);
                                        continue;
                                }
                                if (signal_pending(current))
@@ -6124,43 +6162,36 @@ static void io_file_ref_kill(struct percpu_ref *ref)
        complete(&data->done);
 }
 
-static void io_file_ref_exit_and_free(struct work_struct *work)
-{
-       struct fixed_file_data *data;
-
-       data = container_of(work, struct fixed_file_data, ref_work);
-
-       /*
-        * Ensure any percpu-ref atomic switch callback has run, it could have
-        * been in progress when the files were being unregistered. Once
-        * that's done, we can safely exit and free the ref and containing
-        * data structure.
-        */
-       rcu_barrier();
-       percpu_ref_exit(&data->refs);
-       kfree(data);
-}
-
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
        struct fixed_file_data *data = ctx->file_data;
+       struct fixed_file_ref_node *ref_node = NULL;
        unsigned nr_tables, i;
+       unsigned long flags;
 
        if (!data)
                return -ENXIO;
 
-       percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
-       flush_work(&data->ref_work);
+       spin_lock_irqsave(&data->lock, flags);
+       if (!list_empty(&data->ref_list))
+               ref_node = list_first_entry(&data->ref_list,
+                               struct fixed_file_ref_node, node);
+       spin_unlock_irqrestore(&data->lock, flags);
+       if (ref_node)
+               percpu_ref_kill(&ref_node->refs);
+
+       percpu_ref_kill(&data->refs);
+
+       /* wait for all refs nodes to complete */
        wait_for_completion(&data->done);
-       io_ring_file_ref_flush(data);
 
        __io_sqe_files_unregister(ctx);
        nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
        for (i = 0; i < nr_tables; i++)
                kfree(data->table[i].files);
        kfree(data->table);
-       INIT_WORK(&data->ref_work, io_file_ref_exit_and_free);
-       queue_work(system_wq, &data->ref_work);
+       percpu_ref_exit(&data->refs);
+       kfree(data);
        ctx->file_data = NULL;
        ctx->nr_user_files = 0;
        return 0;
@@ -6204,13 +6235,6 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
        struct sk_buff *skb;
        int i, nr_files;
 
-       if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
-               unsigned long inflight = ctx->user->unix_inflight + nr;
-
-               if (inflight > task_rlimit(current, RLIMIT_NOFILE))
-                       return -EMFILE;
-       }
-
        fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
        if (!fpl)
                return -ENOMEM;
@@ -6385,46 +6409,72 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
 }
 
 struct io_file_put {
-       struct llist_node llist;
+       struct list_head list;
        struct file *file;
 };
 
-static void io_ring_file_ref_flush(struct fixed_file_data *data)
+static void io_file_put_work(struct work_struct *work)
 {
+       struct fixed_file_ref_node *ref_node;
+       struct fixed_file_data *file_data;
+       struct io_ring_ctx *ctx;
        struct io_file_put *pfile, *tmp;
-       struct llist_node *node;
+       unsigned long flags;
 
-       while ((node = llist_del_all(&data->put_llist)) != NULL) {
-               llist_for_each_entry_safe(pfile, tmp, node, llist) {
-                       io_ring_file_put(data->ctx, pfile->file);
-                       kfree(pfile);
-               }
+       ref_node = container_of(work, struct fixed_file_ref_node, work);
+       file_data = ref_node->file_data;
+       ctx = file_data->ctx;
+
+       list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
+               list_del_init(&pfile->list);
+               io_ring_file_put(ctx, pfile->file);
+               kfree(pfile);
        }
+
+       spin_lock_irqsave(&file_data->lock, flags);
+       list_del_init(&ref_node->node);
+       spin_unlock_irqrestore(&file_data->lock, flags);
+
+       percpu_ref_exit(&ref_node->refs);
+       kfree(ref_node);
+       percpu_ref_put(&file_data->refs);
 }
 
-static void io_ring_file_ref_switch(struct work_struct *work)
+static void io_file_data_ref_zero(struct percpu_ref *ref)
 {
-       struct fixed_file_data *data;
+       struct fixed_file_ref_node *ref_node;
+
+       ref_node = container_of(ref, struct fixed_file_ref_node, refs);
 
-       data = container_of(work, struct fixed_file_data, ref_work);
-       io_ring_file_ref_flush(data);
-       percpu_ref_switch_to_percpu(&data->refs);
+       queue_work(system_wq, &ref_node->work);
 }
 
-static void io_file_data_ref_zero(struct percpu_ref *ref)
+static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
+                       struct io_ring_ctx *ctx)
 {
-       struct fixed_file_data *data;
+       struct fixed_file_ref_node *ref_node;
 
-       data = container_of(ref, struct fixed_file_data, refs);
+       ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+       if (!ref_node)
+               return ERR_PTR(-ENOMEM);
+
+       if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
+                           0, GFP_KERNEL)) {
+               kfree(ref_node);
+               return ERR_PTR(-ENOMEM);
+       }
+       INIT_LIST_HEAD(&ref_node->node);
+       INIT_LIST_HEAD(&ref_node->file_list);
+       INIT_WORK(&ref_node->work, io_file_put_work);
+       ref_node->file_data = ctx->file_data;
+       return ref_node;
 
-       /*
-        * We can't safely switch from inside this context, punt to wq. If
-        * the table ref is going away, the table is being unregistered.
-        * Don't queue up the async work for that case, the caller will
-        * handle it.
-        */
-       if (!percpu_ref_is_dying(&data->refs))
-               queue_work(system_wq, &data->ref_work);
+}
+
+static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
+{
+       percpu_ref_exit(&ref_node->refs);
+       kfree(ref_node);
 }
 
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -6435,6 +6485,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
        struct file *file;
        int fd, ret = 0;
        unsigned i;
+       struct fixed_file_ref_node *ref_node;
+       unsigned long flags;
 
        if (ctx->file_data)
                return -EBUSY;
@@ -6448,6 +6500,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
                return -ENOMEM;
        ctx->file_data->ctx = ctx;
        init_completion(&ctx->file_data->done);
+       INIT_LIST_HEAD(&ctx->file_data->ref_list);
+       spin_lock_init(&ctx->file_data->lock);
 
        nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
        ctx->file_data->table = kcalloc(nr_tables,
@@ -6459,15 +6513,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
                return -ENOMEM;
        }
 
-       if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
+       if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
                                PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
                kfree(ctx->file_data->table);
                kfree(ctx->file_data);
                ctx->file_data = NULL;
                return -ENOMEM;
        }
-       ctx->file_data->put_llist.first = NULL;
-       INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
 
        if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
                percpu_ref_exit(&ctx->file_data->refs);
@@ -6530,9 +6582,22 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
        }
 
        ret = io_sqe_files_scm(ctx);
-       if (ret)
+       if (ret) {
                io_sqe_files_unregister(ctx);
+               return ret;
+       }
+
+       ref_node = alloc_fixed_file_ref_node(ctx);
+       if (IS_ERR(ref_node)) {
+               io_sqe_files_unregister(ctx);
+               return PTR_ERR(ref_node);
+       }
 
+       ctx->file_data->cur_refs = &ref_node->refs;
+       spin_lock_irqsave(&ctx->file_data->lock, flags);
+       list_add(&ref_node->node, &ctx->file_data->ref_list);
+       spin_unlock_irqrestore(&ctx->file_data->lock, flags);
+       percpu_ref_get(&ctx->file_data->refs);
        return ret;
 }
 
@@ -6579,30 +6644,21 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 #endif
 }
 
-static void io_atomic_switch(struct percpu_ref *ref)
-{
-       struct fixed_file_data *data;
-
-       /*
-        * Juggle reference to ensure we hit zero, if needed, so we can
-        * switch back to percpu mode
-        */
-       data = container_of(ref, struct fixed_file_data, refs);
-       percpu_ref_put(&data->refs);
-       percpu_ref_get(&data->refs);
-}
-
 static int io_queue_file_removal(struct fixed_file_data *data,
-                                 struct file *file)
+                                struct file *file)
 {
        struct io_file_put *pfile;
+       struct percpu_ref *refs = data->cur_refs;
+       struct fixed_file_ref_node *ref_node;
 
        pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
        if (!pfile)
                return -ENOMEM;
 
+       ref_node = container_of(refs, struct fixed_file_ref_node, refs);
        pfile->file = file;
-       llist_add(&pfile->llist, &data->put_llist);
+       list_add(&pfile->list, &ref_node->file_list);
+
        return 0;
 }
 
@@ -6611,17 +6667,23 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
                                 unsigned nr_args)
 {
        struct fixed_file_data *data = ctx->file_data;
-       bool ref_switch = false;
+       struct fixed_file_ref_node *ref_node;
        struct file *file;
        __s32 __user *fds;
        int fd, i, err;
        __u32 done;
+       unsigned long flags;
+       bool needs_switch = false;
 
        if (check_add_overflow(up->offset, nr_args, &done))
                return -EOVERFLOW;
        if (done > ctx->nr_user_files)
                return -EINVAL;
 
+       ref_node = alloc_fixed_file_ref_node(ctx);
+       if (IS_ERR(ref_node))
+               return PTR_ERR(ref_node);
+
        done = 0;
        fds = u64_to_user_ptr(up->fds);
        while (nr_args) {
@@ -6642,7 +6704,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
                        if (err)
                                break;
                        table->files[index] = NULL;
-                       ref_switch = true;
+                       needs_switch = true;
                }
                if (fd != -1) {
                        file = fget(fd);
@@ -6673,11 +6735,19 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
                up->offset++;
        }
 
-       if (ref_switch)
-               percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
+       if (needs_switch) {
+               percpu_ref_kill(data->cur_refs);
+               spin_lock_irqsave(&data->lock, flags);
+               list_add(&ref_node->node, &data->ref_list);
+               data->cur_refs = &ref_node->refs;
+               spin_unlock_irqrestore(&data->lock, flags);
+               percpu_ref_get(&ctx->file_data->refs);
+       } else
+               destroy_fixed_file_ref_node(ref_node);
 
        return done ? done : err;
 }
+
 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
                               unsigned nr_args)
 {
@@ -7203,6 +7273,18 @@ static int io_remove_personalities(int id, void *p, void *data)
        return 0;
 }
 
+static void io_ring_exit_work(struct work_struct *work)
+{
+       struct io_ring_ctx *ctx;
+
+       ctx = container_of(work, struct io_ring_ctx, exit_work);
+       if (ctx->rings)
+               io_cqring_overflow_flush(ctx, true);
+
+       wait_for_completion(&ctx->completions[0]);
+       io_ring_ctx_free(ctx);
+}
+
 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 {
        mutex_lock(&ctx->uring_lock);
@@ -7230,8 +7312,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
        if (ctx->rings)
                io_cqring_overflow_flush(ctx, true);
        idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
-       wait_for_completion(&ctx->completions[0]);
-       io_ring_ctx_free(ctx);
+       INIT_WORK(&ctx->exit_work, io_ring_exit_work);
+       queue_work(system_wq, &ctx->exit_work);
 }
 
 static int io_uring_release(struct inode *inode, struct file *file)
index f080f54..89e2196 100644 (file)
@@ -302,6 +302,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
        if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) {
                gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
+               gfp_t orig_gfp = gfp;
                int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
                if (ctx->bio)
@@ -310,6 +311,13 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
                if (ctx->is_readahead) /* same as readahead_gfp_mask */
                        gfp |= __GFP_NORETRY | __GFP_NOWARN;
                ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs));
+               /*
+                * If the bio_alloc fails, try it again for a single page to
+                * avoid having to deal with partial page reads.  This emulates
+                * what do_mpage_readpage does.
+                */
+               if (!ctx->bio)
+                       ctx->bio = bio_alloc(orig_gfp, 1);
                ctx->bio->bi_opf = REQ_OP_READ;
                if (ctx->is_readahead)
                        ctx->bio->bi_opf |= REQ_RAHEAD;
@@ -975,13 +983,6 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
        return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
 }
 
-static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
-               struct iomap *iomap)
-{
-       return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
-                       iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
-}
-
 static loff_t
 iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
                void *data, struct iomap *iomap, struct iomap *srcmap)
@@ -1001,7 +1002,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
                bytes = min_t(loff_t, PAGE_SIZE - offset, count);
 
                if (IS_DAX(inode))
-                       status = iomap_dax_zero(pos, offset, bytes, iomap);
+                       status = dax_iomap_zero(pos, offset, bytes, iomap);
                else
                        status = iomap_zero(inode, pos, offset, bytes, iomap,
                                        srcmap);
index 6902217..d1a0e2c 100644 (file)
@@ -476,7 +476,7 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
        err = ext_tree_remove(bl, true, 0, LLONG_MAX);
        WARN_ON(err);
 
-       kfree(bl);
+       kfree_rcu(bl, bl_layout.plh_rcu);
 }
 
 static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
index 5493502..6a20331 100644 (file)
@@ -127,7 +127,9 @@ extern __be32 nfs4_callback_sequence(void *argp, void *resp,
 #define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX  9
 #define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
 #define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
-#define RCA4_TYPE_MASK_ALL 0xf31f
+#define PNFS_FF_RCA4_TYPE_MASK_READ 16
+#define PNFS_FF_RCA4_TYPE_MASK_RW 17
+#define RCA4_TYPE_MASK_ALL 0x3f31f
 
 struct cb_recallanyargs {
        uint32_t        craa_objs_to_keep;
index cd4c6bc..e61dbc9 100644 (file)
@@ -121,31 +121,31 @@ out:
  */
 static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
                const nfs4_stateid *stateid)
+       __must_hold(RCU)
 {
        struct nfs_server *server;
        struct inode *inode;
        struct pnfs_layout_hdr *lo;
 
+       rcu_read_lock();
        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-               list_for_each_entry(lo, &server->layouts, plh_layouts) {
+               list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+                       if (!pnfs_layout_is_valid(lo))
+                               continue;
                        if (stateid != NULL &&
                            !nfs4_stateid_match_other(stateid, &lo->plh_stateid))
                                continue;
+                       if (!nfs_sb_active(server->super))
+                               continue;
                        inode = igrab(lo->plh_inode);
-                       if (!inode)
-                               return ERR_PTR(-EAGAIN);
-                       if (!nfs_sb_active(inode->i_sb)) {
-                               rcu_read_unlock();
-                               spin_unlock(&clp->cl_lock);
-                               iput(inode);
-                               spin_lock(&clp->cl_lock);
-                               rcu_read_lock();
-                               return ERR_PTR(-EAGAIN);
-                       }
-                       return inode;
+                       rcu_read_unlock();
+                       if (inode)
+                               return inode;
+                       nfs_sb_deactive(server->super);
+                       return ERR_PTR(-EAGAIN);
                }
        }
-
+       rcu_read_unlock();
        return ERR_PTR(-ENOENT);
 }
 
@@ -163,28 +163,25 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp,
        struct inode *inode;
        struct pnfs_layout_hdr *lo;
 
+       rcu_read_lock();
        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-               list_for_each_entry(lo, &server->layouts, plh_layouts) {
+               list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
                        nfsi = NFS_I(lo->plh_inode);
                        if (nfs_compare_fh(fh, &nfsi->fh))
                                continue;
                        if (nfsi->layout != lo)
                                continue;
+                       if (!nfs_sb_active(server->super))
+                               continue;
                        inode = igrab(lo->plh_inode);
-                       if (!inode)
-                               return ERR_PTR(-EAGAIN);
-                       if (!nfs_sb_active(inode->i_sb)) {
-                               rcu_read_unlock();
-                               spin_unlock(&clp->cl_lock);
-                               iput(inode);
-                               spin_lock(&clp->cl_lock);
-                               rcu_read_lock();
-                               return ERR_PTR(-EAGAIN);
-                       }
-                       return inode;
+                       rcu_read_unlock();
+                       if (inode)
+                               return inode;
+                       nfs_sb_deactive(server->super);
+                       return ERR_PTR(-EAGAIN);
                }
        }
-
+       rcu_read_unlock();
        return ERR_PTR(-ENOENT);
 }
 
@@ -194,14 +191,9 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp,
 {
        struct inode *inode;
 
-       spin_lock(&clp->cl_lock);
-       rcu_read_lock();
        inode = nfs_layout_find_inode_by_stateid(clp, stateid);
        if (inode == ERR_PTR(-ENOENT))
                inode = nfs_layout_find_inode_by_fh(clp, fh);
-       rcu_read_unlock();
-       spin_unlock(&clp->cl_lock);
-
        return inode;
 }
 
@@ -280,7 +272,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
                goto unlock;
        }
 
-       pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+       pnfs_set_layout_stateid(lo, &args->cbl_stateid, NULL, true);
        switch (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
                                &args->cbl_range,
                                be32_to_cpu(args->cbl_stateid.seqid))) {
@@ -605,6 +597,7 @@ __be32 nfs4_callback_recallany(void *argp, void *resp,
        struct cb_recallanyargs *args = argp;
        __be32 status;
        fmode_t flags = 0;
+       bool schedule_manager = false;
 
        status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
        if (!cps->clp) /* set in cb_sequence */
@@ -627,6 +620,18 @@ __be32 nfs4_callback_recallany(void *argp, void *resp,
 
        if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT))
                pnfs_recall_all_layouts(cps->clp);
+
+       if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) {
+               set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state);
+               schedule_manager = true;
+       }
+       if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_RW)) {
+               set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &cps->clp->cl_state);
+               schedule_manager = true;
+       }
+       if (schedule_manager)
+               nfs4_schedule_state_manager(cps->clp);
+
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
index 1865322..816e142 100644 (file)
@@ -378,6 +378,18 @@ nfs_inode_detach_delegation(struct inode *inode)
 }
 
 static void
+nfs_update_delegation_cred(struct nfs_delegation *delegation,
+               const struct cred *cred)
+{
+       const struct cred *old;
+
+       if (cred_fscmp(delegation->cred, cred) != 0) {
+               old = xchg(&delegation->cred, get_cred(cred));
+               put_cred(old);
+       }
+}
+
+static void
 nfs_update_inplace_delegation(struct nfs_delegation *delegation,
                const struct nfs_delegation *update)
 {
@@ -385,8 +397,14 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation,
                delegation->stateid.seqid = update->stateid.seqid;
                smp_wmb();
                delegation->type = update->type;
-               if (test_and_clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+               delegation->pagemod_limit = update->pagemod_limit;
+               if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+                       delegation->change_attr = update->change_attr;
+                       nfs_update_delegation_cred(delegation, update->cred);
+                       /* smp_mb__before_atomic() is implicit due to xchg() */
+                       clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
                        atomic_long_inc(&nfs_active_delegations);
+               }
        }
 }
 
@@ -545,21 +563,11 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
        return ret;
 }
 
-/**
- * nfs_client_return_marked_delegations - return previously marked delegations
- * @clp: nfs_client to process
- *
- * Note that this function is designed to be called by the state
- * manager thread. For this reason, it cannot flush the dirty data,
- * since that could deadlock in case of a state recovery error.
- *
- * Returns zero on success, or a negative errno value.
- */
-int nfs_client_return_marked_delegations(struct nfs_client *clp)
+static int nfs_server_return_marked_delegations(struct nfs_server *server,
+               void __always_unused *data)
 {
        struct nfs_delegation *delegation;
        struct nfs_delegation *prev;
-       struct nfs_server *server;
        struct inode *inode;
        struct inode *place_holder = NULL;
        struct nfs_delegation *place_holder_deleg = NULL;
@@ -569,78 +577,79 @@ restart:
        /*
         * To avoid quadratic looping we hold a reference
         * to an inode place_holder.  Each time we restart, we
-        * list nfs_servers from the server of that inode, and
-        * delegation in the server from the delegations of that
-        * inode.
+        * list delegation in the server from the delegations
+        * of that inode.
         * prev is an RCU-protected pointer to a delegation which
         * wasn't marked for return and might be a good choice for
         * the next place_holder.
         */
-       rcu_read_lock();
        prev = NULL;
+       delegation = NULL;
+       rcu_read_lock();
        if (place_holder)
-               server = NFS_SERVER(place_holder);
-       else
-               server = list_entry_rcu(clp->cl_superblocks.next,
-                                       struct nfs_server, client_link);
-       list_for_each_entry_from_rcu(server, &clp->cl_superblocks, client_link) {
-               delegation = NULL;
-               if (place_holder && server == NFS_SERVER(place_holder))
-                       delegation = rcu_dereference(NFS_I(place_holder)->delegation);
-               if (!delegation || delegation != place_holder_deleg)
-                       delegation = list_entry_rcu(server->delegations.next,
-                                                   struct nfs_delegation, super_list);
-               list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) {
-                       struct inode *to_put = NULL;
-
-                       if (!nfs_delegation_need_return(delegation)) {
+               delegation = rcu_dereference(NFS_I(place_holder)->delegation);
+       if (!delegation || delegation != place_holder_deleg)
+               delegation = list_entry_rcu(server->delegations.next,
+                                           struct nfs_delegation, super_list);
+       list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) {
+               struct inode *to_put = NULL;
+
+               if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags))
+                       continue;
+               if (!nfs_delegation_need_return(delegation)) {
+                       if (nfs4_is_valid_delegation(delegation, 0))
                                prev = delegation;
-                               continue;
-                       }
-                       if (!nfs_sb_active(server->super))
-                               break; /* continue in outer loop */
-
-                       if (prev) {
-                               struct inode *tmp;
-
-                               tmp = nfs_delegation_grab_inode(prev);
-                               if (tmp) {
-                                       to_put = place_holder;
-                                       place_holder = tmp;
-                                       place_holder_deleg = prev;
-                               }
-                       }
+                       continue;
+               }
 
-                       inode = nfs_delegation_grab_inode(delegation);
-                       if (inode == NULL) {
-                               rcu_read_unlock();
-                               if (to_put)
-                                       iput(to_put);
-                               nfs_sb_deactive(server->super);
-                               goto restart;
+               if (prev) {
+                       struct inode *tmp = nfs_delegation_grab_inode(prev);
+                       if (tmp) {
+                               to_put = place_holder;
+                               place_holder = tmp;
+                               place_holder_deleg = prev;
                        }
-                       delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+               }
+
+               inode = nfs_delegation_grab_inode(delegation);
+               if (inode == NULL) {
                        rcu_read_unlock();
+                       iput(to_put);
+                       goto restart;
+               }
+               delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+               rcu_read_unlock();
 
-                       if (to_put)
-                               iput(to_put);
+               iput(to_put);
 
-                       err = nfs_end_delegation_return(inode, delegation, 0);
-                       iput(inode);
-                       nfs_sb_deactive(server->super);
-                       cond_resched();
-                       if (!err)
-                               goto restart;
-                       set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-                       if (place_holder)
-                               iput(place_holder);
-                       return err;
-               }
+               err = nfs_end_delegation_return(inode, delegation, 0);
+               iput(inode);
+               cond_resched();
+               if (!err)
+                       goto restart;
+               set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+               goto out;
        }
        rcu_read_unlock();
-       if (place_holder)
-               iput(place_holder);
-       return 0;
+out:
+       iput(place_holder);
+       return err;
+}
+
+/**
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Note that this function is designed to be called by the state
+ * manager thread. For this reason, it cannot flush the dirty data,
+ * since that could deadlock in case of a state recovery error.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+       return nfs_client_for_each_server(clp,
+                       nfs_server_return_marked_delegations, NULL);
 }
 
 /**
@@ -1083,53 +1092,51 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
        rcu_read_unlock();
 }
 
-/**
- * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
- * @clp: nfs_client to process
- *
- */
-void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server,
+               void __always_unused *data)
 {
        struct nfs_delegation *delegation;
-       struct nfs_server *server;
        struct inode *inode;
-
 restart:
        rcu_read_lock();
-       list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-               list_for_each_entry_rcu(delegation, &server->delegations,
-                                                               super_list) {
-                       if (test_bit(NFS_DELEGATION_INODE_FREEING,
-                                               &delegation->flags) ||
-                           test_bit(NFS_DELEGATION_RETURNING,
-                                               &delegation->flags) ||
-                           test_bit(NFS_DELEGATION_NEED_RECLAIM,
-                                               &delegation->flags) == 0)
-                               continue;
-                       if (!nfs_sb_active(server->super))
-                               break; /* continue in outer loop */
-                       inode = nfs_delegation_grab_inode(delegation);
-                       if (inode == NULL) {
-                               rcu_read_unlock();
-                               nfs_sb_deactive(server->super);
-                               goto restart;
-                       }
-                       delegation = nfs_start_delegation_return_locked(NFS_I(inode));
-                       rcu_read_unlock();
-                       if (delegation != NULL) {
-                               if (nfs_detach_delegation(NFS_I(inode), delegation,
-                                                       server) != NULL)
-                                       nfs_free_delegation(delegation);
-                               /* Match nfs_start_delegation_return_locked */
-                               nfs_put_delegation(delegation);
-                       }
-                       iput(inode);
-                       nfs_sb_deactive(server->super);
-                       cond_resched();
-                       goto restart;
+restart_locked:
+       list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+               if (test_bit(NFS_DELEGATION_INODE_FREEING,
+                                       &delegation->flags) ||
+                   test_bit(NFS_DELEGATION_RETURNING,
+                                       &delegation->flags) ||
+                   test_bit(NFS_DELEGATION_NEED_RECLAIM,
+                                       &delegation->flags) == 0)
+                       continue;
+               inode = nfs_delegation_grab_inode(delegation);
+               if (inode == NULL)
+                       goto restart_locked;
+               delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+               rcu_read_unlock();
+               if (delegation != NULL) {
+                       if (nfs_detach_delegation(NFS_I(inode), delegation,
+                                               server) != NULL)
+                               nfs_free_delegation(delegation);
+                       /* Match nfs_start_delegation_return_locked */
+                       nfs_put_delegation(delegation);
                }
+               iput(inode);
+               cond_resched();
+               goto restart;
        }
        rcu_read_unlock();
+       return 0;
+}
+
+/**
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+{
+       nfs_client_for_each_server(clp, nfs_server_reap_unclaimed_delegations,
+                       NULL);
 }
 
 static inline bool nfs4_server_rebooted(const struct nfs_client *clp)
@@ -1215,62 +1222,61 @@ nfs_delegation_test_free_expired(struct inode *inode,
                nfs_remove_bad_delegation(inode, stateid);
 }
 
-/**
- * nfs_reap_expired_delegations - reap expired delegations
- * @clp: nfs_client to process
- *
- * Iterates through all the delegations associated with this server and
- * checks if they have may have been revoked. This function is usually
- * expected to be called in cases where the server may have lost its
- * lease.
- */
-void nfs_reap_expired_delegations(struct nfs_client *clp)
+static int nfs_server_reap_expired_delegations(struct nfs_server *server,
+               void __always_unused *data)
 {
        struct nfs_delegation *delegation;
-       struct nfs_server *server;
        struct inode *inode;
        const struct cred *cred;
        nfs4_stateid stateid;
-
 restart:
        rcu_read_lock();
-       list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-               list_for_each_entry_rcu(delegation, &server->delegations,
-                                                               super_list) {
-                       if (test_bit(NFS_DELEGATION_INODE_FREEING,
-                                               &delegation->flags) ||
-                           test_bit(NFS_DELEGATION_RETURNING,
-                                               &delegation->flags) ||
-                           test_bit(NFS_DELEGATION_TEST_EXPIRED,
-                                               &delegation->flags) == 0)
-                               continue;
-                       if (!nfs_sb_active(server->super))
-                               break; /* continue in outer loop */
-                       inode = nfs_delegation_grab_inode(delegation);
-                       if (inode == NULL) {
-                               rcu_read_unlock();
-                               nfs_sb_deactive(server->super);
-                               goto restart;
-                       }
-                       cred = get_cred_rcu(delegation->cred);
-                       nfs4_stateid_copy(&stateid, &delegation->stateid);
-                       clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
-                       rcu_read_unlock();
-                       nfs_delegation_test_free_expired(inode, &stateid, cred);
-                       put_cred(cred);
-                       if (nfs4_server_rebooted(clp)) {
-                               nfs_inode_mark_test_expired_delegation(server,inode);
-                               iput(inode);
-                               nfs_sb_deactive(server->super);
-                               return;
-                       }
+restart_locked:
+       list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+               if (test_bit(NFS_DELEGATION_INODE_FREEING,
+                                       &delegation->flags) ||
+                   test_bit(NFS_DELEGATION_RETURNING,
+                                       &delegation->flags) ||
+                   test_bit(NFS_DELEGATION_TEST_EXPIRED,
+                                       &delegation->flags) == 0)
+                       continue;
+               inode = nfs_delegation_grab_inode(delegation);
+               if (inode == NULL)
+                       goto restart_locked;
+               spin_lock(&delegation->lock);
+               cred = get_cred_rcu(delegation->cred);
+               nfs4_stateid_copy(&stateid, &delegation->stateid);
+               spin_unlock(&delegation->lock);
+               clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+               rcu_read_unlock();
+               nfs_delegation_test_free_expired(inode, &stateid, cred);
+               put_cred(cred);
+               if (!nfs4_server_rebooted(server->nfs_client)) {
                        iput(inode);
-                       nfs_sb_deactive(server->super);
                        cond_resched();
                        goto restart;
                }
+               nfs_inode_mark_test_expired_delegation(server,inode);
+               iput(inode);
+               return -EAGAIN;
        }
        rcu_read_unlock();
+       return 0;
+}
+
+/**
+ * nfs_reap_expired_delegations - reap expired delegations
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * checks if they have may have been revoked. This function is usually
+ * expected to be called in cases where the server may have lost its
+ * lease.
+ */
+void nfs_reap_expired_delegations(struct nfs_client *clp)
+{
+       nfs_client_for_each_server(clp, nfs_server_reap_expired_delegations,
+                       NULL);
 }
 
 void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
@@ -1359,11 +1365,14 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
-       bool ret;
+       bool ret = false;
 
        flags &= FMODE_READ|FMODE_WRITE;
        rcu_read_lock();
        delegation = rcu_dereference(nfsi->delegation);
+       if (!delegation)
+               goto out;
+       spin_lock(&delegation->lock);
        ret = nfs4_is_valid_delegation(delegation, flags);
        if (ret) {
                nfs4_stateid_copy(dst, &delegation->stateid);
@@ -1371,6 +1380,8 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
                if (cred)
                        *cred = get_cred(delegation->cred);
        }
+       spin_unlock(&delegation->lock);
+out:
        rcu_read_unlock();
        return ret;
 }
index d4b839b..5a331da 100644 (file)
@@ -141,10 +141,9 @@ struct nfs_cache_array {
        int size;
        int eof_index;
        u64 last_cookie;
-       struct nfs_cache_array_entry array[0];
+       struct nfs_cache_array_entry array[];
 };
 
-typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool);
 typedef struct {
        struct file     *file;
        struct page     *page;
@@ -153,7 +152,7 @@ typedef struct {
        u64             *dir_cookie;
        u64             last_cookie;
        loff_t          current_index;
-       decode_dirent_t decode;
+       loff_t          prev_index;
 
        unsigned long   dir_verifier;
        unsigned long   timestamp;
@@ -240,6 +239,25 @@ out:
        return ret;
 }
 
+static inline
+int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+       return in_compat_syscall();
+#else
+       return (BITS_PER_LONG == 32);
+#endif
+}
+
+static
+bool nfs_readdir_use_cookie(const struct file *filp)
+{
+       if ((filp->f_mode & FMODE_32BITHASH) ||
+           (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+               return false;
+       return true;
+}
+
 static
 int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
 {
@@ -289,7 +307,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
                            !nfs_readdir_inode_mapping_valid(nfsi)) {
                                ctx->duped = 0;
                                ctx->attr_gencount = nfsi->attr_gencount;
-                       } else if (new_pos < desc->ctx->pos) {
+                       } else if (new_pos < desc->prev_index) {
                                if (ctx->duped > 0
                                    && ctx->dup_cookie == *desc->dir_cookie) {
                                        if (printk_ratelimit()) {
@@ -305,7 +323,11 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
                                ctx->dup_cookie = *desc->dir_cookie;
                                ctx->duped = -1;
                        }
-                       desc->ctx->pos = new_pos;
+                       if (nfs_readdir_use_cookie(desc->file))
+                               desc->ctx->pos = *desc->dir_cookie;
+                       else
+                               desc->ctx->pos = new_pos;
+                       desc->prev_index = new_pos;
                        desc->cache_entry_index = i;
                        return 0;
                }
@@ -376,9 +398,10 @@ error:
 static int xdr_decode(nfs_readdir_descriptor_t *desc,
                      struct nfs_entry *entry, struct xdr_stream *xdr)
 {
+       struct inode *inode = file_inode(desc->file);
        int error;
 
-       error = desc->decode(xdr, entry, desc->plus);
+       error = NFS_PROTO(inode)->decode_dirent(xdr, entry, desc->plus);
        if (error)
                return error;
        entry->fattr->time_start = desc->timestamp;
@@ -756,6 +779,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 
        if (desc->page_index == 0) {
                desc->current_index = 0;
+               desc->prev_index = 0;
                desc->last_cookie = 0;
        }
        do {
@@ -786,11 +810,14 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
                        desc->eof = true;
                        break;
                }
-               desc->ctx->pos++;
                if (i < (array->size-1))
                        *desc->dir_cookie = array->array[i+1].cookie;
                else
                        *desc->dir_cookie = array->last_cookie;
+               if (nfs_readdir_use_cookie(file))
+                       desc->ctx->pos = *desc->dir_cookie;
+               else
+                       desc->ctx->pos++;
                if (ctx->duped != 0)
                        ctx->duped = 1;
        }
@@ -860,9 +887,14 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 {
        struct dentry   *dentry = file_dentry(file);
        struct inode    *inode = d_inode(dentry);
-       nfs_readdir_descriptor_t my_desc,
-                       *desc = &my_desc;
        struct nfs_open_dir_context *dir_ctx = file->private_data;
+       nfs_readdir_descriptor_t my_desc = {
+               .file = file,
+               .ctx = ctx,
+               .dir_cookie = &dir_ctx->dir_cookie,
+               .plus = nfs_use_readdirplus(inode, ctx),
+       },
+                       *desc = &my_desc;
        int res = 0;
 
        dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
@@ -875,14 +907,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
         * to either find the entry with the appropriate number or
         * revalidate the cookie.
         */
-       memset(desc, 0, sizeof(*desc));
-
-       desc->file = file;
-       desc->ctx = ctx;
-       desc->dir_cookie = &dir_ctx->dir_cookie;
-       desc->decode = NFS_PROTO(inode)->decode_dirent;
-       desc->plus = nfs_use_readdirplus(inode, ctx);
-
        if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
                res = nfs_revalidate_mapping(inode, file->f_mapping);
        if (res < 0)
@@ -954,7 +978,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
        }
        if (offset != filp->f_pos) {
                filp->f_pos = offset;
-               dir_ctx->dir_cookie = 0;
+               if (nfs_readdir_use_cookie(filp))
+                       dir_ctx->dir_cookie = offset;
+               else
+                       dir_ctx->dir_cookie = 0;
                dir_ctx->duped = 0;
        }
        inode_unlock(inode);
@@ -2282,7 +2309,7 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock);
 static LIST_HEAD(nfs_access_lru_list);
 static atomic_long_t nfs_access_nr_entries;
 
-static unsigned long nfs_access_max_cachesize = ULONG_MAX;
+static unsigned long nfs_access_max_cachesize = 4*1024*1024;
 module_param(nfs_access_max_cachesize, ulong, 0644);
 MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
 
@@ -2642,9 +2669,10 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
        status = NFS_PROTO(inode)->access(inode, &cache);
        if (status != 0) {
                if (status == -ESTALE) {
-                       nfs_zap_caches(inode);
                        if (!S_ISDIR(inode->i_mode))
-                               set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+                               nfs_set_inode_stale(inode);
+                       else
+                               nfs_zap_caches(inode);
                }
                goto out;
        }
@@ -2732,14 +2760,7 @@ force_lookup:
        if (!NFS_PROTO(inode)->access)
                goto out_notsup;
 
-       /* Always try fast lookups first */
-       rcu_read_lock();
-       res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
-       rcu_read_unlock();
-       if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
-               /* Fast lookup failed, try the slow way */
-               res = nfs_do_access(inode, cred, mask);
-       }
+       res = nfs_do_access(inode, cred, mask);
 out:
        if (!res && (mask & MAY_EXEC))
                res = nfs_execute_ok(inode, mask);
index b768a0b..a57e7c7 100644 (file)
@@ -94,7 +94,7 @@ struct nfs_direct_req {
 #define NFS_ODIRECT_RESCHED_WRITES     (2)     /* write verification failed */
        /* for read */
 #define NFS_ODIRECT_SHOULD_DIRTY       (3)     /* dirty user-space page after read */
-       struct nfs_writeverf    verf;           /* unstable write verifier */
+#define NFS_ODIRECT_DONE               INT_MAX /* write verification failed */
 };
 
 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
@@ -151,106 +151,6 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq,
                dreq->count = dreq_len;
 }
 
-/*
- * nfs_direct_select_verf - select the right verifier
- * @dreq - direct request possibly spanning multiple servers
- * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
- * @commit_idx - commit bucket index for the DS
- *
- * returns the correct verifier to use given the role of the server
- */
-static struct nfs_writeverf *
-nfs_direct_select_verf(struct nfs_direct_req *dreq,
-                      struct nfs_client *ds_clp,
-                      int commit_idx)
-{
-       struct nfs_writeverf *verfp = &dreq->verf;
-
-#ifdef CONFIG_NFS_V4_1
-       /*
-        * pNFS is in use, use the DS verf except commit_through_mds is set
-        * for layout segment where nbuckets is zero.
-        */
-       if (ds_clp && dreq->ds_cinfo.nbuckets > 0) {
-               if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
-                       verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
-               else
-                       WARN_ON_ONCE(1);
-       }
-#endif
-       return verfp;
-}
-
-
-/*
- * nfs_direct_set_hdr_verf - set the write/commit verifier
- * @dreq - direct request possibly spanning multiple servers
- * @hdr - pageio header to validate against previously seen verfs
- *
- * Set the server's (MDS or DS) "seen" verifier
- */
-static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
-                                   struct nfs_pgio_header *hdr)
-{
-       struct nfs_writeverf *verfp;
-
-       verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
-       WARN_ON_ONCE(verfp->committed >= 0);
-       memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
-       WARN_ON_ONCE(verfp->committed < 0);
-}
-
-static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1,
-               const struct nfs_writeverf *v2)
-{
-       return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier);
-}
-
-/*
- * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
- * @dreq - direct request possibly spanning multiple servers
- * @hdr - pageio header to validate against previously seen verf
- *
- * set the server's "seen" verf if not initialized.
- * returns result of comparison between @hdr->verf and the "seen"
- * verf of the server used by @hdr (DS or MDS)
- */
-static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
-                                         struct nfs_pgio_header *hdr)
-{
-       struct nfs_writeverf *verfp;
-
-       verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
-       if (verfp->committed < 0) {
-               nfs_direct_set_hdr_verf(dreq, hdr);
-               return 0;
-       }
-       return nfs_direct_cmp_verf(verfp, &hdr->verf);
-}
-
-/*
- * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
- * @dreq - direct request possibly spanning multiple servers
- * @data - commit data to validate against previously seen verf
- *
- * returns result of comparison between @data->verf and the verf of
- * the server used by @data (DS or MDS)
- */
-static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
-                                          struct nfs_commit_data *data)
-{
-       struct nfs_writeverf *verfp;
-
-       verfp = nfs_direct_select_verf(dreq, data->ds_clp,
-                                        data->ds_commit_index);
-
-       /* verifier not set so always fail */
-       if (verfp->committed < 0 || data->res.verf->committed <= NFS_UNSTABLE)
-               return 1;
-
-       return nfs_direct_cmp_verf(verfp, data->res.verf);
-}
-
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
  * @iocb: target I/O control block
@@ -305,7 +205,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
        kref_get(&dreq->kref);
        init_completion(&dreq->completion);
        INIT_LIST_HEAD(&dreq->mds_cinfo.list);
-       dreq->verf.committed = NFS_INVALID_STABLE_HOW;  /* not set yet */
+       pnfs_init_ds_commit_info(&dreq->ds_cinfo);
        INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
        spin_lock_init(&dreq->lock);
 
@@ -316,7 +216,7 @@ static void nfs_direct_req_free(struct kref *kref)
 {
        struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
 
-       nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
+       pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode);
        if (dreq->l_ctx != NULL)
                nfs_put_lock_context(dreq->l_ctx);
        if (dreq->ctx != NULL)
@@ -571,6 +471,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
        l_ctx = nfs_get_lock_context(dreq->ctx);
        if (IS_ERR(l_ctx)) {
                result = PTR_ERR(l_ctx);
+               nfs_direct_req_release(dreq);
                goto out_release;
        }
        dreq->l_ctx = l_ctx;
@@ -605,15 +506,30 @@ out:
 }
 
 static void
+nfs_direct_join_group(struct list_head *list, struct inode *inode)
+{
+       struct nfs_page *req, *next;
+
+       list_for_each_entry(req, list, wb_list) {
+               if (req->wb_head != req || req->wb_this_page == req)
+                       continue;
+               for (next = req->wb_this_page;
+                               next != req->wb_head;
+                               next = next->wb_this_page) {
+                       nfs_list_remove_request(next);
+                       nfs_release_request(next);
+               }
+               nfs_join_page_group(req, inode);
+       }
+}
+
+static void
 nfs_direct_write_scan_commit_list(struct inode *inode,
                                  struct list_head *list,
                                  struct nfs_commit_info *cinfo)
 {
        mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
-#ifdef CONFIG_NFS_V4_1
-       if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
-               NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
-#endif
+       pnfs_recover_commit_reqs(list, cinfo);
        nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
        mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 }
@@ -629,11 +545,12 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
        nfs_init_cinfo_from_dreq(&cinfo, dreq);
        nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
 
+       nfs_direct_join_group(&reqs, dreq->inode);
+
        dreq->count = 0;
        dreq->max_count = 0;
        list_for_each_entry(req, &reqs, wb_list)
                dreq->max_count += req->wb_bytes;
-       dreq->verf.committed = NFS_INVALID_STABLE_HOW;
        nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
        get_dreq(dreq);
 
@@ -670,27 +587,35 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 
 static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 {
+       const struct nfs_writeverf *verf = data->res.verf;
        struct nfs_direct_req *dreq = data->dreq;
        struct nfs_commit_info cinfo;
        struct nfs_page *req;
        int status = data->task.tk_status;
 
+       if (status < 0) {
+               /* Errors in commit are fatal */
+               dreq->error = status;
+               dreq->max_count = 0;
+               dreq->count = 0;
+               dreq->flags = NFS_ODIRECT_DONE;
+       } else if (dreq->flags == NFS_ODIRECT_DONE)
+               status = dreq->error;
+
        nfs_init_cinfo_from_dreq(&cinfo, dreq);
-       if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data))
-               dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 
        while (!list_empty(&data->pages)) {
                req = nfs_list_entry(data->pages.next);
                nfs_list_remove_request(req);
-               if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
+               if (status >= 0 && !nfs_write_match_verf(verf, req)) {
+                       dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
                        /*
                         * Despite the reboot, the write was successful,
                         * so reset wb_nio.
                         */
                        req->wb_nio = 0;
-                       /* Note the rewrite will go through mds */
                        nfs_mark_request_commit(req, NULL, &cinfo, 0);
-               } else
+               } else /* Error or match */
                        nfs_release_request(req);
                nfs_unlock_and_release_request(req);
        }
@@ -705,7 +630,8 @@ static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
        struct nfs_direct_req *dreq = cinfo->dreq;
 
        spin_lock(&dreq->lock);
-       dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+       if (dreq->flags != NFS_ODIRECT_DONE)
+               dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
        spin_unlock(&dreq->lock);
        nfs_mark_request_commit(req, NULL, cinfo, 0);
 }
@@ -728,6 +654,23 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
                nfs_direct_write_reschedule(dreq);
 }
 
+static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
+{
+       struct nfs_commit_info cinfo;
+       struct nfs_page *req;
+       LIST_HEAD(reqs);
+
+       nfs_init_cinfo_from_dreq(&cinfo, dreq);
+       nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
+
+       while (!list_empty(&reqs)) {
+               req = nfs_list_entry(reqs.next);
+               nfs_list_remove_request(req);
+               nfs_release_request(req);
+               nfs_unlock_and_release_request(req);
+       }
+}
+
 static void nfs_direct_write_schedule_work(struct work_struct *work)
 {
        struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
@@ -742,6 +685,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
                        nfs_direct_write_reschedule(dreq);
                        break;
                default:
+                       nfs_direct_write_clear_reqs(dreq);
                        nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
                        nfs_direct_complete(dreq);
        }
@@ -768,20 +712,15 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
        }
 
        nfs_direct_count_bytes(dreq, hdr);
-       if (hdr->good_bytes != 0) {
-               if (nfs_write_need_commit(hdr)) {
-                       if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
-                               request_commit = true;
-                       else if (dreq->flags == 0) {
-                               nfs_direct_set_hdr_verf(dreq, hdr);
-                               request_commit = true;
-                               dreq->flags = NFS_ODIRECT_DO_COMMIT;
-                       } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
-                               request_commit = true;
-                               if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
-                                       dreq->flags =
-                                               NFS_ODIRECT_RESCHED_WRITES;
-                       }
+       if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) {
+               switch (dreq->flags) {
+               case 0:
+                       dreq->flags = NFS_ODIRECT_DO_COMMIT;
+                       request_commit = true;
+                       break;
+               case NFS_ODIRECT_RESCHED_WRITES:
+               case NFS_ODIRECT_DO_COMMIT:
+                       request_commit = true;
                }
        }
        spin_unlock(&dreq->lock);
@@ -990,11 +929,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
        l_ctx = nfs_get_lock_context(dreq->ctx);
        if (IS_ERR(l_ctx)) {
                result = PTR_ERR(l_ctx);
+               nfs_direct_req_release(dreq);
                goto out_release;
        }
        dreq->l_ctx = l_ctx;
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
+       pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
 
        nfs_start_io_direct(inode);
 
index c9b605f..a13e690 100644 (file)
@@ -49,6 +49,7 @@ MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
 MODULE_DESCRIPTION("The NFSv4 file layout driver");
 
 #define FILELAYOUT_POLL_RETRY_MAX     (15*HZ)
+static const struct pnfs_commit_ops filelayout_commit_ops;
 
 static loff_t
 filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
@@ -750,72 +751,17 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
        /* This assumes a single RW lseg */
        if (lseg->pls_range.iomode == IOMODE_RW) {
                struct nfs4_filelayout *flo;
+               struct inode *inode;
 
                flo = FILELAYOUT_FROM_HDR(lseg->pls_layout);
-               flo->commit_info.nbuckets = 0;
-               kfree(flo->commit_info.buckets);
-               flo->commit_info.buckets = NULL;
+               inode = flo->generic_hdr.plh_inode;
+               spin_lock(&inode->i_lock);
+               pnfs_generic_ds_cinfo_release_lseg(&flo->commit_info, lseg);
+               spin_unlock(&inode->i_lock);
        }
        _filelayout_free_lseg(fl);
 }
 
-static int
-filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
-                            struct nfs_commit_info *cinfo,
-                            gfp_t gfp_flags)
-{
-       struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
-       struct pnfs_commit_bucket *buckets;
-       int size, i;
-
-       if (fl->commit_through_mds)
-               return 0;
-
-       size = (fl->stripe_type == STRIPE_SPARSE) ?
-               fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
-
-       if (cinfo->ds->nbuckets >= size) {
-               /* This assumes there is only one IOMODE_RW lseg.  What
-                * we really want to do is have a layout_hdr level
-                * dictionary of <multipath_list4, fh> keys, each
-                * associated with a struct list_head, populated by calls
-                * to filelayout_write_pagelist().
-                * */
-               return 0;
-       }
-
-       buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
-                         gfp_flags);
-       if (!buckets)
-               return -ENOMEM;
-       for (i = 0; i < size; i++) {
-               INIT_LIST_HEAD(&buckets[i].written);
-               INIT_LIST_HEAD(&buckets[i].committing);
-               /* mark direct verifier as unset */
-               buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
-       }
-
-       spin_lock(&cinfo->inode->i_lock);
-       if (cinfo->ds->nbuckets >= size)
-               goto out;
-       for (i = 0; i < cinfo->ds->nbuckets; i++) {
-               list_splice(&cinfo->ds->buckets[i].written,
-                           &buckets[i].written);
-               list_splice(&cinfo->ds->buckets[i].committing,
-                           &buckets[i].committing);
-               buckets[i].direct_verf.committed =
-                       cinfo->ds->buckets[i].direct_verf.committed;
-               buckets[i].wlseg = cinfo->ds->buckets[i].wlseg;
-               buckets[i].clseg = cinfo->ds->buckets[i].clseg;
-       }
-       swap(cinfo->ds->buckets, buckets);
-       cinfo->ds->nbuckets = size;
-out:
-       spin_unlock(&cinfo->inode->i_lock);
-       kfree(buckets);
-       return 0;
-}
-
 static struct pnfs_layout_segment *
 filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
                      struct nfs4_layoutget_res *lgr,
@@ -938,9 +884,6 @@ static void
 filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
                         struct nfs_page *req)
 {
-       struct nfs_commit_info cinfo;
-       int status;
-
        pnfs_generic_pg_check_layout(pgio);
        if (!pgio->pg_lseg) {
                pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
@@ -959,17 +902,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 
        /* If no lseg, fall back to write through mds */
        if (pgio->pg_lseg == NULL)
-               goto out_mds;
-       nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
-       status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
-       if (status < 0) {
-               pnfs_put_lseg(pgio->pg_lseg);
-               pgio->pg_lseg = NULL;
-               goto out_mds;
-       }
-       return;
-out_mds:
-       nfs_pageio_reset_write_mds(pgio);
+               nfs_pageio_reset_write_mds(pgio);
 }
 
 static const struct nfs_pageio_ops filelayout_pg_read_ops = {
@@ -1078,36 +1011,6 @@ out_err:
        return -EAGAIN;
 }
 
-/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
- *                                for @page
- * @cinfo - commit info for current inode
- * @page - page to search for matching head request
- *
- * Returns a the head request if one is found, otherwise returns NULL.
- */
-static struct nfs_page *
-filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
-{
-       struct nfs_page *freq, *t;
-       struct pnfs_commit_bucket *b;
-       int i;
-
-       /* Linearly search the commit lists for each bucket until a matching
-        * request is found */
-       for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
-               list_for_each_entry_safe(freq, t, &b->written, wb_list) {
-                       if (freq->wb_page == page)
-                               return freq->wb_head;
-               }
-               list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
-                       if (freq->wb_page == page)
-                               return freq->wb_head;
-               }
-       }
-
-       return NULL;
-}
-
 static int
 filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
                           int how, struct nfs_commit_info *cinfo)
@@ -1140,13 +1043,17 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
        struct nfs4_filelayout *flo;
 
        flo = kzalloc(sizeof(*flo), gfp_flags);
-       return flo != NULL ? &flo->generic_hdr : NULL;
+       if (flo == NULL)
+               return NULL;
+       pnfs_init_ds_commit_info(&flo->commit_info);
+       flo->commit_info.ops = &filelayout_commit_ops;
+       return &flo->generic_hdr;
 }
 
 static void
 filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
 {
-       kfree(FILELAYOUT_FROM_HDR(lo));
+       kfree_rcu(FILELAYOUT_FROM_HDR(lo), generic_hdr.plh_rcu);
 }
 
 static struct pnfs_ds_commit_info *
@@ -1160,6 +1067,46 @@ filelayout_get_ds_info(struct inode *inode)
                return &FILELAYOUT_FROM_HDR(layout)->commit_info;
 }
 
+static void
+filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+               struct pnfs_layout_segment *lseg)
+{
+       struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+       struct inode *inode = lseg->pls_layout->plh_inode;
+       struct pnfs_commit_array *array, *new;
+       unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ?
+               fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+       new = pnfs_alloc_commit_array(size, GFP_NOIO);
+       if (new) {
+               spin_lock(&inode->i_lock);
+               array = pnfs_add_commit_array(fl_cinfo, new, lseg);
+               spin_unlock(&inode->i_lock);
+               if (array != new)
+                       pnfs_free_commit_array(new);
+       }
+}
+
+static void
+filelayout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+               struct inode *inode)
+{
+       spin_lock(&inode->i_lock);
+       pnfs_generic_ds_cinfo_destroy(fl_cinfo);
+       spin_unlock(&inode->i_lock);
+}
+
+static const struct pnfs_commit_ops filelayout_commit_ops = {
+       .setup_ds_info          = filelayout_setup_ds_info,
+       .release_ds_info        = filelayout_release_ds_info,
+       .mark_request_commit    = filelayout_mark_request_commit,
+       .clear_request_commit   = pnfs_generic_clear_request_commit,
+       .scan_commit_lists      = pnfs_generic_scan_commit_lists,
+       .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
+       .search_commit_reqs     = pnfs_generic_search_commit_reqs,
+       .commit_pagelist        = filelayout_commit_pagelist,
+};
+
 static struct pnfs_layoutdriver_type filelayout_type = {
        .id                     = LAYOUT_NFSV4_1_FILES,
        .name                   = "LAYOUT_NFSV4_1_FILES",
@@ -1173,12 +1120,6 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .pg_read_ops            = &filelayout_pg_read_ops,
        .pg_write_ops           = &filelayout_pg_write_ops,
        .get_ds_info            = &filelayout_get_ds_info,
-       .mark_request_commit    = filelayout_mark_request_commit,
-       .clear_request_commit   = pnfs_generic_clear_request_commit,
-       .scan_commit_lists      = pnfs_generic_scan_commit_lists,
-       .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
-       .search_commit_reqs     = filelayout_search_commit_reqs,
-       .commit_pagelist        = filelayout_commit_pagelist,
        .read_pagelist          = filelayout_read_pagelist,
        .write_pagelist         = filelayout_write_pagelist,
        .alloc_deviceid_node    = filelayout_alloc_deviceid_node,
index bb9148b..7d399f7 100644 (file)
@@ -32,6 +32,7 @@
 
 static unsigned short io_maxretrans;
 
+static const struct pnfs_commit_ops ff_layout_commit_ops;
 static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
                struct nfs_pgio_header *hdr);
 static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
@@ -48,9 +49,11 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
 
        ffl = kzalloc(sizeof(*ffl), gfp_flags);
        if (ffl) {
+               pnfs_init_ds_commit_info(&ffl->commit_info);
                INIT_LIST_HEAD(&ffl->error_list);
                INIT_LIST_HEAD(&ffl->mirrors);
                ffl->last_report_time = ktime_get();
+               ffl->commit_info.ops = &ff_layout_commit_ops;
                return &ffl->generic_hdr;
        } else
                return NULL;
@@ -59,14 +62,14 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
 static void
 ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
 {
+       struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(lo);
        struct nfs4_ff_layout_ds_err *err, *n;
 
-       list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
-                                list) {
+       list_for_each_entry_safe(err, n, &ffl->error_list, list) {
                list_del(&err->list);
                kfree(err);
        }
-       kfree(FF_LAYOUT_FROM_HDR(lo));
+       kfree_rcu(ffl, generic_hdr.plh_rcu);
 }
 
 static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
@@ -248,36 +251,10 @@ static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
 
 static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
 {
-       int i;
-
-       if (fls->mirror_array) {
-               for (i = 0; i < fls->mirror_array_cnt; i++) {
-                       /* normally mirror_ds is freed in
-                        * .free_deviceid_node but we still do it here
-                        * for .alloc_lseg error path */
-                       ff_layout_put_mirror(fls->mirror_array[i]);
-               }
-               kfree(fls->mirror_array);
-               fls->mirror_array = NULL;
-       }
-}
-
-static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
-{
-       int ret = 0;
+       u32 i;
 
-       dprintk("--> %s\n", __func__);
-
-       /* FIXME: remove this check when layout segment support is added */
-       if (lgr->range.offset != 0 ||
-           lgr->range.length != NFS4_MAX_UINT64) {
-               dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
-                       __func__);
-               ret = -EINVAL;
-       }
-
-       dprintk("--> %s returns %d\n", __func__, ret);
-       return ret;
+       for (i = 0; i < fls->mirror_array_cnt; i++)
+               ff_layout_put_mirror(fls->mirror_array[i]);
 }
 
 static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
@@ -289,6 +266,23 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
 }
 
 static bool
+ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
+               struct pnfs_layout_segment *l2)
+{
+       const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
+       const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
+       u32 i;
+
+       if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
+               return false;
+       for (i = 0; i < fl1->mirror_array_cnt; i++) {
+               if (fl1->mirror_array[i] != fl2->mirror_array[i])
+                       return false;
+       }
+       return true;
+}
+
+static bool
 ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
                const struct pnfs_layout_range *l2)
 {
@@ -323,6 +317,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
                        new->pls_range.length);
        if (new_end < old->pls_range.offset)
                return false;
+       if (!ff_lseg_match_mirrors(new, old))
+               return false;
 
        /* Mergeable: copy info from 'old' to 'new' */
        if (new_end < old_end)
@@ -400,16 +396,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                goto out_err_free;
 
        rc = -ENOMEM;
-       fls = kzalloc(sizeof(*fls), gfp_flags);
+       fls = kzalloc(struct_size(fls, mirror_array, mirror_array_cnt),
+                       gfp_flags);
        if (!fls)
                goto out_err_free;
 
        fls->mirror_array_cnt = mirror_array_cnt;
        fls->stripe_unit = stripe_unit;
-       fls->mirror_array = kcalloc(fls->mirror_array_cnt,
-                                   sizeof(fls->mirror_array[0]), gfp_flags);
-       if (fls->mirror_array == NULL)
-               goto out_err_free;
 
        for (i = 0; i < fls->mirror_array_cnt; i++) {
                struct nfs4_ff_layout_mirror *mirror;
@@ -545,9 +538,6 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 
 out_sort_mirrors:
        ff_layout_sort_mirrors(fls);
-       rc = ff_layout_check_layout(lgr);
-       if (rc)
-               goto out_err_free;
        ret = &fls->generic_hdr;
        dprintk("<-- %s (success)\n", __func__);
 out_free_page:
@@ -560,17 +550,6 @@ out_err_free:
        goto out_free_page;
 }
 
-static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
-{
-       struct pnfs_layout_segment *lseg;
-
-       list_for_each_entry(lseg, &layout->plh_segs, pls_list)
-               if (lseg->pls_range.iomode == IOMODE_RW)
-                       return true;
-
-       return false;
-}
-
 static void
 ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
 {
@@ -585,23 +564,12 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
                ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
                inode = ffl->generic_hdr.plh_inode;
                spin_lock(&inode->i_lock);
-               if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
-                       ffl->commit_info.nbuckets = 0;
-                       kfree(ffl->commit_info.buckets);
-                       ffl->commit_info.buckets = NULL;
-               }
+               pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg);
                spin_unlock(&inode->i_lock);
        }
        _ff_layout_free_lseg(fls);
 }
 
-/* Return 1 until we have multiple lsegs support */
-static int
-ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
-{
-       return 1;
-}
-
 static void
 nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
 {
@@ -746,52 +714,6 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
        spin_unlock(&mirror->lock);
 }
 
-static int
-ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
-                           struct nfs_commit_info *cinfo,
-                           gfp_t gfp_flags)
-{
-       struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
-       struct pnfs_commit_bucket *buckets;
-       int size;
-
-       if (cinfo->ds->nbuckets != 0) {
-               /* This assumes there is only one RW lseg per file.
-                * To support multiple lseg per file, we need to
-                * change struct pnfs_commit_bucket to allow dynamic
-                * increasing nbuckets.
-                */
-               return 0;
-       }
-
-       size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
-
-       buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
-                         gfp_flags);
-       if (!buckets)
-               return -ENOMEM;
-       else {
-               int i;
-
-               spin_lock(&cinfo->inode->i_lock);
-               if (cinfo->ds->nbuckets != 0)
-                       kfree(buckets);
-               else {
-                       cinfo->ds->buckets = buckets;
-                       cinfo->ds->nbuckets = size;
-                       for (i = 0; i < size; i++) {
-                               INIT_LIST_HEAD(&buckets[i].written);
-                               INIT_LIST_HEAD(&buckets[i].committing);
-                               /* mark direct verifier as unset */
-                               buckets[i].direct_verf.committed =
-                                       NFS_INVALID_STABLE_HOW;
-                       }
-               }
-               spin_unlock(&cinfo->inode->i_lock);
-               return 0;
-       }
-}
-
 static void
 ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
 {
@@ -876,8 +798,8 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
        pnfs_put_lseg(pgio->pg_lseg);
        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                           nfs_req_openctx(req),
-                                          0,
-                                          NFS4_MAX_UINT64,
+                                          req_offset(req),
+                                          req->wb_bytes,
                                           IOMODE_READ,
                                           strict_iomode,
                                           GFP_KERNEL);
@@ -888,6 +810,14 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
 }
 
 static void
+ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio,
+                         struct nfs_page *req)
+{
+       pnfs_generic_pg_check_layout(pgio);
+       pnfs_generic_pg_check_range(pgio, req);
+}
+
+static void
 ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
                        struct nfs_page *req)
 {
@@ -897,7 +827,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
        int ds_idx;
 
 retry:
-       pnfs_generic_pg_check_layout(pgio);
+       ff_layout_pg_check_layout(pgio, req);
        /* Use full layout for now */
        if (!pgio->pg_lseg) {
                ff_layout_pg_get_read(pgio, req, false);
@@ -953,18 +883,16 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 {
        struct nfs4_ff_layout_mirror *mirror;
        struct nfs_pgio_mirror *pgm;
-       struct nfs_commit_info cinfo;
        struct nfs4_pnfs_ds *ds;
        int i;
-       int status;
 
 retry:
-       pnfs_generic_pg_check_layout(pgio);
+       ff_layout_pg_check_layout(pgio, req);
        if (!pgio->pg_lseg) {
                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                                   nfs_req_openctx(req),
-                                                  0,
-                                                  NFS4_MAX_UINT64,
+                                                  req_offset(req),
+                                                  req->wb_bytes,
                                                   IOMODE_RW,
                                                   false,
                                                   GFP_NOFS);
@@ -978,11 +906,6 @@ retry:
        if (pgio->pg_lseg == NULL)
                goto out_mds;
 
-       nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
-       status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
-       if (status < 0)
-               goto out_mds;
-
        /* Use a direct mapping of ds_idx to pgio mirror_idx */
        if (WARN_ON_ONCE(pgio->pg_mirror_count !=
            FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
@@ -1297,21 +1220,23 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
                }
        }
 
+       mirror = FF_LAYOUT_COMP(lseg, idx);
+       err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+                                      mirror, offset, length, status, opnum,
+                                      GFP_NOIO);
+
        switch (status) {
        case NFS4ERR_DELAY:
        case NFS4ERR_GRACE:
-               return;
-       default:
                break;
+       case NFS4ERR_NXIO:
+               ff_layout_mark_ds_unreachable(lseg, idx);
+               /* Fallthrough */
+       default:
+               pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+                                                 lseg);
        }
 
-       mirror = FF_LAYOUT_COMP(lseg, idx);
-       err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
-                                      mirror, offset, length, status, opnum,
-                                      GFP_NOIO);
-       if (status == NFS4ERR_NXIO)
-               ff_layout_mark_ds_unreachable(lseg, idx);
-       pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
        dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
 }
 
@@ -2012,6 +1937,33 @@ ff_layout_get_ds_info(struct inode *inode)
 }
 
 static void
+ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+               struct pnfs_layout_segment *lseg)
+{
+       struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+       struct inode *inode = lseg->pls_layout->plh_inode;
+       struct pnfs_commit_array *array, *new;
+
+       new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_NOIO);
+       if (new) {
+               spin_lock(&inode->i_lock);
+               array = pnfs_add_commit_array(fl_cinfo, new, lseg);
+               spin_unlock(&inode->i_lock);
+               if (array != new)
+                       pnfs_free_commit_array(new);
+       }
+}
+
+static void
+ff_layout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+               struct inode *inode)
+{
+       spin_lock(&inode->i_lock);
+       pnfs_generic_ds_cinfo_destroy(fl_cinfo);
+       spin_unlock(&inode->i_lock);
+}
+
+static void
 ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
 {
        nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
@@ -2496,6 +2448,16 @@ ff_layout_set_layoutdriver(struct nfs_server *server,
        return 0;
 }
 
+static const struct pnfs_commit_ops ff_layout_commit_ops = {
+       .setup_ds_info          = ff_layout_setup_ds_info,
+       .release_ds_info        = ff_layout_release_ds_info,
+       .mark_request_commit    = pnfs_layout_mark_request_commit,
+       .clear_request_commit   = pnfs_generic_clear_request_commit,
+       .scan_commit_lists      = pnfs_generic_scan_commit_lists,
+       .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
+       .commit_pagelist        = ff_layout_commit_pagelist,
+};
+
 static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .id                     = LAYOUT_FLEX_FILES,
        .name                   = "LAYOUT_FLEX_FILES",
@@ -2512,11 +2474,6 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .pg_write_ops           = &ff_layout_pg_write_ops,
        .get_ds_info            = ff_layout_get_ds_info,
        .free_deviceid_node     = ff_layout_free_deviceid_node,
-       .mark_request_commit    = pnfs_layout_mark_request_commit,
-       .clear_request_commit   = pnfs_generic_clear_request_commit,
-       .scan_commit_lists      = pnfs_generic_scan_commit_lists,
-       .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
-       .commit_pagelist        = ff_layout_commit_pagelist,
        .read_pagelist          = ff_layout_read_pagelist,
        .write_pagelist         = ff_layout_write_pagelist,
        .alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
index 2f36996..354a031 100644 (file)
@@ -99,7 +99,7 @@ struct nfs4_ff_layout_segment {
        u64                             stripe_unit;
        u32                             flags;
        u32                             mirror_array_cnt;
-       struct nfs4_ff_layout_mirror    **mirror_array;
+       struct nfs4_ff_layout_mirror    *mirror_array[];
 };
 
 struct nfs4_flexfile_layout {
index e113fcb..ccc88be 100644 (file)
@@ -190,6 +190,7 @@ static const struct constant_table nfs_vers_tokens[] = {
        { "4.0",        Opt_vers_4_0 },
        { "4.1",        Opt_vers_4_1 },
        { "4.2",        Opt_vers_4_2 },
+       {}
 };
 
 enum {
@@ -202,13 +203,14 @@ enum {
        nr__Opt_xprt
 };
 
-static const struct constant_table nfs_xprt_protocol_tokens[nr__Opt_xprt] = {
+static const struct constant_table nfs_xprt_protocol_tokens[] = {
        { "rdma",       Opt_xprt_rdma },
        { "rdma6",      Opt_xprt_rdma6 },
        { "tcp",        Opt_xprt_tcp },
        { "tcp6",       Opt_xprt_tcp6 },
        { "udp",        Opt_xprt_udp },
        { "udp6",       Opt_xprt_udp6 },
+       {}
 };
 
 enum {
@@ -239,6 +241,7 @@ static const struct constant_table nfs_secflavor_tokens[] = {
        { "spkm3i",     Opt_sec_spkmi },
        { "spkm3p",     Opt_sec_spkmp },
        { "sys",        Opt_sec_sys },
+       {}
 };
 
 /*
@@ -1135,7 +1138,7 @@ out_no_address:
        return nfs_invalf(fc, "NFS4: mount program didn't pass remote address");
 
 out_invalid_transport_udp:
-       return nfs_invalf(fc, "NFSv4: Unsupported transport protocol udp");
+       return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
 }
 #endif
 
@@ -1257,7 +1260,7 @@ out_v4_not_compiled:
        nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel");
        return -EPROTONOSUPPORT;
 out_invalid_transport_udp:
-       return nfs_invalf(fc, "NFSv4: Unsupported transport protocol udp");
+       return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
 out_no_address:
        return nfs_invalf(fc, "NFS: mount program didn't pass remote address");
 out_mountproto_mismatch:
index 11bf158..b9d0921 100644 (file)
@@ -62,7 +62,6 @@
 /* Default is to see 64-bit inode numbers */
 static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
 
-static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
 
 static struct kmem_cache * nfs_inode_cachep;
@@ -284,10 +283,18 @@ EXPORT_SYMBOL_GPL(nfs_invalidate_atime);
  * Invalidate, but do not unhash, the inode.
  * NB: must be called with inode->i_lock held!
  */
-static void nfs_invalidate_inode(struct inode *inode)
+static void nfs_set_inode_stale_locked(struct inode *inode)
 {
        set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
        nfs_zap_caches_locked(inode);
+       trace_nfs_set_inode_stale(inode);
+}
+
+void nfs_set_inode_stale(struct inode *inode)
+{
+       spin_lock(&inode->i_lock);
+       nfs_set_inode_stale_locked(inode);
+       spin_unlock(&inode->i_lock);
 }
 
 struct nfs_find_desc {
@@ -959,16 +966,16 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
                                                struct file *filp)
 {
        struct nfs_open_context *ctx;
-       const struct cred *cred = get_current_cred();
 
        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
-       if (!ctx) {
-               put_cred(cred);
+       if (!ctx)
                return ERR_PTR(-ENOMEM);
-       }
        nfs_sb_active(dentry->d_sb);
        ctx->dentry = dget(dentry);
-       ctx->cred = cred;
+       if (filp)
+               ctx->cred = get_cred(filp->f_cred);
+       else
+               ctx->cred = get_current_cred();
        ctx->ll_cred = NULL;
        ctx->state = NULL;
        ctx->mode = f_mode;
@@ -1163,9 +1170,10 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
                                status = 0;
                        break;
                case -ESTALE:
-                       nfs_zap_caches(inode);
                        if (!S_ISDIR(inode->i_mode))
-                               set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+                               nfs_set_inode_stale(inode);
+                       else
+                               nfs_zap_caches(inode);
                }
                goto err_out;
        }
@@ -2064,7 +2072,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
         * lookup validation will know that the inode is bad.
         * (But we fall through to invalidate the caches.)
         */
-       nfs_invalidate_inode(inode);
+       nfs_set_inode_stale_locked(inode);
        return -ESTALE;
 }
 
index f80c47d..1f32a9f 100644 (file)
@@ -274,12 +274,6 @@ void nfs_free_request(struct nfs_page *req);
 struct nfs_pgio_mirror *
 nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
 
-static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
-{
-       WARN_ON_ONCE(desc->pg_mirror_count < 1);
-       return desc->pg_mirror_count > 1;
-}
-
 static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
                const struct nfs_open_context *ctx2)
 {
@@ -417,7 +411,9 @@ extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
 extern bool nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
-
+extern int nfs_client_for_each_server(struct nfs_client *clp,
+                                     int (*fn)(struct nfs_server *, void *),
+                                     void *data);
 /* io.c */
 extern void nfs_start_io_read(struct inode *inode);
 extern void nfs_end_io_read(struct inode *inode);
@@ -515,13 +511,25 @@ int nfs_filemap_write_and_wait_range(struct address_space *mapping,
                loff_t lstart, loff_t lend);
 
 #ifdef CONFIG_NFS_V4_1
+static inline void
+pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets,
+               unsigned int nbuckets)
+{
+       unsigned int i;
+
+       for (i = 0; i < nbuckets; i++)
+               buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+}
 static inline
 void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
 {
-       int i;
+       struct pnfs_commit_array *array;
 
-       for (i = 0; i < cinfo->nbuckets; i++)
-               cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+       rcu_read_lock();
+       list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list)
+               pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets,
+                               array->nbuckets);
+       rcu_read_unlock();
 }
 #else
 static inline
@@ -542,6 +550,14 @@ nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
        return memcmp(v1->data, v2->data, sizeof(v1->data));
 }
 
+static inline bool
+nfs_write_match_verf(const struct nfs_writeverf *verf,
+               struct nfs_page *req)
+{
+       return verf->committed > NFS_UNSTABLE &&
+               !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
+}
+
 /* unlink.c */
 extern struct rpc_task *
 nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
index f3ece8e..6b06322 100644 (file)
@@ -145,6 +145,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
        struct vfsmount *mnt = ERR_PTR(-ENOMEM);
        struct nfs_server *server = NFS_SERVER(d_inode(path->dentry));
        struct nfs_client *client = server->nfs_client;
+       int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
        int ret;
 
        if (IS_ROOT(path->dentry))
@@ -190,12 +191,12 @@ struct vfsmount *nfs_d_automount(struct path *path)
        if (IS_ERR(mnt))
                goto out_fc;
 
-       if (nfs_mountpoint_expiry_timeout < 0)
+       mntget(mnt); /* prevent immediate expiration */
+       if (timeout <= 0)
                goto out_fc;
 
-       mntget(mnt); /* prevent immediate expiration */
        mnt_set_expiry(mnt, &nfs_automount_list);
-       schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+       schedule_delayed_work(&nfs_automount_task, timeout);
 
 out_fc:
        put_fs_context(fc);
@@ -233,10 +234,11 @@ const struct inode_operations nfs_referral_inode_operations = {
 static void nfs_expire_automounts(struct work_struct *work)
 {
        struct list_head *list = &nfs_automount_list;
+       int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
 
        mark_mounts_for_expiry(list);
-       if (!list_empty(list))
-               schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+       if (!list_empty(list) && timeout > 0)
+               schedule_delayed_work(&nfs_automount_task, timeout);
 }
 
 void nfs_release_automount_timer(void)
@@ -247,10 +249,7 @@ void nfs_release_automount_timer(void)
 
 /**
  * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @dentry: parent directory
- * @fh: filehandle for new root dentry
- * @fattr: attributes for new root inode
- * @authflavor: security flavor to use when performing the mount
+ * @fc: pointer to struct nfs_fs_context
  *
  */
 int nfs_do_submount(struct fs_context *fc)
@@ -312,3 +311,53 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server)
        return nfs_do_submount(fc);
 }
 EXPORT_SYMBOL_GPL(nfs_submount);
+
+static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp)
+{
+       long num;
+       int ret;
+
+       if (!val)
+               return -EINVAL;
+       ret = kstrtol(val, 0, &num);
+       if (ret)
+               return -EINVAL;
+       if (num > 0) {
+               if (num >= INT_MAX / HZ)
+                       num = INT_MAX;
+               else
+                       num *= HZ;
+               *((int *)kp->arg) = num;
+               if (!list_empty(&nfs_automount_list))
+                       mod_delayed_work(system_wq, &nfs_automount_task, num);
+       } else {
+               *((int *)kp->arg) = -1*HZ;
+               cancel_delayed_work(&nfs_automount_task);
+       }
+       return 0;
+}
+
+static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp)
+{
+       long num = *((int *)kp->arg);
+
+       if (num > 0) {
+               if (num >= INT_MAX - (HZ - 1))
+                       num = INT_MAX / HZ;
+               else
+                       num = (num + (HZ - 1)) / HZ;
+       } else
+               num = -1;
+       return scnprintf(buffer, PAGE_SIZE, "%li\n", num);
+}
+
+static const struct kernel_param_ops param_ops_nfs_timeout = {
+       .set = param_set_nfs_timeout,
+       .get = param_get_nfs_timeout,
+};
+#define param_check_nfs_timeout(name, p) __param_check(name, p, int);
+
+module_param(nfs_mountpoint_expiry_timeout, nfs_timeout, 0644);
+MODULE_PARM_DESC(nfs_mountpoint_expiry_timeout,
+               "Set the NFS automounted mountpoint timeout value (seconds)."
+               "Values <= 0 turn expiration off.");
index 8be1ba7..2b7f6dc 100644 (file)
@@ -42,7 +42,9 @@ enum nfs4_client_state {
        NFS4CLNT_LEASE_MOVED,
        NFS4CLNT_DELEGATION_EXPIRED,
        NFS4CLNT_RUN_MANAGER,
-       NFS4CLNT_DELEGRETURN_RUNNING,
+       NFS4CLNT_RECALL_RUNNING,
+       NFS4CLNT_RECALL_ANY_LAYOUT_READ,
+       NFS4CLNT_RECALL_ANY_LAYOUT_RW,
 };
 
 #define NFS4_RENEW_TIMEOUT             0x01
index 1297919..8e5d622 100644 (file)
@@ -252,6 +252,9 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
        if (remap_flags & ~REMAP_FILE_ADVISORY)
                return -EINVAL;
 
+       if (IS_SWAPFILE(dst_inode) || IS_SWAPFILE(src_inode))
+               return -ETXTBSY;
+
        /* check alignment w.r.t. clone_blksize */
        ret = -EINVAL;
        if (bs) {
index 84026e7..a3ab6e2 100644 (file)
@@ -354,7 +354,7 @@ static int try_location(struct fs_context *fc,
 
 /**
  * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @dentry: parent directory
+ * @fc: pointer to struct nfs_fs_context
  * @locations: array of NFSv4 server location information
  *
  */
index cb34e84..512afb1 100644 (file)
@@ -2346,7 +2346,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
                .callback_ops = &nfs4_open_confirm_ops,
                .callback_data = data,
                .workqueue = nfsiod_workqueue,
-               .flags = RPC_TASK_ASYNC,
+               .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
        };
        int status;
 
@@ -2511,7 +2511,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data,
                .callback_ops = &nfs4_open_ops,
                .callback_data = data,
                .workqueue = nfsiod_workqueue,
-               .flags = RPC_TASK_ASYNC,
+               .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
        };
        int status;
 
@@ -2790,16 +2790,19 @@ static int nfs41_check_delegation_stateid(struct nfs4_state *state)
                return NFS_OK;
        }
 
+       spin_lock(&delegation->lock);
        nfs4_stateid_copy(&stateid, &delegation->stateid);
 
        if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED,
                                &delegation->flags)) {
+               spin_unlock(&delegation->lock);
                rcu_read_unlock();
                return NFS_OK;
        }
 
        if (delegation->cred)
                cred = get_cred(delegation->cred);
+       spin_unlock(&delegation->lock);
        rcu_read_unlock();
        status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
        trace_nfs4_test_delegation_stateid(state, NULL, status);
@@ -3651,7 +3654,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
                .rpc_message = &msg,
                .callback_ops = &nfs4_close_ops,
                .workqueue = nfsiod_workqueue,
-               .flags = RPC_TASK_ASYNC,
+               .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
        };
        int status = -ENOMEM;
 
@@ -5544,7 +5547,7 @@ unwind:
 struct nfs4_cached_acl {
        int cached;
        size_t len;
-       char data[0];
+       char data[];
 };
 
 static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl)
@@ -6253,6 +6256,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
                /* Fallthrough */
        case -NFS4ERR_BAD_STATEID:
        case -NFS4ERR_STALE_STATEID:
+       case -ETIMEDOUT:
                task->tk_status = 0;
                break;
        case -NFS4ERR_OLD_STATEID:
@@ -6343,7 +6347,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
                .rpc_client = server->client,
                .rpc_message = &msg,
                .callback_ops = &nfs4_delegreturn_ops,
-               .flags = RPC_TASK_ASYNC,
+               .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | RPC_TASK_TIMEOUT,
        };
        int status = 0;
 
@@ -6926,7 +6930,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                .rpc_message = &msg,
                .callback_ops = &nfs4_lock_ops,
                .workqueue = nfsiod_workqueue,
-               .flags = RPC_TASK_ASYNC,
+               .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
        };
        int ret;
 
@@ -9170,7 +9174,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
                .rpc_message = &msg,
                .callback_ops = &nfs4_layoutget_call_ops,
                .callback_data = lgp,
-               .flags = RPC_TASK_ASYNC,
+               .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
        };
        struct pnfs_layout_segment *lseg = NULL;
        struct nfs4_exception exception = {
@@ -9287,6 +9291,7 @@ static void nfs4_layoutreturn_release(void *calldata)
                lrp->ld_private.ops->free(&lrp->ld_private);
        pnfs_put_layout_hdr(lrp->args.layout);
        nfs_iput_and_deactive(lrp->inode);
+       put_cred(lrp->cred);
        kfree(calldata);
        dprintk("<-- %s\n", __func__);
 }
index f7723d2..ac93715 100644 (file)
@@ -2524,6 +2524,21 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
        }
        return 0;
 }
+
+static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
+{
+       int iomode = 0;
+
+       if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &clp->cl_state))
+               iomode += IOMODE_READ;
+       if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &clp->cl_state))
+               iomode += IOMODE_RW;
+       /* Note: IOMODE_READ + IOMODE_RW == IOMODE_ANY */
+       if (iomode) {
+               pnfs_layout_return_unused_byclid(clp, iomode);
+               set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+       }
+}
 #else /* CONFIG_NFS_V4_1 */
 static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
 
@@ -2531,6 +2546,10 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
 {
        return 0;
 }
+
+static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 static void nfs4_state_manager(struct nfs_client *clp)
@@ -2635,12 +2654,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
                nfs4_end_drain_session(clp);
                nfs4_clear_state_manager_bit(clp);
 
-               if (!test_and_set_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state)) {
+               if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) {
                        if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
                                nfs_client_return_marked_delegations(clp);
                                set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
                        }
-                       clear_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state);
+                       nfs4_layoutreturn_any_run(clp);
+                       clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state);
                }
 
                /* Did we race with an attempt to give us more work? */
index 1e97e5e..5435411 100644 (file)
@@ -584,7 +584,9 @@ TRACE_DEFINE_ENUM(NFS4CLNT_MOVED);
 TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED);
 TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED);
 TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER);
-TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_READ);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_RW);
 
 #define show_nfs4_clp_state(state) \
        __print_flags(state, "|", \
@@ -605,7 +607,9 @@ TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING);
                { NFS4CLNT_LEASE_MOVED,         "LEASE_MOVED" }, \
                { NFS4CLNT_DELEGATION_EXPIRED,  "DELEGATION_EXPIRED" }, \
                { NFS4CLNT_RUN_MANAGER,         "RUN_MANAGER" }, \
-               { NFS4CLNT_DELEGRETURN_RUNNING, "DELEGRETURN_RUNNING" })
+               { NFS4CLNT_RECALL_RUNNING,      "RECALL_RUNNING" }, \
+               { NFS4CLNT_RECALL_ANY_LAYOUT_READ, "RECALL_ANY_LAYOUT_READ" }, \
+               { NFS4CLNT_RECALL_ANY_LAYOUT_RW, "RECALL_ANY_LAYOUT_RW" })
 
 TRACE_EVENT(nfs4_state_mgr,
                TP_PROTO(
index effaa42..8d32788 100644 (file)
@@ -88,7 +88,7 @@
 #define NFS_ROOT               "/tftpboot/%s"
 
 /* Default NFSROOT mount options. */
-#define NFS_DEF_OPTIONS                "vers=2,udp,rsize=4096,wsize=4096"
+#define NFS_DEF_OPTIONS                "vers=2,tcp,rsize=4096,wsize=4096"
 
 /* Parameters passed from the kernel command line */
 static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = "";
index a9588d1..7e7a97a 100644 (file)
@@ -181,6 +181,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done,
                                int error \
                        ), \
                        TP_ARGS(inode, error))
+DEFINE_NFS_INODE_EVENT(nfs_set_inode_stale);
 DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter);
 DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit);
 DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter);
index 20b3717..f61f966 100644 (file)
@@ -33,9 +33,7 @@ static const struct rpc_call_ops nfs_pgio_common_ops;
 struct nfs_pgio_mirror *
 nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
 {
-       return nfs_pgio_has_mirroring(desc) ?
-               &desc->pg_mirrors[desc->pg_mirror_idx] :
-               &desc->pg_mirrors[0];
+       return &desc->pg_mirrors[desc->pg_mirror_idx];
 }
 EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
 
@@ -133,47 +131,166 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx)
 EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
 
 /*
- * nfs_page_group_lock - lock the head of the page group
- * @req - request in group that is to be locked
+ * nfs_page_lock_head_request - page lock the head of the page group
+ * @req: any member of the page group
+ */
+struct nfs_page *
+nfs_page_group_lock_head(struct nfs_page *req)
+{
+       struct nfs_page *head = req->wb_head;
+
+       while (!nfs_lock_request(head)) {
+               int ret = nfs_wait_on_request(head);
+               if (ret < 0)
+                       return ERR_PTR(ret);
+       }
+       if (head != req)
+               kref_get(&head->wb_kref);
+       return head;
+}
+
+/*
+ * nfs_unroll_locks -  unlock all newly locked reqs and wait on @req
+ * @head: head request of page group, must be holding head lock
+ * @req: request that couldn't lock and needs to wait on the req bit lock
  *
- * this lock must be held when traversing or modifying the page
- * group list
+ * This is a helper function for nfs_lock_and_join_requests
+ * returns 0 on success, < 0 on error.
+ */
+static void
+nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req)
+{
+       struct nfs_page *tmp;
+
+       /* relinquish all the locks successfully grabbed this run */
+       for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
+               if (!kref_read(&tmp->wb_kref))
+                       continue;
+               nfs_unlock_and_release_request(tmp);
+       }
+}
+
+/*
+ * nfs_page_group_lock_subreq -  try to lock a subrequest
+ * @head: head request of page group
+ * @subreq: request to lock
  *
- * return 0 on success, < 0 on error
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request and page group both locked.
+ * On error, it returns with the page group unlocked.
  */
-int
-nfs_page_group_lock(struct nfs_page *req)
+static int
+nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq)
 {
-       struct nfs_page *head = req->wb_head;
+       int ret;
+
+       if (!kref_get_unless_zero(&subreq->wb_kref))
+               return 0;
+       while (!nfs_lock_request(subreq)) {
+               nfs_page_group_unlock(head);
+               ret = nfs_wait_on_request(subreq);
+               if (!ret)
+                       ret = nfs_page_group_lock(head);
+               if (ret < 0) {
+                       nfs_unroll_locks(head, subreq);
+                       nfs_release_request(subreq);
+                       return ret;
+               }
+       }
+       return 0;
+}
+
+/*
+ * nfs_page_group_lock_subrequests -  try to lock the subrequests
+ * @head: head request of page group
+ *
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request locked.
+ */
+int nfs_page_group_lock_subrequests(struct nfs_page *head)
+{
+       struct nfs_page *subreq;
+       int ret;
 
-       WARN_ON_ONCE(head != head->wb_head);
+       ret = nfs_page_group_lock(head);
+       if (ret < 0)
+               return ret;
+       /* lock each request in the page group */
+       for (subreq = head->wb_this_page; subreq != head;
+                       subreq = subreq->wb_this_page) {
+               ret = nfs_page_group_lock_subreq(head, subreq);
+               if (ret < 0)
+                       return ret;
+       }
+       nfs_page_group_unlock(head);
+       return 0;
+}
 
-       if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
+/*
+ * nfs_page_set_headlock - set the request PG_HEADLOCK
+ * @req: request that is to be locked
+ *
+ * this lock must be held when modifying req->wb_head
+ *
+ * return 0 on success, < 0 on error
+ */
+int
+nfs_page_set_headlock(struct nfs_page *req)
+{
+       if (!test_and_set_bit(PG_HEADLOCK, &req->wb_flags))
                return 0;
 
-       set_bit(PG_CONTENDED1, &head->wb_flags);
+       set_bit(PG_CONTENDED1, &req->wb_flags);
        smp_mb__after_atomic();
-       return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+       return wait_on_bit_lock(&req->wb_flags, PG_HEADLOCK,
                                TASK_UNINTERRUPTIBLE);
 }
 
 /*
- * nfs_page_group_unlock - unlock the head of the page group
- * @req - request in group that is to be unlocked
+ * nfs_page_clear_headlock - clear the request PG_HEADLOCK
+ * @req: request that is to be locked
  */
 void
-nfs_page_group_unlock(struct nfs_page *req)
+nfs_page_clear_headlock(struct nfs_page *req)
 {
-       struct nfs_page *head = req->wb_head;
-
-       WARN_ON_ONCE(head != head->wb_head);
-
        smp_mb__before_atomic();
-       clear_bit(PG_HEADLOCK, &head->wb_flags);
+       clear_bit(PG_HEADLOCK, &req->wb_flags);
        smp_mb__after_atomic();
-       if (!test_bit(PG_CONTENDED1, &head->wb_flags))
+       if (!test_bit(PG_CONTENDED1, &req->wb_flags))
                return;
-       wake_up_bit(&head->wb_flags, PG_HEADLOCK);
+       wake_up_bit(&req->wb_flags, PG_HEADLOCK);
+}
+
+/*
+ * nfs_page_group_lock - lock the head of the page group
+ * @req: request in group that is to be locked
+ *
+ * this lock must be held when traversing or modifying the page
+ * group list
+ *
+ * return 0 on success, < 0 on error
+ */
+int
+nfs_page_group_lock(struct nfs_page *req)
+{
+       int ret;
+
+       ret = nfs_page_set_headlock(req);
+       if (ret || req->wb_head == req)
+               return ret;
+       return nfs_page_set_headlock(req->wb_head);
+}
+
+/*
+ * nfs_page_group_unlock - unlock the head of the page group
+ * @req: request in group that is to be unlocked
+ */
+void
+nfs_page_group_unlock(struct nfs_page *req)
+{
+       if (req != req->wb_head)
+               nfs_page_clear_headlock(req->wb_head);
+       nfs_page_clear_headlock(req);
 }
 
 /*
@@ -359,15 +476,23 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
 }
 
 static struct nfs_page *
-nfs_create_subreq(struct nfs_page *req, struct nfs_page *last,
-                 unsigned int pgbase, unsigned int offset,
+nfs_create_subreq(struct nfs_page *req,
+                 unsigned int pgbase,
+                 unsigned int offset,
                  unsigned int count)
 {
+       struct nfs_page *last;
        struct nfs_page *ret;
 
        ret = __nfs_create_request(req->wb_lock_context, req->wb_page,
                        pgbase, offset, count);
        if (!IS_ERR(ret)) {
+               /* find the last request */
+               for (last = req->wb_head;
+                    last->wb_this_page != req->wb_head;
+                    last = last->wb_this_page)
+                       ;
+
                nfs_lock_request(ret);
                ret->wb_index = req->wb_index;
                nfs_page_group_init(ret, last);
@@ -627,9 +752,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
                .callback_ops = call_ops,
                .callback_data = hdr,
                .workqueue = nfsiod_workqueue,
-               .flags = RPC_TASK_ASYNC | flags,
+               .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags,
        };
-       int ret = 0;
 
        hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
 
@@ -641,18 +765,10 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
                (unsigned long long)hdr->args.offset);
 
        task = rpc_run_task(&task_setup_data);
-       if (IS_ERR(task)) {
-               ret = PTR_ERR(task);
-               goto out;
-       }
-       if (how & FLUSH_SYNC) {
-               ret = rpc_wait_for_completion_task(task);
-               if (ret == 0)
-                       ret = task->tk_status;
-       }
+       if (IS_ERR(task))
+               return PTR_ERR(task);
        rpc_put_task(task);
-out:
-       return ret;
+       return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
 
@@ -886,15 +1002,6 @@ static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
        pgio->pg_mirror_count = mirror_count;
 }
 
-/*
- * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
- */
-void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
-{
-       pgio->pg_mirror_count = 1;
-       pgio->pg_mirror_idx = 0;
-}
-
 static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
 {
        pgio->pg_mirror_count = 1;
@@ -911,7 +1018,7 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
 }
 
 /**
- * nfs_can_coalesce_requests - test two requests for compatibility
+ * nfs_coalesce_size - test two requests for compatibility
  * @prev: pointer to nfs_page
  * @req: pointer to nfs_page
  * @pgio: pointer to nfs_pagio_descriptor
@@ -920,41 +1027,36 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
  * page data area they describe is contiguous, and that their RPC
  * credentials, NFSv4 open state, and lockowners are the same.
  *
- * Return 'true' if this is the case, else return 'false'.
+ * Returns size of the request that can be coalesced
  */
-static bool nfs_can_coalesce_requests(struct nfs_page *prev,
+static unsigned int nfs_coalesce_size(struct nfs_page *prev,
                                      struct nfs_page *req,
                                      struct nfs_pageio_descriptor *pgio)
 {
-       size_t size;
        struct file_lock_context *flctx;
 
        if (prev) {
                if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev)))
-                       return false;
+                       return 0;
                flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx;
                if (flctx != NULL &&
                    !(list_empty_careful(&flctx->flc_posix) &&
                      list_empty_careful(&flctx->flc_flock)) &&
                    !nfs_match_lock_context(req->wb_lock_context,
                                            prev->wb_lock_context))
-                       return false;
+                       return 0;
                if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
-                       return false;
+                       return 0;
                if (req->wb_page == prev->wb_page) {
                        if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
-                               return false;
+                               return 0;
                } else {
                        if (req->wb_pgbase != 0 ||
                            prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE)
-                               return false;
+                               return 0;
                }
        }
-       size = pgio->pg_ops->pg_test(pgio, prev, req);
-       WARN_ON_ONCE(size > req->wb_bytes);
-       if (size && size < req->wb_bytes)
-               req->wb_bytes = size;
-       return size > 0;
+       return pgio->pg_ops->pg_test(pgio, prev, req);
 }
 
 /**
@@ -962,15 +1064,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
  * @desc: destination io descriptor
  * @req: request
  *
- * Returns true if the request 'req' was successfully coalesced into the
- * existing list of pages 'desc'.
+ * If the request 'req' was successfully coalesced into the existing list
+ * of pages 'desc', it returns the size of req.
  */
-static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
-                                    struct nfs_page *req)
+static unsigned int
+nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
+               struct nfs_page *req)
 {
        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
        struct nfs_page *prev = NULL;
+       unsigned int size;
 
        if (mirror->pg_count != 0) {
                prev = nfs_list_entry(mirror->pg_list.prev);
@@ -990,11 +1093,12 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
                return 0;
        }
 
-       if (!nfs_can_coalesce_requests(prev, req, desc))
-               return 0;
+       size = nfs_coalesce_size(prev, req, desc);
+       if (size < req->wb_bytes)
+               return size;
        nfs_list_move_request(req, &mirror->pg_list);
        mirror->pg_count += req->wb_bytes;
-       return 1;
+       return req->wb_bytes;
 }
 
 /*
@@ -1034,7 +1138,8 @@ nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
  * @req: request
  *
  * This may split a request into subrequests which are all part of the
- * same page group.
+ * same page group. If so, it will submit @req as the last one, to ensure
+ * the pointer to @req is still valid in case of failure.
  *
  * Returns true if the request 'req' was successfully coalesced into the
  * existing list of pages 'desc'.
@@ -1043,51 +1148,50 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                           struct nfs_page *req)
 {
        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
        struct nfs_page *subreq;
-       unsigned int bytes_left = 0;
-       unsigned int offset, pgbase;
+       unsigned int size, subreq_size;
 
        nfs_page_group_lock(req);
 
        subreq = req;
-       bytes_left = subreq->wb_bytes;
-       offset = subreq->wb_offset;
-       pgbase = subreq->wb_pgbase;
-
-       do {
-               if (!nfs_pageio_do_add_request(desc, subreq)) {
-                       /* make sure pg_test call(s) did nothing */
-                       WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
-                       WARN_ON_ONCE(subreq->wb_offset != offset);
-                       WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
-
+       subreq_size = subreq->wb_bytes;
+       for(;;) {
+               size = nfs_pageio_do_add_request(desc, subreq);
+               if (size == subreq_size) {
+                       /* We successfully submitted a request */
+                       if (subreq == req)
+                               break;
+                       req->wb_pgbase += size;
+                       req->wb_bytes -= size;
+                       req->wb_offset += size;
+                       subreq_size = req->wb_bytes;
+                       subreq = req;
+                       continue;
+               }
+               if (WARN_ON_ONCE(subreq != req)) {
+                       nfs_page_group_unlock(req);
+                       nfs_pageio_cleanup_request(desc, subreq);
+                       subreq = req;
+                       subreq_size = req->wb_bytes;
+                       nfs_page_group_lock(req);
+               }
+               if (!size) {
+                       /* Can't coalesce any more, so do I/O */
                        nfs_page_group_unlock(req);
                        desc->pg_moreio = 1;
                        nfs_pageio_doio(desc);
                        if (desc->pg_error < 0 || mirror->pg_recoalesce)
-                               goto out_cleanup_subreq;
+                               return 0;
                        /* retry add_request for this subreq */
                        nfs_page_group_lock(req);
                        continue;
                }
-
-               /* check for buggy pg_test call(s) */
-               WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
-               WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
-               WARN_ON_ONCE(subreq->wb_bytes == 0);
-
-               bytes_left -= subreq->wb_bytes;
-               offset += subreq->wb_bytes;
-               pgbase += subreq->wb_bytes;
-
-               if (bytes_left) {
-                       subreq = nfs_create_subreq(req, subreq, pgbase,
-                                       offset, bytes_left);
-                       if (IS_ERR(subreq))
-                               goto err_ptr;
-               }
-       } while (bytes_left > 0);
+               subreq = nfs_create_subreq(req, req->wb_pgbase,
+                               req->wb_offset, size);
+               if (IS_ERR(subreq))
+                       goto err_ptr;
+               subreq_size = size;
+       }
 
        nfs_page_group_unlock(req);
        return 1;
@@ -1095,10 +1199,6 @@ err_ptr:
        desc->pg_error = PTR_ERR(subreq);
        nfs_page_group_unlock(req);
        return 0;
-out_cleanup_subreq:
-       if (req != subreq)
-               nfs_pageio_cleanup_request(desc, subreq);
-       return 0;
 }
 
 static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
@@ -1167,7 +1267,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 {
        u32 midx;
        unsigned int pgbase, offset, bytes;
-       struct nfs_page *dupreq, *lastreq;
+       struct nfs_page *dupreq;
 
        pgbase = req->wb_pgbase;
        offset = req->wb_offset;
@@ -1177,38 +1277,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
        if (desc->pg_error < 0)
                goto out_failed;
 
-       for (midx = 0; midx < desc->pg_mirror_count; midx++) {
-               if (midx) {
-                       nfs_page_group_lock(req);
+       /* Create the mirror instances first, and fire them off */
+       for (midx = 1; midx < desc->pg_mirror_count; midx++) {
+               nfs_page_group_lock(req);
 
-                       /* find the last request */
-                       for (lastreq = req->wb_head;
-                            lastreq->wb_this_page != req->wb_head;
-                            lastreq = lastreq->wb_this_page)
-                               ;
+               dupreq = nfs_create_subreq(req,
+                               pgbase, offset, bytes);
 
-                       dupreq = nfs_create_subreq(req, lastreq,
-                                       pgbase, offset, bytes);
-
-                       nfs_page_group_unlock(req);
-                       if (IS_ERR(dupreq)) {
-                               desc->pg_error = PTR_ERR(dupreq);
-                               goto out_failed;
-                       }
-               } else
-                       dupreq = req;
+               nfs_page_group_unlock(req);
+               if (IS_ERR(dupreq)) {
+                       desc->pg_error = PTR_ERR(dupreq);
+                       goto out_failed;
+               }
 
-               if (nfs_pgio_has_mirroring(desc))
-                       desc->pg_mirror_idx = midx;
+               desc->pg_mirror_idx = midx;
                if (!nfs_pageio_add_request_mirror(desc, dupreq))
                        goto out_cleanup_subreq;
        }
 
+       desc->pg_mirror_idx = 0;
+       if (!nfs_pageio_add_request_mirror(desc, req))
+               goto out_failed;
+
        return 1;
 
 out_cleanup_subreq:
-       if (req != dupreq)
-               nfs_pageio_cleanup_request(desc, dupreq);
+       nfs_pageio_cleanup_request(desc, dupreq);
 out_failed:
        nfs_pageio_error_cleanup(desc);
        return 0;
@@ -1226,8 +1320,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
        struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
        u32 restore_idx = desc->pg_mirror_idx;
 
-       if (nfs_pgio_has_mirroring(desc))
-               desc->pg_mirror_idx = mirror_idx;
+       desc->pg_mirror_idx = mirror_idx;
        for (;;) {
                nfs_pageio_doio(desc);
                if (desc->pg_error < 0 || !mirror->pg_recoalesce)
@@ -1320,6 +1413,14 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
        }
 }
 
+/*
+ * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
+ */
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+       nfs_pageio_complete(pgio);
+}
+
 int __init nfs_init_nfspagecache(void)
 {
        nfs_page_cachep = kmem_cache_create("nfs_page",
index 542ea8d..f2dc35c 100644 (file)
@@ -268,11 +268,11 @@ pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
        struct nfs_server *server = NFS_SERVER(lo->plh_inode);
        struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
 
-       if (!list_empty(&lo->plh_layouts)) {
+       if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
                struct nfs_client *clp = server->nfs_client;
 
                spin_lock(&clp->cl_lock);
-               list_del_init(&lo->plh_layouts);
+               list_del_rcu(&lo->plh_layouts);
                spin_unlock(&clp->cl_lock);
        }
        put_cred(lo->plh_lc_cred);
@@ -309,6 +309,16 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
        }
 }
 
+static struct inode *
+pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+       struct inode *inode = igrab(lo->plh_inode);
+       if (inode)
+               return inode;
+       set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
+       return NULL;
+}
+
 static void
 pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
                         u32 seq)
@@ -496,6 +506,7 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
 {
        INIT_LIST_HEAD(&lseg->pls_list);
        INIT_LIST_HEAD(&lseg->pls_lc_list);
+       INIT_LIST_HEAD(&lseg->pls_commits);
        refcount_set(&lseg->pls_refcount, 1);
        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
        lseg->pls_layout = lo;
@@ -782,9 +793,10 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
                /* If the sb is being destroyed, just bail */
                if (!nfs_sb_active(server->super))
                        break;
-               inode = igrab(lo->plh_inode);
+               inode = pnfs_grab_inode_layout_hdr(lo);
                if (inode != NULL) {
-                       list_del_init(&lo->plh_layouts);
+                       if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags))
+                               list_del_rcu(&lo->plh_layouts);
                        if (pnfs_layout_add_bulk_destroy_list(inode,
                                                layout_list))
                                continue;
@@ -794,7 +806,6 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
                } else {
                        rcu_read_unlock();
                        spin_unlock(&clp->cl_lock);
-                       set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
                }
                nfs_sb_deactive(server->super);
                spin_lock(&clp->cl_lock);
@@ -903,10 +914,21 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
        pnfs_destroy_layouts_byclid(clp, false);
 }
 
+static void
+pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred)
+{
+       const struct cred *old;
+
+       if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) {
+               old = xchg(&lo->plh_lc_cred, get_cred(cred));
+               put_cred(old);
+       }
+}
+
 /* update lo->plh_stateid with new if is more recent */
 void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
-                       bool update_barrier)
+                       const struct cred *cred, bool update_barrier)
 {
        u32 oldseq, newseq, new_barrier = 0;
 
@@ -914,6 +936,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
        newseq = be32_to_cpu(new->seqid);
 
        if (!pnfs_layout_is_valid(lo)) {
+               pnfs_set_layout_cred(lo, cred);
                nfs4_stateid_copy(&lo->plh_stateid, new);
                lo->plh_barrier = newseq;
                pnfs_clear_layoutreturn_info(lo);
@@ -1061,7 +1084,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino,
        lgp->args.ctx = get_nfs_open_context(ctx);
        nfs4_stateid_copy(&lgp->args.stateid, stateid);
        lgp->gfp_flags = gfp_flags;
-       lgp->cred = get_cred(ctx->cred);
+       lgp->cred = ctx->cred;
        return lgp;
 }
 
@@ -1072,7 +1095,6 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
        nfs4_free_pages(lgp->args.layout.pages, max_pages);
        if (lgp->args.inode)
                pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout);
-       put_cred(lgp->cred);
        put_nfs_open_context(lgp->args.ctx);
        kfree(lgp);
 }
@@ -1109,7 +1131,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
 
                pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
                pnfs_free_returned_lsegs(lo, &freeme, range, seq);
-               pnfs_set_layout_stateid(lo, stateid, true);
+               pnfs_set_layout_stateid(lo, stateid, NULL, true);
        } else
                pnfs_mark_layout_stateid_invalid(lo, &freeme);
 out_unlock:
@@ -1122,6 +1144,7 @@ out_unlock:
 static bool
 pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
                nfs4_stateid *stateid,
+               const struct cred **cred,
                enum pnfs_iomode *iomode)
 {
        /* Serialise LAYOUTGET/LAYOUTRETURN */
@@ -1132,18 +1155,17 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
        set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
        pnfs_get_layout_hdr(lo);
        if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
-               if (stateid != NULL) {
-                       nfs4_stateid_copy(stateid, &lo->plh_stateid);
-                       if (lo->plh_return_seq != 0)
-                               stateid->seqid = cpu_to_be32(lo->plh_return_seq);
-               }
+               nfs4_stateid_copy(stateid, &lo->plh_stateid);
+               *cred = get_cred(lo->plh_lc_cred);
+               if (lo->plh_return_seq != 0)
+                       stateid->seqid = cpu_to_be32(lo->plh_return_seq);
                if (iomode != NULL)
                        *iomode = lo->plh_return_iomode;
                pnfs_clear_layoutreturn_info(lo);
                return true;
        }
-       if (stateid != NULL)
-               nfs4_stateid_copy(stateid, &lo->plh_stateid);
+       nfs4_stateid_copy(stateid, &lo->plh_stateid);
+       *cred = get_cred(lo->plh_lc_cred);
        if (iomode != NULL)
                *iomode = IOMODE_ANY;
        return true;
@@ -1167,20 +1189,26 @@ pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
 }
 
 static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
-                      enum pnfs_iomode iomode, bool sync)
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
+                      const nfs4_stateid *stateid,
+                      const struct cred **pcred,
+                      enum pnfs_iomode iomode,
+                      bool sync)
 {
        struct inode *ino = lo->plh_inode;
        struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
        struct nfs4_layoutreturn *lrp;
+       const struct cred *cred = *pcred;
        int status = 0;
 
+       *pcred = NULL;
        lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
        if (unlikely(lrp == NULL)) {
                status = -ENOMEM;
                spin_lock(&ino->i_lock);
                pnfs_clear_layoutreturn_waitbit(lo);
                spin_unlock(&ino->i_lock);
+               put_cred(cred);
                pnfs_put_layout_hdr(lo);
                goto out;
        }
@@ -1188,7 +1216,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
        pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
        lrp->args.ld_private = &lrp->ld_private;
        lrp->clp = NFS_SERVER(ino)->nfs_client;
-       lrp->cred = lo->plh_lc_cred;
+       lrp->cred = cred;
        if (ld->prepare_layoutreturn)
                ld->prepare_layoutreturn(&lrp->args);
 
@@ -1233,15 +1261,16 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
                return;
        spin_lock(&inode->i_lock);
        if (pnfs_layout_need_return(lo)) {
+               const struct cred *cred;
                nfs4_stateid stateid;
                enum pnfs_iomode iomode;
                bool send;
 
-               send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+               send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
                spin_unlock(&inode->i_lock);
                if (send) {
                        /* Send an async layoutreturn so we dont deadlock */
-                       pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+                       pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
                }
        } else
                spin_unlock(&inode->i_lock);
@@ -1261,6 +1290,7 @@ _pnfs_return_layout(struct inode *ino)
        struct pnfs_layout_hdr *lo = NULL;
        struct nfs_inode *nfsi = NFS_I(ino);
        LIST_HEAD(tmp_list);
+       const struct cred *cred;
        nfs4_stateid stateid;
        int status = 0;
        bool send, valid_layout;
@@ -1305,10 +1335,10 @@ _pnfs_return_layout(struct inode *ino)
                goto out_put_layout_hdr;
        }
 
-       send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
+       send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
        spin_unlock(&ino->i_lock);
        if (send)
-               status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
+               status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true);
 out_put_layout_hdr:
        pnfs_free_lseg_list(&tmp_list);
        pnfs_put_layout_hdr(lo);
@@ -1354,6 +1384,7 @@ bool pnfs_roc(struct inode *ino,
        struct nfs4_state *state;
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg, *next;
+       const struct cred *lc_cred;
        nfs4_stateid stateid;
        enum pnfs_iomode iomode = 0;
        bool layoutreturn = false, roc = false;
@@ -1423,16 +1454,20 @@ retry:
         * 2. we don't send layoutreturn
         */
        /* lo ref dropped in pnfs_roc_release() */
-       layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+       layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode);
        /* If the creds don't match, we can't compound the layoutreturn */
-       if (!layoutreturn || cred_fscmp(cred, lo->plh_lc_cred) != 0)
+       if (!layoutreturn)
                goto out_noroc;
+       if (cred_fscmp(cred, lc_cred) != 0)
+               goto out_noroc_put_cred;
 
        roc = layoutreturn;
        pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
        res->lrs_present = 0;
        layoutreturn = false;
 
+out_noroc_put_cred:
+       put_cred(lc_cred);
 out_noroc:
        spin_unlock(&ino->i_lock);
        rcu_read_unlock();
@@ -1445,7 +1480,7 @@ out_noroc:
                return true;
        }
        if (layoutreturn)
-               pnfs_send_layoutreturn(lo, &stateid, iomode, true);
+               pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true);
        pnfs_put_layout_hdr(lo);
        return false;
 }
@@ -1859,15 +1894,14 @@ static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
 static void _add_to_server_list(struct pnfs_layout_hdr *lo,
                                struct nfs_server *server)
 {
-       if (list_empty(&lo->plh_layouts)) {
+       if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
                struct nfs_client *clp = server->nfs_client;
 
                /* The lo must be on the clp list if there is any
                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
                 */
                spin_lock(&clp->cl_lock);
-               if (list_empty(&lo->plh_layouts))
-                       list_add_tail(&lo->plh_layouts, &server->layouts);
+               list_add_tail_rcu(&lo->plh_layouts, &server->layouts);
                spin_unlock(&clp->cl_lock);
        }
 }
@@ -2323,14 +2357,14 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 
        if (!pnfs_layout_is_valid(lo)) {
                /* We have a completely new layout */
-               pnfs_set_layout_stateid(lo, &res->stateid, true);
+               pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
        } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
                /* existing state ID, make sure the sequence number matches. */
                if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
                        dprintk("%s forget reply due to sequence\n", __func__);
                        goto out_forget;
                }
-               pnfs_set_layout_stateid(lo, &res->stateid, false);
+               pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false);
        } else {
                /*
                 * We got an entirely new state ID.  Mark all segments for the
@@ -2423,43 +2457,159 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
        return -ENOENT;
 }
 
-void pnfs_error_mark_layout_for_return(struct inode *inode,
-                                      struct pnfs_layout_segment *lseg)
+static void
+pnfs_mark_layout_for_return(struct inode *inode,
+                           const struct pnfs_layout_range *range)
 {
-       struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
-       struct pnfs_layout_range range = {
-               .iomode = lseg->pls_range.iomode,
-               .offset = 0,
-               .length = NFS4_MAX_UINT64,
-       };
+       struct pnfs_layout_hdr *lo;
        bool return_now = false;
 
        spin_lock(&inode->i_lock);
+       lo = NFS_I(inode)->layout;
        if (!pnfs_layout_is_valid(lo)) {
                spin_unlock(&inode->i_lock);
                return;
        }
-       pnfs_set_plh_return_info(lo, range.iomode, 0);
+       pnfs_set_plh_return_info(lo, range->iomode, 0);
        /*
         * mark all matching lsegs so that we are sure to have no live
         * segments at hand when sending layoutreturn. See pnfs_put_lseg()
         * for how it works.
         */
-       if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0) != -EBUSY) {
+       if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) {
+               const struct cred *cred;
                nfs4_stateid stateid;
                enum pnfs_iomode iomode;
 
-               return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+               return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
                spin_unlock(&inode->i_lock);
                if (return_now)
-                       pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+                       pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
        } else {
                spin_unlock(&inode->i_lock);
                nfs_commit_inode(inode, 0);
        }
 }
+
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+                                      struct pnfs_layout_segment *lseg)
+{
+       struct pnfs_layout_range range = {
+               .iomode = lseg->pls_range.iomode,
+               .offset = 0,
+               .length = NFS4_MAX_UINT64,
+       };
+
+       pnfs_mark_layout_for_return(inode, &range);
+}
 EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
 
+static bool
+pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo)
+{
+       return pnfs_layout_is_valid(lo) &&
+               !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) &&
+               !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+}
+
+static struct pnfs_layout_segment *
+pnfs_find_first_lseg(struct pnfs_layout_hdr *lo,
+                    const struct pnfs_layout_range *range,
+                    enum pnfs_iomode iomode)
+{
+       struct pnfs_layout_segment *lseg;
+
+       list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+               if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+                       continue;
+               if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+                       continue;
+               if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY)
+                       continue;
+               if (pnfs_lseg_range_intersecting(&lseg->pls_range, range))
+                       return lseg;
+       }
+       return NULL;
+}
+
+/* Find open file states whose mode matches that of the range */
+static bool
+pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
+                                const struct pnfs_layout_range *range)
+{
+       struct list_head *head;
+       struct nfs_open_context *ctx;
+       fmode_t mode = 0;
+
+       if (!pnfs_layout_can_be_returned(lo) ||
+           !pnfs_find_first_lseg(lo, range, range->iomode))
+               return false;
+
+       head = &NFS_I(lo->plh_inode)->open_files;
+       list_for_each_entry_rcu(ctx, head, list) {
+               if (ctx->state)
+                       mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE);
+       }
+
+       switch (range->iomode) {
+       default:
+               break;
+       case IOMODE_READ:
+               mode &= ~FMODE_WRITE;
+               break;
+       case IOMODE_RW:
+               if (pnfs_find_first_lseg(lo, range, IOMODE_READ))
+                       mode &= ~FMODE_READ;
+       }
+       return mode == 0;
+}
+
+static int
+pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data)
+{
+       const struct pnfs_layout_range *range = data;
+       struct pnfs_layout_hdr *lo;
+       struct inode *inode;
+restart:
+       rcu_read_lock();
+       list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+               if (!pnfs_layout_can_be_returned(lo) ||
+                   test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+                       continue;
+               inode = lo->plh_inode;
+               spin_lock(&inode->i_lock);
+               if (!pnfs_should_return_unused_layout(lo, range)) {
+                       spin_unlock(&inode->i_lock);
+                       continue;
+               }
+               spin_unlock(&inode->i_lock);
+               inode = pnfs_grab_inode_layout_hdr(lo);
+               if (!inode)
+                       continue;
+               rcu_read_unlock();
+               pnfs_mark_layout_for_return(inode, range);
+               iput(inode);
+               cond_resched();
+               goto restart;
+       }
+       rcu_read_unlock();
+       return 0;
+}
+
+void
+pnfs_layout_return_unused_byclid(struct nfs_client *clp,
+                                enum pnfs_iomode iomode)
+{
+       struct pnfs_layout_range range = {
+               .iomode = iomode,
+               .offset = 0,
+               .length = NFS4_MAX_UINT64,
+       };
+
+       nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver,
+                       &range);
+}
+
 void
 pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
 {
@@ -2475,7 +2625,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
  * Check for any intersection between the request and the pgio->pg_lseg,
  * and if none, put this pgio->pg_lseg away.
  */
-static void
+void
 pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
        if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
@@ -2483,6 +2633,7 @@ pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page
                pgio->pg_lseg = NULL;
        }
 }
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
 
 void
 pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
@@ -3000,10 +3151,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
        end_pos = nfsi->layout->plh_lwb;
 
        nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
+       data->cred = get_cred(nfsi->layout->plh_lc_cred);
        spin_unlock(&inode->i_lock);
 
        data->args.inode = inode;
-       data->cred = get_cred(nfsi->layout->plh_lc_cred);
        nfs_fattr_init(&data->fattr);
        data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
        data->res.fattr = &data->fattr;
index 0fafdad..8e0ada5 100644 (file)
@@ -66,6 +66,7 @@ struct nfs4_pnfs_ds {
 struct pnfs_layout_segment {
        struct list_head pls_list;
        struct list_head pls_lc_list;
+       struct list_head pls_commits;
        struct pnfs_layout_range pls_range;
        refcount_t pls_refcount;
        u32 pls_seq;
@@ -105,6 +106,7 @@ enum {
        NFS_LAYOUT_INVALID_STID,        /* layout stateid id is invalid */
        NFS_LAYOUT_FIRST_LAYOUTGET,     /* Serialize first layoutget */
        NFS_LAYOUT_INODE_FREEING,       /* The inode is being freed */
+       NFS_LAYOUT_HASHED,              /* The layout visible */
 };
 
 enum layoutdriver_policy_flags {
@@ -148,22 +150,6 @@ struct pnfs_layoutdriver_type {
        const struct nfs_pageio_ops *pg_write_ops;
 
        struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
-       void (*mark_request_commit) (struct nfs_page *req,
-                                    struct pnfs_layout_segment *lseg,
-                                    struct nfs_commit_info *cinfo,
-                                    u32 ds_commit_idx);
-       void (*clear_request_commit) (struct nfs_page *req,
-                                     struct nfs_commit_info *cinfo);
-       int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
-                                 int max);
-       void (*recover_commit_reqs) (struct list_head *list,
-                                    struct nfs_commit_info *cinfo);
-       struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
-                                               struct page *page);
-       int (*commit_pagelist)(struct inode *inode,
-                              struct list_head *mds_pages,
-                              int how,
-                              struct nfs_commit_info *cinfo);
 
        int (*sync)(struct inode *inode, bool datasync);
 
@@ -186,6 +172,29 @@ struct pnfs_layoutdriver_type {
        int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
 };
 
+struct pnfs_commit_ops {
+       void (*setup_ds_info)(struct pnfs_ds_commit_info *,
+                             struct pnfs_layout_segment *);
+       void (*release_ds_info)(struct pnfs_ds_commit_info *,
+                               struct inode *inode);
+       int (*commit_pagelist)(struct inode *inode,
+                              struct list_head *mds_pages,
+                              int how,
+                              struct nfs_commit_info *cinfo);
+       void (*mark_request_commit) (struct nfs_page *req,
+                                    struct pnfs_layout_segment *lseg,
+                                    struct nfs_commit_info *cinfo,
+                                    u32 ds_commit_idx);
+       void (*clear_request_commit) (struct nfs_page *req,
+                                     struct nfs_commit_info *cinfo);
+       int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
+                                 int max);
+       void (*recover_commit_reqs) (struct list_head *list,
+                                    struct nfs_commit_info *cinfo);
+       struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+                                               struct page *page);
+};
+
 struct pnfs_layout_hdr {
        refcount_t              plh_refcount;
        atomic_t                plh_outstanding; /* number of RPCs out */
@@ -203,6 +212,7 @@ struct pnfs_layout_hdr {
        loff_t                  plh_lwb; /* last write byte for layoutcommit */
        const struct cred       *plh_lc_cred; /* layoutcommit cred */
        struct inode            *plh_inode;
+       struct rcu_head         plh_rcu;
 };
 
 struct pnfs_device {
@@ -242,6 +252,7 @@ void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
 void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio);
+void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req);
 void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
 int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
 void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
@@ -267,6 +278,7 @@ bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
 void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
                             const nfs4_stateid *new,
+                            const struct cred *cred,
                             bool update_barrier);
 int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
@@ -326,6 +338,9 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
 void pnfs_error_mark_layout_for_return(struct inode *inode,
                                       struct pnfs_layout_segment *lseg);
+void pnfs_layout_return_unused_byclid(struct nfs_client *clp,
+                                     enum pnfs_iomode iomode);
+
 /* nfs4_deviceid_flags */
 enum {
        NFS_DEVICEID_INVALID = 0,       /* set when MDS clientid recalled */
@@ -360,6 +375,16 @@ bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
 void nfs4_deviceid_purge_client(const struct nfs_client *);
 
 /* pnfs_nfs.c */
+struct pnfs_commit_array *pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags);
+void pnfs_free_commit_array(struct pnfs_commit_array *p);
+struct pnfs_commit_array *pnfs_add_commit_array(struct pnfs_ds_commit_info *,
+                                               struct pnfs_commit_array *,
+                                               struct pnfs_layout_segment *);
+
+void pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+               struct pnfs_layout_segment *lseg);
+void pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo);
+
 void pnfs_generic_clear_request_commit(struct nfs_page *req,
                                       struct nfs_commit_info *cinfo);
 void pnfs_generic_commit_release(void *calldata);
@@ -367,6 +392,8 @@ void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
 void pnfs_generic_rw_release(void *data);
 void pnfs_generic_recover_commit_reqs(struct list_head *dst,
                                      struct nfs_commit_info *cinfo);
+struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo,
+                                                struct page *page);
 int pnfs_generic_commit_pagelist(struct inode *inode,
                                 struct list_head *mds_pages,
                                 int how,
@@ -438,9 +465,11 @@ static inline int
 pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
                 struct nfs_commit_info *cinfo)
 {
-       if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0)
+       struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+       if (fl_cinfo == NULL || fl_cinfo->ncommitting == 0)
                return PNFS_NOT_ATTEMPTED;
-       return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo);
+       return fl_cinfo->ops->commit_pagelist(inode, mds_pages, how, cinfo);
 }
 
 static inline struct pnfs_ds_commit_info *
@@ -454,6 +483,28 @@ pnfs_get_ds_info(struct inode *inode)
 }
 
 static inline void
+pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+       struct pnfs_ds_commit_info *inode_cinfo = pnfs_get_ds_info(inode);
+       if (inode_cinfo != NULL)
+               fl_cinfo->ops = inode_cinfo->ops;
+}
+
+static inline void
+pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo)
+{
+       INIT_LIST_HEAD(&fl_cinfo->commits);
+       fl_cinfo->ops = NULL;
+}
+
+static inline void
+pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+       if (fl_cinfo->ops != NULL && fl_cinfo->ops->release_ds_info != NULL)
+               fl_cinfo->ops->release_ds_info(fl_cinfo, inode);
+}
+
+static inline void
 pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
 {
        set_bit(NFS_DEVICEID_INVALID, &node->flags);
@@ -463,24 +514,22 @@ static inline bool
 pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
                         struct nfs_commit_info *cinfo, u32 ds_commit_idx)
 {
-       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
-       struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+       struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
 
-       if (lseg == NULL || ld->mark_request_commit == NULL)
+       if (!lseg || !fl_cinfo->ops->mark_request_commit)
                return false;
-       ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
+       fl_cinfo->ops->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
        return true;
 }
 
 static inline bool
 pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
-       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
-       struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+       struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
 
-       if (ld == NULL || ld->clear_request_commit == NULL)
+       if (!fl_cinfo || !fl_cinfo->ops || !fl_cinfo->ops->clear_request_commit)
                return false;
-       ld->clear_request_commit(req, cinfo);
+       fl_cinfo->ops->clear_request_commit(req, cinfo);
        return true;
 }
 
@@ -488,21 +537,31 @@ static inline int
 pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
                       int max)
 {
-       if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
+       struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+       if (!fl_cinfo || fl_cinfo->nwritten == 0)
                return 0;
-       else
-               return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
+       return fl_cinfo->ops->scan_commit_lists(cinfo, max);
+}
+
+static inline void
+pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
+{
+       struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+       if (fl_cinfo && fl_cinfo->nwritten != 0)
+               fl_cinfo->ops->recover_commit_reqs(head, cinfo);
 }
 
 static inline struct nfs_page *
 pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
                        struct page *page)
 {
-       struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+       struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
 
-       if (ld == NULL || ld->search_commit_reqs == NULL)
+       if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs)
                return NULL;
-       return ld->search_commit_reqs(cinfo, page);
+       return fl_cinfo->ops->search_commit_reqs(cinfo, page);
 }
 
 /* Should the pNFS client commit and return the layout upon a setattr */
@@ -750,6 +809,21 @@ pnfs_get_ds_info(struct inode *inode)
        return NULL;
 }
 
+static inline void
+pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+}
+
+static inline void
+pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo)
+{
+}
+
+static inline void
+pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+}
+
 static inline bool
 pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
                         struct nfs_commit_info *cinfo, u32 ds_commit_idx)
@@ -770,6 +844,11 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
        return 0;
 }
 
+static inline void
+pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
+{
+}
+
 static inline struct nfs_page *
 pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
                        struct page *page)
index 8b37e7f..25f1355 100644 (file)
@@ -59,6 +59,17 @@ void pnfs_generic_commit_release(void *calldata)
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
 
+static struct pnfs_layout_segment *
+pnfs_free_bucket_lseg(struct pnfs_commit_bucket *bucket)
+{
+       if (list_empty(&bucket->committing) && list_empty(&bucket->written)) {
+               struct pnfs_layout_segment *freeme = bucket->lseg;
+               bucket->lseg = NULL;
+               return freeme;
+       }
+       return NULL;
+}
+
 /* The generic layer is about to remove the req from the commit list.
  * If this will make the bucket empty, it will need to put the lseg reference.
  * Note this must be called holding nfsi->commit_mutex
@@ -78,8 +89,7 @@ pnfs_generic_clear_request_commit(struct nfs_page *req,
                bucket = list_first_entry(&req->wb_list,
                                          struct pnfs_commit_bucket,
                                          written);
-               freeme = bucket->wlseg;
-               bucket->wlseg = NULL;
+               freeme = pnfs_free_bucket_lseg(bucket);
        }
 out:
        nfs_request_remove_commit_list(req, cinfo);
@@ -87,10 +97,154 @@ out:
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
 
+struct pnfs_commit_array *
+pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags)
+{
+       struct pnfs_commit_array *p;
+       struct pnfs_commit_bucket *b;
+
+       p = kmalloc(struct_size(p, buckets, n), gfp_flags);
+       if (!p)
+               return NULL;
+       p->nbuckets = n;
+       INIT_LIST_HEAD(&p->cinfo_list);
+       INIT_LIST_HEAD(&p->lseg_list);
+       p->lseg = NULL;
+       for (b = &p->buckets[0]; n != 0; b++, n--) {
+               INIT_LIST_HEAD(&b->written);
+               INIT_LIST_HEAD(&b->committing);
+               b->lseg = NULL;
+               b->direct_verf.committed = NFS_INVALID_STABLE_HOW;
+       }
+       return p;
+}
+EXPORT_SYMBOL_GPL(pnfs_alloc_commit_array);
+
+void
+pnfs_free_commit_array(struct pnfs_commit_array *p)
+{
+       kfree_rcu(p, rcu);
+}
+EXPORT_SYMBOL_GPL(pnfs_free_commit_array);
+
+static struct pnfs_commit_array *
+pnfs_find_commit_array_by_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+               struct pnfs_layout_segment *lseg)
+{
+       struct pnfs_commit_array *array;
+
+       list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+               if (array->lseg == lseg)
+                       return array;
+       }
+       return NULL;
+}
+
+struct pnfs_commit_array *
+pnfs_add_commit_array(struct pnfs_ds_commit_info *fl_cinfo,
+               struct pnfs_commit_array *new,
+               struct pnfs_layout_segment *lseg)
+{
+       struct pnfs_commit_array *array;
+
+       array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+       if (array)
+               return array;
+       new->lseg = lseg;
+       refcount_set(&new->refcount, 1);
+       list_add_rcu(&new->cinfo_list, &fl_cinfo->commits);
+       list_add(&new->lseg_list, &lseg->pls_commits);
+       return new;
+}
+EXPORT_SYMBOL_GPL(pnfs_add_commit_array);
+
+static struct pnfs_commit_array *
+pnfs_lookup_commit_array(struct pnfs_ds_commit_info *fl_cinfo,
+               struct pnfs_layout_segment *lseg)
+{
+       struct pnfs_commit_array *array;
+
+       rcu_read_lock();
+       array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+       if (!array) {
+               rcu_read_unlock();
+               fl_cinfo->ops->setup_ds_info(fl_cinfo, lseg);
+               rcu_read_lock();
+               array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+       }
+       rcu_read_unlock();
+       return array;
+}
+
+static void
+pnfs_release_commit_array_locked(struct pnfs_commit_array *array)
+{
+       list_del_rcu(&array->cinfo_list);
+       list_del(&array->lseg_list);
+       pnfs_free_commit_array(array);
+}
+
+static void
+pnfs_put_commit_array_locked(struct pnfs_commit_array *array)
+{
+       if (refcount_dec_and_test(&array->refcount))
+               pnfs_release_commit_array_locked(array);
+}
+
+static void
+pnfs_put_commit_array(struct pnfs_commit_array *array, struct inode *inode)
+{
+       if (refcount_dec_and_lock(&array->refcount, &inode->i_lock)) {
+               pnfs_release_commit_array_locked(array);
+               spin_unlock(&inode->i_lock);
+       }
+}
+
+static struct pnfs_commit_array *
+pnfs_get_commit_array(struct pnfs_commit_array *array)
+{
+       if (refcount_inc_not_zero(&array->refcount))
+               return array;
+       return NULL;
+}
+
+static void
+pnfs_remove_and_free_commit_array(struct pnfs_commit_array *array)
+{
+       array->lseg = NULL;
+       list_del_init(&array->lseg_list);
+       pnfs_put_commit_array_locked(array);
+}
+
+void
+pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+               struct pnfs_layout_segment *lseg)
+{
+       struct pnfs_commit_array *array, *tmp;
+
+       list_for_each_entry_safe(array, tmp, &lseg->pls_commits, lseg_list)
+               pnfs_remove_and_free_commit_array(array);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_release_lseg);
+
+void
+pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo)
+{
+       struct pnfs_commit_array *array, *tmp;
+
+       list_for_each_entry_safe(array, tmp, &fl_cinfo->commits, cinfo_list)
+               pnfs_remove_and_free_commit_array(array);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_destroy);
+
+/*
+ * Locks the nfs_page requests for commit and moves them to
+ * @bucket->committing.
+ */
 static int
-pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
-                                struct nfs_commit_info *cinfo,
-                                int max)
+pnfs_bucket_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+                               struct nfs_commit_info *cinfo,
+                               int max)
 {
        struct list_head *src = &bucket->written;
        struct list_head *dst = &bucket->committing;
@@ -101,158 +255,253 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
        if (ret) {
                cinfo->ds->nwritten -= ret;
                cinfo->ds->ncommitting += ret;
-               if (bucket->clseg == NULL)
-                       bucket->clseg = pnfs_get_lseg(bucket->wlseg);
-               if (list_empty(src)) {
-                       pnfs_put_lseg(bucket->wlseg);
-                       bucket->wlseg = NULL;
-               }
        }
        return ret;
 }
 
+static int pnfs_bucket_scan_array(struct nfs_commit_info *cinfo,
+                                 struct pnfs_commit_bucket *buckets,
+                                 unsigned int nbuckets,
+                                 int max)
+{
+       unsigned int i;
+       int rv = 0, cnt;
+
+       for (i = 0; i < nbuckets && max != 0; i++) {
+               cnt = pnfs_bucket_scan_ds_commit_list(&buckets[i], cinfo, max);
+               rv += cnt;
+               max -= cnt;
+       }
+       return rv;
+}
+
 /* Move reqs from written to committing lists, returning count
  * of number moved.
  */
-int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
-                                  int max)
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max)
 {
-       int i, rv = 0, cnt;
+       struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+       struct pnfs_commit_array *array;
+       int rv = 0, cnt;
 
-       lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
-       for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
-               cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
-                                                      cinfo, max);
-               max -= cnt;
+       rcu_read_lock();
+       list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+               if (!array->lseg || !pnfs_get_commit_array(array))
+                       continue;
+               rcu_read_unlock();
+               cnt = pnfs_bucket_scan_array(cinfo, array->buckets,
+                               array->nbuckets, max);
+               rcu_read_lock();
+               pnfs_put_commit_array(array, cinfo->inode);
                rv += cnt;
+               max -= cnt;
+               if (!max)
+                       break;
        }
+       rcu_read_unlock();
        return rv;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
 
-/* Pull everything off the committing lists and dump into @dst.  */
-void pnfs_generic_recover_commit_reqs(struct list_head *dst,
-                                     struct nfs_commit_info *cinfo)
+static unsigned int
+pnfs_bucket_recover_commit_reqs(struct list_head *dst,
+                               struct pnfs_commit_bucket *buckets,
+                               unsigned int nbuckets,
+                               struct nfs_commit_info *cinfo)
 {
        struct pnfs_commit_bucket *b;
        struct pnfs_layout_segment *freeme;
-       int nwritten;
-       int i;
+       unsigned int nwritten, ret = 0;
+       unsigned int i;
 
-       lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
 restart:
-       for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+       for (i = 0, b = buckets; i < nbuckets; i++, b++) {
                nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0);
                if (!nwritten)
                        continue;
-               cinfo->ds->nwritten -= nwritten;
-               if (list_empty(&b->written)) {
-                       freeme = b->wlseg;
-                       b->wlseg = NULL;
+               ret += nwritten;
+               freeme = pnfs_free_bucket_lseg(b);
+               if (freeme) {
                        pnfs_put_lseg(freeme);
                        goto restart;
                }
        }
+       return ret;
+}
+
+/* Pull everything off the committing lists and dump into @dst.  */
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+                                     struct nfs_commit_info *cinfo)
+{
+       struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+       struct pnfs_commit_array *array;
+       unsigned int nwritten;
+
+       lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
+       rcu_read_lock();
+       list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+               if (!array->lseg || !pnfs_get_commit_array(array))
+                       continue;
+               rcu_read_unlock();
+               nwritten = pnfs_bucket_recover_commit_reqs(dst,
+                                                          array->buckets,
+                                                          array->nbuckets,
+                                                          cinfo);
+               rcu_read_lock();
+               pnfs_put_commit_array(array, cinfo->inode);
+               fl_cinfo->nwritten -= nwritten;
+       }
+       rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
 
-static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
+static struct nfs_page *
+pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets,
+               unsigned int nbuckets, struct page *page)
+{
+       struct nfs_page *req;
+       struct pnfs_commit_bucket *b;
+       unsigned int i;
+
+       /* Linearly search the commit lists for each bucket until a matching
+        * request is found */
+       for (i = 0, b = buckets; i < nbuckets; i++, b++) {
+               list_for_each_entry(req, &b->written, wb_list) {
+                       if (req->wb_page == page)
+                               return req->wb_head;
+               }
+               list_for_each_entry(req, &b->committing, wb_list) {
+                       if (req->wb_page == page)
+                               return req->wb_head;
+               }
+       }
+       return NULL;
+}
+
+/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest
+ *                                for @page
+ * @cinfo - commit info for current inode
+ * @page - page to search for matching head request
+ *
+ * Returns a the head request if one is found, otherwise returns NULL.
+ */
+struct nfs_page *
+pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
 {
        struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+       struct pnfs_commit_array *array;
+       struct nfs_page *req;
+
+       list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) {
+               req = pnfs_bucket_search_commit_reqs(array->buckets,
+                               array->nbuckets, page);
+               if (req)
+                       return req;
+       }
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_search_commit_reqs);
+
+static struct pnfs_layout_segment *
+pnfs_bucket_get_committing(struct list_head *head,
+                          struct pnfs_commit_bucket *bucket,
+                          struct nfs_commit_info *cinfo)
+{
+       struct list_head *pos;
+
+       list_for_each(pos, &bucket->committing)
+               cinfo->ds->ncommitting--;
+       list_splice_init(&bucket->committing, head);
+       return pnfs_free_bucket_lseg(bucket);
+}
+
+static struct nfs_commit_data *
+pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket,
+                            struct nfs_commit_info *cinfo)
+{
+       struct nfs_commit_data *data = nfs_commitdata_alloc(false);
+
+       if (!data)
+               return NULL;
+       data->lseg = pnfs_bucket_get_committing(&data->pages, bucket, cinfo);
+       if (!data->lseg)
+               data->lseg = pnfs_get_lseg(bucket->lseg);
+       return data;
+}
+
+static void pnfs_generic_retry_commit(struct pnfs_commit_bucket *buckets,
+                                     unsigned int nbuckets,
+                                     struct nfs_commit_info *cinfo,
+                                     unsigned int idx)
+{
        struct pnfs_commit_bucket *bucket;
        struct pnfs_layout_segment *freeme;
-       struct list_head *pos;
        LIST_HEAD(pages);
-       int i;
 
-       mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
-       for (i = idx; i < fl_cinfo->nbuckets; i++) {
-               bucket = &fl_cinfo->buckets[i];
+       for (bucket = buckets; idx < nbuckets; bucket++, idx++) {
                if (list_empty(&bucket->committing))
                        continue;
-               freeme = bucket->clseg;
-               bucket->clseg = NULL;
-               list_for_each(pos, &bucket->committing)
-                       cinfo->ds->ncommitting--;
-               list_splice_init(&bucket->committing, &pages);
+               mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+               freeme = pnfs_bucket_get_committing(&pages, bucket, cinfo);
                mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
-               nfs_retry_commit(&pages, freeme, cinfo, i);
+               nfs_retry_commit(&pages, freeme, cinfo, idx);
                pnfs_put_lseg(freeme);
-               mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
        }
-       mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 }
 
 static unsigned int
-pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
-                             struct list_head *list)
+pnfs_bucket_alloc_ds_commits(struct list_head *list,
+                            struct pnfs_commit_bucket *buckets,
+                            unsigned int nbuckets,
+                            struct nfs_commit_info *cinfo)
 {
-       struct pnfs_ds_commit_info *fl_cinfo;
        struct pnfs_commit_bucket *bucket;
        struct nfs_commit_data *data;
-       int i;
+       unsigned int i;
        unsigned int nreq = 0;
 
-       fl_cinfo = cinfo->ds;
-       bucket = fl_cinfo->buckets;
-       for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
+       for (i = 0, bucket = buckets; i < nbuckets; i++, bucket++) {
                if (list_empty(&bucket->committing))
                        continue;
-               data = nfs_commitdata_alloc(false);
-               if (!data)
-                       break;
-               data->ds_commit_index = i;
-               list_add(&data->pages, list);
-               nreq++;
+               mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+               if (!list_empty(&bucket->committing)) {
+                       data = pnfs_bucket_fetch_commitdata(bucket, cinfo);
+                       if (!data)
+                               goto out_error;
+                       data->ds_commit_index = i;
+                       list_add_tail(&data->list, list);
+                       atomic_inc(&cinfo->mds->rpcs_out);
+                       nreq++;
+               }
+               mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
        }
-
+       return nreq;
+out_error:
+       mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
        /* Clean up on error */
-       pnfs_generic_retry_commit(cinfo, i);
+       pnfs_generic_retry_commit(buckets, nbuckets, cinfo, i);
        return nreq;
 }
 
-static inline
-void pnfs_fetch_commit_bucket_list(struct list_head *pages,
-               struct nfs_commit_data *data,
-               struct nfs_commit_info *cinfo)
+static unsigned int
+pnfs_alloc_ds_commits_list(struct list_head *list,
+                          struct pnfs_ds_commit_info *fl_cinfo,
+                          struct nfs_commit_info *cinfo)
 {
-       struct pnfs_commit_bucket *bucket;
-       struct list_head *pos;
-
-       bucket = &cinfo->ds->buckets[data->ds_commit_index];
-       mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
-       list_for_each(pos, &bucket->committing)
-               cinfo->ds->ncommitting--;
-       list_splice_init(&bucket->committing, pages);
-       data->lseg = bucket->clseg;
-       bucket->clseg = NULL;
-       mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
-
-}
+       struct pnfs_commit_array *array;
+       unsigned int ret = 0;
 
-/* Helper function for pnfs_generic_commit_pagelist to catch an empty
- * page list. This can happen when two commits race.
- *
- * This must be called instead of nfs_init_commit - call one or the other, but
- * not both!
- */
-static bool
-pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages,
-                                         struct nfs_commit_data *data,
-                                         struct nfs_commit_info *cinfo)
-{
-       if (list_empty(pages)) {
-               if (atomic_dec_and_test(&cinfo->mds->rpcs_out))
-                       wake_up_var(&cinfo->mds->rpcs_out);
-               /* don't call nfs_commitdata_release - it tries to put
-                * the open_context which is not acquired until nfs_init_commit
-                * which has not been called on @data */
-               WARN_ON_ONCE(data->context);
-               nfs_commit_free(data);
-               return true;
+       rcu_read_lock();
+       list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+               if (!array->lseg || !pnfs_get_commit_array(array))
+                       continue;
+               rcu_read_unlock();
+               ret += pnfs_bucket_alloc_ds_commits(list, array->buckets,
+                               array->nbuckets, cinfo);
+               rcu_read_lock();
+               pnfs_put_commit_array(array, cinfo->inode);
        }
-
-       return false;
+       return ret;
 }
 
 /* This follows nfs_commit_list pretty closely */
@@ -262,6 +511,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
                             int (*initiate_commit)(struct nfs_commit_data *data,
                                                    int how))
 {
+       struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
        struct nfs_commit_data *data, *tmp;
        LIST_HEAD(list);
        unsigned int nreq = 0;
@@ -269,40 +519,25 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
        if (!list_empty(mds_pages)) {
                data = nfs_commitdata_alloc(true);
                data->ds_commit_index = -1;
-               list_add(&data->pages, &list);
+               list_splice_init(mds_pages, &data->pages);
+               list_add_tail(&data->list, &list);
+               atomic_inc(&cinfo->mds->rpcs_out);
                nreq++;
        }
 
-       nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
-
+       nreq += pnfs_alloc_ds_commits_list(&list, fl_cinfo, cinfo);
        if (nreq == 0)
                goto out;
 
-       atomic_add(nreq, &cinfo->mds->rpcs_out);
-
-       list_for_each_entry_safe(data, tmp, &list, pages) {
-               list_del_init(&data->pages);
+       list_for_each_entry_safe(data, tmp, &list, list) {
+               list_del(&data->list);
                if (data->ds_commit_index < 0) {
-                       /* another commit raced with us */
-                       if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages,
-                               data, cinfo))
-                               continue;
-
-                       nfs_init_commit(data, mds_pages, NULL, cinfo);
+                       nfs_init_commit(data, NULL, NULL, cinfo);
                        nfs_initiate_commit(NFS_CLIENT(inode), data,
                                            NFS_PROTO(data->inode),
                                            data->mds_ops, how, 0);
                } else {
-                       LIST_HEAD(pages);
-
-                       pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
-
-                       /* another commit raced with us */
-                       if (pnfs_generic_commit_cancel_empty_pagelist(&pages,
-                               data, cinfo))
-                               continue;
-
-                       nfs_init_commit(data, &pages, data->lseg, cinfo);
+                       nfs_init_commit(data, NULL, data->lseg, cinfo);
                        initiate_commit(data, how);
                }
        }
@@ -930,32 +1165,33 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
                                u32 ds_commit_idx)
 {
        struct list_head *list;
-       struct pnfs_commit_bucket *buckets;
+       struct pnfs_commit_array *array;
+       struct pnfs_commit_bucket *bucket;
 
        mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
-       buckets = cinfo->ds->buckets;
-       list = &buckets[ds_commit_idx].written;
-       if (list_empty(list)) {
-               if (!pnfs_is_valid_lseg(lseg)) {
-                       mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
-                       cinfo->completion_ops->resched_write(cinfo, req);
-                       return;
-               }
-               /* Non-empty buckets hold a reference on the lseg.  That ref
-                * is normally transferred to the COMMIT call and released
-                * there.  It could also be released if the last req is pulled
-                * off due to a rewrite, in which case it will be done in
-                * pnfs_common_clear_request_commit
-                */
-               WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
-               buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
-       }
+       array = pnfs_lookup_commit_array(cinfo->ds, lseg);
+       if (!array || !pnfs_is_valid_lseg(lseg))
+               goto out_resched;
+       bucket = &array->buckets[ds_commit_idx];
+       list = &bucket->written;
+       /* Non-empty buckets hold a reference on the lseg.  That ref
+        * is normally transferred to the COMMIT call and released
+        * there.  It could also be released if the last req is pulled
+        * off due to a rewrite, in which case it will be done in
+        * pnfs_common_clear_request_commit
+        */
+       if (!bucket->lseg)
+               bucket->lseg = pnfs_get_lseg(lseg);
        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
        cinfo->ds->nwritten++;
 
        nfs_request_add_commit_list_locked(req, list, cinfo);
        mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
        nfs_mark_page_unstable(req->wb_page, cinfo);
+       return;
+out_resched:
+       mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+       cinfo->completion_ops->resched_write(cinfo, req);
 }
 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
 
index 34bb9ad..13b22e8 100644 (file)
@@ -250,7 +250,7 @@ static int nfs_readpage_done(struct rpc_task *task,
        trace_nfs_readpage_done(task, hdr);
 
        if (task->tk_status == -ESTALE) {
-               set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+               nfs_set_inode_stale(inode);
                nfs_mark_for_revalidate(inode);
        }
        return 0;
index bb14bed..59ef3b1 100644 (file)
@@ -176,6 +176,41 @@ void nfs_sb_deactive(struct super_block *sb)
 }
 EXPORT_SYMBOL_GPL(nfs_sb_deactive);
 
+static int __nfs_list_for_each_server(struct list_head *head,
+               int (*fn)(struct nfs_server *, void *),
+               void *data)
+{
+       struct nfs_server *server, *last = NULL;
+       int ret = 0;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(server, head, client_link) {
+               if (!nfs_sb_active(server->super))
+                       continue;
+               rcu_read_unlock();
+               if (last)
+                       nfs_sb_deactive(last->super);
+               last = server;
+               ret = fn(server, data);
+               if (ret)
+                       goto out;
+               rcu_read_lock();
+       }
+       rcu_read_unlock();
+out:
+       if (last)
+               nfs_sb_deactive(last->super);
+       return ret;
+}
+
+int nfs_client_for_each_server(struct nfs_client *clp,
+               int (*fn)(struct nfs_server *, void *),
+               void *data)
+{
+       return __nfs_list_for_each_server(&clp->cl_superblocks, fn, data);
+}
+EXPORT_SYMBOL_GPL(nfs_client_for_each_server);
+
 /*
  * Deliver file system statistics to userspace
  */
index 0effeee..b27ebdc 100644 (file)
@@ -98,7 +98,7 @@ static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data)
                .callback_ops = &nfs_unlink_ops,
                .callback_data = data,
                .workqueue = nfsiod_workqueue,
-               .flags = RPC_TASK_ASYNC,
+               .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
        };
        struct rpc_task *task;
        struct inode *dir = d_inode(data->dentry->d_parent);
@@ -341,7 +341,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
                .callback_ops = &nfs_rename_ops,
                .workqueue = nfsiod_workqueue,
                .rpc_client = NFS_CLIENT(old_dir),
-               .flags = RPC_TASK_ASYNC,
+               .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
        };
 
        data = kzalloc(sizeof(*data), GFP_KERNEL);
index c478b77..df4b87c 100644 (file)
@@ -149,6 +149,31 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
                kref_put(&ioc->refcount, nfs_io_completion_release);
 }
 
+static void
+nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
+{
+       if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) {
+               kref_get(&req->wb_kref);
+               atomic_long_inc(&NFS_I(inode)->nrequests);
+       }
+}
+
+static int
+nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
+{
+       int ret;
+
+       if (!test_bit(PG_REMOVE, &req->wb_flags))
+               return 0;
+       ret = nfs_page_group_lock(req);
+       if (ret)
+               return ret;
+       if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
+               nfs_page_set_inode_ref(req, inode);
+       nfs_page_group_unlock(req);
+       return 0;
+}
+
 static struct nfs_page *
 nfs_page_private_request(struct page *page)
 {
@@ -218,6 +243,36 @@ static struct nfs_page *nfs_page_find_head_request(struct page *page)
        return req;
 }
 
+static struct nfs_page *nfs_find_and_lock_page_request(struct page *page)
+{
+       struct inode *inode = page_file_mapping(page)->host;
+       struct nfs_page *req, *head;
+       int ret;
+
+       for (;;) {
+               req = nfs_page_find_head_request(page);
+               if (!req)
+                       return req;
+               head = nfs_page_group_lock_head(req);
+               if (head != req)
+                       nfs_release_request(req);
+               if (IS_ERR(head))
+                       return head;
+               ret = nfs_cancel_remove_inode(head, inode);
+               if (ret < 0) {
+                       nfs_unlock_and_release_request(head);
+                       return ERR_PTR(ret);
+               }
+               /* Ensure that nobody removed the request before we locked it */
+               if (head == nfs_page_private_request(page))
+                       break;
+               if (PageSwapCache(page))
+                       break;
+               nfs_unlock_and_release_request(head);
+       }
+       return head;
+}
+
 /* Adjust the file length if we're writing beyond the end */
 static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
 {
@@ -380,34 +435,6 @@ static void nfs_end_page_writeback(struct nfs_page *req)
 }
 
 /*
- * nfs_unroll_locks_and_wait -  unlock all newly locked reqs and wait on @req
- *
- * this is a helper function for nfs_lock_and_join_requests
- *
- * @inode - inode associated with request page group, must be holding inode lock
- * @head  - head request of page group, must be holding head lock
- * @req   - request that couldn't lock and needs to wait on the req bit lock
- *
- * NOTE: this must be called holding page_group bit lock
- *       which will be released before returning.
- *
- * returns 0 on success, < 0 on error.
- */
-static void
-nfs_unroll_locks(struct inode *inode, struct nfs_page *head,
-                         struct nfs_page *req)
-{
-       struct nfs_page *tmp;
-
-       /* relinquish all the locks successfully grabbed this run */
-       for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
-               if (!kref_read(&tmp->wb_kref))
-                       continue;
-               nfs_unlock_and_release_request(tmp);
-       }
-}
-
-/*
  * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests
  *
  * @destroy_list - request list (using wb_this_page) terminated by @old_head
@@ -428,22 +455,29 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
                destroy_list = (subreq->wb_this_page == old_head) ?
                                   NULL : subreq->wb_this_page;
 
+               /* Note: lock subreq in order to change subreq->wb_head */
+               nfs_page_set_headlock(subreq);
                WARN_ON_ONCE(old_head != subreq->wb_head);
 
                /* make sure old group is not used */
                subreq->wb_this_page = subreq;
+               subreq->wb_head = subreq;
 
                clear_bit(PG_REMOVE, &subreq->wb_flags);
 
                /* Note: races with nfs_page_group_destroy() */
                if (!kref_read(&subreq->wb_kref)) {
                        /* Check if we raced with nfs_page_group_destroy() */
-                       if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags))
+                       if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) {
+                               nfs_page_clear_headlock(subreq);
                                nfs_free_request(subreq);
+                       } else
+                               nfs_page_clear_headlock(subreq);
                        continue;
                }
+               nfs_page_clear_headlock(subreq);
 
-               subreq->wb_head = subreq;
+               nfs_release_request(old_head);
 
                if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
                        nfs_release_request(subreq);
@@ -457,105 +491,43 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
 }
 
 /*
- * nfs_lock_and_join_requests - join all subreqs to the head req and return
- *                              a locked reference, cancelling any pending
- *                              operations for this page.
- *
- * @page - the page used to lookup the "page group" of nfs_page structures
+ * nfs_join_page_group - destroy subrequests of the head req
+ * @head: the page used to lookup the "page group" of nfs_page structures
+ * @inode: Inode to which the request belongs.
  *
  * This function joins all sub requests to the head request by first
  * locking all requests in the group, cancelling any pending operations
  * and finally updating the head request to cover the whole range covered by
  * the (former) group.  All subrequests are removed from any write or commit
  * lists, unlinked from the group and destroyed.
- *
- * Returns a locked, referenced pointer to the head request - which after
- * this call is guaranteed to be the only request associated with the page.
- * Returns NULL if no requests are found for @page, or a ERR_PTR if an
- * error was encountered.
  */
-static struct nfs_page *
-nfs_lock_and_join_requests(struct page *page)
+void
+nfs_join_page_group(struct nfs_page *head, struct inode *inode)
 {
-       struct inode *inode = page_file_mapping(page)->host;
-       struct nfs_page *head, *subreq;
+       struct nfs_page *subreq;
        struct nfs_page *destroy_list = NULL;
-       unsigned int total_bytes;
-       int ret;
+       unsigned int pgbase, off, bytes;
 
-try_again:
-       /*
-        * A reference is taken only on the head request which acts as a
-        * reference to the whole page group - the group will not be destroyed
-        * until the head reference is released.
-        */
-       head = nfs_page_find_head_request(page);
-       if (!head)
-               return NULL;
-
-       /* lock the page head first in order to avoid an ABBA inefficiency */
-       if (!nfs_lock_request(head)) {
-               ret = nfs_wait_on_request(head);
-               nfs_release_request(head);
-               if (ret < 0)
-                       return ERR_PTR(ret);
-               goto try_again;
-       }
-
-       /* Ensure that nobody removed the request before we locked it */
-       if (head != nfs_page_private_request(page) && !PageSwapCache(page)) {
-               nfs_unlock_and_release_request(head);
-               goto try_again;
-       }
-
-       ret = nfs_page_group_lock(head);
-       if (ret < 0)
-               goto release_request;
-
-       /* lock each request in the page group */
-       total_bytes = head->wb_bytes;
+       pgbase = head->wb_pgbase;
+       bytes = head->wb_bytes;
+       off = head->wb_offset;
        for (subreq = head->wb_this_page; subreq != head;
                        subreq = subreq->wb_this_page) {
-
-               if (!kref_get_unless_zero(&subreq->wb_kref)) {
-                       if (subreq->wb_offset == head->wb_offset + total_bytes)
-                               total_bytes += subreq->wb_bytes;
-                       continue;
-               }
-
-               while (!nfs_lock_request(subreq)) {
-                       /*
-                        * Unlock page to allow nfs_page_group_sync_on_bit()
-                        * to succeed
-                        */
-                       nfs_page_group_unlock(head);
-                       ret = nfs_wait_on_request(subreq);
-                       if (!ret)
-                               ret = nfs_page_group_lock(head);
-                       if (ret < 0) {
-                               nfs_unroll_locks(inode, head, subreq);
-                               nfs_release_request(subreq);
-                               goto release_request;
-                       }
-               }
-               /*
-                * Subrequests are always contiguous, non overlapping
-                * and in order - but may be repeated (mirrored writes).
-                */
-               if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
-                       /* keep track of how many bytes this group covers */
-                       total_bytes += subreq->wb_bytes;
-               } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
-                           ((subreq->wb_offset + subreq->wb_bytes) >
-                            (head->wb_offset + total_bytes)))) {
-                       nfs_page_group_unlock(head);
-                       nfs_unroll_locks(inode, head, subreq);
-                       nfs_unlock_and_release_request(subreq);
-                       ret = -EIO;
-                       goto release_request;
+               /* Subrequests should always form a contiguous range */
+               if (pgbase > subreq->wb_pgbase) {
+                       off -= pgbase - subreq->wb_pgbase;
+                       bytes += pgbase - subreq->wb_pgbase;
+                       pgbase = subreq->wb_pgbase;
                }
+               bytes = max(subreq->wb_pgbase + subreq->wb_bytes
+                               - pgbase, bytes);
        }
 
+       /* Set the head request's range to cover the former page group */
+       head->wb_pgbase = pgbase;
+       head->wb_bytes = bytes;
+       head->wb_offset = off;
+
        /* Now that all requests are locked, make sure they aren't on any list.
         * Commit list removal accounting is done after locks are dropped */
        subreq = head;
@@ -569,36 +541,52 @@ try_again:
                /* destroy list will be terminated by head */
                destroy_list = head->wb_this_page;
                head->wb_this_page = head;
-
-               /* change head request to cover whole range that
-                * the former page group covered */
-               head->wb_bytes = total_bytes;
        }
 
-       /* Postpone destruction of this request */
-       if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) {
-               set_bit(PG_INODE_REF, &head->wb_flags);
-               kref_get(&head->wb_kref);
-               atomic_long_inc(&NFS_I(inode)->nrequests);
-       }
+       nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
+}
 
-       nfs_page_group_unlock(head);
+/*
+ * nfs_lock_and_join_requests - join all subreqs to the head req
+ * @page: the page used to lookup the "page group" of nfs_page structures
+ *
+ * This function joins all sub requests to the head request by first
+ * locking all requests in the group, cancelling any pending operations
+ * and finally updating the head request to cover the whole range covered by
+ * the (former) group.  All subrequests are removed from any write or commit
+ * lists, unlinked from the group and destroyed.
+ *
+ * Returns a locked, referenced pointer to the head request - which after
+ * this call is guaranteed to be the only request associated with the page.
+ * Returns NULL if no requests are found for @page, or a ERR_PTR if an
+ * error was encountered.
+ */
+static struct nfs_page *
+nfs_lock_and_join_requests(struct page *page)
+{
+       struct inode *inode = page_file_mapping(page)->host;
+       struct nfs_page *head;
+       int ret;
 
-       nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
+       /*
+        * A reference is taken only on the head request which acts as a
+        * reference to the whole page group - the group will not be destroyed
+        * until the head reference is released.
+        */
+       head = nfs_find_and_lock_page_request(page);
+       if (IS_ERR_OR_NULL(head))
+               return head;
 
-       /* Did we lose a race with nfs_inode_remove_request()? */
-       if (!(PagePrivate(page) || PageSwapCache(page))) {
+       /* lock each request in the page group */
+       ret = nfs_page_group_lock_subrequests(head);
+       if (ret < 0) {
                nfs_unlock_and_release_request(head);
-               return NULL;
+               return ERR_PTR(ret);
        }
 
-       /* still holds ref on head from nfs_page_find_head_request
-        * and still has lock on head from lock loop */
-       return head;
+       nfs_join_page_group(head, inode);
 
-release_request:
-       nfs_unlock_and_release_request(head);
-       return ERR_PTR(ret);
+       return head;
 }
 
 static void nfs_write_error(struct nfs_page *req, int error)
@@ -1707,7 +1695,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
                .callback_ops = call_ops,
                .callback_data = data,
                .workqueue = nfsiod_workqueue,
-               .flags = RPC_TASK_ASYNC | flags,
+               .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags,
                .priority = priority,
        };
        /* Set up the initial task struct.  */
@@ -1746,14 +1734,19 @@ void nfs_init_commit(struct nfs_commit_data *data,
                     struct pnfs_layout_segment *lseg,
                     struct nfs_commit_info *cinfo)
 {
-       struct nfs_page *first = nfs_list_entry(head->next);
-       struct nfs_open_context *ctx = nfs_req_openctx(first);
-       struct inode *inode = d_inode(ctx->dentry);
+       struct nfs_page *first;
+       struct nfs_open_context *ctx;
+       struct inode *inode;
 
        /* Set up the RPC argument and reply structs
         * NB: take care not to mess about with data->commit et al. */
 
-       list_splice_init(head, &data->pages);
+       if (head)
+               list_splice_init(head, &data->pages);
+
+       first = nfs_list_entry(data->pages.next);
+       ctx = nfs_req_openctx(first);
+       inode = d_inode(ctx->dentry);
 
        data->inode       = inode;
        data->cred        = ctx->cred;
@@ -1869,8 +1862,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 
                /* Okay, COMMIT succeeded, apparently. Check the verifier
                 * returned by the server against all stored verfs. */
-               if (verf->committed > NFS_UNSTABLE &&
-                   !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier)) {
+               if (nfs_write_match_verf(verf, req)) {
                        /* We have a match */
                        if (req->wb_page)
                                nfs_inode_remove_request(req);
index 9fc47c2..9709cf2 100644 (file)
@@ -36,6 +36,13 @@ static int ovl_ccup_get(char *buf, const struct kernel_param *param)
 module_param_call(check_copy_up, ovl_ccup_set, ovl_ccup_get, NULL, 0644);
 MODULE_PARM_DESC(check_copy_up, "Obsolete; does nothing");
 
+static bool ovl_must_copy_xattr(const char *name)
+{
+       return !strcmp(name, XATTR_POSIX_ACL_ACCESS) ||
+              !strcmp(name, XATTR_POSIX_ACL_DEFAULT) ||
+              !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+}
+
 int ovl_copy_xattr(struct dentry *old, struct dentry *new)
 {
        ssize_t list_size, size, value_size = 0;
@@ -107,8 +114,13 @@ retry:
                        continue; /* Discard */
                }
                error = vfs_setxattr(new, name, value, size, 0);
-               if (error)
-                       break;
+               if (error) {
+                       if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))
+                               break;
+
+                       /* Ignore failure to copy unknown xattrs */
+                       error = 0;
+               }
        }
        kfree(value);
 out:
index 8e57d53..279009d 100644 (file)
@@ -42,7 +42,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
        return err;
 }
 
-static struct dentry *ovl_lookup_temp(struct dentry *workdir)
+struct dentry *ovl_lookup_temp(struct dentry *workdir)
 {
        struct dentry *temp;
        char name[20];
@@ -243,6 +243,9 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
 
        ovl_dir_modified(dentry->d_parent, false);
        ovl_dentry_set_upper_alias(dentry);
+       ovl_dentry_update_reval(dentry, newdentry,
+                       DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+
        if (!hardlink) {
                /*
                 * ovl_obtain_alias() can be called after ovl_create_real()
@@ -819,6 +822,28 @@ static bool ovl_pure_upper(struct dentry *dentry)
               !ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry));
 }
 
+static void ovl_drop_nlink(struct dentry *dentry)
+{
+       struct inode *inode = d_inode(dentry);
+       struct dentry *alias;
+
+       /* Try to find another, hashed alias */
+       spin_lock(&inode->i_lock);
+       hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+               if (alias != dentry && !d_unhashed(alias))
+                       break;
+       }
+       spin_unlock(&inode->i_lock);
+
+       /*
+        * Changes to underlying layers may cause i_nlink to lose sync with
+        * reality.  In this case prevent the link count from going to zero
+        * prematurely.
+        */
+       if (inode->i_nlink > !!alias)
+               drop_nlink(inode);
+}
+
 static int ovl_do_remove(struct dentry *dentry, bool is_dir)
 {
        int err;
@@ -856,7 +881,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
                if (is_dir)
                        clear_nlink(dentry->d_inode);
                else
-                       drop_nlink(dentry->d_inode);
+                       ovl_drop_nlink(dentry);
        }
        ovl_nlink_end(dentry);
 
@@ -1201,7 +1226,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
                if (new_is_dir)
                        clear_nlink(d_inode(new));
                else
-                       drop_nlink(d_inode(new));
+                       ovl_drop_nlink(new);
        }
 
        ovl_dir_modified(old->d_parent, ovl_type_origin(old) ||
index 6f54d70..475c61f 100644 (file)
@@ -308,29 +308,35 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb,
                ovl_set_flag(OVL_UPPERDATA, inode);
 
        dentry = d_find_any_alias(inode);
-       if (!dentry) {
-               dentry = d_alloc_anon(inode->i_sb);
-               if (!dentry)
-                       goto nomem;
-               oe = ovl_alloc_entry(lower ? 1 : 0);
-               if (!oe)
-                       goto nomem;
-
-               if (lower) {
-                       oe->lowerstack->dentry = dget(lower);
-                       oe->lowerstack->layer = lowerpath->layer;
-               }
-               dentry->d_fsdata = oe;
-               if (upper_alias)
-                       ovl_dentry_set_upper_alias(dentry);
+       if (dentry)
+               goto out_iput;
+
+       dentry = d_alloc_anon(inode->i_sb);
+       if (unlikely(!dentry))
+               goto nomem;
+       oe = ovl_alloc_entry(lower ? 1 : 0);
+       if (!oe)
+               goto nomem;
+
+       if (lower) {
+               oe->lowerstack->dentry = dget(lower);
+               oe->lowerstack->layer = lowerpath->layer;
        }
+       dentry->d_fsdata = oe;
+       if (upper_alias)
+               ovl_dentry_set_upper_alias(dentry);
+
+       ovl_dentry_update_reval(dentry, upper,
+                       DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
 
        return d_instantiate_anon(dentry, inode);
 
 nomem:
-       iput(inode);
        dput(dentry);
-       return ERR_PTR(-ENOMEM);
+       dentry = ERR_PTR(-ENOMEM);
+out_iput:
+       iput(inode);
+       return dentry;
 }
 
 /* Get the upper or lower dentry in stach whose on layer @idx */
index 79e8994..b0d42ec 100644 (file)
@@ -79,6 +79,7 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
 {
        bool samefs = ovl_same_fs(dentry->d_sb);
        unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
+       unsigned int xinoshift = 64 - xinobits;
 
        if (samefs) {
                /*
@@ -89,22 +90,22 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
                stat->dev = dentry->d_sb->s_dev;
                return 0;
        } else if (xinobits) {
-               unsigned int shift = 64 - xinobits;
                /*
                 * All inode numbers of underlying fs should not be using the
                 * high xinobits, so we use high xinobits to partition the
                 * overlay st_ino address space. The high bits holds the fsid
-                * (upper fsid is 0). This way overlay inode numbers are unique
-                * and all inodes use overlay st_dev. Inode numbers are also
-                * persistent for a given layer configuration.
+                * (upper fsid is 0). The lowest xinobit is reserved for mapping
+                * the non-peresistent inode numbers range in case of overflow.
+                * This way all overlay inode numbers are unique and use the
+                * overlay st_dev.
                 */
-               if (stat->ino >> shift) {
-                       pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
-                                           dentry, stat->ino, xinobits);
-               } else {
-                       stat->ino |= ((u64)fsid) << shift;
+               if (likely(!(stat->ino >> xinoshift))) {
+                       stat->ino |= ((u64)fsid) << (xinoshift + 1);
                        stat->dev = dentry->d_sb->s_dev;
                        return 0;
+               } else if (ovl_xino_warn(dentry->d_sb)) {
+                       pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
+                                           dentry, stat->ino, xinobits);
                }
        }
 
@@ -504,7 +505,7 @@ static const struct address_space_operations ovl_aops = {
 
 /*
  * It is possible to stack overlayfs instance on top of another
- * overlayfs instance as lower layer. We need to annonate the
+ * overlayfs instance as lower layer. We need to annotate the
  * stackable i_mutex locks according to stack level of the super
  * block instance. An overlayfs instance can never be in stack
  * depth 0 (there is always a real fs below it).  An overlayfs
@@ -561,27 +562,73 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
 #endif
 }
 
-static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
-                          unsigned long ino, int fsid)
+static void ovl_next_ino(struct inode *inode)
+{
+       struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+
+       inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
+       if (unlikely(!inode->i_ino))
+               inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
+}
+
+static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid)
 {
        int xinobits = ovl_xino_bits(inode->i_sb);
+       unsigned int xinoshift = 64 - xinobits;
 
        /*
         * When d_ino is consistent with st_ino (samefs or i_ino has enough
         * bits to encode layer), set the same value used for st_ino to i_ino,
         * so inode number exposed via /proc/locks and a like will be
         * consistent with d_ino and st_ino values. An i_ino value inconsistent
-        * with d_ino also causes nfsd readdirplus to fail.  When called from
-        * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
-        * upper inode i_ino on ovl_inode_init() or ovl_inode_update().
+        * with d_ino also causes nfsd readdirplus to fail.
         */
-       if (ovl_same_dev(inode->i_sb)) {
-               inode->i_ino = ino;
-               if (xinobits && fsid && !(ino >> (64 - xinobits)))
-                       inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
-       } else {
-               inode->i_ino = get_next_ino();
+       inode->i_ino = ino;
+       if (ovl_same_fs(inode->i_sb)) {
+               return;
+       } else if (xinobits && likely(!(ino >> xinoshift))) {
+               inode->i_ino |= (unsigned long)fsid << (xinoshift + 1);
+               return;
+       }
+
+       /*
+        * For directory inodes on non-samefs with xino disabled or xino
+        * overflow, we allocate a non-persistent inode number, to be used for
+        * resolving st_ino collisions in ovl_map_dev_ino().
+        *
+        * To avoid ino collision with legitimate xino values from upper
+        * layer (fsid 0), use the lowest xinobit to map the non
+        * persistent inode numbers to the unified st_ino address space.
+        */
+       if (S_ISDIR(inode->i_mode)) {
+               ovl_next_ino(inode);
+               if (xinobits) {
+                       inode->i_ino &= ~0UL >> xinobits;
+                       inode->i_ino |= 1UL << xinoshift;
+               }
        }
+}
+
+void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
+                   unsigned long ino, int fsid)
+{
+       struct inode *realinode;
+
+       if (oip->upperdentry)
+               OVL_I(inode)->__upperdentry = oip->upperdentry;
+       if (oip->lowerpath && oip->lowerpath->dentry)
+               OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry));
+       if (oip->lowerdata)
+               OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata));
+
+       realinode = ovl_inode_real(inode);
+       ovl_copyattr(realinode, inode);
+       ovl_copyflags(realinode, inode);
+       ovl_map_ino(inode, ino, fsid);
+}
+
+static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
        inode->i_mode = mode;
        inode->i_flags |= S_NOCMTIME;
 #ifdef CONFIG_FS_POSIX_ACL
@@ -719,7 +766,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
 
        inode = new_inode(sb);
        if (inode)
-               ovl_fill_inode(inode, mode, rdev, 0, 0);
+               ovl_fill_inode(inode, mode, rdev);
 
        return inode;
 }
@@ -891,7 +938,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
        struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
        bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
                                        oip->index);
-       int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
+       int fsid = bylower ? lowerpath->layer->fsid : 0;
        bool is_dir, metacopy = false;
        unsigned long ino = 0;
        int err = oip->newinode ? -EEXIST : -ENOMEM;
@@ -941,9 +988,11 @@ struct inode *ovl_get_inode(struct super_block *sb,
                        err = -ENOMEM;
                        goto out_err;
                }
+               ino = realinode->i_ino;
+               fsid = lowerpath->layer->fsid;
        }
-       ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid);
-       ovl_inode_init(inode, upperdentry, lowerdentry, oip->lowerdata);
+       ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
+       ovl_inode_init(inode, oip, ino, fsid);
 
        if (upperdentry && ovl_is_impuredir(upperdentry))
                ovl_set_flag(OVL_IMPURE, inode);
index ed9e129..0db23ba 100644 (file)
@@ -845,7 +845,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
                if (err)
                        goto out;
 
-               if (upperdentry && unlikely(ovl_dentry_remote(upperdentry))) {
+               if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) {
                        dput(upperdentry);
                        err = -EREMOTE;
                        goto out;
@@ -1076,6 +1076,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
                        goto out_free_oe;
        }
 
+       ovl_dentry_update_reval(dentry, upperdentry,
+                       DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+
        revert_creds(old_cred);
        if (origin_path) {
                dput(origin_path->dentry);
index 3d3f2b8..e6f3670 100644 (file)
@@ -48,6 +48,12 @@ enum ovl_entry_flag {
        OVL_E_CONNECTED,
 };
 
+enum {
+       OVL_XINO_OFF,
+       OVL_XINO_AUTO,
+       OVL_XINO_ON,
+};
+
 /*
  * The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
  * where:
@@ -87,7 +93,7 @@ struct ovl_fb {
        u8 flags;       /* OVL_FH_FLAG_* */
        u8 type;        /* fid_type of fid */
        uuid_t uuid;    /* uuid of filesystem */
-       u32 fid[0];     /* file identifier should be 32bit aligned in-memory */
+       u32 fid[];      /* file identifier should be 32bit aligned in-memory */
 } __packed;
 
 /* In-memory and on-wire format for overlay file handle */
@@ -230,6 +236,8 @@ bool ovl_index_all(struct super_block *sb);
 bool ovl_verify_lower(struct super_block *sb);
 struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
 bool ovl_dentry_remote(struct dentry *dentry);
+void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry,
+                            unsigned int mask);
 bool ovl_dentry_weird(struct dentry *dentry);
 enum ovl_path_type ovl_path_type(struct dentry *dentry);
 void ovl_path_upper(struct dentry *dentry, struct path *path);
@@ -264,8 +272,6 @@ void ovl_set_upperdata(struct inode *inode);
 bool ovl_redirect_dir(struct super_block *sb);
 const char *ovl_dentry_get_redirect(struct dentry *dentry);
 void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
-void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
-                   struct dentry *lowerdentry, struct dentry *lowerdata);
 void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
 void ovl_dir_modified(struct dentry *dentry, bool impurity);
 u64 ovl_dentry_version_get(struct dentry *dentry);
@@ -301,6 +307,16 @@ static inline bool ovl_is_impuredir(struct dentry *dentry)
        return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE);
 }
 
+/*
+ * With xino=auto, we do best effort to keep all inodes on same st_dev and
+ * d_ino consistent with st_ino.
+ * With xino=on, we do the same effort but we warn if we failed.
+ */
+static inline bool ovl_xino_warn(struct super_block *sb)
+{
+       return OVL_FS(sb)->config.xino == OVL_XINO_ON;
+}
+
 /* All layers on same fs? */
 static inline bool ovl_same_fs(struct super_block *sb)
 {
@@ -410,6 +426,8 @@ struct ovl_inode_params {
        char *redirect;
        struct dentry *lowerdata;
 };
+void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
+                   unsigned long ino, int fsid);
 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
 struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
                               bool is_upper);
@@ -451,6 +469,7 @@ struct ovl_cattr {
 struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
                               struct ovl_cattr *attr);
 int ovl_cleanup(struct inode *dir, struct dentry *dentry);
+struct dentry *ovl_lookup_temp(struct dentry *workdir);
 struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
 
 /* file.c */
index 89015ea..5762d80 100644 (file)
@@ -75,6 +75,8 @@ struct ovl_fs {
        struct inode *indexdir_trap;
        /* -1: disabled, 0: same fs, 1..32: number of unused ino bits */
        int xino_mode;
+       /* For allocation of non-persistent inode numbers */
+       atomic_long_t last_ino;
 };
 
 static inline struct ovl_fs *OVL_FS(struct super_block *sb)
index 40ac9ce..e452ff7 100644 (file)
@@ -438,15 +438,23 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
 
 /* Map inode number to lower fs unique range */
 static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
-                              const char *name, int namelen)
+                              const char *name, int namelen, bool warn)
 {
-       if (ino >> (64 - xinobits)) {
-               pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
-                                   namelen, name, ino, xinobits);
+       unsigned int xinoshift = 64 - xinobits;
+
+       if (unlikely(ino >> xinoshift)) {
+               if (warn) {
+                       pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
+                                           namelen, name, ino, xinobits);
+               }
                return ino;
        }
 
-       return ino | ((u64)fsid) << (64 - xinobits);
+       /*
+        * The lowest xinobit is reserved for mapping the non-peresistent inode
+        * numbers range, but this range is only exposed via st_ino, not here.
+        */
+       return ino | ((u64)fsid) << (xinoshift + 1);
 }
 
 /*
@@ -515,7 +523,8 @@ get:
        } else if (xinobits && !OVL_TYPE_UPPER(type)) {
                ino = ovl_remap_lower_ino(ino, xinobits,
                                          ovl_layer_lower(this)->fsid,
-                                         p->name, p->len);
+                                         p->name, p->len,
+                                         ovl_xino_warn(dir->d_sb));
        }
 
 out:
@@ -645,6 +654,7 @@ struct ovl_readdir_translate {
        u64 parent_ino;
        int fsid;
        int xinobits;
+       bool xinowarn;
 };
 
 static int ovl_fill_real(struct dir_context *ctx, const char *name,
@@ -665,7 +675,7 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name,
                        ino = p->ino;
        } else if (rdt->xinobits) {
                ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid,
-                                         name, namelen);
+                                         name, namelen, rdt->xinowarn);
        }
 
        return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
@@ -696,6 +706,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
                .ctx.actor = ovl_fill_real,
                .orig_ctx = ctx,
                .xinobits = ovl_xino_bits(dir->d_sb),
+               .xinowarn = ovl_xino_warn(dir->d_sb),
        };
 
        if (rdt.xinobits && lower_layer)
index ac967f1..732ad54 100644 (file)
@@ -113,53 +113,54 @@ bug:
        return dentry;
 }
 
-static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
 {
-       struct ovl_entry *oe = dentry->d_fsdata;
-       unsigned int i;
        int ret = 1;
 
-       for (i = 0; i < oe->numlower; i++) {
-               struct dentry *d = oe->lowerstack[i].dentry;
-
-               if (d->d_flags & DCACHE_OP_REVALIDATE) {
-                       ret = d->d_op->d_revalidate(d, flags);
-                       if (ret < 0)
-                               return ret;
-                       if (!ret) {
-                               if (!(flags & LOOKUP_RCU))
-                                       d_invalidate(d);
-                               return -ESTALE;
-                       }
+       if (weak) {
+               if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE)
+                       ret =  d->d_op->d_weak_revalidate(d, flags);
+       } else if (d->d_flags & DCACHE_OP_REVALIDATE) {
+               ret = d->d_op->d_revalidate(d, flags);
+               if (!ret) {
+                       if (!(flags & LOOKUP_RCU))
+                               d_invalidate(d);
+                       ret = -ESTALE;
                }
        }
-       return 1;
+       return ret;
 }
 
-static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+static int ovl_dentry_revalidate_common(struct dentry *dentry,
+                                       unsigned int flags, bool weak)
 {
        struct ovl_entry *oe = dentry->d_fsdata;
+       struct dentry *upper;
        unsigned int i;
        int ret = 1;
 
-       for (i = 0; i < oe->numlower; i++) {
-               struct dentry *d = oe->lowerstack[i].dentry;
+       upper = ovl_dentry_upper(dentry);
+       if (upper)
+               ret = ovl_revalidate_real(upper, flags, weak);
 
-               if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) {
-                       ret = d->d_op->d_weak_revalidate(d, flags);
-                       if (ret <= 0)
-                               break;
-               }
+       for (i = 0; ret > 0 && i < oe->numlower; i++) {
+               ret = ovl_revalidate_real(oe->lowerstack[i].dentry, flags,
+                                         weak);
        }
        return ret;
 }
 
-static const struct dentry_operations ovl_dentry_operations = {
-       .d_release = ovl_dentry_release,
-       .d_real = ovl_d_real,
-};
+static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+{
+       return ovl_dentry_revalidate_common(dentry, flags, false);
+}
+
+static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+       return ovl_dentry_revalidate_common(dentry, flags, true);
+}
 
-static const struct dentry_operations ovl_reval_dentry_operations = {
+static const struct dentry_operations ovl_dentry_operations = {
        .d_release = ovl_dentry_release,
        .d_real = ovl_d_real,
        .d_revalidate = ovl_dentry_revalidate,
@@ -316,12 +317,6 @@ static const char *ovl_redirect_mode_def(void)
        return ovl_redirect_dir_def ? "on" : "off";
 }
 
-enum {
-       OVL_XINO_OFF,
-       OVL_XINO_AUTO,
-       OVL_XINO_ON,
-};
-
 static const char * const ovl_xino_str[] = {
        "off",
        "auto",
@@ -751,13 +746,12 @@ static int ovl_mount_dir(const char *name, struct path *path)
                ovl_unescape(tmp);
                err = ovl_mount_dir_noesc(tmp, path);
 
-               if (!err)
-                       if (ovl_dentry_remote(path->dentry)) {
-                               pr_err("filesystem on '%s' not supported as upperdir\n",
-                                      tmp);
-                               path_put_init(path);
-                               err = -EINVAL;
-                       }
+               if (!err && path->dentry->d_flags & DCACHE_OP_REAL) {
+                       pr_err("filesystem on '%s' not supported as upperdir\n",
+                              tmp);
+                       path_put_init(path);
+                       err = -EINVAL;
+               }
                kfree(tmp);
        }
        return err;
@@ -778,7 +772,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
 }
 
 static int ovl_lower_dir(const char *name, struct path *path,
-                        struct ovl_fs *ofs, int *stack_depth, bool *remote)
+                        struct ovl_fs *ofs, int *stack_depth)
 {
        int fh_type;
        int err;
@@ -793,9 +787,6 @@ static int ovl_lower_dir(const char *name, struct path *path,
 
        *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
 
-       if (ovl_dentry_remote(path->dentry))
-               *remote = true;
-
        /*
         * The inodes index feature and NFS export need to encode and decode
         * file handles, so they require that all layers support them.
@@ -1074,11 +1065,73 @@ out:
        return err;
 }
 
+/*
+ * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and
+ * negative values if error is encountered.
+ */
+static int ovl_check_rename_whiteout(struct dentry *workdir)
+{
+       struct inode *dir = d_inode(workdir);
+       struct dentry *temp;
+       struct dentry *dest;
+       struct dentry *whiteout;
+       struct name_snapshot name;
+       int err;
+
+       inode_lock_nested(dir, I_MUTEX_PARENT);
+
+       temp = ovl_create_temp(workdir, OVL_CATTR(S_IFREG | 0));
+       err = PTR_ERR(temp);
+       if (IS_ERR(temp))
+               goto out_unlock;
+
+       dest = ovl_lookup_temp(workdir);
+       err = PTR_ERR(dest);
+       if (IS_ERR(dest)) {
+               dput(temp);
+               goto out_unlock;
+       }
+
+       /* Name is inline and stable - using snapshot as a copy helper */
+       take_dentry_name_snapshot(&name, temp);
+       err = ovl_do_rename(dir, temp, dir, dest, RENAME_WHITEOUT);
+       if (err) {
+               if (err == -EINVAL)
+                       err = 0;
+               goto cleanup_temp;
+       }
+
+       whiteout = lookup_one_len(name.name.name, workdir, name.name.len);
+       err = PTR_ERR(whiteout);
+       if (IS_ERR(whiteout))
+               goto cleanup_temp;
+
+       err = ovl_is_whiteout(whiteout);
+
+       /* Best effort cleanup of whiteout and temp file */
+       if (err)
+               ovl_cleanup(dir, whiteout);
+       dput(whiteout);
+
+cleanup_temp:
+       ovl_cleanup(dir, temp);
+       release_dentry_name_snapshot(&name);
+       dput(temp);
+       dput(dest);
+
+out_unlock:
+       inode_unlock(dir);
+
+       return err;
+}
+
 static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
                            struct path *workpath)
 {
        struct vfsmount *mnt = ofs->upper_mnt;
        struct dentry *temp;
+       bool rename_whiteout;
+       bool d_type;
        int fh_type;
        int err;
 
@@ -1104,11 +1157,8 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
        if (err < 0)
                goto out;
 
-       /*
-        * We allowed this configuration and don't want to break users over
-        * kernel upgrade. So warn instead of erroring out.
-        */
-       if (!err)
+       d_type = err;
+       if (!d_type)
                pr_warn("upper fs needs to support d_type.\n");
 
        /* Check if upper/work fs supports O_TMPFILE */
@@ -1119,6 +1169,16 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
        else
                pr_warn("upper fs does not support tmpfile.\n");
 
+
+       /* Check if upper/work fs supports RENAME_WHITEOUT */
+       err = ovl_check_rename_whiteout(ofs->workdir);
+       if (err < 0)
+               goto out;
+
+       rename_whiteout = err;
+       if (!rename_whiteout)
+               pr_warn("upper fs does not support RENAME_WHITEOUT.\n");
+
        /*
         * Check if upper/work fs supports trusted.overlay.* xattr
         */
@@ -1133,6 +1193,18 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
                vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE);
        }
 
+       /*
+        * We allowed sub-optimal upper fs configuration and don't want to break
+        * users over kernel upgrade, but we never allowed remote upper fs, so
+        * we can enforce strict requirements for remote upper fs.
+        */
+       if (ovl_dentry_remote(ofs->workdir) &&
+           (!d_type || !rename_whiteout || ofs->noxattr)) {
+               pr_err("upper fs missing required features.\n");
+               err = -EINVAL;
+               goto out;
+       }
+
        /* Check if upper/work fs supports file handles */
        fh_type = ovl_can_decode_fh(ofs->workdir->d_sb);
        if (ofs->config.index && !fh_type) {
@@ -1401,11 +1473,12 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
 
        /*
         * When all layers on same fs, overlay can use real inode numbers.
-        * With mount option "xino=on", mounter declares that there are enough
-        * free high bits in underlying fs to hold the unique fsid.
+        * With mount option "xino=<on|auto>", mounter declares that there are
+        * enough free high bits in underlying fs to hold the unique fsid.
         * If overlayfs does encounter underlying inodes using the high xino
         * bits reserved for fsid, it emits a warning and uses the original
-        * inode number.
+        * inode number or a non persistent inode number allocated from a
+        * dedicated range.
         */
        if (ofs->numfs - !ofs->upper_mnt == 1) {
                if (ofs->config.xino == OVL_XINO_ON)
@@ -1413,14 +1486,16 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
                ofs->xino_mode = 0;
        } else if (ofs->config.xino == OVL_XINO_OFF) {
                ofs->xino_mode = -1;
-       } else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) {
+       } else if (ofs->xino_mode < 0) {
                /*
                 * This is a roundup of number of bits needed for encoding
-                * fsid, where fsid 0 is reserved for upper fs even with
-                * lower only overlay.
+                * fsid, where fsid 0 is reserved for upper fs (even with
+                * lower only overlay) +1 extra bit is reserved for the non
+                * persistent inode number range that is used for resolving
+                * xino lower bits overflow.
                 */
-               BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31);
-               ofs->xino_mode = ilog2(ofs->numfs - 1) + 1;
+               BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 30);
+               ofs->xino_mode = ilog2(ofs->numfs - 1) + 2;
        }
 
        if (ofs->xino_mode > 0) {
@@ -1440,7 +1515,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
        char *lowertmp, *lower;
        struct path *stack = NULL;
        unsigned int stacklen, numlower = 0, i;
-       bool remote = false;
        struct ovl_entry *oe;
 
        err = -ENOMEM;
@@ -1472,7 +1546,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
        lower = lowertmp;
        for (numlower = 0; numlower < stacklen; numlower++) {
                err = ovl_lower_dir(lower, &stack[numlower], ofs,
-                                   &sb->s_stack_depth, &remote);
+                                   &sb->s_stack_depth);
                if (err)
                        goto out_err;
 
@@ -1500,11 +1574,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
                oe->lowerstack[i].layer = &ofs->layers[i+1];
        }
 
-       if (remote)
-               sb->s_d_op = &ovl_reval_dentry_operations;
-       else
-               sb->s_d_op = &ovl_dentry_operations;
-
 out:
        for (i = 0; i < numlower; i++)
                path_put(&stack[i]);
@@ -1589,6 +1658,44 @@ static int ovl_check_overlapping_layers(struct super_block *sb,
        return 0;
 }
 
+static struct dentry *ovl_get_root(struct super_block *sb,
+                                  struct dentry *upperdentry,
+                                  struct ovl_entry *oe)
+{
+       struct dentry *root;
+       struct ovl_path *lowerpath = &oe->lowerstack[0];
+       unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
+       int fsid = lowerpath->layer->fsid;
+       struct ovl_inode_params oip = {
+               .upperdentry = upperdentry,
+               .lowerpath = lowerpath,
+       };
+
+       root = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
+       if (!root)
+               return NULL;
+
+       root->d_fsdata = oe;
+
+       if (upperdentry) {
+               /* Root inode uses upper st_ino/i_ino */
+               ino = d_inode(upperdentry)->i_ino;
+               fsid = 0;
+               ovl_dentry_set_upper_alias(root);
+               if (ovl_is_impuredir(upperdentry))
+                       ovl_set_flag(OVL_IMPURE, d_inode(root));
+       }
+
+       /* Root is always merge -> can have whiteouts */
+       ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
+       ovl_dentry_set_flag(OVL_E_CONNECTED, root);
+       ovl_set_upperdata(d_inode(root));
+       ovl_inode_init(d_inode(root), &oip, ino, fsid);
+       ovl_dentry_update_reval(root, upperdentry, DCACHE_OP_WEAK_REVALIDATE);
+
+       return root;
+}
+
 static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct path upperpath = { };
@@ -1598,6 +1705,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        struct cred *cred;
        int err;
 
+       sb->s_d_op = &ovl_dentry_operations;
+
        err = -ENOMEM;
        ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
        if (!ofs)
@@ -1624,6 +1733,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 
        sb->s_stack_depth = 0;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
+       atomic_long_set(&ofs->last_ino, 1);
        /* Assume underlaying fs uses 32bit inodes unless proven otherwise */
        if (ofs->config.xino != OVL_XINO_OFF) {
                ofs->xino_mode = BITS_PER_LONG - 32;
@@ -1710,25 +1820,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_flags |= SB_POSIXACL;
 
        err = -ENOMEM;
-       root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
+       root_dentry = ovl_get_root(sb, upperpath.dentry, oe);
        if (!root_dentry)
                goto out_free_oe;
 
-       root_dentry->d_fsdata = oe;
-
        mntput(upperpath.mnt);
-       if (upperpath.dentry) {
-               ovl_dentry_set_upper_alias(root_dentry);
-               if (ovl_is_impuredir(upperpath.dentry))
-                       ovl_set_flag(OVL_IMPURE, d_inode(root_dentry));
-       }
-
-       /* Root is always merge -> can have whiteouts */
-       ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry));
-       ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry);
-       ovl_set_upperdata(d_inode(root_dentry));
-       ovl_inode_init(d_inode(root_dentry), upperpath.dentry,
-                      ovl_dentry_lower(root_dentry), NULL);
 
        sb->s_root = root_dentry;
 
index 042f7eb..36b6078 100644 (file)
@@ -93,8 +93,24 @@ struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
 bool ovl_dentry_remote(struct dentry *dentry)
 {
        return dentry->d_flags &
-               (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE |
-                DCACHE_OP_REAL);
+               (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+}
+
+void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry,
+                            unsigned int mask)
+{
+       struct ovl_entry *oe = OVL_E(dentry);
+       unsigned int i, flags = 0;
+
+       if (upperdentry)
+               flags |= upperdentry->d_flags;
+       for (i = 0; i < oe->numlower; i++)
+               flags |= oe->lowerstack[i].dentry->d_flags;
+
+       spin_lock(&dentry->d_lock);
+       dentry->d_flags &= ~mask;
+       dentry->d_flags |= flags & mask;
+       spin_unlock(&dentry->d_lock);
 }
 
 bool ovl_dentry_weird(struct dentry *dentry)
@@ -386,24 +402,6 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect)
        oi->redirect = redirect;
 }
 
-void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
-                   struct dentry *lowerdentry, struct dentry *lowerdata)
-{
-       struct inode *realinode = d_inode(upperdentry ?: lowerdentry);
-
-       if (upperdentry)
-               OVL_I(inode)->__upperdentry = upperdentry;
-       if (lowerdentry)
-               OVL_I(inode)->lower = igrab(d_inode(lowerdentry));
-       if (lowerdata)
-               OVL_I(inode)->lowerdata = igrab(d_inode(lowerdata));
-
-       ovl_copyattr(realinode, inode);
-       ovl_copyflags(realinode, inode);
-       if (!inode->i_ino)
-               inode->i_ino = realinode->i_ino;
-}
-
 void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
 {
        struct inode *upperinode = d_inode(upperdentry);
@@ -416,8 +414,6 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
        smp_wmb();
        OVL_I(inode)->__upperdentry = upperdentry;
        if (inode_unhashed(inode)) {
-               if (!inode->i_ino)
-                       inode->i_ino = upperinode->i_ino;
                inode->i_private = upperinode;
                __insert_inode_hash(inode, (unsigned long) upperinode);
        }
index 5efaf37..8e16f14 100644 (file)
@@ -635,28 +635,35 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
 int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
 {
-       unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0;
        struct mm_struct *mm = get_task_mm(task);
 
        if (mm) {
+               unsigned long size;
+               unsigned long resident = 0;
+               unsigned long shared = 0;
+               unsigned long text = 0;
+               unsigned long data = 0;
+
                size = task_statm(mm, &shared, &text, &data, &resident);
                mmput(mm);
-       }
-       /*
-        * For quick read, open code by putting numbers directly
-        * expected format is
-        * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
-        *               size, resident, shared, text, data);
-        */
-       seq_put_decimal_ull(m, "", size);
-       seq_put_decimal_ull(m, " ", resident);
-       seq_put_decimal_ull(m, " ", shared);
-       seq_put_decimal_ull(m, " ", text);
-       seq_put_decimal_ull(m, " ", 0);
-       seq_put_decimal_ull(m, " ", data);
-       seq_put_decimal_ull(m, " ", 0);
-       seq_putc(m, '\n');
 
+               /*
+                * For quick read, open code by putting numbers directly
+                * expected format is
+                * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+                *               size, resident, shared, text, data);
+                */
+               seq_put_decimal_ull(m, "", size);
+               seq_put_decimal_ull(m, " ", resident);
+               seq_put_decimal_ull(m, " ", shared);
+               seq_put_decimal_ull(m, " ", text);
+               seq_put_decimal_ull(m, " ", 0);
+               seq_put_decimal_ull(m, " ", data);
+               seq_put_decimal_ull(m, " ", 0);
+               seq_putc(m, '\n');
+       } else {
+               seq_write(m, "0 0 0 0 0 0 0\n", 14);
+       }
        return 0;
 }
 
index c1dea9b..d0989a4 100644 (file)
@@ -17,6 +17,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops cpuinfo_proc_ops = {
+       .proc_flags     = PROC_ENTRY_PERMANENT,
        .proc_open      = cpuinfo_open,
        .proc_read      = seq_read,
        .proc_lseek     = seq_lseek,
index 3faed94..4ed6dab 100644 (file)
@@ -531,6 +531,12 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
        return p;
 }
 
+static inline void pde_set_flags(struct proc_dir_entry *pde)
+{
+       if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
+               pde->flags |= PROC_ENTRY_PERMANENT;
+}
+
 struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent,
                const struct proc_ops *proc_ops, void *data)
@@ -541,6 +547,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
        if (!p)
                return NULL;
        p->proc_ops = proc_ops;
+       pde_set_flags(p);
        return proc_register(parent, p);
 }
 EXPORT_SYMBOL(proc_create_data);
@@ -572,6 +579,7 @@ static int proc_seq_release(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops proc_seq_ops = {
+       /* not permanent -- can call into arbitrary seq_operations */
        .proc_open      = proc_seq_open,
        .proc_read      = seq_read,
        .proc_lseek     = seq_lseek,
@@ -602,6 +610,7 @@ static int proc_single_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops proc_single_ops = {
+       /* not permanent -- can call into arbitrary ->single_show */
        .proc_open      = proc_single_open,
        .proc_read      = seq_read,
        .proc_lseek     = seq_lseek,
@@ -662,9 +671,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 
        de = pde_subdir_find(parent, fn, len);
        if (de) {
-               rb_erase(&de->subdir_node, &parent->subdir);
-               if (S_ISDIR(de->mode)) {
-                       parent->nlink--;
+               if (unlikely(pde_is_permanent(de))) {
+                       WARN(1, "removing permanent /proc entry '%s'", de->name);
+                       de = NULL;
+               } else {
+                       rb_erase(&de->subdir_node, &parent->subdir);
+                       if (S_ISDIR(de->mode))
+                               parent->nlink--;
                }
        }
        write_unlock(&proc_subdir_lock);
@@ -700,12 +713,24 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
                write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
+       if (unlikely(pde_is_permanent(root))) {
+               write_unlock(&proc_subdir_lock);
+               WARN(1, "removing permanent /proc entry '%s/%s'",
+                       root->parent->name, root->name);
+               return -EINVAL;
+       }
        rb_erase(&root->subdir_node, &parent->subdir);
 
        de = root;
        while (1) {
                next = pde_subdir_first(de);
                if (next) {
+                       if (unlikely(pde_is_permanent(root))) {
+                               write_unlock(&proc_subdir_lock);
+                               WARN(1, "removing permanent /proc entry '%s/%s'",
+                                       next->parent->name, next->name);
+                               return -EINVAL;
+                       }
                        rb_erase(&next->subdir_node, &de->subdir);
                        de = next;
                        continue;
index 1e730ea..fb4cace 100644 (file)
@@ -202,6 +202,7 @@ static void unuse_pde(struct proc_dir_entry *pde)
 
 /* pde is locked on entry, unlocked on exit */
 static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
+       __releases(&pde->pde_unload_lock)
 {
        /*
         * close() (proc_reg_release()) can't delete an entry and proceed:
@@ -258,135 +259,204 @@ void proc_entry_rundown(struct proc_dir_entry *de)
        spin_unlock(&de->pde_unload_lock);
 }
 
+static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence)
+{
+       typeof_member(struct proc_ops, proc_lseek) lseek;
+
+       lseek = pde->proc_ops->proc_lseek;
+       if (!lseek)
+               lseek = default_llseek;
+       return lseek(file, offset, whence);
+}
+
 static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
 {
        struct proc_dir_entry *pde = PDE(file_inode(file));
        loff_t rv = -EINVAL;
-       if (use_pde(pde)) {
-               typeof_member(struct proc_ops, proc_lseek) lseek;
 
-               lseek = pde->proc_ops->proc_lseek;
-               if (!lseek)
-                       lseek = default_llseek;
-               rv = lseek(file, offset, whence);
+       if (pde_is_permanent(pde)) {
+               return pde_lseek(pde, file, offset, whence);
+       } else if (use_pde(pde)) {
+               rv = pde_lseek(pde, file, offset, whence);
                unuse_pde(pde);
        }
        return rv;
 }
 
+static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+       typeof_member(struct proc_ops, proc_read) read;
+
+       read = pde->proc_ops->proc_read;
+       if (read)
+               return read(file, buf, count, ppos);
+       return -EIO;
+}
+
 static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
        struct proc_dir_entry *pde = PDE(file_inode(file));
        ssize_t rv = -EIO;
-       if (use_pde(pde)) {
-               typeof_member(struct proc_ops, proc_read) read;
 
-               read = pde->proc_ops->proc_read;
-               if (read)
-                       rv = read(file, buf, count, ppos);
+       if (pde_is_permanent(pde)) {
+               return pde_read(pde, file, buf, count, ppos);
+       } else if (use_pde(pde)) {
+               rv = pde_read(pde, file, buf, count, ppos);
                unuse_pde(pde);
        }
        return rv;
 }
 
+static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+       typeof_member(struct proc_ops, proc_write) write;
+
+       write = pde->proc_ops->proc_write;
+       if (write)
+               return write(file, buf, count, ppos);
+       return -EIO;
+}
+
 static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 {
        struct proc_dir_entry *pde = PDE(file_inode(file));
        ssize_t rv = -EIO;
-       if (use_pde(pde)) {
-               typeof_member(struct proc_ops, proc_write) write;
 
-               write = pde->proc_ops->proc_write;
-               if (write)
-                       rv = write(file, buf, count, ppos);
+       if (pde_is_permanent(pde)) {
+               return pde_write(pde, file, buf, count, ppos);
+       } else if (use_pde(pde)) {
+               rv = pde_write(pde, file, buf, count, ppos);
                unuse_pde(pde);
        }
        return rv;
 }
 
+static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
+{
+       typeof_member(struct proc_ops, proc_poll) poll;
+
+       poll = pde->proc_ops->proc_poll;
+       if (poll)
+               return poll(file, pts);
+       return DEFAULT_POLLMASK;
+}
+
 static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
 {
        struct proc_dir_entry *pde = PDE(file_inode(file));
        __poll_t rv = DEFAULT_POLLMASK;
-       if (use_pde(pde)) {
-               typeof_member(struct proc_ops, proc_poll) poll;
 
-               poll = pde->proc_ops->proc_poll;
-               if (poll)
-                       rv = poll(file, pts);
+       if (pde_is_permanent(pde)) {
+               return pde_poll(pde, file, pts);
+       } else if (use_pde(pde)) {
+               rv = pde_poll(pde, file, pts);
                unuse_pde(pde);
        }
        return rv;
 }
 
+static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+       typeof_member(struct proc_ops, proc_ioctl) ioctl;
+
+       ioctl = pde->proc_ops->proc_ioctl;
+       if (ioctl)
+               return ioctl(file, cmd, arg);
+       return -ENOTTY;
+}
+
 static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        struct proc_dir_entry *pde = PDE(file_inode(file));
        long rv = -ENOTTY;
-       if (use_pde(pde)) {
-               typeof_member(struct proc_ops, proc_ioctl) ioctl;
 
-               ioctl = pde->proc_ops->proc_ioctl;
-               if (ioctl)
-                       rv = ioctl(file, cmd, arg);
+       if (pde_is_permanent(pde)) {
+               return pde_ioctl(pde, file, cmd, arg);
+       } else if (use_pde(pde)) {
+               rv = pde_ioctl(pde, file, cmd, arg);
                unuse_pde(pde);
        }
        return rv;
 }
 
 #ifdef CONFIG_COMPAT
+static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
+{
+       typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
+
+       compat_ioctl = pde->proc_ops->proc_compat_ioctl;
+       if (compat_ioctl)
+               return compat_ioctl(file, cmd, arg);
+       return -ENOTTY;
+}
+
 static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        struct proc_dir_entry *pde = PDE(file_inode(file));
        long rv = -ENOTTY;
-       if (use_pde(pde)) {
-               typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
-
-               compat_ioctl = pde->proc_ops->proc_compat_ioctl;
-               if (compat_ioctl)
-                       rv = compat_ioctl(file, cmd, arg);
+       if (pde_is_permanent(pde)) {
+               return pde_compat_ioctl(pde, file, cmd, arg);
+       } else if (use_pde(pde)) {
+               rv = pde_compat_ioctl(pde, file, cmd, arg);
                unuse_pde(pde);
        }
        return rv;
 }
 #endif
 
+static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
+{
+       typeof_member(struct proc_ops, proc_mmap) mmap;
+
+       mmap = pde->proc_ops->proc_mmap;
+       if (mmap)
+               return mmap(file, vma);
+       return -EIO;
+}
+
 static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct proc_dir_entry *pde = PDE(file_inode(file));
        int rv = -EIO;
-       if (use_pde(pde)) {
-               typeof_member(struct proc_ops, proc_mmap) mmap;
 
-               mmap = pde->proc_ops->proc_mmap;
-               if (mmap)
-                       rv = mmap(file, vma);
+       if (pde_is_permanent(pde)) {
+               return pde_mmap(pde, file, vma);
+       } else if (use_pde(pde)) {
+               rv = pde_mmap(pde, file, vma);
                unuse_pde(pde);
        }
        return rv;
 }
 
 static unsigned long
-proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
                           unsigned long len, unsigned long pgoff,
                           unsigned long flags)
 {
-       struct proc_dir_entry *pde = PDE(file_inode(file));
-       unsigned long rv = -EIO;
-
-       if (use_pde(pde)) {
-               typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
+       typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
 
-               get_area = pde->proc_ops->proc_get_unmapped_area;
+       get_area = pde->proc_ops->proc_get_unmapped_area;
 #ifdef CONFIG_MMU
-               if (!get_area)
-                       get_area = current->mm->get_unmapped_area;
+       if (!get_area)
+               get_area = current->mm->get_unmapped_area;
 #endif
+       if (get_area)
+               return get_area(file, orig_addr, len, pgoff, flags);
+       return orig_addr;
+}
+
+static unsigned long
+proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+                          unsigned long len, unsigned long pgoff,
+                          unsigned long flags)
+{
+       struct proc_dir_entry *pde = PDE(file_inode(file));
+       unsigned long rv = -EIO;
 
-               if (get_area)
-                       rv = get_area(file, orig_addr, len, pgoff, flags);
-               else
-                       rv = orig_addr;
+       if (pde_is_permanent(pde)) {
+               return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
+       } else if (use_pde(pde)) {
+               rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
                unuse_pde(pde);
        }
        return rv;
@@ -400,6 +470,13 @@ static int proc_reg_open(struct inode *inode, struct file *file)
        typeof_member(struct proc_ops, proc_release) release;
        struct pde_opener *pdeo;
 
+       if (pde_is_permanent(pde)) {
+               open = pde->proc_ops->proc_open;
+               if (open)
+                       rv = open(inode, file);
+               return rv;
+       }
+
        /*
         * Ensure that
         * 1) PDE's ->release hook will be called no matter what
@@ -449,6 +526,17 @@ static int proc_reg_release(struct inode *inode, struct file *file)
 {
        struct proc_dir_entry *pde = PDE(inode);
        struct pde_opener *pdeo;
+
+       if (pde_is_permanent(pde)) {
+               typeof_member(struct proc_ops, proc_release) release;
+
+               release = pde->proc_ops->proc_release;
+               if (release) {
+                       return release(inode, file);
+               }
+               return 0;
+       }
+
        spin_lock(&pde->pde_unload_lock);
        list_for_each_entry(pdeo, &pde->pde_openers, lh) {
                if (pdeo->file == file) {
index 9e294f0..917cc85 100644 (file)
@@ -61,6 +61,7 @@ struct proc_dir_entry {
        struct rb_node subdir_node;
        char *name;
        umode_t mode;
+       u8 flags;
        u8 namelen;
        char inline_name[];
 } __randomize_layout;
@@ -73,6 +74,11 @@ struct proc_dir_entry {
        0)
 #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))
 
+static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
+{
+       return pde->flags & PROC_ENTRY_PERMANENT;
+}
+
 extern struct kmem_cache *proc_dir_entry_cache;
 void pde_free(struct proc_dir_entry *pde);
 
index ec1b7d2..b38ad55 100644 (file)
@@ -50,6 +50,7 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait)
 
 
 static const struct proc_ops kmsg_proc_ops = {
+       .proc_flags     = PROC_ENTRY_PERMANENT,
        .proc_read      = kmsg_read,
        .proc_poll      = kmsg_poll,
        .proc_open      = kmsg_open,
index 0449edf..46b3293 100644 (file)
@@ -224,6 +224,7 @@ static int stat_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops stat_proc_ops = {
+       .proc_flags     = PROC_ENTRY_PERMANENT,
        .proc_open      = stat_open,
        .proc_read      = seq_read,
        .proc_lseek     = seq_lseek,
index 3ba9ae8..8d382d4 100644 (file)
@@ -123,38 +123,14 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
 }
 #endif
 
-static void vma_stop(struct proc_maps_private *priv)
-{
-       struct mm_struct *mm = priv->mm;
-
-       release_task_mempolicy(priv);
-       up_read(&mm->mmap_sem);
-       mmput(mm);
-}
-
-static struct vm_area_struct *
-m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
-{
-       if (vma == priv->tail_vma)
-               return NULL;
-       return vma->vm_next ?: priv->tail_vma;
-}
-
-static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
-{
-       if (m->count < m->size) /* vma is copied successfully */
-               m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
-}
-
 static void *m_start(struct seq_file *m, loff_t *ppos)
 {
        struct proc_maps_private *priv = m->private;
-       unsigned long last_addr = m->version;
+       unsigned long last_addr = *ppos;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
-       unsigned int pos = *ppos;
 
-       /* See m_cache_vma(). Zero at the start or after lseek. */
+       /* See m_next(). Zero at the start or after lseek. */
        if (last_addr == -1UL)
                return NULL;
 
@@ -163,64 +139,59 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
                return ERR_PTR(-ESRCH);
 
        mm = priv->mm;
-       if (!mm || !mmget_not_zero(mm))
+       if (!mm || !mmget_not_zero(mm)) {
+               put_task_struct(priv->task);
+               priv->task = NULL;
                return NULL;
+       }
 
        if (down_read_killable(&mm->mmap_sem)) {
                mmput(mm);
+               put_task_struct(priv->task);
+               priv->task = NULL;
                return ERR_PTR(-EINTR);
        }
 
        hold_task_mempolicy(priv);
        priv->tail_vma = get_gate_vma(mm);
 
-       if (last_addr) {
-               vma = find_vma(mm, last_addr - 1);
-               if (vma && vma->vm_start <= last_addr)
-                       vma = m_next_vma(priv, vma);
-               if (vma)
-                       return vma;
-       }
-
-       m->version = 0;
-       if (pos < mm->map_count) {
-               for (vma = mm->mmap; pos; pos--) {
-                       m->version = vma->vm_start;
-                       vma = vma->vm_next;
-               }
+       vma = find_vma(mm, last_addr);
+       if (vma)
                return vma;
-       }
-
-       /* we do not bother to update m->version in this case */
-       if (pos == mm->map_count && priv->tail_vma)
-               return priv->tail_vma;
 
-       vma_stop(priv);
-       return NULL;
+       return priv->tail_vma;
 }
 
-static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
 {
        struct proc_maps_private *priv = m->private;
-       struct vm_area_struct *next;
+       struct vm_area_struct *next, *vma = v;
+
+       if (vma == priv->tail_vma)
+               next = NULL;
+       else if (vma->vm_next)
+               next = vma->vm_next;
+       else
+               next = priv->tail_vma;
+
+       *ppos = next ? next->vm_start : -1UL;
 
-       (*pos)++;
-       next = m_next_vma(priv, v);
-       if (!next)
-               vma_stop(priv);
        return next;
 }
 
 static void m_stop(struct seq_file *m, void *v)
 {
        struct proc_maps_private *priv = m->private;
+       struct mm_struct *mm = priv->mm;
 
-       if (!IS_ERR_OR_NULL(v))
-               vma_stop(priv);
-       if (priv->task) {
-               put_task_struct(priv->task);
-               priv->task = NULL;
-       }
+       if (!priv->task)
+               return;
+
+       release_task_mempolicy(priv);
+       up_read(&mm->mmap_sem);
+       mmput(mm);
+       put_task_struct(priv->task);
+       priv->task = NULL;
 }
 
 static int proc_maps_open(struct inode *inode, struct file *file,
@@ -363,7 +334,6 @@ done:
 static int show_map(struct seq_file *m, void *v)
 {
        show_map_vma(m, v);
-       m_cache_vma(m, v);
        return 0;
 }
 
@@ -847,8 +817,6 @@ static int show_smap(struct seq_file *m, void *v)
                seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
        show_smap_vma_flags(m, vma);
 
-       m_cache_vma(m, vma);
-
        return 0;
 }
 
@@ -1887,7 +1855,6 @@ static int show_numa_map(struct seq_file *m, void *v)
        seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
 out:
        seq_putc(m, '\n');
-       m_cache_vma(m, vma);
        return 0;
 }
 
index 59d819c..bbfa9b1 100644 (file)
@@ -331,7 +331,8 @@ COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned i
 }
 #endif
 
-#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT)
+#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
+       defined(__ARCH_WANT_SYS_LLSEEK)
 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
                unsigned long, offset_low, loff_t __user *, result,
                unsigned int, whence)
index 4075e41..5129efc 100644 (file)
@@ -842,7 +842,7 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb,
        struct item_head *pasted;
        struct buffer_info bi;
 
-                                                       buffer_info_init_right(tb, &bi);
+       buffer_info_init_right(tb, &bi);
        leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
 
        /* append item in R[0] */
index 45e1a5d..adb21be 100644 (file)
@@ -184,11 +184,12 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
        }
 
        /* we need to make sure nobody is changing the file size beneath us */
-{
-       int depth = reiserfs_write_unlock_nested(inode->i_sb);
-       inode_lock(inode);
-       reiserfs_write_lock_nested(inode->i_sb, depth);
-}
+       {
+               int depth = reiserfs_write_unlock_nested(inode->i_sb);
+
+               inode_lock(inode);
+               reiserfs_write_lock_nested(inode->i_sb, depth);
+       }
 
        reiserfs_write_lock(inode->i_sb);
 
index 959a066..1594687 100644 (file)
@@ -838,10 +838,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
         */
        INC_DIR_INODE_NLINK(dir)
 
-           retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ ,
-                                       old_format_only(dir->i_sb) ?
-                                       EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
-                                       dentry, inode, &security);
+       retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */,
+                                   old_format_only(dir->i_sb) ?
+                                   EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
+                                   dentry, inode, &security);
        if (retval) {
                DEC_DIR_INODE_NLINK(dir)
                goto out_failed;
@@ -967,7 +967,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
        reiserfs_update_sd(&th, inode);
 
        DEC_DIR_INODE_NLINK(dir)
-           dir->i_size -= (DEH_SIZE + de.de_entrylen);
+       dir->i_size -= (DEH_SIZE + de.de_entrylen);
        reiserfs_update_sd(&th, dir);
 
        /* prevent empty directory from getting lost */
index 1600034..79781eb 100644 (file)
@@ -68,13 +68,6 @@ int seq_open(struct file *file, const struct seq_operations *op)
        p->file = file;
 
        /*
-        * Wrappers around seq_open(e.g. swaps_open) need to be
-        * aware of this. If they set f_version themselves, they
-        * should call seq_open first and then set f_version.
-        */
-       file->f_version = 0;
-
-       /*
         * seq_files support lseek() and pread().  They do not implement
         * write() at all, but we clear FMODE_PWRITE here for historical
         * reasons.
@@ -94,7 +87,6 @@ static int traverse(struct seq_file *m, loff_t offset)
        int error = 0;
        void *p;
 
-       m->version = 0;
        m->index = 0;
        m->count = m->from = 0;
        if (!offset)
@@ -161,25 +153,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
        mutex_lock(&m->lock);
 
        /*
-        * seq_file->op->..m_start/m_stop/m_next may do special actions
-        * or optimisations based on the file->f_version, so we want to
-        * pass the file->f_version to those methods.
-        *
-        * seq_file->version is just copy of f_version, and seq_file
-        * methods can treat it simply as file version.
-        * It is copied in first and copied out after all operations.
-        * It is convenient to have it as  part of structure to avoid the
-        * need of passing another argument to all the seq_file methods.
-        */
-       m->version = file->f_version;
-
-       /*
         * if request is to read from zero offset, reset iterator to first
         * record as it might have been already advanced by previous requests
         */
        if (*ppos == 0) {
                m->index = 0;
-               m->version = 0;
                m->count = 0;
        }
 
@@ -190,7 +168,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
                if (err) {
                        /* With prejudice... */
                        m->read_pos = 0;
-                       m->version = 0;
                        m->index = 0;
                        m->count = 0;
                        goto Done;
@@ -243,7 +220,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
                m->buf = seq_buf_alloc(m->size <<= 1);
                if (!m->buf)
                        goto Enomem;
-               m->version = 0;
                p = m->op->start(m, &m->index);
        }
        m->op->stop(m, p);
@@ -287,7 +263,6 @@ Done:
                *ppos += copied;
                m->read_pos += copied;
        }
-       file->f_version = m->version;
        mutex_unlock(&m->lock);
        return copied;
 Enomem:
@@ -313,7 +288,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
        loff_t retval = -EINVAL;
 
        mutex_lock(&m->lock);
-       m->version = file->f_version;
        switch (whence) {
        case SEEK_CUR:
                offset += file->f_pos;
@@ -329,7 +303,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
                                /* with extreme prejudice... */
                                file->f_pos = 0;
                                m->read_pos = 0;
-                               m->version = 0;
                                m->index = 0;
                                m->count = 0;
                        } else {
@@ -340,7 +313,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
                        file->f_pos = offset;
                }
        }
-       file->f_version = m->version;
        mutex_unlock(&m->lock);
        return retval;
 }
index 703c1c3..e39fdec 100644 (file)
@@ -314,8 +314,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
        if (!pmd_present(_pmd))
                goto out;
 
-       if (pmd_trans_huge(_pmd))
+       if (pmd_trans_huge(_pmd)) {
+               if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
+                       ret = true;
                goto out;
+       }
 
        /*
         * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
@@ -328,6 +331,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
         */
        if (pte_none(*pte))
                ret = true;
+       if (!pte_write(*pte) && (reason & VM_UFFD_WP))
+               ret = true;
        pte_unmap(pte);
 
 out:
@@ -1287,10 +1292,13 @@ static __always_inline int validate_range(struct mm_struct *mm,
        return 0;
 }
 
-static inline bool vma_can_userfault(struct vm_area_struct *vma)
+static inline bool vma_can_userfault(struct vm_area_struct *vma,
+                                    unsigned long vm_flags)
 {
-       return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
-               vma_is_shmem(vma);
+       /* FIXME: add WP support to hugetlbfs and shmem */
+       return vma_is_anonymous(vma) ||
+               ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
+                !(vm_flags & VM_UFFD_WP));
 }
 
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
@@ -1322,15 +1330,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
        vm_flags = 0;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
                vm_flags |= VM_UFFD_MISSING;
-       if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+       if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
                vm_flags |= VM_UFFD_WP;
-               /*
-                * FIXME: remove the below error constraint by
-                * implementing the wprotect tracking mode.
-                */
-               ret = -EINVAL;
-               goto out;
-       }
 
        ret = validate_range(mm, &uffdio_register.range.start,
                             uffdio_register.range.len);
@@ -1380,7 +1381,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
                /* check not compatible vmas */
                ret = -EINVAL;
-               if (!vma_can_userfault(cur))
+               if (!vma_can_userfault(cur, vm_flags))
                        goto out_unlock;
 
                /*
@@ -1408,6 +1409,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                        if (end & (vma_hpagesize - 1))
                                goto out_unlock;
                }
+               if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
+                       goto out_unlock;
 
                /*
                 * Check that this vma isn't already owned by a
@@ -1437,7 +1440,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
        do {
                cond_resched();
 
-               BUG_ON(!vma_can_userfault(vma));
+               BUG_ON(!vma_can_userfault(vma, vm_flags));
                BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
                       vma->vm_userfaultfd_ctx.ctx != ctx);
                WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1492,14 +1495,24 @@ out_unlock:
        up_write(&mm->mmap_sem);
        mmput(mm);
        if (!ret) {
+               __u64 ioctls_out;
+
+               ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
+                   UFFD_API_RANGE_IOCTLS;
+
+               /*
+                * Declare the WP ioctl only if the WP mode is
+                * specified and all checks passed with the range
+                */
+               if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
+                       ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
+
                /*
                 * Now that we scanned all vmas we can already tell
                 * userland which ioctls methods are guaranteed to
                 * succeed on this range.
                 */
-               if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
-                            UFFD_API_RANGE_IOCTLS,
-                            &user_uffdio_register->ioctls))
+               if (put_user(ioctls_out, &user_uffdio_register->ioctls))
                        ret = -EFAULT;
        }
 out:
@@ -1575,7 +1588,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                 * provides for more strict behavior to notice
                 * unregistration errors.
                 */
-               if (!vma_can_userfault(cur))
+               if (!vma_can_userfault(cur, cur->vm_flags))
                        goto out_unlock;
 
                found = true;
@@ -1589,7 +1602,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
        do {
                cond_resched();
 
-               BUG_ON(!vma_can_userfault(vma));
+               BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
 
                /*
                 * Nothing to do: this vma is already registered into this
@@ -1724,11 +1737,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
        ret = -EINVAL;
        if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
                goto out;
-       if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
+       if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
                goto out;
        if (mmget_not_zero(ctx->mm)) {
                ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
-                                  uffdio_copy.len, &ctx->mmap_changing);
+                                  uffdio_copy.len, &ctx->mmap_changing,
+                                  uffdio_copy.mode);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
@@ -1801,6 +1815,53 @@ out:
        return ret;
 }
 
+static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
+                                   unsigned long arg)
+{
+       int ret;
+       struct uffdio_writeprotect uffdio_wp;
+       struct uffdio_writeprotect __user *user_uffdio_wp;
+       struct userfaultfd_wake_range range;
+       bool mode_wp, mode_dontwake;
+
+       if (READ_ONCE(ctx->mmap_changing))
+               return -EAGAIN;
+
+       user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
+
+       if (copy_from_user(&uffdio_wp, user_uffdio_wp,
+                          sizeof(struct uffdio_writeprotect)))
+               return -EFAULT;
+
+       ret = validate_range(ctx->mm, &uffdio_wp.range.start,
+                            uffdio_wp.range.len);
+       if (ret)
+               return ret;
+
+       if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
+                              UFFDIO_WRITEPROTECT_MODE_WP))
+               return -EINVAL;
+
+       mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
+       mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
+
+       if (mode_wp && mode_dontwake)
+               return -EINVAL;
+
+       ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
+                                 uffdio_wp.range.len, mode_wp,
+                                 &ctx->mmap_changing);
+       if (ret)
+               return ret;
+
+       if (!mode_wp && !mode_dontwake) {
+               range.start = uffdio_wp.range.start;
+               range.len = uffdio_wp.range.len;
+               wake_userfault(ctx, &range);
+       }
+       return ret;
+}
+
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
        /*
@@ -1882,6 +1943,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
        case UFFDIO_ZEROPAGE:
                ret = userfaultfd_zeropage(ctx, arg);
                break;
+       case UFFDIO_WRITEPROTECT:
+               ret = userfaultfd_writeprotect(ctx, arg);
+               break;
        }
        return ret;
 }
index 00266de..c526c5e 100644 (file)
@@ -328,6 +328,38 @@ xfs_validate_sb_common(
                return -EFSCORRUPTED;
        }
 
+       /* Validate the realtime geometry; stolen from xfs_repair */
+       if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+           sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) {
+               xfs_notice(mp,
+                       "realtime extent sanity check failed");
+               return -EFSCORRUPTED;
+       }
+
+       if (sbp->sb_rblocks == 0) {
+               if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
+                   sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) {
+                       xfs_notice(mp,
+                               "realtime zeroed geometry check failed");
+                       return -EFSCORRUPTED;
+               }
+       } else {
+               uint64_t        rexts;
+               uint64_t        rbmblocks;
+
+               rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
+               rbmblocks = howmany_64(sbp->sb_rextents,
+                                      NBBY * sbp->sb_blocksize);
+
+               if (sbp->sb_rextents != rexts ||
+                   sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+                   sbp->sb_rbmblocks != rbmblocks) {
+                       xfs_notice(mp,
+                               "realtime geometry sanity check failed");
+                       return -EFSCORRUPTED;
+               }
+       }
+
        if (sbp->sb_unit) {
                if (!xfs_sb_version_hasdalign(sbp) ||
                    sbp->sb_unit > sbp->sb_width ||
index f880141..9ec3eaf 100644 (file)
@@ -327,6 +327,9 @@ xfs_buf_free(
 
                        __free_page(page);
                }
+               if (current->reclaim_state)
+                       current->reclaim_state->reclaimed_slab +=
+                                                       bp->b_page_count;
        } else if (bp->b_flags & _XBF_KMEM)
                kmem_free(bp->b_addr);
        _xfs_buf_free_pages(bp);
@@ -2114,9 +2117,11 @@ xfs_buf_delwri_pushbuf(
 int __init
 xfs_buf_init(void)
 {
-       xfs_buf_zone = kmem_cache_create("xfs_buf",
-                                        sizeof(struct xfs_buf), 0,
-                                        SLAB_HWCACHE_ALIGN, NULL);
+       xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+                                        SLAB_HWCACHE_ALIGN |
+                                        SLAB_RECLAIM_ACCOUNT |
+                                        SLAB_MEM_SPREAD,
+                                        NULL);
        if (!xfs_buf_zone)
                goto out;
 
index 711376c..af2c8e5 100644 (file)
@@ -1105,8 +1105,8 @@ xfs_qm_dqflush(
         * Get the buffer containing the on-disk dquot
         */
        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
-                                  mp->m_quotainfo->qi_dqchunklen, 0, &bp,
-                                  &xfs_dquot_buf_ops);
+                                  mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
+                                  &bp, &xfs_dquot_buf_ops);
        if (error)
                goto out_unlock;
 
@@ -1177,7 +1177,7 @@ xfs_qm_dqflush(
 
 out_unlock:
        xfs_dqfunlock(dqp);
-       return -EIO;
+       return error;
 }
 
 /*
index cf65e2e..baad174 100644 (file)
@@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push(
                if (!xfs_buf_delwri_queue(bp, buffer_list))
                        rval = XFS_ITEM_FLUSHING;
                xfs_buf_relse(bp);
-       }
+       } else if (error == -EAGAIN)
+               rval = XFS_ITEM_LOCKED;
 
        spin_lock(&lip->li_ailp->ail_lock);
 out_unlock:
index f1372f9..5a4b011 100644 (file)
@@ -15,7 +15,6 @@
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
 #include "xfs_icache.h"
-#include "xfs_log.h"
 #include "xfs_pnfs.h"
 
 /*
@@ -221,18 +220,7 @@ STATIC int
 xfs_fs_nfs_commit_metadata(
        struct inode            *inode)
 {
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_lsn_t               lsn = 0;
-
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
-       if (xfs_ipincount(ip))
-               lsn = ip->i_itemp->ili_last_lsn;
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-       if (!lsn)
-               return 0;
-       return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+       return xfs_log_force_inode(XFS_I(inode));
 }
 
 const struct export_operations xfs_export_operations = {
index b8a4a3f..4b8bdec 100644 (file)
@@ -80,19 +80,9 @@ xfs_dir_fsync(
        int                     datasync)
 {
        struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_lsn_t               lsn = 0;
 
        trace_xfs_dir_fsync(ip);
-
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
-       if (xfs_ipincount(ip))
-               lsn = ip->i_itemp->ili_last_lsn;
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-       if (!lsn)
-               return 0;
-       return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+       return xfs_log_force_inode(ip);
 }
 
 STATIC int
@@ -1069,7 +1059,11 @@ xfs_file_remap_range(
 
        ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
                        remap_flags);
+       if (ret)
+               goto out_unlock;
 
+       if (mp->m_flags & XFS_MOUNT_WSYNC)
+               xfs_log_force_inode(dest);
 out_unlock:
        xfs_reflink_remap_unlock(file_in, file_out);
        if (ret)
index 14b922f..d177278 100644 (file)
@@ -1200,8 +1200,7 @@ xfs_create(
        unlock_dp_on_error = false;
 
        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
-                                  resblks ?
-                                       resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+                                       resblks - XFS_IALLOC_SPACE_RES(mp));
        if (error) {
                ASSERT(error != -ENOSPC);
                goto out_trans_cancel;
@@ -2504,6 +2503,88 @@ out:
 }
 
 /*
+ * Look up the inode number specified and mark it stale if it is found. If it is
+ * dirty, return the inode so it can be attached to the cluster buffer so it can
+ * be processed appropriately when the cluster free transaction completes.
+ */
+static struct xfs_inode *
+xfs_ifree_get_one_inode(
+       struct xfs_perag        *pag,
+       struct xfs_inode        *free_ip,
+       xfs_ino_t               inum)
+{
+       struct xfs_mount        *mp = pag->pag_mount;
+       struct xfs_inode        *ip;
+
+retry:
+       rcu_read_lock();
+       ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
+
+       /* Inode not in memory, nothing to do */
+       if (!ip)
+               goto out_rcu_unlock;
+
+       /*
+        * because this is an RCU protected lookup, we could find a recently
+        * freed or even reallocated inode during the lookup. We need to check
+        * under the i_flags_lock for a valid inode here. Skip it if it is not
+        * valid, the wrong inode or stale.
+        */
+       spin_lock(&ip->i_flags_lock);
+       if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
+               spin_unlock(&ip->i_flags_lock);
+               goto out_rcu_unlock;
+       }
+       spin_unlock(&ip->i_flags_lock);
+
+       /*
+        * Don't try to lock/unlock the current inode, but we _cannot_ skip the
+        * other inodes that we did not find in the list attached to the buffer
+        * and are not already marked stale. If we can't lock it, back off and
+        * retry.
+        */
+       if (ip != free_ip) {
+               if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                       rcu_read_unlock();
+                       delay(1);
+                       goto retry;
+               }
+
+               /*
+                * Check the inode number again in case we're racing with
+                * freeing in xfs_reclaim_inode().  See the comments in that
+                * function for more information as to why the initial check is
+                * not sufficient.
+                */
+               if (ip->i_ino != inum) {
+                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                       goto out_rcu_unlock;
+               }
+       }
+       rcu_read_unlock();
+
+       xfs_iflock(ip);
+       xfs_iflags_set(ip, XFS_ISTALE);
+
+       /*
+        * We don't need to attach clean inodes or those only with unlogged
+        * changes (which we throw away, anyway).
+        */
+       if (!ip->i_itemp || xfs_inode_clean(ip)) {
+               ASSERT(ip != free_ip);
+               xfs_ifunlock(ip);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               goto out_no_inode;
+       }
+       return ip;
+
+out_rcu_unlock:
+       rcu_read_unlock();
+out_no_inode:
+       return NULL;
+}
+
+/*
  * A big issue when freeing the inode cluster is that we _cannot_ skip any
  * inodes that are in memory - they all must be marked stale and attached to
  * the cluster buffer.
@@ -2603,77 +2684,11 @@ xfs_ifree_cluster(
                 * even trying to lock them.
                 */
                for (i = 0; i < igeo->inodes_per_cluster; i++) {
-retry:
-                       rcu_read_lock();
-                       ip = radix_tree_lookup(&pag->pag_ici_root,
-                                       XFS_INO_TO_AGINO(mp, (inum + i)));
-
-                       /* Inode not in memory, nothing to do */
-                       if (!ip) {
-                               rcu_read_unlock();
+                       ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i);
+                       if (!ip)
                                continue;
-                       }
-
-                       /*
-                        * because this is an RCU protected lookup, we could
-                        * find a recently freed or even reallocated inode
-                        * during the lookup. We need to check under the
-                        * i_flags_lock for a valid inode here. Skip it if it
-                        * is not valid, the wrong inode or stale.
-                        */
-                       spin_lock(&ip->i_flags_lock);
-                       if (ip->i_ino != inum + i ||
-                           __xfs_iflags_test(ip, XFS_ISTALE)) {
-                               spin_unlock(&ip->i_flags_lock);
-                               rcu_read_unlock();
-                               continue;
-                       }
-                       spin_unlock(&ip->i_flags_lock);
-
-                       /*
-                        * Don't try to lock/unlock the current inode, but we
-                        * _cannot_ skip the other inodes that we did not find
-                        * in the list attached to the buffer and are not
-                        * already marked stale. If we can't lock it, back off
-                        * and retry.
-                        */
-                       if (ip != free_ip) {
-                               if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                       rcu_read_unlock();
-                                       delay(1);
-                                       goto retry;
-                               }
-
-                               /*
-                                * Check the inode number again in case we're
-                                * racing with freeing in xfs_reclaim_inode().
-                                * See the comments in that function for more
-                                * information as to why the initial check is
-                                * not sufficient.
-                                */
-                               if (ip->i_ino != inum + i) {
-                                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                       rcu_read_unlock();
-                                       continue;
-                               }
-                       }
-                       rcu_read_unlock();
-
-                       xfs_iflock(ip);
-                       xfs_iflags_set(ip, XFS_ISTALE);
 
-                       /*
-                        * we don't need to attach clean inodes or those only
-                        * with unlogged changes (which we throw away, anyway).
-                        */
                        iip = ip->i_itemp;
-                       if (!iip || xfs_inode_clean(ip)) {
-                               ASSERT(ip != free_ip);
-                               xfs_ifunlock(ip);
-                               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                               continue;
-                       }
-
                        iip->ili_last_fields = iip->ili_fields;
                        iip->ili_fields = 0;
                        iip->ili_fsync_fields = 0;
@@ -3930,3 +3945,22 @@ xfs_irele(
        trace_xfs_irele(ip, _RET_IP_);
        iput(VFS_I(ip));
 }
+
+/*
+ * Ensure all commited transactions touching the inode are written to the log.
+ */
+int
+xfs_log_force_inode(
+       struct xfs_inode        *ip)
+{
+       xfs_lsn_t               lsn = 0;
+
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+       if (xfs_ipincount(ip))
+               lsn = ip->i_itemp->ili_last_lsn;
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+       if (!lsn)
+               return 0;
+       return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
+}
index 492e539..c6a63f6 100644 (file)
@@ -426,6 +426,7 @@ int         xfs_itruncate_extents_flags(struct xfs_trans **,
                                struct xfs_inode *, int, xfs_fsize_t, int);
 void           xfs_iext_realloc(xfs_inode_t *, int, int);
 
+int            xfs_log_force_inode(struct xfs_inode *ip);
 void           xfs_iunpin_wait(xfs_inode_t *);
 #define xfs_ipincount(ip)      ((unsigned int) atomic_read(&ip->i_pincount))
 
index 4a3d13d..f779cca 100644 (file)
@@ -552,7 +552,8 @@ xfs_inode_item_push(
                if (!xfs_buf_delwri_queue(bp, buffer_list))
                        rval = XFS_ITEM_FLUSHING;
                xfs_buf_relse(bp);
-       }
+       } else if (error == -EAGAIN)
+               rval = XFS_ITEM_LOCKED;
 
        spin_lock(&lip->li_ailp->ail_lock);
 out_unlock:
@@ -730,29 +731,27 @@ xfs_iflush_done(
         * holding the lock before removing the inode from the AIL.
         */
        if (need_ail) {
-               bool                    mlip_changed = false;
+               xfs_lsn_t       tail_lsn = 0;
 
                /* this is an opencoded batch version of xfs_trans_ail_delete */
                spin_lock(&ailp->ail_lock);
                list_for_each_entry(blip, &tmp, li_bio_list) {
                        if (INODE_ITEM(blip)->ili_logged &&
-                           blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
-                               mlip_changed |= xfs_ail_delete_one(ailp, blip);
-                       else {
+                           blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) {
+                               /*
+                                * xfs_ail_update_finish() only cares about the
+                                * lsn of the first tail item removed, any
+                                * others will be at the same or higher lsn so
+                                * we just ignore them.
+                                */
+                               xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip);
+                               if (!tail_lsn && lsn)
+                                       tail_lsn = lsn;
+                       } else {
                                xfs_clear_li_failed(blip);
                        }
                }
-
-               if (mlip_changed) {
-                       if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
-                               xlog_assign_tail_lsn_locked(ailp->ail_mount);
-                       if (list_empty(&ailp->ail_head))
-                               wake_up_all(&ailp->ail_empty);
-               }
-               spin_unlock(&ailp->ail_lock);
-
-               if (mlip_changed)
-                       xfs_log_space_wake(ailp->ail_mount);
+               xfs_ail_update_finish(ailp, tail_lsn);
        }
 
        /*
index 4a53768..00fda2e 100644 (file)
 kmem_zone_t    *xfs_log_ticket_zone;
 
 /* Local miscellaneous function prototypes */
-STATIC int
-xlog_commit_record(
-       struct xlog             *log,
-       struct xlog_ticket      *ticket,
-       struct xlog_in_core     **iclog,
-       xfs_lsn_t               *commitlsnp);
-
 STATIC struct xlog *
 xlog_alloc_log(
        struct xfs_mount        *mp,
@@ -66,14 +59,6 @@ xlog_grant_push_ail(
        struct xlog             *log,
        int                     need_bytes);
 STATIC void
-xlog_regrant_reserve_log_space(
-       struct xlog             *log,
-       struct xlog_ticket      *ticket);
-STATIC void
-xlog_ungrant_log_space(
-       struct xlog             *log,
-       struct xlog_ticket      *ticket);
-STATIC void
 xlog_sync(
        struct xlog             *log,
        struct xlog_in_core     *iclog);
@@ -478,73 +463,6 @@ out_error:
        return error;
 }
 
-
-/*
- * NOTES:
- *
- *     1. currblock field gets updated at startup and after in-core logs
- *             marked as with WANT_SYNC.
- */
-
-/*
- * This routine is called when a user of a log manager ticket is done with
- * the reservation.  If the ticket was ever used, then a commit record for
- * the associated transaction is written out as a log operation header with
- * no data.  The flag XLOG_TIC_INITED is set when the first write occurs with
- * a given ticket.  If the ticket was one with a permanent reservation, then
- * a few operations are done differently.  Permanent reservation tickets by
- * default don't release the reservation.  They just commit the current
- * transaction with the belief that the reservation is still needed.  A flag
- * must be passed in before permanent reservations are actually released.
- * When these type of tickets are not released, they need to be set into
- * the inited state again.  By doing this, a start record will be written
- * out when the next write occurs.
- */
-xfs_lsn_t
-xfs_log_done(
-       struct xfs_mount        *mp,
-       struct xlog_ticket      *ticket,
-       struct xlog_in_core     **iclog,
-       bool                    regrant)
-{
-       struct xlog             *log = mp->m_log;
-       xfs_lsn_t               lsn = 0;
-
-       if (XLOG_FORCED_SHUTDOWN(log) ||
-           /*
-            * If nothing was ever written, don't write out commit record.
-            * If we get an error, just continue and give back the log ticket.
-            */
-           (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
-            (xlog_commit_record(log, ticket, iclog, &lsn)))) {
-               lsn = (xfs_lsn_t) -1;
-               regrant = false;
-       }
-
-
-       if (!regrant) {
-               trace_xfs_log_done_nonperm(log, ticket);
-
-               /*
-                * Release ticket if not permanent reservation or a specific
-                * request has been made to release a permanent reservation.
-                */
-               xlog_ungrant_log_space(log, ticket);
-       } else {
-               trace_xfs_log_done_perm(log, ticket);
-
-               xlog_regrant_reserve_log_space(log, ticket);
-               /* If this ticket was a permanent reservation and we aren't
-                * trying to release it, reset the inited flags; so next time
-                * we write, a start record will be written out.
-                */
-               ticket->t_flags |= XLOG_TIC_INITED;
-       }
-
-       xfs_log_ticket_put(ticket);
-       return lsn;
-}
-
 static bool
 __xlog_state_release_iclog(
        struct xlog             *log,
@@ -869,32 +787,44 @@ xlog_wait_on_iclog(
 }
 
 /*
- * Final log writes as part of unmount.
- *
- * Mark the filesystem clean as unmount happens.  Note that during relocation
- * this routine needs to be executed as part of source-bag while the
- * deallocation must not be done until source-end.
+ * Write out an unmount record using the ticket provided. We have to account for
+ * the data space used in the unmount ticket as this write is not done from a
+ * transaction context that has already done the accounting for us.
  */
-
-/* Actually write the unmount record to disk. */
-static void
-xfs_log_write_unmount_record(
-       struct xfs_mount        *mp)
+static int
+xlog_write_unmount_record(
+       struct xlog             *log,
+       struct xlog_ticket      *ticket,
+       xfs_lsn_t               *lsn,
+       uint                    flags)
 {
-       /* the data section must be 32 bit size aligned */
-       struct xfs_unmount_log_format magic = {
+       struct xfs_unmount_log_format ulf = {
                .magic = XLOG_UNMOUNT_TYPE,
        };
        struct xfs_log_iovec reg = {
-               .i_addr = &magic,
-               .i_len = sizeof(magic),
+               .i_addr = &ulf,
+               .i_len = sizeof(ulf),
                .i_type = XLOG_REG_TYPE_UNMOUNT,
        };
        struct xfs_log_vec vec = {
                .lv_niovecs = 1,
                .lv_iovecp = &reg,
        };
-       struct xlog             *log = mp->m_log;
+
+       /* account for space used by record data */
+       ticket->t_curr_res -= sizeof(ulf);
+       return xlog_write(log, &vec, ticket, lsn, NULL, flags, false);
+}
+
+/*
+ * Mark the filesystem clean by writing an unmount record to the head of the
+ * log.
+ */
+static void
+xlog_unmount_write(
+       struct xlog             *log)
+{
+       struct xfs_mount        *mp = log->l_mp;
        struct xlog_in_core     *iclog;
        struct xlog_ticket      *tic = NULL;
        xfs_lsn_t               lsn;
@@ -905,23 +835,7 @@ xfs_log_write_unmount_record(
        if (error)
                goto out_err;
 
-       /*
-        * If we think the summary counters are bad, clear the unmount header
-        * flag in the unmount record so that the summary counters will be
-        * recalculated during log recovery at next mount.  Refer to
-        * xlog_check_unmount_rec for more details.
-        */
-       if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
-                       XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
-               xfs_alert(mp, "%s: will fix summary counters at next mount",
-                               __func__);
-               flags &= ~XLOG_UNMOUNT_TRANS;
-       }
-
-       /* remove inited flag, and account for space used */
-       tic->t_flags = 0;
-       tic->t_curr_res -= sizeof(magic);
-       error = xlog_write(log, &vec, tic, &lsn, NULL, flags);
+       error = xlog_write_unmount_record(log, tic, &lsn, flags);
        /*
         * At this point, we're umounting anyway, so there's no point in
         * transitioning log state to IOERROR. Just continue...
@@ -943,8 +857,7 @@ out_err:
 
        if (tic) {
                trace_xfs_log_umount_write(log, tic);
-               xlog_ungrant_log_space(log, tic);
-               xfs_log_ticket_put(tic);
+               xfs_log_ticket_ungrant(log, tic);
        }
 }
 
@@ -987,8 +900,22 @@ xfs_log_unmount_write(
 
        if (XLOG_FORCED_SHUTDOWN(log))
                return;
+
+       /*
+        * If we think the summary counters are bad, avoid writing the unmount
+        * record to force log recovery at next mount, after which the summary
+        * counters will be recalculated.  Refer to xlog_check_unmount_rec for
+        * more details.
+        */
+       if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
+                       XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+               xfs_alert(mp, "%s: will fix summary counters at next mount",
+                               __func__);
+               return;
+       }
+
        xfs_log_unmount_verify_iclog(log);
-       xfs_log_write_unmount_record(mp);
+       xlog_unmount_write(log);
 }
 
 /*
@@ -1515,20 +1442,17 @@ out:
        return ERR_PTR(error);
 }      /* xlog_alloc_log */
 
-
 /*
  * Write out the commit record of a transaction associated with the given
- * ticket Return the lsn of the commit record.
+ * ticket to close off a running log write. Return the lsn of the commit record.
  */
-STATIC int
+int
 xlog_commit_record(
        struct xlog             *log,
        struct xlog_ticket      *ticket,
        struct xlog_in_core     **iclog,
-       xfs_lsn_t               *commitlsnp)
+       xfs_lsn_t               *lsn)
 {
-       struct xfs_mount *mp = log->l_mp;
-       int     error;
        struct xfs_log_iovec reg = {
                .i_addr = NULL,
                .i_len = 0,
@@ -1538,12 +1462,15 @@ xlog_commit_record(
                .lv_niovecs = 1,
                .lv_iovecp = &reg,
        };
+       int     error;
+
+       if (XLOG_FORCED_SHUTDOWN(log))
+               return -EIO;
 
-       ASSERT_ALWAYS(iclog);
-       error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
-                                       XLOG_COMMIT_TRANS);
+       error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS,
+                          false);
        if (error)
-               xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+               xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
        return error;
 }
 
@@ -1761,7 +1688,15 @@ xlog_write_iclog(
        iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
        iclog->ic_bio.bi_end_io = xlog_bio_end_io;
        iclog->ic_bio.bi_private = iclog;
-       iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA;
+
+       /*
+        * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
+        * IOs coming immediately after this one. This prevents the block layer
+        * writeback throttle from throttling log writes behind background
+        * metadata writeback and causing priority inversions.
+        */
+       iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC |
+                               REQ_IDLE | REQ_FUA;
        if (need_flush)
                iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
 
@@ -1981,7 +1916,7 @@ xlog_dealloc_log(
        log->l_mp->m_log = NULL;
        destroy_workqueue(log->l_ioend_workqueue);
        kmem_free(log);
-}      /* xlog_dealloc_log */
+}
 
 /*
  * Update counters atomically now that memcpy is done.
@@ -2118,23 +2053,21 @@ xlog_print_trans(
 }
 
 /*
- * Calculate the potential space needed by the log vector.  Each region gets
- * its own xlog_op_header_t and may need to be double word aligned.
+ * Calculate the potential space needed by the log vector.  We may need a start
+ * record, and each region gets its own struct xlog_op_header and may need to be
+ * double word aligned.
  */
 static int
 xlog_write_calc_vec_length(
        struct xlog_ticket      *ticket,
-       struct xfs_log_vec      *log_vector)
+       struct xfs_log_vec      *log_vector,
+       bool                    need_start_rec)
 {
        struct xfs_log_vec      *lv;
-       int                     headers = 0;
+       int                     headers = need_start_rec ? 1 : 0;
        int                     len = 0;
        int                     i;
 
-       /* acct for start rec of xact */
-       if (ticket->t_flags & XLOG_TIC_INITED)
-               headers++;
-
        for (lv = log_vector; lv; lv = lv->lv_next) {
                /* we don't write ordered log vectors */
                if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
@@ -2156,27 +2089,16 @@ xlog_write_calc_vec_length(
        return len;
 }
 
-/*
- * If first write for transaction, insert start record  We can't be trying to
- * commit if we are inited.  We can't have any "partial_copy" if we are inited.
- */
-static int
+static void
 xlog_write_start_rec(
        struct xlog_op_header   *ophdr,
        struct xlog_ticket      *ticket)
 {
-       if (!(ticket->t_flags & XLOG_TIC_INITED))
-               return 0;
-
        ophdr->oh_tid   = cpu_to_be32(ticket->t_tid);
        ophdr->oh_clientid = ticket->t_clientid;
        ophdr->oh_len = 0;
        ophdr->oh_flags = XLOG_START_TRANS;
        ophdr->oh_res2 = 0;
-
-       ticket->t_flags &= ~XLOG_TIC_INITED;
-
-       return sizeof(struct xlog_op_header);
 }
 
 static xlog_op_header_t *
@@ -2365,13 +2287,14 @@ xlog_write(
        struct xlog_ticket      *ticket,
        xfs_lsn_t               *start_lsn,
        struct xlog_in_core     **commit_iclog,
-       uint                    flags)
+       uint                    flags,
+       bool                    need_start_rec)
 {
        struct xlog_in_core     *iclog = NULL;
-       struct xfs_log_iovec    *vecp;
-       struct xfs_log_vec      *lv;
+       struct xfs_log_vec      *lv = log_vector;
+       struct xfs_log_iovec    *vecp = lv->lv_iovecp;
+       int                     index = 0;
        int                     len;
-       int                     index;
        int                     partial_copy = 0;
        int                     partial_copy_len = 0;
        int                     contwr = 0;
@@ -2379,25 +2302,13 @@ xlog_write(
        int                     data_cnt = 0;
        int                     error = 0;
 
-       *start_lsn = 0;
-
-       len = xlog_write_calc_vec_length(ticket, log_vector);
-
        /*
-        * Region headers and bytes are already accounted for.
-        * We only need to take into account start records and
-        * split regions in this function.
+        * If this is a commit or unmount transaction, we don't need a start
+        * record to be written.  We do, however, have to account for the
+        * commit or unmount header that gets written. Hence we always have
+        * to account for an extra xlog_op_header here.
         */
-       if (ticket->t_flags & XLOG_TIC_INITED)
-               ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
-       /*
-        * Commit record headers need to be accounted for. These
-        * come in as separate writes so are easy to detect.
-        */
-       if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
-               ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
+       ticket->t_curr_res -= sizeof(struct xlog_op_header);
        if (ticket->t_curr_res < 0) {
                xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
                     "ctx ticket reservation ran out. Need to up reservation");
@@ -2405,9 +2316,8 @@ xlog_write(
                xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
        }
 
-       index = 0;
-       lv = log_vector;
-       vecp = lv->lv_iovecp;
+       len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec);
+       *start_lsn = 0;
        while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
                void            *ptr;
                int             log_offset;
@@ -2431,7 +2341,6 @@ xlog_write(
                while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
                        struct xfs_log_iovec    *reg;
                        struct xlog_op_header   *ophdr;
-                       int                     start_rec_copy;
                        int                     copy_len;
                        int                     copy_off;
                        bool                    ordered = false;
@@ -2447,11 +2356,15 @@ xlog_write(
                        ASSERT(reg->i_len % sizeof(int32_t) == 0);
                        ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
 
-                       start_rec_copy = xlog_write_start_rec(ptr, ticket);
-                       if (start_rec_copy) {
-                               record_cnt++;
+                       /*
+                        * Before we start formatting log vectors, we need to
+                        * write a start record. Only do this for the first
+                        * iclog we write to.
+                        */
+                       if (need_start_rec) {
+                               xlog_write_start_rec(ptr, ticket);
                                xlog_write_adv_cnt(&ptr, &len, &log_offset,
-                                                  start_rec_copy);
+                                               sizeof(struct xlog_op_header));
                        }
 
                        ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
@@ -2483,8 +2396,13 @@ xlog_write(
                                xlog_write_adv_cnt(&ptr, &len, &log_offset,
                                                   copy_len);
                        }
-                       copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+                       copy_len += sizeof(struct xlog_op_header);
                        record_cnt++;
+                       if (need_start_rec) {
+                               copy_len += sizeof(struct xlog_op_header);
+                               record_cnt++;
+                               need_start_rec = false;
+                       }
                        data_cnt += contwr ? copy_len : 0;
 
                        error = xlog_write_copy_finish(log, iclog, flags,
@@ -2541,14 +2459,6 @@ next_lv:
        return error;
 }
 
-
-/*****************************************************************************
- *
- *             State Machine functions
- *
- *****************************************************************************
- */
-
 static void
 xlog_state_activate_iclog(
        struct xlog_in_core     *iclog,
@@ -2909,7 +2819,7 @@ xlog_state_done_syncing(
         */
        wake_up_all(&iclog->ic_write_wait);
        spin_unlock(&log->l_icloglock);
-       xlog_state_do_callback(log);    /* also cleans log */
+       xlog_state_do_callback(log);
 }
 
 /*
@@ -3029,21 +2939,21 @@ restart:
 
        *logoffsetp = log_offset;
        return 0;
-}      /* xlog_state_get_iclog_space */
-
-/* The first cnt-1 times through here we don't need to
- * move the grant write head because the permanent
- * reservation has reserved cnt times the unit amount.
- * Release part of current permanent unit reservation and
- * reset current reservation to be one units worth.  Also
- * move grant reservation head forward.
+}
+
+/*
+ * The first cnt-1 times a ticket goes through here we don't need to move the
+ * grant write head because the permanent reservation has reserved cnt times the
+ * unit amount.  Release part of current permanent unit reservation and reset
+ * current reservation to be one units worth.  Also move grant reservation head
+ * forward.
  */
-STATIC void
-xlog_regrant_reserve_log_space(
+void
+xfs_log_ticket_regrant(
        struct xlog             *log,
        struct xlog_ticket      *ticket)
 {
-       trace_xfs_log_regrant_reserve_enter(log, ticket);
+       trace_xfs_log_ticket_regrant(log, ticket);
 
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
@@ -3055,21 +2965,20 @@ xlog_regrant_reserve_log_space(
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
 
-       trace_xfs_log_regrant_reserve_sub(log, ticket);
+       trace_xfs_log_ticket_regrant_sub(log, ticket);
 
        /* just return if we still have some of the pre-reserved space */
-       if (ticket->t_cnt > 0)
-               return;
+       if (!ticket->t_cnt) {
+               xlog_grant_add_space(log, &log->l_reserve_head.grant,
+                                    ticket->t_unit_res);
+               trace_xfs_log_ticket_regrant_exit(log, ticket);
 
-       xlog_grant_add_space(log, &log->l_reserve_head.grant,
-                                       ticket->t_unit_res);
-
-       trace_xfs_log_regrant_reserve_exit(log, ticket);
-
-       ticket->t_curr_res = ticket->t_unit_res;
-       xlog_tic_reset_res(ticket);
-}      /* xlog_regrant_reserve_log_space */
+               ticket->t_curr_res = ticket->t_unit_res;
+               xlog_tic_reset_res(ticket);
+       }
 
+       xfs_log_ticket_put(ticket);
+}
 
 /*
  * Give back the space left from a reservation.
@@ -3085,18 +2994,19 @@ xlog_regrant_reserve_log_space(
  * space, the count will stay at zero and the only space remaining will be
  * in the current reservation field.
  */
-STATIC void
-xlog_ungrant_log_space(
+void
+xfs_log_ticket_ungrant(
        struct xlog             *log,
        struct xlog_ticket      *ticket)
 {
-       int     bytes;
+       int                     bytes;
+
+       trace_xfs_log_ticket_ungrant(log, ticket);
 
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
 
-       trace_xfs_log_ungrant_enter(log, ticket);
-       trace_xfs_log_ungrant_sub(log, ticket);
+       trace_xfs_log_ticket_ungrant_sub(log, ticket);
 
        /*
         * If this is a permanent reservation ticket, we may be able to free
@@ -3111,18 +3021,15 @@ xlog_ungrant_log_space(
        xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
        xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
 
-       trace_xfs_log_ungrant_exit(log, ticket);
+       trace_xfs_log_ticket_ungrant_exit(log, ticket);
 
        xfs_log_space_wake(log->l_mp);
+       xfs_log_ticket_put(ticket);
 }
 
 /*
- * Mark the current iclog in the ring as WANT_SYNC and move the current iclog
- * pointer to the next iclog in the ring.
- *
- * When called from xlog_state_get_iclog_space(), the exact size of the iclog
- * has not yet been determined, all we know is that we have run out of space in
- * the current iclog.
+ * This routine will mark the current iclog in the ring as WANT_SYNC and move
+ * the current iclog pointer to the next iclog in the ring.
  */
 STATIC void
 xlog_state_switch_iclogs(
@@ -3167,7 +3074,7 @@ xlog_state_switch_iclogs(
        }
        ASSERT(iclog == log->l_iclog);
        log->l_iclog = iclog->ic_next;
-}      /* xlog_state_switch_iclogs */
+}
 
 /*
  * Write out all data in the in-core log as of this exact moment in time.
@@ -3374,13 +3281,6 @@ xfs_log_force_lsn(
        return ret;
 }
 
-/*****************************************************************************
- *
- *             TICKET functions
- *
- *****************************************************************************
- */
-
 /*
  * Free a used ticket when its refcount falls to zero.
  */
@@ -3529,7 +3429,6 @@ xlog_ticket_alloc(
        tic->t_ocnt             = cnt;
        tic->t_tid              = prandom_u32();
        tic->t_clientid         = client;
-       tic->t_flags            = XLOG_TIC_INITED;
        if (permanent)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
 
@@ -3538,13 +3437,6 @@ xlog_ticket_alloc(
        return tic;
 }
 
-
-/******************************************************************************
- *
- *             Log debug routines
- *
- ******************************************************************************
- */
 #if defined(DEBUG)
 /*
  * Make sure that the destination ptr is within the valid data region of
@@ -3630,7 +3522,7 @@ xlog_verify_tail_lsn(
        if (blocks < BTOBB(iclog->ic_offset) + 1)
                xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
     }
-}      /* xlog_verify_tail_lsn */
+}
 
 /*
  * Perform a number of checks on the iclog before writing to disk.
@@ -3733,7 +3625,7 @@ xlog_verify_iclog(
                }
                ptr += sizeof(xlog_op_header_t) + op_len;
        }
-}      /* xlog_verify_iclog */
+}
 #endif
 
 /*
index cc77cc3..1412d69 100644 (file)
@@ -105,10 +105,6 @@ struct xfs_log_item;
 struct xfs_item_ops;
 struct xfs_trans;
 
-xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
-                      struct xlog_ticket *ticket,
-                      struct xlog_in_core **iclog,
-                      bool regrant);
 int      xfs_log_force(struct xfs_mount *mp, uint flags);
 int      xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags,
                int *log_forced);
index 64cc0bf..b43f0e8 100644 (file)
@@ -669,6 +669,11 @@ xlog_cil_push_work(
        ASSERT(push_seq <= ctx->sequence);
 
        /*
+        * Wake up any background push waiters now this context is being pushed.
+        */
+       wake_up_all(&ctx->push_wait);
+
+       /*
         * Check if we've anything to push. If there is nothing, then we don't
         * move on to a new sequence number and so we have to be able to push
         * this sequence again later.
@@ -744,6 +749,7 @@ xlog_cil_push_work(
         */
        INIT_LIST_HEAD(&new_ctx->committing);
        INIT_LIST_HEAD(&new_ctx->busy_extents);
+       init_waitqueue_head(&new_ctx->push_wait);
        new_ctx->sequence = ctx->sequence + 1;
        new_ctx->cil = cil;
        cil->xc_ctx = new_ctx;
@@ -801,7 +807,7 @@ xlog_cil_push_work(
        lvhdr.lv_iovecp = &lhdr;
        lvhdr.lv_next = ctx->lv_chain;
 
-       error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+       error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true);
        if (error)
                goto out_abort_free_ticket;
 
@@ -839,10 +845,11 @@ restart:
        }
        spin_unlock(&cil->xc_push_lock);
 
-       /* xfs_log_done always frees the ticket on error. */
-       commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
-       if (commit_lsn == -1)
-               goto out_abort;
+       error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn);
+       if (error)
+               goto out_abort_free_ticket;
+
+       xfs_log_ticket_ungrant(log, tic);
 
        spin_lock(&commit_iclog->ic_callback_lock);
        if (commit_iclog->ic_state == XLOG_STATE_IOERROR) {
@@ -875,7 +882,7 @@ out_skip:
        return;
 
 out_abort_free_ticket:
-       xfs_log_ticket_put(tic);
+       xfs_log_ticket_ungrant(log, tic);
 out_abort:
        ASSERT(XLOG_FORCED_SHUTDOWN(log));
        xlog_cil_committed(ctx);
@@ -890,7 +897,7 @@ out_abort:
  */
 static void
 xlog_cil_push_background(
-       struct xlog     *log)
+       struct xlog     *log) __releases(cil->xc_ctx_lock)
 {
        struct xfs_cil  *cil = log->l_cilp;
 
@@ -904,14 +911,36 @@ xlog_cil_push_background(
         * don't do a background push if we haven't used up all the
         * space available yet.
         */
-       if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+       if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
+               up_read(&cil->xc_ctx_lock);
                return;
+       }
 
        spin_lock(&cil->xc_push_lock);
        if (cil->xc_push_seq < cil->xc_current_sequence) {
                cil->xc_push_seq = cil->xc_current_sequence;
                queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
        }
+
+       /*
+        * Drop the context lock now, we can't hold that if we need to sleep
+        * because we are over the blocking threshold. The push_lock is still
+        * held, so blocking threshold sleep/wakeup is still correctly
+        * serialised here.
+        */
+       up_read(&cil->xc_ctx_lock);
+
+       /*
+        * If we are well over the space limit, throttle the work that is being
+        * done until the push work on this context has begun.
+        */
+       if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
+               trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
+               ASSERT(cil->xc_ctx->space_used < log->l_logsize);
+               xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock);
+               return;
+       }
+
        spin_unlock(&cil->xc_push_lock);
 
 }
@@ -1007,7 +1036,10 @@ xfs_log_commit_cil(
        if (commit_lsn)
                *commit_lsn = xc_commit_lsn;
 
-       xfs_log_done(mp, tp->t_ticket, NULL, regrant);
+       if (regrant && !XLOG_FORCED_SHUTDOWN(log))
+               xfs_log_ticket_regrant(log, tp->t_ticket);
+       else
+               xfs_log_ticket_ungrant(log, tp->t_ticket);
        tp->t_ticket = NULL;
        xfs_trans_unreserve_and_mod_sb(tp);
 
@@ -1028,9 +1060,9 @@ xfs_log_commit_cil(
                if (lip->li_ops->iop_committing)
                        lip->li_ops->iop_committing(lip, xc_commit_lsn);
        }
-       xlog_cil_push_background(log);
 
-       up_read(&cil->xc_ctx_lock);
+       /* xlog_cil_push_background() releases cil->xc_ctx_lock */
+       xlog_cil_push_background(log);
 }
 
 /*
@@ -1189,6 +1221,7 @@ xlog_cil_init(
 
        INIT_LIST_HEAD(&ctx->committing);
        INIT_LIST_HEAD(&ctx->busy_extents);
+       init_waitqueue_head(&ctx->push_wait);
        ctx->sequence = 1;
        ctx->cil = cil;
        cil->xc_ctx = ctx;
index 2b0aec3..ec22c7a 100644 (file)
@@ -51,13 +51,11 @@ enum xlog_iclog_state {
 };
 
 /*
- * Flags to log ticket
+ * Log ticket flags
  */
-#define XLOG_TIC_INITED                0x1     /* has been initialized */
-#define XLOG_TIC_PERM_RESERV   0x2     /* permanent reservation */
+#define XLOG_TIC_PERM_RESERV   0x1     /* permanent reservation */
 
 #define XLOG_TIC_FLAGS \
-       { XLOG_TIC_INITED,      "XLOG_TIC_INITED" }, \
        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
 
 /*
@@ -242,6 +240,7 @@ struct xfs_cil_ctx {
        struct xfs_log_vec      *lv_chain;      /* logvecs being pushed */
        struct list_head        iclog_entry;
        struct list_head        committing;     /* ctx committing list */
+       wait_queue_head_t       push_wait;      /* background push throttle */
        struct work_struct      discard_endio_work;
 };
 
@@ -318,13 +317,53 @@ struct xfs_cil {
  * tries to keep 25% of the log free, so we need to keep below that limit or we
  * risk running out of free log space to start any new transactions.
  *
- * In order to keep background CIL push efficient, we will set a lower
- * threshold at which background pushing is attempted without blocking current
- * transaction commits.  A separate, higher bound defines when CIL pushes are
- * enforced to ensure we stay within our maximum checkpoint size bounds.
- * threshold, yet give us plenty of space for aggregation on large logs.
+ * In order to keep background CIL push efficient, we only need to ensure the
+ * CIL is large enough to maintain sufficient in-memory relogging to avoid
+ * repeated physical writes of frequently modified metadata. If we allow the CIL
+ * to grow to a substantial fraction of the log, then we may be pinning hundreds
+ * of megabytes of metadata in memory until the CIL flushes. This can cause
+ * issues when we are running low on memory - pinned memory cannot be reclaimed,
+ * and the CIL consumes a lot of memory. Hence we need to set an upper physical
+ * size limit for the CIL that limits the maximum amount of memory pinned by the
+ * CIL but does not limit performance by reducing relogging efficiency
+ * significantly.
+ *
+ * As such, the CIL push threshold ends up being the smaller of two thresholds:
+ * - a threshold large enough that it allows CIL to be pushed and progress to be
+ *   made without excessive blocking of incoming transaction commits. This is
+ *   defined to be 12.5% of the log space - half the 25% push threshold of the
+ *   AIL.
+ * - small enough that it doesn't pin excessive amounts of memory but maintains
+ *   close to peak relogging efficiency. This is defined to be 16x the iclog
+ *   buffer window (32MB) as measurements have shown this to be roughly the
+ *   point of diminishing performance increases under highly concurrent
+ *   modification workloads.
+ *
+ * To prevent the CIL from overflowing upper commit size bounds, we introduce a
+ * new threshold at which we block committing transactions until the background
+ * CIL commit commences and switches to a new context. While this is not a hard
+ * limit, it forces the process committing a transaction to the CIL to block and
+ * yeild the CPU, giving the CIL push work a chance to be scheduled and start
+ * work. This prevents a process running lots of transactions from overfilling
+ * the CIL because it is not yielding the CPU. We set the blocking limit at
+ * twice the background push space threshold so we keep in line with the AIL
+ * push thresholds.
+ *
+ * Note: this is not a -hard- limit as blocking is applied after the transaction
+ * is inserted into the CIL and the push has been triggered. It is largely a
+ * throttling mechanism that allows the CIL push to be scheduled and run. A hard
+ * limit will be difficult to implement without introducing global serialisation
+ * in the CIL commit fast path, and it's not at all clear that we actually need
+ * such hard limits given the ~7 years we've run without a hard limit before
+ * finding the first situation where a checkpoint size overflow actually
+ * occurred. Hence the simple throttle, and an ASSERT check to tell us that
+ * we've overrun the max size.
  */
-#define XLOG_CIL_SPACE_LIMIT(log)      (log->l_logsize >> 3)
+#define XLOG_CIL_SPACE_LIMIT(log)      \
+       min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
+
+#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log)     \
+       (XLOG_CIL_SPACE_LIMIT(log) * 2)
 
 /*
  * ticket grant locks, queues and accounting have their own cachlines
@@ -439,14 +478,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
 
 void   xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
 void   xlog_print_trans(struct xfs_trans *);
-int
-xlog_write(
-       struct xlog             *log,
-       struct xfs_log_vec      *log_vector,
-       struct xlog_ticket      *tic,
-       xfs_lsn_t               *start_lsn,
-       struct xlog_in_core     **commit_iclog,
-       uint                    flags);
+int    xlog_write(struct xlog *log, struct xfs_log_vec *log_vector,
+               struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+               struct xlog_in_core **commit_iclog, uint flags,
+               bool need_start_rec);
+int    xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
+               struct xlog_in_core **iclog, xfs_lsn_t *lsn);
+void   xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
+void   xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
 
 /*
  * When we crack an atomic LSN, we sample it first so that the value will not
index 88ab09e..50c4342 100644 (file)
@@ -167,6 +167,7 @@ typedef struct xfs_mount {
        struct xfs_kobj         m_error_meta_kobj;
        struct xfs_error_cfg    m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
        struct xstats           m_stats;        /* per-fs stats */
+       struct ratelimit_state  m_flush_inodes_ratelimit;
 
        struct workqueue_struct *m_buf_workqueue;
        struct workqueue_struct *m_unwritten_workqueue;
index cabdb75..c225691 100644 (file)
@@ -121,12 +121,11 @@ xfs_qm_dqpurge(
 {
        struct xfs_mount        *mp = dqp->q_mount;
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
+       int                     error = -EAGAIN;
 
        xfs_dqlock(dqp);
-       if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
-               xfs_dqunlock(dqp);
-               return -EAGAIN;
-       }
+       if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0)
+               goto out_unlock;
 
        dqp->dq_flags |= XFS_DQ_FREEING;
 
@@ -139,7 +138,6 @@ xfs_qm_dqpurge(
         */
        if (XFS_DQ_IS_DIRTY(dqp)) {
                struct xfs_buf  *bp = NULL;
-               int             error;
 
                /*
                 * We don't care about getting disk errors here. We need
@@ -149,6 +147,8 @@ xfs_qm_dqpurge(
                if (!error) {
                        error = xfs_bwrite(bp);
                        xfs_buf_relse(bp);
+               } else if (error == -EAGAIN) {
+                       goto out_unlock;
                }
                xfs_dqflock(dqp);
        }
@@ -174,6 +174,10 @@ xfs_qm_dqpurge(
 
        xfs_qm_dqdestroy(dqp);
        return 0;
+
+out_unlock:
+       xfs_dqunlock(dqp);
+       return error;
 }
 
 /*
index 2094386..abf06bf 100644 (file)
@@ -528,6 +528,9 @@ xfs_flush_inodes(
 {
        struct super_block      *sb = mp->m_super;
 
+       if (!__ratelimit(&mp->m_flush_inodes_ratelimit))
+               return;
+
        if (down_read_trylock(&sb->s_umount)) {
                sync_inodes_sb(sb);
                up_read(&sb->s_umount);
@@ -1366,6 +1369,17 @@ xfs_fc_fill_super(
        if (error)
                goto out_free_names;
 
+       /*
+        * Cap the number of invocations of xfs_flush_inodes to 16 for every
+        * quarter of a second.  The magic numbers here were determined by
+        * observation neither to cause stalls in writeback when there are a
+        * lot of IO threads and the fs is near ENOSPC, nor cause any fstest
+        * regressions.  YMMV.
+        */
+       ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16);
+       ratelimit_set_flags(&mp->m_flush_inodes_ratelimit,
+                       RATELIMIT_MSG_ON_RELEASE);
+
        error = xfs_init_mount_workqueues(mp);
        if (error)
                goto out_close_devices;
@@ -1861,7 +1875,8 @@ xfs_init_zones(void)
 
        xfs_ili_zone = kmem_cache_create("xfs_ili",
                                         sizeof(struct xfs_inode_log_item), 0,
-                                        SLAB_MEM_SPREAD, NULL);
+                                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                        NULL);
        if (!xfs_ili_zone)
                goto out_destroy_inode_zone;
 
index fa0fa3c..13fb4b9 100644 (file)
@@ -176,7 +176,6 @@ xfs_symlink(
                return -ENAMETOOLONG;
        ASSERT(pathlen > 0);
 
-       udqp = gdqp = NULL;
        prid = xfs_get_initial_prid(dp);
 
        /*
index efc7751..a4323a6 100644 (file)
@@ -1001,8 +1001,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
 DEFINE_EVENT(xfs_loggrant_class, name, \
        TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \
        TP_ARGS(log, tic))
-DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
-DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
 DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
@@ -1011,12 +1009,13 @@ DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
 DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
 
 DECLARE_EVENT_CLASS(xfs_log_item_class,
        TP_PROTO(struct xfs_log_item *lip),
index 1adc6bc..28b983f 100644 (file)
@@ -9,6 +9,7 @@
 #include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
+#include "xfs_log_priv.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_extent_busy.h"
@@ -150,8 +151,9 @@ xfs_trans_reserve(
        uint                    blocks,
        uint                    rtextents)
 {
-       int             error = 0;
-       bool            rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+       struct xfs_mount        *mp = tp->t_mountp;
+       int                     error = 0;
+       bool                    rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
 
        /* Mark this thread as being in a transaction */
        current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -162,7 +164,7 @@ xfs_trans_reserve(
         * fail if the count would go below zero.
         */
        if (blocks > 0) {
-               error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
+               error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
                if (error != 0) {
                        current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
                        return -ENOSPC;
@@ -191,9 +193,9 @@ xfs_trans_reserve(
 
                if (tp->t_ticket != NULL) {
                        ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
-                       error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+                       error = xfs_log_regrant(mp, tp->t_ticket);
                } else {
-                       error = xfs_log_reserve(tp->t_mountp,
+                       error = xfs_log_reserve(mp,
                                                resp->tr_logres,
                                                resp->tr_logcount,
                                                &tp->t_ticket, XFS_TRANSACTION,
@@ -213,7 +215,7 @@ xfs_trans_reserve(
         * fail if the count would go below zero.
         */
        if (rtextents > 0) {
-               error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
+               error = xfs_mod_frextents(mp, -((int64_t)rtextents));
                if (error) {
                        error = -ENOSPC;
                        goto undo_log;
@@ -229,7 +231,7 @@ xfs_trans_reserve(
         */
 undo_log:
        if (resp->tr_logres > 0) {
-               xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
+               xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
                tp->t_ticket = NULL;
                tp->t_log_res = 0;
                tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -237,7 +239,7 @@ undo_log:
 
 undo_blocks:
        if (blocks > 0) {
-               xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
+               xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
                tp->t_blk_res = 0;
        }
 
@@ -1004,9 +1006,10 @@ out_unreserve:
         */
        xfs_trans_unreserve_and_mod_dquots(tp);
        if (tp->t_ticket) {
-               commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
-               if (commit_lsn == -1 && !error)
-                       error = -EIO;
+               if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log))
+                       xfs_log_ticket_regrant(mp->m_log, tp->t_ticket);
+               else
+                       xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
                tp->t_ticket = NULL;
        }
        current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -1065,7 +1068,7 @@ xfs_trans_cancel(
        xfs_trans_unreserve_and_mod_dquots(tp);
 
        if (tp->t_ticket) {
-               xfs_log_done(mp, tp->t_ticket, NULL, false);
+               xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
                tp->t_ticket = NULL;
        }
 
index 2ef0dfb..5642535 100644 (file)
@@ -109,17 +109,25 @@ xfs_ail_next(
  * We need the AIL lock in order to get a coherent read of the lsn of the last
  * item in the AIL.
  */
+static xfs_lsn_t
+__xfs_ail_min_lsn(
+       struct xfs_ail          *ailp)
+{
+       struct xfs_log_item     *lip = xfs_ail_min(ailp);
+
+       if (lip)
+               return lip->li_lsn;
+       return 0;
+}
+
 xfs_lsn_t
 xfs_ail_min_lsn(
        struct xfs_ail          *ailp)
 {
-       xfs_lsn_t               lsn = 0;
-       struct xfs_log_item     *lip;
+       xfs_lsn_t               lsn;
 
        spin_lock(&ailp->ail_lock);
-       lip = xfs_ail_min(ailp);
-       if (lip)
-               lsn = lip->li_lsn;
+       lsn = __xfs_ail_min_lsn(ailp);
        spin_unlock(&ailp->ail_lock);
 
        return lsn;
@@ -681,6 +689,28 @@ xfs_ail_push_all_sync(
        finish_wait(&ailp->ail_empty, &wait);
 }
 
+void
+xfs_ail_update_finish(
+       struct xfs_ail          *ailp,
+       xfs_lsn_t               old_lsn) __releases(ailp->ail_lock)
+{
+       struct xfs_mount        *mp = ailp->ail_mount;
+
+       /* if the tail lsn hasn't changed, don't do updates or wakeups. */
+       if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
+               spin_unlock(&ailp->ail_lock);
+               return;
+       }
+
+       if (!XFS_FORCED_SHUTDOWN(mp))
+               xlog_assign_tail_lsn_locked(mp);
+
+       if (list_empty(&ailp->ail_head))
+               wake_up_all(&ailp->ail_empty);
+       spin_unlock(&ailp->ail_lock);
+       xfs_log_space_wake(mp);
+}
+
 /*
  * xfs_trans_ail_update - bulk AIL insertion operation.
  *
@@ -712,7 +742,7 @@ xfs_trans_ail_update_bulk(
        xfs_lsn_t               lsn) __releases(ailp->ail_lock)
 {
        struct xfs_log_item     *mlip;
-       int                     mlip_changed = 0;
+       xfs_lsn_t               tail_lsn = 0;
        int                     i;
        LIST_HEAD(tmp);
 
@@ -727,9 +757,10 @@ xfs_trans_ail_update_bulk(
                                continue;
 
                        trace_xfs_ail_move(lip, lip->li_lsn, lsn);
+                       if (mlip == lip && !tail_lsn)
+                               tail_lsn = lip->li_lsn;
+
                        xfs_ail_delete(ailp, lip);
-                       if (mlip == lip)
-                               mlip_changed = 1;
                } else {
                        trace_xfs_ail_insert(lip, 0, lsn);
                }
@@ -740,23 +771,23 @@ xfs_trans_ail_update_bulk(
        if (!list_empty(&tmp))
                xfs_ail_splice(ailp, cur, &tmp, lsn);
 
-       if (mlip_changed) {
-               if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
-                       xlog_assign_tail_lsn_locked(ailp->ail_mount);
-               spin_unlock(&ailp->ail_lock);
-
-               xfs_log_space_wake(ailp->ail_mount);
-       } else {
-               spin_unlock(&ailp->ail_lock);
-       }
+       xfs_ail_update_finish(ailp, tail_lsn);
 }
 
-bool
+/*
+ * Delete one log item from the AIL.
+ *
+ * If this item was at the tail of the AIL, return the LSN of the log item so
+ * that we can use it to check if the LSN of the tail of the log has moved
+ * when finishing up the AIL delete process in xfs_ail_update_finish().
+ */
+xfs_lsn_t
 xfs_ail_delete_one(
        struct xfs_ail          *ailp,
        struct xfs_log_item     *lip)
 {
        struct xfs_log_item     *mlip = xfs_ail_min(ailp);
+       xfs_lsn_t               lsn = lip->li_lsn;
 
        trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
        xfs_ail_delete(ailp, lip);
@@ -764,7 +795,9 @@ xfs_ail_delete_one(
        clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
        lip->li_lsn = 0;
 
-       return mlip == lip;
+       if (mlip == lip)
+               return lsn;
+       return 0;
 }
 
 /**
@@ -792,10 +825,10 @@ void
 xfs_trans_ail_delete(
        struct xfs_ail          *ailp,
        struct xfs_log_item     *lip,
-       int                     shutdown_type) __releases(ailp->ail_lock)
+       int                     shutdown_type)
 {
        struct xfs_mount        *mp = ailp->ail_mount;
-       bool                    mlip_changed;
+       xfs_lsn_t               tail_lsn;
 
        if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
                spin_unlock(&ailp->ail_lock);
@@ -808,17 +841,8 @@ xfs_trans_ail_delete(
                return;
        }
 
-       mlip_changed = xfs_ail_delete_one(ailp, lip);
-       if (mlip_changed) {
-               if (!XFS_FORCED_SHUTDOWN(mp))
-                       xlog_assign_tail_lsn_locked(mp);
-               if (list_empty(&ailp->ail_head))
-                       wake_up_all(&ailp->ail_empty);
-       }
-
-       spin_unlock(&ailp->ail_lock);
-       if (mlip_changed)
-               xfs_log_space_wake(ailp->ail_mount);
+       tail_lsn = xfs_ail_delete_one(ailp, lip);
+       xfs_ail_update_finish(ailp, tail_lsn);
 }
 
 int
index 2e073c1..35655ea 100644 (file)
@@ -91,9 +91,11 @@ xfs_trans_ail_update(
        xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
 }
 
-bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn)
+                       __releases(ailp->ail_lock);
 void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
-               int shutdown_type) __releases(ailp->ail_lock);
+               int shutdown_type);
 
 static inline void
 xfs_trans_ail_remove(
index e2e2bef..329b8c8 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/mm_types.h>
 #include <linux/bug.h>
 #include <linux/errno.h>
+#include <asm-generic/pgtable_uffd.h>
 
 #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
        defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h
new file mode 100644 (file)
index 0000000..828966d
--- /dev/null
@@ -0,0 +1,66 @@
+#ifndef _ASM_GENERIC_PGTABLE_UFFD_H
+#define _ASM_GENERIC_PGTABLE_UFFD_H
+
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static __always_inline int pte_uffd_wp(pte_t pte)
+{
+       return 0;
+}
+
+static __always_inline int pmd_uffd_wp(pmd_t pmd)
+{
+       return 0;
+}
+
+static __always_inline pte_t pte_mkuffd_wp(pte_t pte)
+{
+       return pte;
+}
+
+static __always_inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
+{
+       return pmd;
+}
+
+static __always_inline pte_t pte_clear_uffd_wp(pte_t pte)
+{
+       return pte;
+}
+
+static __always_inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
+{
+       return pmd;
+}
+
+static __always_inline pte_t pte_swp_mkuffd_wp(pte_t pte)
+{
+       return pte;
+}
+
+static __always_inline int pte_swp_uffd_wp(pte_t pte)
+{
+       return 0;
+}
+
+static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
+{
+       return pte;
+}
+
+static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd)
+{
+       return pmd;
+}
+
+static inline int pmd_swp_uffd_wp(pmd_t pmd)
+{
+       return 0;
+}
+
+static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
+{
+       return pmd;
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
+#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */
index f391f6b..3f1649a 100644 (file)
@@ -13,6 +13,7 @@
 
 #include <linux/mmu_notifier.h>
 #include <linux/swap.h>
+#include <linux/hugetlb_inline.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
@@ -398,7 +399,7 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
         * We rely on tlb_end_vma() to issue a flush, such that when we reset
         * these values the batch is empty.
         */
-       tlb->vma_huge = !!(vma->vm_flags & VM_HUGETLB);
+       tlb->vma_huge = is_vm_hugetlb_page(vma);
        tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
 }
 
diff --git a/include/dt-bindings/clock/k210-clk.h b/include/dt-bindings/clock/k210-clk.h
new file mode 100644 (file)
index 0000000..5a2fd64
--- /dev/null
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019-20 Sean Anderson <seanga2@gmail.com>
+ * Copyright (c) 2020 Western Digital Corporation or its affiliates.
+ */
+#ifndef K210_CLK_H
+#define K210_CLK_H
+
+/*
+ * Arbitrary identifiers for clocks.
+ * The structure is: in0 -> pll0 -> aclk -> cpu
+ *
+ * Since we use the hardware defaults for now, set all these to the same clock.
+ */
+#define K210_CLK_PLL0   0
+#define K210_CLK_PLL1   0
+#define K210_CLK_ACLK   0
+#define K210_CLK_CPU    0
+
+#endif /* K210_CLK_H */
index 9e1256a..0ce7dfc 100644 (file)
@@ -6,6 +6,7 @@
  * Author: Jacek Anaszewski <j.anaszewski@samsung.com>
  *
  * Copyright (C) 2019 Jacek Anaszewski <jacek.anaszewski@gmail.com>
+ * Copyright (C) 2020 Pavel Machek <pavel@ucw.cz>
  */
 
 #ifndef __DT_BINDINGS_LEDS_H
 #define LED_COLOR_ID_MAX       8
 
 /* Standard LED functions */
+/* Keyboard LEDs, usually it would be input4::capslock etc. */
+/*   Obsolete equivalent: "shift-key-light" */
+#define LED_FUNCTION_CAPSLOCK "capslock"
+#define LED_FUNCTION_SCROLLLOCK "scrolllock"
+#define LED_FUNCTION_NUMLOCK "numlock"
+/*   Obsolete equivalents: "tpacpi::thinklight" (IBM/Lenovo Thinkpads),
+     "lp5523:kb{1,2,3,4,5,6}" (Nokia N900) */
+#define LED_FUNCTION_KBD_BACKLIGHT "kbd_backlight"
+
+/* System LEDs, usually found on system body.
+   platform::mute (etc) is sometimes seen, :mute would be better */
+#define LED_FUNCTION_POWER "power"
+#define LED_FUNCTION_DISK "disk"
+
+/*   Obsolete: "platform:*:charging" (allwinner sun50i) */
+#define LED_FUNCTION_CHARGING "charging"
+/*   Used RGB notification LEDs common on phones.
+     Obsolete equivalents: "status-led:{red,green,blue}" (Motorola Droid 4),
+     "lp5523:{r,g,b}" (Nokia N900) */
+#define LED_FUNCTION_STATUS "status"
+
+#define LED_FUNCTION_MICMUTE "micmute"
+#define LED_FUNCTION_MUTE "mute"
+
+/* Miscelleaus functions. Use functions above if you can. */
 #define LED_FUNCTION_ACTIVITY "activity"
 #define LED_FUNCTION_ALARM "alarm"
 #define LED_FUNCTION_BACKLIGHT "backlight"
 #define LED_FUNCTION_BLUETOOTH "bluetooth"
 #define LED_FUNCTION_BOOT "boot"
 #define LED_FUNCTION_CPU "cpu"
-#define LED_FUNCTION_CAPSLOCK "capslock"
-#define LED_FUNCTION_CHARGING "charging"
 #define LED_FUNCTION_DEBUG "debug"
-#define LED_FUNCTION_DISK "disk"
 #define LED_FUNCTION_DISK_ACTIVITY "disk-activity"
 #define LED_FUNCTION_DISK_ERR "disk-err"
 #define LED_FUNCTION_DISK_READ "disk-read"
 #define LED_FUNCTION_FLASH "flash"
 #define LED_FUNCTION_HEARTBEAT "heartbeat"
 #define LED_FUNCTION_INDICATOR "indicator"
-#define LED_FUNCTION_KBD_BACKLIGHT "kbd_backlight"
 #define LED_FUNCTION_LAN "lan"
 #define LED_FUNCTION_MAIL "mail"
 #define LED_FUNCTION_MTD "mtd"
-#define LED_FUNCTION_MICMUTE "micmute"
-#define LED_FUNCTION_MUTE "mute"
-#define LED_FUNCTION_NUMLOCK "numlock"
 #define LED_FUNCTION_PANIC "panic"
 #define LED_FUNCTION_PROGRAMMING "programming"
-#define LED_FUNCTION_POWER "power"
 #define LED_FUNCTION_RX "rx"
 #define LED_FUNCTION_SD "sd"
-#define LED_FUNCTION_SCROLLLOCK "scrolllock"
 #define LED_FUNCTION_STANDBY "standby"
-#define LED_FUNCTION_STATUS "status"
 #define LED_FUNCTION_TORCH "torch"
 #define LED_FUNCTION_TX "tx"
 #define LED_FUNCTION_USB "usb"
index 9f70b78..d661cd0 100644 (file)
@@ -416,9 +416,30 @@ extern void acpi_osi_setup(char *str);
 extern bool acpi_osi_is_win8(void);
 
 #ifdef CONFIG_ACPI_NUMA
-int acpi_map_pxm_to_online_node(int pxm);
 int acpi_map_pxm_to_node(int pxm);
 int acpi_get_node(acpi_handle handle);
+
+/**
+ * acpi_map_pxm_to_online_node - Map proximity ID to online node
+ * @pxm: ACPI proximity ID
+ *
+ * This is similar to acpi_map_pxm_to_node(), but always returns an online
+ * node.  When the mapped node from a given proximity ID is offline, it
+ * looks up the node distance table and returns the nearest online node.
+ *
+ * ACPI device drivers, which are called after the NUMA initialization has
+ * completed in the kernel, can call this interface to obtain their device
+ * NUMA topology from ACPI tables.  Such drivers do not have to deal with
+ * offline nodes.  A node may be offline when a device proximity ID is
+ * unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
+ * "numa=off" on x86.
+ */
+static inline int acpi_map_pxm_to_online_node(int pxm)
+{
+       int node = acpi_map_pxm_to_node(pxm);
+
+       return numa_map_to_online_node(node);
+}
 #else
 static inline int acpi_map_pxm_to_online_node(int pxm)
 {
index 47f54b4..9acf654 100644 (file)
@@ -162,7 +162,7 @@ static inline __u8 ror8(__u8 word, unsigned int shift)
  *
  * This is safe to use for 16- and 8-bit types as well.
  */
-static inline __s32 sign_extend32(__u32 value, int index)
+static __always_inline __s32 sign_extend32(__u32 value, int index)
 {
        __u8 shift = 31 - index;
        return (__s32)(value << shift) >> shift;
@@ -173,7 +173,7 @@ static inline __s32 sign_extend32(__u32 value, int index)
  * @value: value to sign extend
  * @index: 0 based bit index (0<=index<64) to sign bit
  */
-static inline __s64 sign_extend64(__u64 value, int index)
+static __always_inline __s64 sign_extend64(__u64 value, int index)
 {
        __u8 shift = 63 - index;
        return (__s64)(value << shift) >> shift;
index a740bbc..4671fbf 100644 (file)
  * position @h. For example
  * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
  */
-#define GENMASK(h, l) \
+#if !defined(__ASSEMBLY__) && \
+       (!defined(CONFIG_CC_IS_GCC) || CONFIG_GCC_VERSION >= 49000)
+#include <linux/build_bug.h>
+#define GENMASK_INPUT_CHECK(h, l) \
+       (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \
+               __builtin_constant_p((l) > (h)), (l) > (h), 0)))
+#else
+/*
+ * BUILD_BUG_ON_ZERO is not available in h files included from asm files,
+ * disable the input check if that is the case.
+ */
+#define GENMASK_INPUT_CHECK(h, l) 0
+#endif
+
+#define __GENMASK(h, l) \
        (((~UL(0)) - (UL(1) << (l)) + 1) & \
         (~UL(0) >> (BITS_PER_LONG - 1 - (h))))
+#define GENMASK(h, l) \
+       (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l))
 
-#define GENMASK_ULL(h, l) \
+#define __GENMASK_ULL(h, l) \
        (((~ULL(0)) - (ULL(1) << (l)) + 1) & \
         (~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h))))
+#define GENMASK_ULL(h, l) \
+       (GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l))
 
 #endif /* __LINUX_BITS_H */
index e4a6949..35f8ffe 100644 (file)
@@ -46,6 +46,7 @@ struct blkcg_gq;
 struct blkcg {
        struct cgroup_subsys_state      css;
        spinlock_t                      lock;
+       refcount_t                      online_pin;
 
        struct radix_tree_root          blkg_tree;
        struct blkcg_gq __rcu           *blkg_hint;
@@ -56,7 +57,6 @@ struct blkcg {
        struct list_head                all_blkcgs_node;
 #ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head                cgwb_list;
-       refcount_t                      cgwb_refcnt;
 #endif
 };
 
@@ -412,47 +412,38 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
 
 extern void blkcg_destroy_blkgs(struct blkcg *blkcg);
 
-#ifdef CONFIG_CGROUP_WRITEBACK
-
 /**
- * blkcg_cgwb_get - get a reference for blkcg->cgwb_list
+ * blkcg_pin_online - pin online state
  * @blkcg: blkcg of interest
  *
- * This is used to track the number of active wb's related to a blkcg.
+ * While pinned, a blkcg is kept online.  This is primarily used to
+ * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline
+ * while an associated cgwb is still active.
  */
-static inline void blkcg_cgwb_get(struct blkcg *blkcg)
+static inline void blkcg_pin_online(struct blkcg *blkcg)
 {
-       refcount_inc(&blkcg->cgwb_refcnt);
+       refcount_inc(&blkcg->online_pin);
 }
 
 /**
- * blkcg_cgwb_put - put a reference for @blkcg->cgwb_list
+ * blkcg_unpin_online - unpin online state
  * @blkcg: blkcg of interest
  *
- * This is used to track the number of active wb's related to a blkcg.
- * When this count goes to zero, all active wb has finished so the
+ * This is primarily used to impedance-match blkg and cgwb lifetimes so
+ * that blkg doesn't go offline while an associated cgwb is still active.
+ * When this count goes to zero, all active cgwbs have finished so the
  * blkcg can continue destruction by calling blkcg_destroy_blkgs().
- * This work may occur in cgwb_release_workfn() on the cgwb_release
- * workqueue.
  */
-static inline void blkcg_cgwb_put(struct blkcg *blkcg)
+static inline void blkcg_unpin_online(struct blkcg *blkcg)
 {
-       if (refcount_dec_and_test(&blkcg->cgwb_refcnt))
+       do {
+               if (!refcount_dec_and_test(&blkcg->online_pin))
+                       break;
                blkcg_destroy_blkgs(blkcg);
+               blkcg = blkcg_parent(blkcg);
+       } while (blkcg);
 }
 
-#else
-
-static inline void blkcg_cgwb_get(struct blkcg *blkcg) { }
-
-static inline void blkcg_cgwb_put(struct blkcg *blkcg)
-{
-       /* wb isn't being accounted, so trigger destruction right away */
-       blkcg_destroy_blkgs(blkcg);
-}
-
-#endif
-
 /**
  * blkg_path - format cgroup path of blkg
  * @blkg: blkg of interest
index cb21c5c..ebf5ba6 100644 (file)
@@ -444,8 +444,9 @@ union ceph_mds_request_args {
        } __attribute__ ((packed)) lookupino;
 } __attribute__ ((packed));
 
-#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
-#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
+#define CEPH_MDS_FLAG_REPLAY           1 /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY      2 /* want dentry in reply */
+#define CEPH_MDS_FLAG_ASYNC            4 /* request is asynchronous */
 
 struct ceph_mds_request_head {
        __le64 oldest_client_tid;
@@ -530,6 +531,9 @@ struct ceph_mds_reply_lease {
        __le32 seq;
 } __attribute__ ((packed));
 
+#define CEPH_LEASE_VALID        (1 | 2) /* old and new bit values */
+#define CEPH_LEASE_PRIMARY_LINK 4       /* primary linkage */
+
 struct ceph_mds_reply_dirfrag {
        __le32 frag;            /* fragment */
        __le32 auth;            /* auth mds, if this is a delegation point */
@@ -564,6 +568,7 @@ struct ceph_filelock {
 #define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
 #define CEPH_FILE_MODE_LAZY       4  /* lazy io */
 #define CEPH_FILE_MODE_BITS       4
+#define CEPH_FILE_MODE_MASK       ((1 << CEPH_FILE_MODE_BITS) - 1)
 
 int ceph_flags_to_mode(int flags);
 
@@ -655,10 +660,19 @@ int ceph_flags_to_mode(int flags);
 #define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
                           CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
                           CEPH_CAP_PIN)
+#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \
+                          CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \
+                          CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)
 
 #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
                        CEPH_LOCK_IXATTR)
 
+/* cap masks async dir operations */
+#define CEPH_CAP_DIR_CREATE    CEPH_CAP_FILE_CACHE
+#define CEPH_CAP_DIR_UNLINK    CEPH_CAP_FILE_RD
+#define CEPH_CAP_ANY_DIR_OPS   (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | \
+                                CEPH_CAP_FILE_WREXTEND | CEPH_CAP_FILE_LAZYIO)
+
 int ceph_caps_for_mode(int mode);
 
 enum {
index cf5e840..8b3a1a7 100644 (file)
@@ -2,22 +2,8 @@
 #ifndef _FS_CEPH_DEBUGFS_H
 #define _FS_CEPH_DEBUGFS_H
 
-#include <linux/ceph/ceph_debug.h>
 #include <linux/ceph/types.h>
 
-#define CEPH_DEFINE_SHOW_FUNC(name)                                    \
-static int name##_open(struct inode *inode, struct file *file)         \
-{                                                                      \
-       return single_open(file, name, inode->i_private);               \
-}                                                                      \
-                                                                       \
-static const struct file_operations name##_fops = {                    \
-       .open           = name##_open,                                  \
-       .read           = seq_read,                                     \
-       .llseek         = seq_lseek,                                    \
-       .release        = single_release,                               \
-};
-
 /* debugfs.c */
 extern void ceph_debugfs_init(void);
 extern void ceph_debugfs_cleanup(void);
index ec73ebc..525b7c3 100644 (file)
@@ -272,6 +272,7 @@ extern struct kmem_cache *ceph_cap_flush_cachep;
 extern struct kmem_cache *ceph_dentry_cachep;
 extern struct kmem_cache *ceph_file_cachep;
 extern struct kmem_cache *ceph_dir_file_cachep;
+extern struct kmem_cache *ceph_mds_request_cachep;
 
 /* ceph_common.c */
 extern bool libceph_compatible(void *data);
index 5a62dbd..9d9f745 100644 (file)
@@ -509,23 +509,6 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
                   struct page *req_page, size_t req_len,
                   struct page **resp_pages, size_t *resp_len);
 
-extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
-                              struct ceph_vino vino,
-                              struct ceph_file_layout *layout,
-                              u64 off, u64 *plen,
-                              u32 truncate_seq, u64 truncate_size,
-                              struct page **pages, int nr_pages,
-                              int page_align);
-
-extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
-                               struct ceph_vino vino,
-                               struct ceph_file_layout *layout,
-                               struct ceph_snap_context *sc,
-                               u64 off, u64 len,
-                               u32 truncate_seq, u64 truncate_size,
-                               struct timespec64 *mtime,
-                               struct page **pages, int nr_pages);
-
 int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
                        u64 src_snapid, u64 src_version,
                        struct ceph_object_id *src_oid,
index 5e88e7e..034b0a6 100644 (file)
@@ -347,7 +347,7 @@ static inline void *offset_to_ptr(const int *off)
  * compiler has support to do so.
  */
 #define compiletime_assert(condition, msg) \
-       _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
+       _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
 
 #define compiletime_assert_atomic_type(t)                              \
        compiletime_assert(__native_word(t),                            \
index 72393a8..e970f97 100644 (file)
@@ -129,22 +129,13 @@ struct ftrace_likely_data {
 #define __compiler_offsetof(a, b)      __builtin_offsetof(a, b)
 
 /*
- * Force always-inline if the user requests it so via the .config.
  * Prefer gnu_inline, so that extern inline functions do not emit an
  * externally visible function. This makes extern inline behave as per gnu89
  * semantics rather than c99. This prevents multiple symbol definition errors
  * of extern inline functions at link time.
  * A lot of inline functions can cause havoc with function tracing.
- * Do not use __always_inline here, since currently it expands to inline again
- * (which would break users of __always_inline).
  */
-#if !defined(CONFIG_OPTIMIZE_INLINING)
-#define inline inline __attribute__((__always_inline__)) __gnu_inline \
-       __inline_maybe_unused notrace
-#else
-#define inline inline                                    __gnu_inline \
-       __inline_maybe_unused notrace
-#endif
+#define inline inline __gnu_inline __inline_maybe_unused notrace
 
 /*
  * gcc provides both __inline__ and __inline as alternate spellings of
index 328c2db..d7af5d2 100644 (file)
@@ -13,6 +13,7 @@
 typedef unsigned long dax_entry_t;
 
 struct iomap_ops;
+struct iomap;
 struct dax_device;
 struct dax_operations {
        /*
@@ -34,6 +35,8 @@ struct dax_operations {
        /* copy_to_iter: required operation for fs-dax direct-i/o */
        size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t,
                        struct iov_iter *);
+       /* zero_page_range: required operation. Zero page range   */
+       int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
 };
 
 extern struct attribute_group dax_attribute_group;
@@ -199,6 +202,8 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
+int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
+                       size_t nr_pages);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
@@ -210,20 +215,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
                                      pgoff_t index);
-
-#ifdef CONFIG_FS_DAX
-int __dax_zero_page_range(struct block_device *bdev,
-               struct dax_device *dax_dev, sector_t sector,
-               unsigned int offset, unsigned int length);
-#else
-static inline int __dax_zero_page_range(struct block_device *bdev,
-               struct dax_device *dax_dev, sector_t sector,
-               unsigned int offset, unsigned int length)
-{
-       return -ENXIO;
-}
-#endif
-
+int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size,
+                       struct iomap *iomap);
 static inline bool dax_mapping(struct address_space *mapping)
 {
        return mapping->host && IS_DAX(mapping->host);
index 4635f95..79a6e37 100644 (file)
@@ -75,7 +75,7 @@ void devfreq_cooling_unregister(struct thermal_cooling_device *dfc);
 
 #else /* !CONFIG_DEVFREQ_THERMAL */
 
-struct thermal_cooling_device *
+static inline struct thermal_cooling_device *
 of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df,
                                  struct devfreq_cooling_power *dfc_power)
 {
index 475668c..af48d9d 100644 (file)
@@ -141,6 +141,8 @@ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
                long nr_pages, void **kaddr, pfn_t *pfn);
 typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
                void *addr, size_t bytes, struct iov_iter *i);
+typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff,
+               size_t nr_pages);
 #define PAGE_SECTORS (PAGE_SIZE / 512)
 
 void dm_error(const char *message);
@@ -195,6 +197,7 @@ struct target_type {
        dm_dax_direct_access_fn direct_access;
        dm_dax_copy_iter_fn dax_copy_from_iter;
        dm_dax_copy_iter_fn dax_copy_to_iter;
+       dm_dax_zero_page_range_fn dax_zero_page_range;
 
        /* For internal device-mapper use. */
        struct list_head list;
index 1311f27..ac8e37c 100644 (file)
@@ -42,9 +42,8 @@ struct device_node;
 struct fwnode_handle;
 struct iommu_ops;
 struct iommu_group;
-struct iommu_fwspec;
 struct dev_pin_info;
-struct iommu_param;
+struct dev_iommu;
 
 /**
  * struct subsys_interface - interfaces to device functions
@@ -513,8 +512,7 @@ struct dev_links_info {
  *             gone away. This should be set by the allocator of the
  *             device (i.e. the bus driver that discovered the device).
  * @iommu_group: IOMMU group the device belongs to.
- * @iommu_fwspec: IOMMU-specific properties supplied by firmware.
- * @iommu_param: Per device generic IOMMU runtime data
+ * @iommu:     Per device generic IOMMU runtime data
  *
  * @offline_disabled: If set, the device is permanently online.
  * @offline:   Set after successful invocation of bus type's .offline().
@@ -613,8 +611,7 @@ struct device {
 
        void    (*release)(struct device *dev);
        struct iommu_group      *iommu_group;
-       struct iommu_fwspec     *iommu_fwspec;
-       struct iommu_param      *iommu_param;
+       struct dev_iommu        *iommu;
 
        bool                    offline_disabled:1;
        bool                    offline:1;
index ac3f488..3c383dd 100644 (file)
@@ -125,6 +125,7 @@ struct f2fs_super_block {
 /*
  * For checkpoint
  */
+#define CP_RESIZEFS_FLAG               0x00004000
 #define CP_DISABLED_QUICK_FLAG         0x00002000
 #define CP_DISABLED_FLAG               0x00001000
 #define CP_QUOTA_NEED_FSCK_FLAG                0x00000800
index be27548..4aba4c8 100644 (file)
@@ -124,6 +124,8 @@ struct vm_area_struct;
  *
  * Reclaim modifiers
  * ~~~~~~~~~~~~~~~~~
+ * Please note that all the following flags are only applicable to sleepable
+ * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them).
  *
  * %__GFP_IO can start physical IO.
  *
index f2df224..cfbb0a8 100644 (file)
@@ -46,7 +46,7 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                         pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, pgprot_t newprot,
-                       int prot_numa);
+                       unsigned long cp_flags);
 vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
                                   pgprot_t pgprot, bool write);
 
index bb331e6..7bc961d 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/irqreturn.h>
 #include <linux/platform_data/cros_ec_commands.h>
 #include <linux/platform_data/cros_ec_proto.h>
+#include <linux/platform_data/cros_ec_sensorhub.h>
 
 enum {
        CROS_EC_SENSOR_X,
@@ -29,8 +30,7 @@ enum {
  */
 #define CROS_EC_SAMPLE_SIZE  (sizeof(s64) * 2)
 
-/* Minimum sampling period to use when device is suspending */
-#define CROS_EC_MIN_SUSPEND_SAMPLING_FREQUENCY 1000  /* 1 second */
+typedef irqreturn_t (*cros_ec_sensors_capture_t)(int irq, void *p);
 
 /**
  * struct cros_ec_sensors_core_state - state data for EC sensors IIO driver
@@ -50,7 +50,9 @@ enum {
  *                             the timestamp. The timestamp is always last and
  *                             is always 8-byte aligned.
  * @read_ec_sensors_data:      function used for accessing sensors values
- * @cuur_sampl_freq:           current sampling period
+ * @fifo_max_event_count:      Size of the EC sensor FIFO
+ * @frequencies:               Table of known available frequencies:
+ *                             0, Min and Max in mHz
  */
 struct cros_ec_sensors_core_state {
        struct cros_ec_device *ec;
@@ -73,101 +75,34 @@ struct cros_ec_sensors_core_state {
        int (*read_ec_sensors_data)(struct iio_dev *indio_dev,
                                    unsigned long scan_mask, s16 *data);
 
-       int curr_sampl_freq;
-
-       /* Table of known available frequencies : 0, Min and Max in mHz */
-       int frequencies[3];
+       u32 fifo_max_event_count;
+       int frequencies[6];
 };
 
-/**
- * cros_ec_sensors_read_lpc() - retrieve data from EC shared memory
- * @indio_dev: pointer to IIO device
- * @scan_mask: bitmap of the sensor indices to scan
- * @data:      location to store data
- *
- * This is the safe function for reading the EC data. It guarantees that the
- * data sampled was not modified by the EC while being read.
- *
- * Return: 0 on success, -errno on failure.
- */
 int cros_ec_sensors_read_lpc(struct iio_dev *indio_dev, unsigned long scan_mask,
                             s16 *data);
 
-/**
- * cros_ec_sensors_read_cmd() - retrieve data using the EC command protocol
- * @indio_dev: pointer to IIO device
- * @scan_mask: bitmap of the sensor indices to scan
- * @data:      location to store data
- *
- * Return: 0 on success, -errno on failure.
- */
 int cros_ec_sensors_read_cmd(struct iio_dev *indio_dev, unsigned long scan_mask,
                             s16 *data);
 
 struct platform_device;
-/**
- * cros_ec_sensors_core_init() - basic initialization of the core structure
- * @pdev:              platform device created for the sensors
- * @indio_dev:         iio device structure of the device
- * @physical_device:   true if the device refers to a physical device
- *
- * Return: 0 on success, -errno on failure.
- */
 int cros_ec_sensors_core_init(struct platform_device *pdev,
-                             struct iio_dev *indio_dev, bool physical_device);
+                             struct iio_dev *indio_dev, bool physical_device,
+                             cros_ec_sensors_capture_t trigger_capture,
+                             cros_ec_sensorhub_push_data_cb_t push_data);
 
-/**
- * cros_ec_sensors_capture() - the trigger handler function
- * @irq:       the interrupt number.
- * @p:         a pointer to the poll function.
- *
- * On a trigger event occurring, if the pollfunc is attached then this
- * handler is called as a threaded interrupt (and hence may sleep). It
- * is responsible for grabbing data from the device and pushing it into
- * the associated buffer.
- *
- * Return: IRQ_HANDLED
- */
 irqreturn_t cros_ec_sensors_capture(int irq, void *p);
+int cros_ec_sensors_push_data(struct iio_dev *indio_dev,
+                             s16 *data,
+                             s64 timestamp);
 
-/**
- * cros_ec_motion_send_host_cmd() - send motion sense host command
- * @st:                pointer to state information for device
- * @opt_length:        optional length to reduce the response size, useful on the data
- *             path. Otherwise, the maximal allowed response size is used
- *
- * When called, the sub-command is assumed to be set in param->cmd.
- *
- * Return: 0 on success, -errno on failure.
- */
 int cros_ec_motion_send_host_cmd(struct cros_ec_sensors_core_state *st,
                                 u16 opt_length);
 
-/**
- * cros_ec_sensors_core_read() - function to request a value from the sensor
- * @st:                pointer to state information for device
- * @chan:      channel specification structure table
- * @val:       will contain one element making up the returned value
- * @val2:      will contain another element making up the returned value
- * @mask:      specifies which values to be requested
- *
- * Return:     the type of value returned by the device
- */
 int cros_ec_sensors_core_read(struct cros_ec_sensors_core_state *st,
                              struct iio_chan_spec const *chan,
                              int *val, int *val2, long mask);
 
-/**
- * cros_ec_sensors_core_read_avail() - get available values
- * @indio_dev:         pointer to state information for device
- * @chan:      channel specification structure table
- * @vals:      list of available values
- * @type:      type of data returned
- * @length:    number of data returned in the array
- * @mask:      specifies which values to be requested
- *
- * Return:     an error code, IIO_AVAIL_RANGE or IIO_AVAIL_LIST
- */
 int cros_ec_sensors_core_read_avail(struct iio_dev *indio_dev,
                                    struct iio_chan_spec const *chan,
                                    const int **vals,
@@ -175,23 +110,12 @@ int cros_ec_sensors_core_read_avail(struct iio_dev *indio_dev,
                                    int *length,
                                    long mask);
 
-/**
- * cros_ec_sensors_core_write() - function to write a value to the sensor
- * @st:                pointer to state information for device
- * @chan:      channel specification structure table
- * @val:       first part of value to write
- * @val2:      second part of value to write
- * @mask:      specifies which values to write
- *
- * Return:     the type of value returned by the device
- */
 int cros_ec_sensors_core_write(struct cros_ec_sensors_core_state *st,
                               struct iio_chan_spec const *chan,
                               int val, int val2, long mask);
 
-extern const struct dev_pm_ops cros_ec_sensors_pm_ops;
-
 /* List of extended channel specification for all sensors */
 extern const struct iio_chan_spec_ext_info cros_ec_sensors_ext_info[];
+extern const struct attribute *cros_ec_sensor_fifo_attributes[];
 
 #endif  /* __CROS_EC_SENSORS_CORE_H */
index eed58ed..17f56a0 100644 (file)
@@ -629,6 +629,8 @@ static inline clockid_t iio_device_get_clock(const struct iio_dev *indio_dev)
        return indio_dev->clock_id;
 }
 
+int iio_device_set_clock(struct iio_dev *indio_dev, clockid_t clock_id);
+
 /**
  * dev_to_iio_dev() - Get IIO device struct from a device struct
  * @dev:               The device embedded in the IIO device
index b1c44bb..8394c56 100644 (file)
@@ -77,8 +77,6 @@ void *devm_memremap(struct device *dev, resource_size_t offset,
                size_t size, unsigned long flags);
 void devm_memunmap(struct device *dev, void *addr);
 
-void *__devm_memremap_pages(struct device *dev, struct resource *res);
-
 #ifdef CONFIG_PCI
 /*
  * The PCI specifications (Rev 3.0, 3.2.5 "Transaction Ordering and
index d1b5f4d..7ef8b0b 100644 (file)
@@ -365,17 +365,20 @@ struct iommu_fault_param {
 };
 
 /**
- * struct iommu_param - collection of per-device IOMMU data
+ * struct dev_iommu - Collection of per-device IOMMU data
  *
  * @fault_param: IOMMU detected device fault reporting data
+ * @fwspec:     IOMMU fwspec data
+ * @priv:       IOMMU Driver private data
  *
  * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
  *     struct iommu_group      *iommu_group;
- *     struct iommu_fwspec     *iommu_fwspec;
  */
-struct iommu_param {
+struct dev_iommu {
        struct mutex lock;
-       struct iommu_fault_param *fault_param;
+       struct iommu_fault_param        *fault_param;
+       struct iommu_fwspec             *fwspec;
+       void                            *priv;
 };
 
 int  iommu_device_register(struct iommu_device *iommu);
@@ -588,11 +591,10 @@ struct iommu_group *fsl_mc_device_group(struct device *dev);
 struct iommu_fwspec {
        const struct iommu_ops  *ops;
        struct fwnode_handle    *iommu_fwnode;
-       void                    *iommu_priv;
        u32                     flags;
        u32                     num_pasid_bits;
        unsigned int            num_ids;
-       u32                     ids[1];
+       u32                     ids[];
 };
 
 /* ATS is supported */
@@ -614,13 +616,26 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
 
 static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
 {
-       return dev->iommu_fwspec;
+       if (dev->iommu)
+               return dev->iommu->fwspec;
+       else
+               return NULL;
 }
 
 static inline void dev_iommu_fwspec_set(struct device *dev,
                                        struct iommu_fwspec *fwspec)
 {
-       dev->iommu_fwspec = fwspec;
+       dev->iommu->fwspec = fwspec;
+}
+
+static inline void *dev_iommu_priv_get(struct device *dev)
+{
+       return dev->iommu->priv;
+}
+
+static inline void dev_iommu_priv_set(struct device *dev, void *priv)
+{
+       dev->iommu->priv = priv;
 }
 
 int iommu_probe_device(struct device *dev);
@@ -1073,6 +1088,10 @@ static inline int iommu_sva_unbind_gpasid(struct iommu_domain *domain,
        return -ENODEV;
 }
 
+static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
+{
+       return NULL;
+}
 #endif /* CONFIG_IOMMU_API */
 
 #ifdef CONFIG_IOMMU_DEBUGFS
index 75353e5..2451962 100644 (file)
@@ -25,6 +25,7 @@ struct device_node;
  * LED Core
  */
 
+/* This is obsolete/useless. We now support variable maximum brightness. */
 enum led_brightness {
        LED_OFF         = 0,
        LED_ON          = 1,
diff --git a/include/linux/leds_pwm.h b/include/linux/leds_pwm.h
deleted file mode 100644 (file)
index 93d101d..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PWM LED driver data - see drivers/leds/leds-pwm.c
- */
-#ifndef __LINUX_LEDS_PWM_H
-#define __LINUX_LEDS_PWM_H
-
-struct led_pwm {
-       const char      *name;
-       const char      *default_trigger;
-       unsigned        pwm_id __deprecated;
-       u8              active_low;
-       unsigned        max_brightness;
-       unsigned        pwm_period_ns;
-};
-
-struct led_pwm_platform_data {
-       int                     num_leds;
-       struct led_pwm  *leds;
-};
-
-#endif
index 9df091b..18da405 100644 (file)
@@ -37,6 +37,8 @@ enum {
        NDD_WORK_PENDING = 4,
        /* ignore / filter NSLABEL_FLAG_LOCAL for this DIMM, i.e. no aliasing */
        NDD_NOBLK = 5,
+       /* dimm supports namespace labels */
+       NDD_LABELING = 6,
 
        /* need to set a limit somewhere, but yes, this is likely overkill */
        ND_IOCTL_MAX_BUFLEN = SZ_4M,
index 0b8d791..439a89e 100644 (file)
@@ -26,7 +26,6 @@
 struct memory_block {
        unsigned long start_section_nr;
        unsigned long state;            /* serialized by the dev->lock */
-       int section_count;              /* serialized by mem_sysfs_mutex */
        int online_type;                /* for passing data to online routine */
        int phys_device;                /* to which fru does this belong? */
        struct device dev;
index f4d5915..ef55115 100644 (file)
@@ -47,9 +47,13 @@ enum {
 
 /* Types for control the zone type of onlined and offlined memory */
 enum {
-       MMOP_OFFLINE = -1,
-       MMOP_ONLINE_KEEP,
+       /* Offline the memory. */
+       MMOP_OFFLINE = 0,
+       /* Online the memory. Zone depends, see default_zone_for_pfn(). */
+       MMOP_ONLINE,
+       /* Online the memory to ZONE_NORMAL. */
        MMOP_ONLINE_KERNEL,
+       /* Online the memory to ZONE_MOVABLE. */
        MMOP_ONLINE_MOVABLE,
 };
 
@@ -113,7 +117,10 @@ extern int arch_add_memory(int nid, u64 start, u64 size,
                        struct mhp_restrictions *restrictions);
 extern u64 max_mem_size;
 
-extern bool memhp_auto_online;
+extern int memhp_online_type_from_str(const char *str);
+
+/* Default online_type (MMOP_*) when new memory blocks are added. */
+extern int memhp_default_online_type;
 /* If movable_node boot option specified */
 extern bool movable_node_enabled;
 static inline bool movable_node_is_enabled(void)
index 60d97e8..5f5b2df 100644 (file)
@@ -98,8 +98,6 @@ struct dev_pagemap_ops {
  * @ref: reference count that pins the devm_memremap_pages() mapping
  * @internal_ref: internal reference if @ref is not provided by the caller
  * @done: completion for @internal_ref
- * @dev: host device of the mapping for debug
- * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
  * @flags: PGMAP_* flags to specify defailed behavior
  * @ops: method table
@@ -136,6 +134,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
+unsigned long memremap_compat_align(void);
 #else
 static inline void *devm_memremap_pages(struct device *dev,
                struct dev_pagemap *pgmap)
@@ -169,6 +168,12 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
                unsigned long nr_pfns)
 {
 }
+
+/* when memremap_pages() is disabled all archs can remap a single page */
+static inline unsigned long memremap_compat_align(void)
+{
+       return PAGE_SIZE;
+}
 #endif /* CONFIG_ZONE_DEVICE */
 
 static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
@@ -176,4 +181,5 @@ static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
        if (pgmap)
                percpu_ref_put(pgmap->ref);
 }
+
 #endif /* _LINUX_MEMREMAP_H_ */
diff --git a/include/linux/mfd/iqs62x.h b/include/linux/mfd/iqs62x.h
new file mode 100644 (file)
index 0000000..043d3b6
--- /dev/null
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Azoteq IQS620A/621/622/624/625 Multi-Function Sensors
+ *
+ * Copyright (C) 2019 Jeff LaBundy <jeff@labundy.com>
+ */
+
+#ifndef __LINUX_MFD_IQS62X_H
+#define __LINUX_MFD_IQS62X_H
+
+#define IQS620_PROD_NUM                                0x41
+#define IQS621_PROD_NUM                                0x46
+#define IQS622_PROD_NUM                                0x42
+#define IQS624_PROD_NUM                                0x43
+#define IQS625_PROD_NUM                                0x4E
+
+#define IQS621_ALS_FLAGS                       0x16
+#define IQS622_ALS_FLAGS                       0x14
+
+#define IQS624_HALL_UI                         0x70
+#define IQS624_HALL_UI_WHL_EVENT               BIT(4)
+#define IQS624_HALL_UI_INT_EVENT               BIT(3)
+#define IQS624_HALL_UI_AUTO_CAL                        BIT(2)
+
+#define IQS624_INTERVAL_DIV                    0x7D
+
+#define IQS620_GLBL_EVENT_MASK                 0xD7
+#define IQS620_GLBL_EVENT_MASK_PMU             BIT(6)
+
+#define IQS62X_NUM_KEYS                                16
+#define IQS62X_NUM_EVENTS                      (IQS62X_NUM_KEYS + 5)
+
+#define IQS62X_EVENT_SIZE                      10
+
+enum iqs62x_ui_sel {
+       IQS62X_UI_PROX,
+       IQS62X_UI_SAR1,
+};
+
+enum iqs62x_event_reg {
+       IQS62X_EVENT_NONE,
+       IQS62X_EVENT_SYS,
+       IQS62X_EVENT_PROX,
+       IQS62X_EVENT_HYST,
+       IQS62X_EVENT_HALL,
+       IQS62X_EVENT_ALS,
+       IQS62X_EVENT_IR,
+       IQS62X_EVENT_WHEEL,
+       IQS62X_EVENT_INTER,
+       IQS62X_EVENT_UI_LO,
+       IQS62X_EVENT_UI_HI,
+};
+
+enum iqs62x_event_flag {
+       /* keys */
+       IQS62X_EVENT_PROX_CH0_T,
+       IQS62X_EVENT_PROX_CH0_P,
+       IQS62X_EVENT_PROX_CH1_T,
+       IQS62X_EVENT_PROX_CH1_P,
+       IQS62X_EVENT_PROX_CH2_T,
+       IQS62X_EVENT_PROX_CH2_P,
+       IQS62X_EVENT_HYST_POS_T,
+       IQS62X_EVENT_HYST_POS_P,
+       IQS62X_EVENT_HYST_NEG_T,
+       IQS62X_EVENT_HYST_NEG_P,
+       IQS62X_EVENT_SAR1_ACT,
+       IQS62X_EVENT_SAR1_QRD,
+       IQS62X_EVENT_SAR1_MOVE,
+       IQS62X_EVENT_SAR1_HALT,
+       IQS62X_EVENT_WHEEL_UP,
+       IQS62X_EVENT_WHEEL_DN,
+
+       /* switches */
+       IQS62X_EVENT_HALL_N_T,
+       IQS62X_EVENT_HALL_N_P,
+       IQS62X_EVENT_HALL_S_T,
+       IQS62X_EVENT_HALL_S_P,
+
+       /* everything else */
+       IQS62X_EVENT_SYS_RESET,
+};
+
+struct iqs62x_event_data {
+       u16 ui_data;
+       u8 als_flags;
+       u8 ir_flags;
+       u8 interval;
+};
+
+struct iqs62x_event_desc {
+       enum iqs62x_event_reg reg;
+       u8 mask;
+       u8 val;
+};
+
+struct iqs62x_dev_desc {
+       const char *dev_name;
+       const struct mfd_cell *sub_devs;
+       int num_sub_devs;
+
+       u8 prod_num;
+       u8 sw_num;
+       const u8 *cal_regs;
+       int num_cal_regs;
+
+       u8 prox_mask;
+       u8 sar_mask;
+       u8 hall_mask;
+       u8 hyst_mask;
+       u8 temp_mask;
+       u8 als_mask;
+       u8 ir_mask;
+
+       u8 prox_settings;
+       u8 als_flags;
+       u8 hall_flags;
+       u8 hyst_shift;
+
+       u8 interval;
+       u8 interval_div;
+
+       u8 clk_div;
+       const char *fw_name;
+       const enum iqs62x_event_reg (*event_regs)[IQS62X_EVENT_SIZE];
+};
+
+struct iqs62x_core {
+       const struct iqs62x_dev_desc *dev_desc;
+       struct i2c_client *client;
+       struct regmap *regmap;
+       struct blocking_notifier_head nh;
+       struct list_head fw_blk_head;
+       struct completion fw_done;
+       enum iqs62x_ui_sel ui_sel;
+};
+
+extern const struct iqs62x_event_desc iqs62x_events[IQS62X_NUM_EVENTS];
+
+#endif /* __LINUX_MFD_IQS62X_H */
index a59bf32..e07f6e6 100644 (file)
@@ -620,7 +620,5 @@ struct rk808 {
        long                            variant;
        const struct regmap_config      *regmap_cfg;
        const struct regmap_irq_chip    *regmap_irq_chip;
-       void                            (*pm_pwroff_fn)(void);
-       void                            (*pm_pwroff_prep_fn)(void);
 };
 #endif /* __LINUX_REGULATOR_RK808_H */
index d62ef48..fba0df1 100644 (file)
 #define RN5T618_INTPOL                 0x9c
 #define RN5T618_INTEN                  0x9d
 #define RN5T618_INTMON                 0x9e
+
+#define RN5T618_RTC_SECONDS     0xA0
+#define RN5T618_RTC_MDAY        0xA4
+#define RN5T618_RTC_MONTH       0xA5
+#define RN5T618_RTC_YEAR        0xA6
+#define RN5T618_RTC_ADJUST      0xA7
+#define RN5T618_RTC_ALARM_Y_SEC 0xA8
+#define RN5T618_RTC_DAL_MONTH   0xAC
+#define RN5T618_RTC_CTRL1       0xAE
+#define RN5T618_RTC_CTRL2       0xAF
+
 #define RN5T618_PREVINDAC              0xb0
 #define RN5T618_BATDAC                 0xb1
 #define RN5T618_CHGCTL1                        0xb3
@@ -242,9 +253,24 @@ enum {
        RC5T619,
 };
 
+/* RN5T618 IRQ definitions */
+enum {
+       RN5T618_IRQ_SYS = 0,
+       RN5T618_IRQ_DCDC,
+       RN5T618_IRQ_RTC,
+       RN5T618_IRQ_ADC,
+       RN5T618_IRQ_GPIO,
+       RN5T618_IRQ_CHG,
+       RN5T618_NR_IRQS,
+};
+
 struct rn5t618 {
        struct regmap *regmap;
+       struct device *dev;
        long variant;
+
+       int irq;
+       struct regmap_irq_chip_data *irq_data;
 };
 
 #endif /* __LINUX_MFD_RN5T618_H */
diff --git a/include/linux/mfd/sc27xx-pmic.h b/include/linux/mfd/sc27xx-pmic.h
new file mode 100644 (file)
index 0000000..57e45c0
--- /dev/null
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_MFD_SC27XX_PMIC_H
+#define __LINUX_MFD_SC27XX_PMIC_H
+
+extern enum usb_charger_type sprd_pmic_detect_charger_type(struct device *dev);
+
+#endif /* __LINUX_MFD_SC27XX_PMIC_H */
index 986986f..75aa94d 100644 (file)
@@ -89,7 +89,6 @@ enum wm831x_watchdog_action {
 
 struct wm831x_watchdog_pdata {
        enum wm831x_watchdog_action primary, secondary;
-       int update_gpio;
        unsigned int software:1;
 };
 
index 7dd5c4c..e2f938c 100644 (file)
@@ -629,6 +629,12 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma)
 
        return false;
 }
+
+static inline bool vma_is_accessible(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+}
+
 #ifdef CONFIG_SHMEM
 /*
  * The vma_is_shmem is not inline because it is used only by slow
@@ -1765,9 +1771,26 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
                unsigned long new_addr, unsigned long len,
                bool need_rmap_locks);
+
+/*
+ * Flags used by change_protection().  For now we make it a bitmap so
+ * that we can pass in multiple flags just like parameters.  However
+ * for now all the callers are only use one of the flags at the same
+ * time.
+ */
+/* Whether we should allow dirty bit accounting */
+#define  MM_CP_DIRTY_ACCT                  (1UL << 0)
+/* Whether this protection change is for NUMA hints */
+#define  MM_CP_PROT_NUMA                   (1UL << 1)
+/* Whether this change is for write protecting */
+#define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
+#define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
+#define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
+                                           MM_CP_UFFD_WP_RESOLVE)
+
 extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
                              unsigned long end, pgprot_t newprot,
-                             int dirty_accountable, int prot_numa);
+                             unsigned long cp_flags);
 extern int mprotect_fixup(struct vm_area_struct *vma,
                          struct vm_area_struct **pprev, unsigned long start,
                          unsigned long end, unsigned long newflags);
index 6f2fef7..219bef4 100644 (file)
@@ -6,19 +6,20 @@
 #include <linux/swap.h>
 
 /**
- * page_is_file_cache - should the page be on a file LRU or anon LRU?
+ * page_is_file_lru - should the page be on a file LRU or anon LRU?
  * @page: the page to test
  *
- * Returns 1 if @page is page cache page backed by a regular filesystem,
- * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed.
- * Used by functions that manipulate the LRU lists, to sort a page
- * onto the right LRU list.
+ * Returns 1 if @page is a regular filesystem backed page cache page or a lazily
+ * freed anonymous page (e.g. via MADV_FREE).  Returns 0 if @page is a normal
+ * anonymous page, a tmpfs page or otherwise ram or swap backed page.  Used by
+ * functions that manipulate the LRU lists, to sort a page onto the right LRU
+ * list.
  *
  * We would like to get this info without a page flag, but the state
  * needs to survive until the page is last deleted from the LRU, which
  * could be as far down as __page_cache_release.
  */
-static inline int page_is_file_cache(struct page *page)
+static inline int page_is_file_lru(struct page *page)
 {
        return !PageSwapBacked(page);
 }
@@ -75,7 +76,7 @@ static __always_inline void del_page_from_lru_list(struct page *page,
  */
 static inline enum lru_list page_lru_base_type(struct page *page)
 {
-       if (page_is_file_cache(page))
+       if (page_is_file_lru(page))
                return LRU_INACTIVE_FILE;
        return LRU_INACTIVE_ANON;
 }
index dd555e6..4aba6c0 100644 (file)
@@ -289,8 +289,8 @@ struct vm_userfaultfd_ctx {};
 #endif /* CONFIG_USERFAULTFD */
 
 /*
- * This struct defines a memory VMM memory area. There is one of these
- * per VM-area/task.  A VM area is any part of the process virtual memory
+ * This struct describes a virtual memory area. There is one of these
+ * per VM-area/task. A VM area is any part of the process virtual memory
  * space that has a special rule for the page-fault handlers (ie a shared
  * library, the executable area etc).
  */
index e84d448..1b9de7d 100644 (file)
@@ -100,41 +100,6 @@ struct free_area {
        unsigned long           nr_free;
 };
 
-/* Used for pages not on another list */
-static inline void add_to_free_area(struct page *page, struct free_area *area,
-                            int migratetype)
-{
-       list_add(&page->lru, &area->free_list[migratetype]);
-       area->nr_free++;
-}
-
-/* Used for pages not on another list */
-static inline void add_to_free_area_tail(struct page *page, struct free_area *area,
-                                 int migratetype)
-{
-       list_add_tail(&page->lru, &area->free_list[migratetype]);
-       area->nr_free++;
-}
-
-#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
-/* Used to preserve page allocation order entropy */
-void add_to_free_area_random(struct page *page, struct free_area *area,
-               int migratetype);
-#else
-static inline void add_to_free_area_random(struct page *page,
-               struct free_area *area, int migratetype)
-{
-       add_to_free_area(page, area, migratetype);
-}
-#endif
-
-/* Used for pages which are on another list */
-static inline void move_to_free_area(struct page *page, struct free_area *area,
-                            int migratetype)
-{
-       list_move(&page->lru, &area->free_list[migratetype]);
-}
-
 static inline struct page *get_page_from_free_area(struct free_area *area,
                                            int migratetype)
 {
@@ -142,15 +107,6 @@ static inline struct page *get_page_from_free_area(struct free_area *area,
                                        struct page, lru);
 }
 
-static inline void del_page_from_free_area(struct page *page,
-               struct free_area *area)
-{
-       list_del(&page->lru);
-       __ClearPageBuddy(page);
-       set_page_private(page, 0);
-       area->nr_free--;
-}
-
 static inline bool free_area_empty(struct free_area *area, int migratetype)
 {
        return list_empty(&area->free_list[migratetype]);
@@ -708,7 +664,6 @@ struct deferred_split {
  * Memory statistics and page replacement data structures are maintained on a
  * per-zone basis.
  */
-struct bootmem_data;
 typedef struct pglist_data {
        struct zone node_zones[MAX_NR_ZONES];
        struct zonelist node_zonelists[MAX_ZONELISTS];
@@ -1172,6 +1127,7 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 #define SECTION_ALIGN_DOWN(pfn)        ((pfn) & PAGE_SECTION_MASK)
 
 #define SUBSECTION_SHIFT 21
+#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT)
 
 #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
 #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
@@ -1187,7 +1143,9 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
 
 struct mem_section_usage {
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
        DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
+#endif
        /* See declaration of similar field in struct zone */
        unsigned long pageblock_flags[0];
 };
index 5d5b91e..73eda45 100644 (file)
@@ -354,6 +354,7 @@ static inline unsigned long nfs_save_change_attribute(struct inode *dir)
 extern int nfs_sync_mapping(struct address_space *mapping);
 extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping);
 extern void nfs_zap_caches(struct inode *);
+extern void nfs_set_inode_stale(struct inode *inode);
 extern void nfs_invalidate_atime(struct inode *);
 extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *,
                                struct nfs_fattr *, struct nfs4_label *);
index 0bbd587..c32c152 100644 (file)
@@ -139,9 +139,14 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern void nfs_unlock_request(struct nfs_page *req);
 extern void nfs_unlock_and_release_request(struct nfs_page *);
+extern struct nfs_page *nfs_page_group_lock_head(struct nfs_page *req);
+extern int nfs_page_group_lock_subrequests(struct nfs_page *head);
+extern void nfs_join_page_group(struct nfs_page *head, struct inode *inode);
 extern int nfs_page_group_lock(struct nfs_page *);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
+extern int nfs_page_set_headlock(struct nfs_page *req);
+extern void nfs_page_clear_headlock(struct nfs_page *req);
 extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
 
 /*
index 6838c14..4402304 100644 (file)
@@ -1266,16 +1266,25 @@ struct nfstime4 {
 struct pnfs_commit_bucket {
        struct list_head written;
        struct list_head committing;
-       struct pnfs_layout_segment *wlseg;
-       struct pnfs_layout_segment *clseg;
+       struct pnfs_layout_segment *lseg;
        struct nfs_writeverf direct_verf;
 };
 
+struct pnfs_commit_array {
+       struct list_head cinfo_list;
+       struct list_head lseg_list;
+       struct pnfs_layout_segment *lseg;
+       struct rcu_head rcu;
+       refcount_t refcount;
+       unsigned int nbuckets;
+       struct pnfs_commit_bucket buckets[];
+};
+
 struct pnfs_ds_commit_info {
-       int nwritten;
-       int ncommitting;
-       int nbuckets;
-       struct pnfs_commit_bucket *buckets;
+       struct list_head commits;
+       unsigned int nwritten;
+       unsigned int ncommitting;
+       const struct pnfs_commit_ops *ops;
 };
 
 struct nfs41_state_protection {
@@ -1386,22 +1395,11 @@ struct nfs41_free_stateid_res {
        unsigned int                    status;
 };
 
-static inline void
-nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo)
-{
-       kfree(cinfo->buckets);
-}
-
 #else
 
 struct pnfs_ds_commit_info {
 };
 
-static inline void
-nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo)
-{
-}
-
 #endif /* CONFIG_NFS_V4_1 */
 
 #ifdef CONFIG_NFS_V4_2
index 110b0e5..a42df80 100644 (file)
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_NUMA_H
 #define _LINUX_NUMA_H
-
+#include <linux/types.h>
 
 #ifdef CONFIG_NODES_SHIFT
 #define NODES_SHIFT     CONFIG_NODES_SHIFT
 
 #define        NUMA_NO_NODE    (-1)
 
+/* optionally keep NUMA memory info available post init */
+#ifdef CONFIG_NUMA_KEEP_MEMINFO
+#define __initdata_or_meminfo
+#else
+#define __initdata_or_meminfo __initdata
+#endif
+
+#ifdef CONFIG_NUMA
+/* Generic implementation available */
+int numa_map_to_online_node(int node);
+
+/*
+ * Optional architecture specific implementation, users need a "depends
+ * on $ARCH"
+ */
+int phys_to_target_node(phys_addr_t addr);
+#else
+static inline int numa_map_to_online_node(int node)
+{
+       return NUMA_NO_NODE;
+}
+
+static inline int phys_to_target_node(phys_addr_t addr)
+{
+       return NUMA_NO_NODE;
+}
+#endif
+
 #endif /* _LINUX_NUMA_H */
index 6d0d70f..10f8162 100644 (file)
@@ -270,8 +270,6 @@ struct nvme_fc_remote_port {
  *
  * Host/Initiator Transport Entrypoints/Parameters:
  *
- * @module:  The LLDD module using the interface
- *
  * @localport_delete:  The LLDD initiates deletion of a localport via
  *       nvme_fc_deregister_localport(). However, the teardown is
  *       asynchronous. This routine is called upon the completion of the
@@ -385,8 +383,6 @@ struct nvme_fc_remote_port {
  *       Value is Mandatory. Allowed to be zero.
  */
 struct nvme_fc_port_template {
-       struct module   *module;
-
        /* initiator-based functions */
        void    (*localport_delete)(struct nvme_fc_local_port *);
        void    (*remoteport_delete)(struct nvme_fc_remote_port *);
index 77de28b..222f6f7 100644 (file)
  * page_waitqueue(page) is a wait queue of all tasks waiting for the page
  * to become unlocked.
  *
+ * PG_swapbacked is set when a page uses swap as a backing storage.  This are
+ * usually PageAnon or shmem pages but please note that even anonymous pages
+ * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
+ * a result of MADV_FREE).
+ *
  * PG_uptodate tells whether the page's contents is valid.  When a read
  * completes, the page becomes uptodate, unless a disk I/O error happened.
  *
@@ -163,6 +168,9 @@ enum pageflags {
 
        /* non-lru isolated movable page */
        PG_isolated = PG_reclaim,
+
+       /* Only valid for buddy pages. Used to track pages that are reported */
+       PG_reported = PG_uptodate,
 };
 
 #ifndef __GENERATING_BOUNDS_H
@@ -432,6 +440,14 @@ PAGEFLAG(Idle, idle, PF_ANY)
 #endif
 
 /*
+ * PageReported() is used to track reported free pages within the Buddy
+ * allocator. We can use the non-atomic version of the test and set
+ * operations as both should be shielded with the zone lock to prevent
+ * any possible races on the setting or clearing of the bit.
+ */
+__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
+
+/*
  * On an anonymous page mapped into a user virtual memory area,
  * page->mapping points to its anon_vma, not to a struct address_space;
  * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
new file mode 100644 (file)
index 0000000..3b99e0e
--- /dev/null
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PAGE_REPORTING_H
+#define _LINUX_PAGE_REPORTING_H
+
+#include <linux/mmzone.h>
+#include <linux/scatterlist.h>
+
+/* This value should always be a power of 2, see page_reporting_cycle() */
+#define PAGE_REPORTING_CAPACITY                32
+
+struct page_reporting_dev_info {
+       /* function that alters pages to make them "reported" */
+       int (*report)(struct page_reporting_dev_info *prdev,
+                     struct scatterlist *sg, unsigned int nents);
+
+       /* work struct for processing reports */
+       struct delayed_work work;
+
+       /* Current state of page reporting */
+       atomic_t state;
+};
+
+/* Tear-down and bring-up for page reporting devices */
+void page_reporting_unregister(struct page_reporting_dev_info *prdev);
+int page_reporting_register(struct page_reporting_dev_info *prdev);
+#endif /*_LINUX_PAGE_REPORTING_H */
index f562824..a8f7bd8 100644 (file)
@@ -341,9 +341,7 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
        if (PageHuge(head))
                return head;
 
-       VM_BUG_ON_PAGE(PageTail(head), head);
-
-       return head + (index & (compound_nr(head) - 1));
+       return head + (index & (hpage_nr_pages(head) - 1));
 }
 
 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
index 4f05249..0a4f54d 100644 (file)
@@ -78,9 +78,9 @@ static inline s64 percpu_counter_read(struct percpu_counter *fbc)
  */
 static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
 {
-       s64 ret = fbc->count;
+       /* Prevent reloads of fbc->count */
+       s64 ret = READ_ONCE(fbc->count);
 
-       barrier();              /* Prevent reloads of fbc->count */
        if (ret >= 0)
                return ret;
        return 0;
index ba59147..3832433 100644 (file)
@@ -125,6 +125,9 @@ struct cros_ec_command {
  * @host_event_wake_mask: Mask of host events that cause wake from suspend.
  * @last_event_time: exact time from the hard irq when we got notified of
  *     a new event.
+ * @notifier_ready: The notifier_block to let the kernel re-query EC
+ *                 communication protocol when the EC sends
+ *                 EC_HOST_EVENT_INTERFACE_READY.
  * @ec: The platform_device used by the mfd driver to interface with the
  *      main EC.
  * @pd: The platform_device used by the mfd driver to interface with the
@@ -166,6 +169,7 @@ struct cros_ec_device {
        u32 host_event_wake_mask;
        u32 last_resume_result;
        ktime_t last_event_time;
+       struct notifier_block notifier_ready;
 
        /* The platform devices used by the mfd driver */
        struct platform_device *ec;
index bef7ffc..c588be8 100644 (file)
@@ -8,8 +8,13 @@
 #ifndef __LINUX_PLATFORM_DATA_CROS_EC_SENSORHUB_H
 #define __LINUX_PLATFORM_DATA_CROS_EC_SENSORHUB_H
 
+#include <linux/ktime.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
 #include <linux/platform_data/cros_ec_commands.h>
 
+struct iio_dev;
+
 /**
  * struct cros_ec_sensor_platform - ChromeOS EC sensor platform information.
  * @sensor_num: Id of the sensor, as reported by the EC.
@@ -19,12 +24,170 @@ struct cros_ec_sensor_platform {
 };
 
 /**
+ * typedef cros_ec_sensorhub_push_data_cb_t - Callback function to send datum
+ *                                           to specific sensors.
+ *
+ * @indio_dev: The IIO device that will process the sample.
+ * @data: Vector array of the ring sample.
+ * @timestamp: Timestamp in host timespace when the sample was acquired by
+ *             the EC.
+ */
+typedef int (*cros_ec_sensorhub_push_data_cb_t)(struct iio_dev *indio_dev,
+                                               s16 *data,
+                                               s64 timestamp);
+
+struct cros_ec_sensorhub_sensor_push_data {
+       struct iio_dev *indio_dev;
+       cros_ec_sensorhub_push_data_cb_t push_data_cb;
+};
+
+enum {
+       CROS_EC_SENSOR_LAST_TS,
+       CROS_EC_SENSOR_NEW_TS,
+       CROS_EC_SENSOR_ALL_TS
+};
+
+struct cros_ec_sensors_ring_sample {
+       u8  sensor_id;
+       u8  flag;
+       s16 vector[3];
+       s64 timestamp;
+} __packed;
+
+/* State used for cros_ec_ring_fix_overflow */
+struct cros_ec_sensors_ec_overflow_state {
+       s64 offset;
+       s64 last;
+};
+
+/* Length of the filter, how long to remember entries for */
+#define CROS_EC_SENSORHUB_TS_HISTORY_SIZE 64
+
+/**
+ * struct cros_ec_sensors_ts_filter_state - Timestamp filetr state.
+ *
+ * @x_offset: x is EC interrupt time. x_offset its last value.
+ * @y_offset: y is the difference between AP and EC time, y_offset its last
+ *            value.
+ * @x_history: The past history of x, relative to x_offset.
+ * @y_history: The past history of y, relative to y_offset.
+ * @m_history: rate between y and x.
+ * @history_len: Amount of valid historic data in the arrays.
+ * @temp_buf: Temporary buffer used when updating the filter.
+ * @median_m: median value of m_history
+ * @median_error: final error to apply to AP interrupt timestamp to get the
+ *                "true timestamp" the event occurred.
+ */
+struct cros_ec_sensors_ts_filter_state {
+       s64 x_offset, y_offset;
+       s64 x_history[CROS_EC_SENSORHUB_TS_HISTORY_SIZE];
+       s64 y_history[CROS_EC_SENSORHUB_TS_HISTORY_SIZE];
+       s64 m_history[CROS_EC_SENSORHUB_TS_HISTORY_SIZE];
+       int history_len;
+
+       s64 temp_buf[CROS_EC_SENSORHUB_TS_HISTORY_SIZE];
+
+       s64 median_m;
+       s64 median_error;
+};
+
+/* struct cros_ec_sensors_ts_batch_state - State of batch of a single sensor.
+ *
+ * Use to store information to batch data using median fileter information.
+ *
+ * @penul_ts: last but one batch timestamp (penultimate timestamp).
+ *           Used for timestamp spreading calculations
+ *           when a batch shows up.
+ * @penul_len: last but one batch length.
+ * @last_ts: Last batch timestam.
+ * @last_len: Last batch length.
+ * @newest_sensor_event: Last sensor timestamp.
+ */
+struct cros_ec_sensors_ts_batch_state {
+       s64 penul_ts;
+       int penul_len;
+       s64 last_ts;
+       int last_len;
+       s64 newest_sensor_event;
+};
+
+/*
  * struct cros_ec_sensorhub - Sensor Hub device data.
  *
+ * @dev: Device object, mostly used for logging.
  * @ec: Embedded Controller where the hub is located.
+ * @sensor_num: Number of MEMS sensors present in the EC.
+ * @msg: Structure to send FIFO requests.
+ * @params: Pointer to parameters in msg.
+ * @resp: Pointer to responses in msg.
+ * @cmd_lock : Lock for sending msg.
+ * @notifier: Notifier to kick the FIFO interrupt.
+ * @ring: Preprocessed ring to store events.
+ * @fifo_timestamp: Array for event timestamp and spreading.
+ * @fifo_info: Copy of FIFO information coming from the EC.
+ * @fifo_size: Size of the ring.
+ * @batch_state: Per sensor information of the last batches received.
+ * @overflow_a: For handling timestamp overflow for a time (sensor events)
+ * @overflow_b: For handling timestamp overflow for b time (ec interrupts)
+ * @filter: Medium fileter structure.
+ * @tight_timestamps: Set to truen when EC support tight timestamping:
+ *                   The timestamps reported from the EC have low jitter.
+ *                   Timestamps also come before every sample. Set either
+ *                   by feature bits coming from the EC or userspace.
+ * @future_timestamp_count: Statistics used to compute shaved time.
+ *                         This occurs when timestamp interpolation from EC
+ *                         time to AP time accidentally puts timestamps in
+ *                         the future. These timestamps are clamped to
+ *                         `now` and these count/total_ns maintain the
+ *                         statistics for how much time was removed in a
+ *                         given period.
+ * @future_timestamp_total_ns: Total amount of time shaved.
+ * @push_data: Array of callback to send datums to iio sensor object.
  */
 struct cros_ec_sensorhub {
+       struct device *dev;
        struct cros_ec_dev *ec;
+       int sensor_num;
+
+       struct cros_ec_command *msg;
+       struct ec_params_motion_sense *params;
+       struct ec_response_motion_sense *resp;
+       struct mutex cmd_lock;  /* Lock for protecting msg structure. */
+
+       struct notifier_block notifier;
+
+       struct cros_ec_sensors_ring_sample *ring;
+
+       ktime_t fifo_timestamp[CROS_EC_SENSOR_ALL_TS];
+       struct ec_response_motion_sense_fifo_info *fifo_info;
+       int fifo_size;
+
+       struct cros_ec_sensors_ts_batch_state *batch_state;
+
+       struct cros_ec_sensors_ec_overflow_state overflow_a;
+       struct cros_ec_sensors_ec_overflow_state overflow_b;
+
+       struct cros_ec_sensors_ts_filter_state filter;
+
+       int tight_timestamps;
+
+       s32 future_timestamp_count;
+       s64 future_timestamp_total_ns;
+
+       struct cros_ec_sensorhub_sensor_push_data *push_data;
 };
 
+int cros_ec_sensorhub_register_push_data(struct cros_ec_sensorhub *sensorhub,
+                                        u8 sensor_num,
+                                        struct iio_dev *indio_dev,
+                                        cros_ec_sensorhub_push_data_cb_t cb);
+
+void cros_ec_sensorhub_unregister_push_data(struct cros_ec_sensorhub *sensorhub,
+                                           u8 sensor_num);
+
+int cros_ec_sensorhub_ring_add(struct cros_ec_sensorhub *sensorhub);
+void cros_ec_sensorhub_ring_remove(void *arg);
+int cros_ec_sensorhub_ring_fifo_enable(struct cros_ec_sensorhub *sensorhub,
+                                      bool on);
+
 #endif   /* __LINUX_PLATFORM_DATA_CROS_EC_SENSORHUB_H */
diff --git a/include/linux/platform_data/cros_usbpd_notify.h b/include/linux/platform_data/cros_usbpd_notify.h
new file mode 100644 (file)
index 0000000..4f27917
--- /dev/null
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ChromeOS EC Power Delivery Notifier Driver
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#ifndef __LINUX_PLATFORM_DATA_CROS_USBPD_NOTIFY_H
+#define __LINUX_PLATFORM_DATA_CROS_USBPD_NOTIFY_H
+
+#include <linux/notifier.h>
+
+int cros_usbpd_register_notify(struct notifier_block *nb);
+
+void cros_usbpd_unregister_notify(struct notifier_block *nb);
+
+#endif  /* __LINUX_PLATFORM_DATA_CROS_USBPD_NOTIFY_H */
diff --git a/include/linux/platform_data/leds-kirkwood-ns2.h b/include/linux/platform_data/leds-kirkwood-ns2.h
deleted file mode 100644 (file)
index eb8a686..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Platform data structure for Network Space v2 LED driver
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2.  This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- */
-
-#ifndef __LEDS_KIRKWOOD_NS2_H
-#define __LEDS_KIRKWOOD_NS2_H
-
-enum ns2_led_modes {
-       NS_V2_LED_OFF,
-       NS_V2_LED_ON,
-       NS_V2_LED_SATA,
-};
-
-struct ns2_led_modval {
-       enum ns2_led_modes      mode;
-       int                     cmd_level;
-       int                     slow_level;
-};
-
-struct ns2_led {
-       const char      *name;
-       const char      *default_trigger;
-       unsigned        cmd;
-       unsigned        slow;
-       int             num_modes;
-       struct ns2_led_modval *modval;
-};
-
-struct ns2_led_platform_data {
-       int             num_leds;
-       struct ns2_led  *leds;
-};
-
-#endif /* __LEDS_KIRKWOOD_NS2_H */
index afede15..25f46a9 100644 (file)
@@ -8,8 +8,8 @@
 #ifndef WILCO_EC_H
 #define WILCO_EC_H
 
-#include <linux/device.h>
-#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
 
 /* Message flags for using the mailbox() interface */
 #define WILCO_EC_FLAG_NO_RESPONSE      BIT(0) /* EC does not respond */
 /* Normal commands have a maximum 32 bytes of data */
 #define EC_MAILBOX_DATA_SIZE           32
 
+struct device;
+struct resource;
+struct platform_device;
+
 /**
  * struct wilco_ec_device - Wilco Embedded Controller handle.
  * @dev: Device handle.
index 40a7982..45c05fd 100644 (file)
@@ -5,6 +5,7 @@
 #ifndef _LINUX_PROC_FS_H
 #define _LINUX_PROC_FS_H
 
+#include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/fs.h>
 
@@ -12,7 +13,21 @@ struct proc_dir_entry;
 struct seq_file;
 struct seq_operations;
 
+enum {
+       /*
+        * All /proc entries using this ->proc_ops instance are never removed.
+        *
+        * If in doubt, ignore this flag.
+        */
+#ifdef MODULE
+       PROC_ENTRY_PERMANENT = 0U,
+#else
+       PROC_ENTRY_PERMANENT = 1U << 0,
+#endif
+};
+
 struct proc_ops {
+       unsigned int proc_flags;
        int     (*proc_open)(struct inode *, struct file *);
        ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *);
        ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *);
@@ -25,7 +40,7 @@ struct proc_ops {
 #endif
        int     (*proc_mmap)(struct file *, struct vm_area_struct *);
        unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
-};
+} __randomize_layout;
 
 #ifdef CONFIG_PROC_FS
 
index 8ea265a..06086cb 100644 (file)
@@ -16,8 +16,6 @@ struct platform_pwm_backlight_data {
        unsigned int *levels;
        unsigned int post_pwm_on_delay;
        unsigned int pwm_off_delay;
-       /* TODO remove once all users are switched to gpiod_* API */
-       int enable_gpio;
        int (*init)(struct device *dev);
        int (*notify)(struct device *dev, int brightness);
        void (*notify_after)(struct device *dev, int brightness);
index 770c2bf..1672cf6 100644 (file)
@@ -21,7 +21,6 @@ struct seq_file {
        size_t pad_until;
        loff_t index;
        loff_t read_pos;
-       u64 version;
        struct mutex lock;
        const struct seq_operations *op;
        int poll_event;
index d56fefe..7a35a69 100644 (file)
@@ -78,6 +78,7 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 extern int shmem_unuse(unsigned int type, bool frontswap,
                       unsigned long *fs_pages_to_unuse);
 
+extern bool shmem_huge_enabled(struct vm_area_struct *vma);
 extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
 extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
                                                pgoff_t start, pgoff_t end);
@@ -114,15 +115,6 @@ static inline bool shmem_file(struct file *file)
 extern bool shmem_charge(struct inode *inode, long pages);
 extern void shmem_uncharge(struct inode *inode, long pages);
 
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
-extern bool shmem_huge_enabled(struct vm_area_struct *vma);
-#else
-static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
-{
-       return false;
-}
-#endif
-
 #ifdef CONFIG_SHMEM
 extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
                                  struct vm_area_struct *dst_vma,
index edf4bec..0b85761 100644 (file)
@@ -11,9 +11,6 @@ struct corgi_lcd_platform_data {
        int     default_intensity;
        int     limit_mask;
 
-       int     gpio_backlight_on;      /* -1 if n/a */
-       int     gpio_backlight_cont;    /* -1 if n/a */
-
        void (*notify)(int intensity);
        void (*kick_battery)(void);
 };
index 3efa97d..24d49c7 100644 (file)
@@ -19,4 +19,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
                               unsigned long **entries);
 
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
+
 #endif
index a6ef351..df696ef 100644 (file)
@@ -132,6 +132,7 @@ struct rpc_task_setup {
 #define RPC_TASK_TIMEOUT       0x1000          /* fail with ETIMEDOUT on timeout */
 #define RPC_TASK_NOCONNECT     0x2000          /* return ENOTCONN if not connected */
 #define RPC_TASK_NO_RETRANS_TIMEOUT    0x4000          /* wait forever for a reply */
+#define RPC_TASK_CRED_NOREF    0x8000          /* No refcount on the credential */
 
 #define RPC_IS_ASYNC(t)                ((t)->tk_flags & RPC_TASK_ASYNC)
 #define RPC_IS_SWAPPER(t)      ((t)->tk_flags & RPC_TASK_SWAPPER)
index 8529d6e..01bb419 100644 (file)
@@ -184,7 +184,6 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p)
 extern void xdr_shift_buf(struct xdr_buf *, size_t);
 extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
 extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
-extern int xdr_buf_read_mic(struct xdr_buf *, struct xdr_netobj *, unsigned int);
 extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
 extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
 
index 877fd23..d9b7c91 100644 (file)
@@ -68,6 +68,8 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
 
        if (pte_swp_soft_dirty(pte))
                pte = pte_swp_clear_soft_dirty(pte);
+       if (pte_swp_uffd_wp(pte))
+               pte = pte_swp_clear_uffd_wp(pte);
        arch_entry = __pte_to_swp_entry(pte);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
 }
@@ -348,7 +350,8 @@ static inline void num_poisoned_pages_inc(void)
 }
 #endif
 
-#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
+#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \
+    defined(CONFIG_DEVICE_PRIVATE)
 static inline int non_swap_entry(swp_entry_t entry)
 {
        return swp_type(entry) >= MAX_SWAPFILES;
index 126913c..c91b1e3 100644 (file)
@@ -364,6 +364,9 @@ struct thermal_trip {
 
 /* Function declarations */
 #ifdef CONFIG_THERMAL_OF
+int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
+                                 struct device_node *sensor_np,
+                                 u32 *id);
 struct thermal_zone_device *
 thermal_zone_of_sensor_register(struct device *dev, int id, void *data,
                                const struct thermal_zone_of_device_ops *ops);
@@ -375,6 +378,13 @@ struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
 void devm_thermal_zone_of_sensor_unregister(struct device *dev,
                                            struct thermal_zone_device *tz);
 #else
+
+static inline int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
+                                        struct device_node *sensor_np,
+                                        u32 *id)
+{
+       return -ENOENT;
+}
 static inline struct thermal_zone_device *
 thermal_zone_of_sensor_register(struct device *dev, int id, void *data,
                                const struct thermal_zone_of_device_ops *ops)
index ac9d71e..a8e5f3e 100644 (file)
@@ -14,6 +14,8 @@
 #include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */
 
 #include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <asm-generic/pgtable_uffd.h>
 
 /*
  * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
@@ -34,11 +36,14 @@ extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
                            unsigned long src_start, unsigned long len,
-                           bool *mmap_changing);
+                           bool *mmap_changing, __u64 mode);
 extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
                              unsigned long dst_start,
                              unsigned long len,
                              bool *mmap_changing);
+extern int mwriteprotect_range(struct mm_struct *dst_mm,
+                              unsigned long start, unsigned long len,
+                              bool enable_wp, bool *mmap_changing);
 
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
@@ -52,6 +57,23 @@ static inline bool userfaultfd_missing(struct vm_area_struct *vma)
        return vma->vm_flags & VM_UFFD_MISSING;
 }
 
+static inline bool userfaultfd_wp(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & VM_UFFD_WP;
+}
+
+static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
+                                     pte_t pte)
+{
+       return userfaultfd_wp(vma) && pte_uffd_wp(pte);
+}
+
+static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
+                                          pmd_t pmd)
+{
+       return userfaultfd_wp(vma) && pmd_uffd_wp(pmd);
+}
+
 static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 {
        return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
@@ -96,6 +118,24 @@ static inline bool userfaultfd_missing(struct vm_area_struct *vma)
        return false;
 }
 
+static inline bool userfaultfd_wp(struct vm_area_struct *vma)
+{
+       return false;
+}
+
+static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
+                                     pte_t pte)
+{
+       return false;
+}
+
+static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
+                                          pmd_t pmd)
+{
+       return false;
+}
+
+
 static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 {
        return false;
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
new file mode 100644 (file)
index 0000000..733acfb
--- /dev/null
@@ -0,0 +1,253 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VDPA_H
+#define _LINUX_VDPA_H
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/vhost_iotlb.h>
+
+/**
+ * vDPA callback definition.
+ * @callback: interrupt callback function
+ * @private: the data passed to the callback function
+ */
+struct vdpa_callback {
+       irqreturn_t (*callback)(void *data);
+       void *private;
+};
+
+/**
+ * vDPA device - representation of a vDPA device
+ * @dev: underlying device
+ * @dma_dev: the actual device that is performing DMA
+ * @config: the configuration ops for this device.
+ * @index: device index
+ */
+struct vdpa_device {
+       struct device dev;
+       struct device *dma_dev;
+       const struct vdpa_config_ops *config;
+       unsigned int index;
+};
+
+/**
+ * vDPA_config_ops - operations for configuring a vDPA device.
+ * Note: vDPA device drivers are required to implement all of the
+ * operations unless it is mentioned to be optional in the following
+ * list.
+ *
+ * @set_vq_address:            Set the address of virtqueue
+ *                             @vdev: vdpa device
+ *                             @idx: virtqueue index
+ *                             @desc_area: address of desc area
+ *                             @driver_area: address of driver area
+ *                             @device_area: address of device area
+ *                             Returns integer: success (0) or error (< 0)
+ * @set_vq_num:                        Set the size of virtqueue
+ *                             @vdev: vdpa device
+ *                             @idx: virtqueue index
+ *                             @num: the size of virtqueue
+ * @kick_vq:                   Kick the virtqueue
+ *                             @vdev: vdpa device
+ *                             @idx: virtqueue index
+ * @set_vq_cb:                 Set the interrupt callback function for
+ *                             a virtqueue
+ *                             @vdev: vdpa device
+ *                             @idx: virtqueue index
+ *                             @cb: virtio-vdev interrupt callback structure
+ * @set_vq_ready:              Set ready status for a virtqueue
+ *                             @vdev: vdpa device
+ *                             @idx: virtqueue index
+ *                             @ready: ready (true) not ready(false)
+ * @get_vq_ready:              Get ready status for a virtqueue
+ *                             @vdev: vdpa device
+ *                             @idx: virtqueue index
+ *                             Returns boolean: ready (true) or not (false)
+ * @set_vq_state:              Set the state for a virtqueue
+ *                             @vdev: vdpa device
+ *                             @idx: virtqueue index
+ *                             @state: virtqueue state (last_avail_idx)
+ *                             Returns integer: success (0) or error (< 0)
+ * @get_vq_state:              Get the state for a virtqueue
+ *                             @vdev: vdpa device
+ *                             @idx: virtqueue index
+ *                             Returns virtqueue state (last_avail_idx)
+ * @get_vq_align:              Get the virtqueue align requirement
+ *                             for the device
+ *                             @vdev: vdpa device
+ *                             Returns virtqueue algin requirement
+ * @get_features:              Get virtio features supported by the device
+ *                             @vdev: vdpa device
+ *                             Returns the virtio features support by the
+ *                             device
+ * @set_features:              Set virtio features supported by the driver
+ *                             @vdev: vdpa device
+ *                             @features: feature support by the driver
+ *                             Returns integer: success (0) or error (< 0)
+ * @set_config_cb:             Set the config interrupt callback
+ *                             @vdev: vdpa device
+ *                             @cb: virtio-vdev interrupt callback structure
+ * @get_vq_num_max:            Get the max size of virtqueue
+ *                             @vdev: vdpa device
+ *                             Returns u16: max size of virtqueue
+ * @get_device_id:             Get virtio device id
+ *                             @vdev: vdpa device
+ *                             Returns u32: virtio device id
+ * @get_vendor_id:             Get id for the vendor that provides this device
+ *                             @vdev: vdpa device
+ *                             Returns u32: virtio vendor id
+ * @get_status:                        Get the device status
+ *                             @vdev: vdpa device
+ *                             Returns u8: virtio device status
+ * @set_status:                        Set the device status
+ *                             @vdev: vdpa device
+ *                             @status: virtio device status
+ * @get_config:                        Read from device specific configuration space
+ *                             @vdev: vdpa device
+ *                             @offset: offset from the beginning of
+ *                             configuration space
+ *                             @buf: buffer used to read to
+ *                             @len: the length to read from
+ *                             configuration space
+ * @set_config:                        Write to device specific configuration space
+ *                             @vdev: vdpa device
+ *                             @offset: offset from the beginning of
+ *                             configuration space
+ *                             @buf: buffer used to write from
+ *                             @len: the length to write to
+ *                             configuration space
+ * @get_generation:            Get device config generation (optional)
+ *                             @vdev: vdpa device
+ *                             Returns u32: device generation
+ * @set_map:                   Set device memory mapping (optional)
+ *                             Needed for device that using device
+ *                             specific DMA translation (on-chip IOMMU)
+ *                             @vdev: vdpa device
+ *                             @iotlb: vhost memory mapping to be
+ *                             used by the vDPA
+ *                             Returns integer: success (0) or error (< 0)
+ * @dma_map:                   Map an area of PA to IOVA (optional)
+ *                             Needed for device that using device
+ *                             specific DMA translation (on-chip IOMMU)
+ *                             and preferring incremental map.
+ *                             @vdev: vdpa device
+ *                             @iova: iova to be mapped
+ *                             @size: size of the area
+ *                             @pa: physical address for the map
+ *                             @perm: device access permission (VHOST_MAP_XX)
+ *                             Returns integer: success (0) or error (< 0)
+ * @dma_unmap:                 Unmap an area of IOVA (optional but
+ *                             must be implemented with dma_map)
+ *                             Needed for device that using device
+ *                             specific DMA translation (on-chip IOMMU)
+ *                             and preferring incremental unmap.
+ *                             @vdev: vdpa device
+ *                             @iova: iova to be unmapped
+ *                             @size: size of the area
+ *                             Returns integer: success (0) or error (< 0)
+ * @free:                      Free resources that belongs to vDPA (optional)
+ *                             @vdev: vdpa device
+ */
+struct vdpa_config_ops {
+       /* Virtqueue ops */
+       int (*set_vq_address)(struct vdpa_device *vdev,
+                             u16 idx, u64 desc_area, u64 driver_area,
+                             u64 device_area);
+       void (*set_vq_num)(struct vdpa_device *vdev, u16 idx, u32 num);
+       void (*kick_vq)(struct vdpa_device *vdev, u16 idx);
+       void (*set_vq_cb)(struct vdpa_device *vdev, u16 idx,
+                         struct vdpa_callback *cb);
+       void (*set_vq_ready)(struct vdpa_device *vdev, u16 idx, bool ready);
+       bool (*get_vq_ready)(struct vdpa_device *vdev, u16 idx);
+       int (*set_vq_state)(struct vdpa_device *vdev, u16 idx, u64 state);
+       u64 (*get_vq_state)(struct vdpa_device *vdev, u16 idx);
+
+       /* Device ops */
+       u16 (*get_vq_align)(struct vdpa_device *vdev);
+       u64 (*get_features)(struct vdpa_device *vdev);
+       int (*set_features)(struct vdpa_device *vdev, u64 features);
+       void (*set_config_cb)(struct vdpa_device *vdev,
+                             struct vdpa_callback *cb);
+       u16 (*get_vq_num_max)(struct vdpa_device *vdev);
+       u32 (*get_device_id)(struct vdpa_device *vdev);
+       u32 (*get_vendor_id)(struct vdpa_device *vdev);
+       u8 (*get_status)(struct vdpa_device *vdev);
+       void (*set_status)(struct vdpa_device *vdev, u8 status);
+       void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
+                          void *buf, unsigned int len);
+       void (*set_config)(struct vdpa_device *vdev, unsigned int offset,
+                          const void *buf, unsigned int len);
+       u32 (*get_generation)(struct vdpa_device *vdev);
+
+       /* DMA ops */
+       int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb);
+       int (*dma_map)(struct vdpa_device *vdev, u64 iova, u64 size,
+                      u64 pa, u32 perm);
+       int (*dma_unmap)(struct vdpa_device *vdev, u64 iova, u64 size);
+
+       /* Free device resources */
+       void (*free)(struct vdpa_device *vdev);
+};
+
+struct vdpa_device *__vdpa_alloc_device(struct device *parent,
+                                       const struct vdpa_config_ops *config,
+                                       size_t size);
+
+#define vdpa_alloc_device(dev_struct, member, parent, config)   \
+                         container_of(__vdpa_alloc_device( \
+                                      parent, config, \
+                                      sizeof(dev_struct) + \
+                                      BUILD_BUG_ON_ZERO(offsetof( \
+                                      dev_struct, member))), \
+                                      dev_struct, member)
+
+int vdpa_register_device(struct vdpa_device *vdev);
+void vdpa_unregister_device(struct vdpa_device *vdev);
+
+/**
+ * vdpa_driver - operations for a vDPA driver
+ * @driver: underlying device driver
+ * @probe: the function to call when a device is found.  Returns 0 or -errno.
+ * @remove: the function to call when a device is removed.
+ */
+struct vdpa_driver {
+       struct device_driver driver;
+       int (*probe)(struct vdpa_device *vdev);
+       void (*remove)(struct vdpa_device *vdev);
+};
+
+#define vdpa_register_driver(drv) \
+       __vdpa_register_driver(drv, THIS_MODULE)
+int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner);
+void vdpa_unregister_driver(struct vdpa_driver *drv);
+
+#define module_vdpa_driver(__vdpa_driver) \
+       module_driver(__vdpa_driver, vdpa_register_driver,      \
+                     vdpa_unregister_driver)
+
+static inline struct vdpa_driver *drv_to_vdpa(struct device_driver *driver)
+{
+       return container_of(driver, struct vdpa_driver, driver);
+}
+
+static inline struct vdpa_device *dev_to_vdpa(struct device *_dev)
+{
+       return container_of(_dev, struct vdpa_device, dev);
+}
+
+static inline void *vdpa_get_drvdata(const struct vdpa_device *vdev)
+{
+       return dev_get_drvdata(&vdev->dev);
+}
+
+static inline void vdpa_set_drvdata(struct vdpa_device *vdev, void *data)
+{
+       dev_set_drvdata(&vdev->dev, data);
+}
+
+static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev)
+{
+       return vdev->dma_dev;
+}
+#endif /* _LINUX_VDPA_H */
diff --git a/include/linux/vhost_iotlb.h b/include/linux/vhost_iotlb.h
new file mode 100644 (file)
index 0000000..6b09b78
--- /dev/null
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VHOST_IOTLB_H
+#define _LINUX_VHOST_IOTLB_H
+
+#include <linux/interval_tree_generic.h>
+
+struct vhost_iotlb_map {
+       struct rb_node rb;
+       struct list_head link;
+       u64 start;
+       u64 last;
+       u64 size;
+       u64 addr;
+#define VHOST_MAP_RO 0x1
+#define VHOST_MAP_WO 0x2
+#define VHOST_MAP_RW 0x3
+       u32 perm;
+       u32 flags_padding;
+       u64 __subtree_last;
+};
+
+#define VHOST_IOTLB_FLAG_RETIRE 0x1
+
+struct vhost_iotlb {
+       struct rb_root_cached root;
+       struct list_head list;
+       unsigned int limit;
+       unsigned int nmaps;
+       unsigned int flags;
+};
+
+int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, u64 start, u64 last,
+                         u64 addr, unsigned int perm);
+void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last);
+
+struct vhost_iotlb *vhost_iotlb_alloc(unsigned int limit, unsigned int flags);
+void vhost_iotlb_free(struct vhost_iotlb *iotlb);
+void vhost_iotlb_reset(struct vhost_iotlb *iotlb);
+
+struct vhost_iotlb_map *
+vhost_iotlb_itree_first(struct vhost_iotlb *iotlb, u64 start, u64 last);
+struct vhost_iotlb_map *
+vhost_iotlb_itree_next(struct vhost_iotlb_map *map, u64 start, u64 last);
+
+void vhost_iotlb_map_free(struct vhost_iotlb *iotlb,
+                         struct vhost_iotlb_map *map);
+#endif
index 47a3441..ffef0f2 100644 (file)
@@ -73,9 +73,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
                THP_FAULT_ALLOC,
                THP_FAULT_FALLBACK,
+               THP_FAULT_FALLBACK_CHARGE,
                THP_COLLAPSE_ALLOC,
                THP_COLLAPSE_ALLOC_FAILED,
                THP_FILE_ALLOC,
+               THP_FILE_FALLBACK,
+               THP_FILE_FALLBACK_CHARGE,
                THP_FILE_MAPPED,
                THP_SPLIT_PAGE,
                THP_SPLIT_PAGE_FAILED,
@@ -115,6 +118,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 
 #ifndef CONFIG_TRANSPARENT_HUGEPAGE
 #define THP_FILE_ALLOC ({ BUILD_BUG(); 0; })
+#define THP_FILE_FALLBACK ({ BUILD_BUG(); 0; })
+#define THP_FILE_FALLBACK_CHARGE ({ BUILD_BUG(); 0; })
 #define THP_FILE_MAPPED ({ BUILD_BUG(); 0; })
 #endif
 
index d237087..bd0503c 100644 (file)
@@ -14,6 +14,8 @@
 #include <linux/virtio_byteorder.h>
 #include <linux/uio.h>
 #include <linux/slab.h>
+#include <linux/dma-direction.h>
+#include <linux/vhost_iotlb.h>
 #include <asm/barrier.h>
 
 /* virtio_ring with information needed for host access. */
@@ -39,6 +41,9 @@ struct vringh {
        /* The vring (note: it may contain user pointers!) */
        struct vring vring;
 
+       /* IOTLB for this vring */
+       struct vhost_iotlb *iotlb;
+
        /* The function to call to notify the guest about added buffers */
        void (*notify)(struct vringh *);
 };
@@ -248,4 +253,35 @@ static inline __virtio64 cpu_to_vringh64(const struct vringh *vrh, u64 val)
 {
        return __cpu_to_virtio64(vringh_is_little_endian(vrh), val);
 }
+
+void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb);
+
+int vringh_init_iotlb(struct vringh *vrh, u64 features,
+                     unsigned int num, bool weak_barriers,
+                     struct vring_desc *desc,
+                     struct vring_avail *avail,
+                     struct vring_used *used);
+
+int vringh_getdesc_iotlb(struct vringh *vrh,
+                        struct vringh_kiov *riov,
+                        struct vringh_kiov *wiov,
+                        u16 *head,
+                        gfp_t gfp);
+
+ssize_t vringh_iov_pull_iotlb(struct vringh *vrh,
+                             struct vringh_kiov *riov,
+                             void *dst, size_t len);
+ssize_t vringh_iov_push_iotlb(struct vringh *vrh,
+                             struct vringh_kiov *wiov,
+                             const void *src, size_t len);
+
+void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num);
+
+int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len);
+
+bool vringh_notify_enable_iotlb(struct vringh *vrh);
+void vringh_notify_disable_iotlb(struct vringh *vrh);
+
+int vringh_need_notify_iotlb(struct vringh *vrh);
+
 #endif /* _LINUX_VRINGH_H */
index 67a9783..d97adfc 100644 (file)
@@ -153,7 +153,8 @@ TRACE_DEFINE_ENUM(CP_PAUSE);
 #define show_compress_algorithm(type)                                  \
        __print_symbolic(type,                                          \
                { COMPRESS_LZO,         "LZO" },                        \
-               { COMPRESS_LZ4,         "LZ4" })
+               { COMPRESS_LZ4,         "LZ4" },                        \
+               { COMPRESS_ZSTD,        "ZSTD" })
 
 struct f2fs_sb_info;
 struct f2fs_io_info;
index d82a0f4..70e32ff 100644 (file)
@@ -13,6 +13,7 @@
        EM( SCAN_PMD_NULL,              "pmd_null")                     \
        EM( SCAN_EXCEED_NONE_PTE,       "exceed_none_pte")              \
        EM( SCAN_PTE_NON_PRESENT,       "pte_non_present")              \
+       EM( SCAN_PTE_UFFD_WP,           "pte_uffd_wp")                  \
        EM( SCAN_PAGE_RO,               "no_writable_page")             \
        EM( SCAN_LACK_REFERENCED_PAGE,  "lack_referenced_page")         \
        EM( SCAN_PAGE_NULL,             "page_null")                    \
index a1675d4..5fb7520 100644 (file)
@@ -154,6 +154,7 @@ IF_HAVE_PG_IDLE(PG_idle,            "idle"          )
        {VM_ACCOUNT,                    "account"       },              \
        {VM_NORESERVE,                  "noreserve"     },              \
        {VM_HUGETLB,                    "hugetlb"       },              \
+       {VM_SYNC,                       "sync"          },              \
        __VM_ARCH_SPECIFIC_1                            ,               \
        {VM_WIPEONFORK,                 "wipeonfork"    },              \
        {VM_DONTDUMP,                   "dontdump"      },              \
index 9238d23..051f26f 100644 (file)
@@ -104,12 +104,12 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class,
        TP_fast_assign(
                __entry->r_xprt = r_xprt;
                __entry->rc = rc;
-               __entry->connect_status = r_xprt->rx_ep.rep_connected;
+               __entry->connect_status = r_xprt->rx_ep->re_connect_status;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connect status=%d",
+       TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connection status=%d",
                __get_str(addr), __get_str(port), __entry->r_xprt,
                __entry->rc, __entry->connect_status
        )
@@ -228,20 +228,20 @@ DECLARE_EVENT_CLASS(xprtrdma_frwr_done,
        TP_ARGS(wc, frwr),
 
        TP_STRUCT__entry(
-               __field(const void *, mr)
+               __field(u32, mr_id)
                __field(unsigned int, status)
                __field(unsigned int, vendor_err)
        ),
 
        TP_fast_assign(
-               __entry->mr = container_of(frwr, struct rpcrdma_mr, frwr);
+               __entry->mr_id = frwr->fr_mr->res.id;
                __entry->status = wc->status;
                __entry->vendor_err = __entry->status ? wc->vendor_err : 0;
        ),
 
        TP_printk(
-               "mr=%p: %s (%u/0x%x)",
-               __entry->mr, rdma_show_wc_status(__entry->status),
+               "mr.id=%u: %s (%u/0x%x)",
+               __entry->mr_id, rdma_show_wc_status(__entry->status),
                __entry->status, __entry->vendor_err
        )
 );
@@ -274,7 +274,8 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
        TP_ARGS(mr),
 
        TP_STRUCT__entry(
-               __field(const void *, mr)
+               __field(u32, mr_id)
+               __field(int, nents)
                __field(u32, handle)
                __field(u32, length)
                __field(u64, offset)
@@ -282,15 +283,16 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
        ),
 
        TP_fast_assign(
-               __entry->mr = mr;
+               __entry->mr_id  = mr->frwr.fr_mr->res.id;
+               __entry->nents  = mr->mr_nents;
                __entry->handle = mr->mr_handle;
                __entry->length = mr->mr_length;
                __entry->offset = mr->mr_offset;
                __entry->dir    = mr->mr_dir;
        ),
 
-       TP_printk("mr=%p %u@0x%016llx:0x%08x (%s)",
-               __entry->mr, __entry->length,
+       TP_printk("mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s)",
+               __entry->mr_id, __entry->nents, __entry->length,
                (unsigned long long)__entry->offset, __entry->handle,
                xprtrdma_show_direction(__entry->dir)
        )
@@ -340,68 +342,37 @@ DECLARE_EVENT_CLASS(xprtrdma_cb_event,
  ** Connection events
  **/
 
-TRACE_EVENT(xprtrdma_cm_event,
-       TP_PROTO(
-               const struct rpcrdma_xprt *r_xprt,
-               struct rdma_cm_event *event
-       ),
-
-       TP_ARGS(r_xprt, event),
-
-       TP_STRUCT__entry(
-               __field(const void *, r_xprt)
-               __field(unsigned int, event)
-               __field(int, status)
-               __string(addr, rpcrdma_addrstr(r_xprt))
-               __string(port, rpcrdma_portstr(r_xprt))
-       ),
-
-       TP_fast_assign(
-               __entry->r_xprt = r_xprt;
-               __entry->event = event->event;
-               __entry->status = event->status;
-               __assign_str(addr, rpcrdma_addrstr(r_xprt));
-               __assign_str(port, rpcrdma_portstr(r_xprt));
-       ),
-
-       TP_printk("peer=[%s]:%s r_xprt=%p: %s (%u/%d)",
-               __get_str(addr), __get_str(port),
-               __entry->r_xprt, rdma_show_cm_event(__entry->event),
-               __entry->event, __entry->status
-       )
-);
-
 TRACE_EVENT(xprtrdma_inline_thresh,
        TP_PROTO(
-               const struct rpcrdma_xprt *r_xprt
+               const struct rpcrdma_ep *ep
        ),
 
-       TP_ARGS(r_xprt),
+       TP_ARGS(ep),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
                __field(unsigned int, inline_send)
                __field(unsigned int, inline_recv)
                __field(unsigned int, max_send)
                __field(unsigned int, max_recv)
-               __string(addr, rpcrdma_addrstr(r_xprt))
-               __string(port, rpcrdma_portstr(r_xprt))
+               __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+               __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
        ),
 
        TP_fast_assign(
-               const struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+               const struct rdma_cm_id *id = ep->re_id;
 
-               __entry->r_xprt = r_xprt;
-               __entry->inline_send = ep->rep_inline_send;
-               __entry->inline_recv = ep->rep_inline_recv;
-               __entry->max_send = ep->rep_max_inline_send;
-               __entry->max_recv = ep->rep_max_inline_recv;
-               __assign_str(addr, rpcrdma_addrstr(r_xprt));
-               __assign_str(port, rpcrdma_portstr(r_xprt));
+               __entry->inline_send = ep->re_inline_send;
+               __entry->inline_recv = ep->re_inline_recv;
+               __entry->max_send = ep->re_max_inline_send;
+               __entry->max_recv = ep->re_max_inline_recv;
+               memcpy(__entry->srcaddr, &id->route.addr.src_addr,
+                      sizeof(struct sockaddr_in6));
+               memcpy(__entry->dstaddr, &id->route.addr.dst_addr,
+                      sizeof(struct sockaddr_in6));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p neg send/recv=%u/%u, calc send/recv=%u/%u",
-               __get_str(addr), __get_str(port), __entry->r_xprt,
+       TP_printk("%pISpc -> %pISpc neg send/recv=%u/%u, calc send/recv=%u/%u",
+               __entry->srcaddr, __entry->dstaddr,
                __entry->inline_send, __entry->inline_recv,
                __entry->max_send, __entry->max_recv
        )
@@ -409,11 +380,10 @@ TRACE_EVENT(xprtrdma_inline_thresh,
 
 DEFINE_CONN_EVENT(connect);
 DEFINE_CONN_EVENT(disconnect);
+DEFINE_CONN_EVENT(flush_dct);
 
 DEFINE_RXPRT_EVENT(xprtrdma_create);
 DEFINE_RXPRT_EVENT(xprtrdma_op_destroy);
-DEFINE_RXPRT_EVENT(xprtrdma_remove);
-DEFINE_RXPRT_EVENT(xprtrdma_reinsert);
 DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc);
 DEFINE_RXPRT_EVENT(xprtrdma_op_close);
 DEFINE_RXPRT_EVENT(xprtrdma_op_setport);
@@ -480,32 +450,33 @@ TRACE_EVENT(xprtrdma_op_set_cto,
 
 TRACE_EVENT(xprtrdma_qp_event,
        TP_PROTO(
-               const struct rpcrdma_xprt *r_xprt,
+               const struct rpcrdma_ep *ep,
                const struct ib_event *event
        ),
 
-       TP_ARGS(r_xprt, event),
+       TP_ARGS(ep, event),
 
        TP_STRUCT__entry(
-               __field(const void *, r_xprt)
-               __field(unsigned int, event)
+               __field(unsigned long, event)
                __string(name, event->device->name)
-               __string(addr, rpcrdma_addrstr(r_xprt))
-               __string(port, rpcrdma_portstr(r_xprt))
+               __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+               __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
        ),
 
        TP_fast_assign(
-               __entry->r_xprt = r_xprt;
+               const struct rdma_cm_id *id = ep->re_id;
+
                __entry->event = event->event;
                __assign_str(name, event->device->name);
-               __assign_str(addr, rpcrdma_addrstr(r_xprt));
-               __assign_str(port, rpcrdma_portstr(r_xprt));
+               memcpy(__entry->srcaddr, &id->route.addr.src_addr,
+                      sizeof(struct sockaddr_in6));
+               memcpy(__entry->dstaddr, &id->route.addr.dst_addr,
+                      sizeof(struct sockaddr_in6));
        ),
 
-       TP_printk("peer=[%s]:%s r_xprt=%p: dev %s: %s (%u)",
-               __get_str(addr), __get_str(port), __entry->r_xprt,
-               __get_str(name), rdma_show_ib_event(__entry->event),
-               __entry->event
+       TP_printk("%pISpc -> %pISpc device=%s %s (%lu)",
+               __entry->srcaddr, __entry->dstaddr, __get_str(name),
+               rdma_show_ib_event(__entry->event), __entry->event
        )
 );
 
@@ -801,7 +772,7 @@ TRACE_EVENT(xprtrdma_post_recvs,
                __entry->r_xprt = r_xprt;
                __entry->count = count;
                __entry->status = status;
-               __entry->posted = r_xprt->rx_ep.rep_receive_count;
+               __entry->posted = r_xprt->rx_ep->re_receive_count;
                __assign_str(addr, rpcrdma_addrstr(r_xprt));
                __assign_str(port, rpcrdma_portstr(r_xprt));
        ),
@@ -920,17 +891,17 @@ TRACE_EVENT(xprtrdma_frwr_alloc,
        TP_ARGS(mr, rc),
 
        TP_STRUCT__entry(
-               __field(const void *, mr)
+               __field(u32, mr_id)
                __field(int, rc)
        ),
 
        TP_fast_assign(
-               __entry->mr = mr;
-               __entry->rc     = rc;
+               __entry->mr_id = mr->frwr.fr_mr->res.id;
+               __entry->rc = rc;
        ),
 
-       TP_printk("mr=%p: rc=%d",
-               __entry->mr, __entry->rc
+       TP_printk("mr.id=%u: rc=%d",
+               __entry->mr_id, __entry->rc
        )
 );
 
@@ -943,7 +914,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg,
        TP_ARGS(mr, rc),
 
        TP_STRUCT__entry(
-               __field(const void *, mr)
+               __field(u32, mr_id)
+               __field(int, nents)
                __field(u32, handle)
                __field(u32, length)
                __field(u64, offset)
@@ -952,7 +924,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg,
        ),
 
        TP_fast_assign(
-               __entry->mr = mr;
+               __entry->mr_id  = mr->frwr.fr_mr->res.id;
+               __entry->nents  = mr->mr_nents;
                __entry->handle = mr->mr_handle;
                __entry->length = mr->mr_length;
                __entry->offset = mr->mr_offset;
@@ -960,8 +933,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg,
                __entry->rc     = rc;
        ),
 
-       TP_printk("mr=%p %u@0x%016llx:0x%08x (%s): rc=%d",
-               __entry->mr, __entry->length,
+       TP_printk("mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s): rc=%d",
+               __entry->mr_id, __entry->nents, __entry->length,
                (unsigned long long)__entry->offset, __entry->handle,
                xprtrdma_show_direction(__entry->dir),
                __entry->rc
@@ -977,21 +950,21 @@ TRACE_EVENT(xprtrdma_frwr_sgerr,
        TP_ARGS(mr, sg_nents),
 
        TP_STRUCT__entry(
-               __field(const void *, mr)
+               __field(u32, mr_id)
                __field(u64, addr)
                __field(u32, dir)
                __field(int, nents)
        ),
 
        TP_fast_assign(
-               __entry->mr = mr;
+               __entry->mr_id = mr->frwr.fr_mr->res.id;
                __entry->addr = mr->mr_sg->dma_address;
                __entry->dir = mr->mr_dir;
                __entry->nents = sg_nents;
        ),
 
-       TP_printk("mr=%p dma addr=0x%llx (%s) sg_nents=%d",
-               __entry->mr, __entry->addr,
+       TP_printk("mr.id=%u DMA addr=0x%llx (%s) sg_nents=%d",
+               __entry->mr_id, __entry->addr,
                xprtrdma_show_direction(__entry->dir),
                __entry->nents
        )
@@ -1006,7 +979,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
        TP_ARGS(mr, num_mapped),
 
        TP_STRUCT__entry(
-               __field(const void *, mr)
+               __field(u32, mr_id)
                __field(u64, addr)
                __field(u32, dir)
                __field(int, num_mapped)
@@ -1014,15 +987,15 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
        ),
 
        TP_fast_assign(
-               __entry->mr = mr;
+               __entry->mr_id = mr->frwr.fr_mr->res.id;
                __entry->addr = mr->mr_sg->dma_address;
                __entry->dir = mr->mr_dir;
                __entry->num_mapped = num_mapped;
                __entry->nents = mr->mr_nents;
        ),
 
-       TP_printk("mr=%p dma addr=0x%llx (%s) nents=%d of %d",
-               __entry->mr, __entry->addr,
+       TP_printk("mr.id=%u DMA addr=0x%llx (%s) nents=%d of %d",
+               __entry->mr_id, __entry->addr,
                xprtrdma_show_direction(__entry->dir),
                __entry->num_mapped, __entry->nents
        )
@@ -1031,7 +1004,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
 DEFINE_MR_EVENT(localinv);
 DEFINE_MR_EVENT(map);
 DEFINE_MR_EVENT(unmap);
-DEFINE_MR_EVENT(remoteinv);
+DEFINE_MR_EVENT(reminv);
 DEFINE_MR_EVENT(recycle);
 
 TRACE_EVENT(xprtrdma_dma_maperr,
index a5ab297..74bb594 100644 (file)
@@ -323,7 +323,7 @@ TRACE_EVENT(mm_vmscan_writepage,
        TP_fast_assign(
                __entry->pfn = page_to_pfn(page);
                __entry->reclaim_flags = trace_reclaim_flags(
-                                               page_is_file_cache(page));
+                                               page_is_file_lru(page));
        ),
 
        TP_printk("page=%p pfn=%lu flags=%s",
index 6923dc7..b6a835d 100644 (file)
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
 /*
  * Input event codes
  *
index 48f1a7c..e7e98bd 100644 (file)
@@ -19,7 +19,8 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |           \
+#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP |    \
+                          UFFD_FEATURE_EVENT_FORK |            \
                           UFFD_FEATURE_EVENT_REMAP |           \
                           UFFD_FEATURE_EVENT_REMOVE |  \
                           UFFD_FEATURE_EVENT_UNMAP |           \
@@ -34,7 +35,8 @@
 #define UFFD_API_RANGE_IOCTLS                  \
        ((__u64)1 << _UFFDIO_WAKE |             \
         (__u64)1 << _UFFDIO_COPY |             \
-        (__u64)1 << _UFFDIO_ZEROPAGE)
+        (__u64)1 << _UFFDIO_ZEROPAGE |         \
+        (__u64)1 << _UFFDIO_WRITEPROTECT)
 #define UFFD_API_RANGE_IOCTLS_BASIC            \
        ((__u64)1 << _UFFDIO_WAKE |             \
         (__u64)1 << _UFFDIO_COPY)
@@ -52,6 +54,7 @@
 #define _UFFDIO_WAKE                   (0x02)
 #define _UFFDIO_COPY                   (0x03)
 #define _UFFDIO_ZEROPAGE               (0x04)
+#define _UFFDIO_WRITEPROTECT           (0x06)
 #define _UFFDIO_API                    (0x3F)
 
 /* userfaultfd ioctl ids */
@@ -68,6 +71,8 @@
                                      struct uffdio_copy)
 #define UFFDIO_ZEROPAGE                _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
                                      struct uffdio_zeropage)
+#define UFFDIO_WRITEPROTECT    _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
+                                     struct uffdio_writeprotect)
 
 /* read() structure */
 struct uffd_msg {
@@ -203,13 +208,14 @@ struct uffdio_copy {
        __u64 dst;
        __u64 src;
        __u64 len;
+#define UFFDIO_COPY_MODE_DONTWAKE              ((__u64)1<<0)
        /*
-        * There will be a wrprotection flag later that allows to map
-        * pages wrprotected on the fly. And such a flag will be
-        * available if the wrprotection ioctl are implemented for the
-        * range according to the uffdio_register.ioctls.
+        * UFFDIO_COPY_MODE_WP will map the page write protected on
+        * the fly.  UFFDIO_COPY_MODE_WP is available only if the
+        * write protected ioctl is implemented for the range
+        * according to the uffdio_register.ioctls.
         */
-#define UFFDIO_COPY_MODE_DONTWAKE              ((__u64)1<<0)
+#define UFFDIO_COPY_MODE_WP                    ((__u64)1<<1)
        __u64 mode;
 
        /*
@@ -231,4 +237,24 @@ struct uffdio_zeropage {
        __s64 zeropage;
 };
 
+struct uffdio_writeprotect {
+       struct uffdio_range range;
+/*
+ * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range,
+ * unset the flag to undo protection of a range which was previously
+ * write protected.
+ *
+ * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up
+ * any wait thread after the operation succeeds.
+ *
+ * NOTE: Write protecting a region (WP=1) is unrelated to page faults,
+ * therefore DONTWAKE flag is meaningless with WP=1.  Removing write
+ * protection (WP=0) in response to a page fault wakes the faulting
+ * task unless DONTWAKE is set.
+ */
+#define UFFDIO_WRITEPROTECT_MODE_WP            ((__u64)1<<0)
+#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE      ((__u64)1<<1)
+       __u64 mode;
+};
+
 #endif /* _LINUX_USERFAULTFD_H */
index 40d028e..9fe72e4 100644 (file)
 #define VHOST_VSOCK_SET_GUEST_CID      _IOW(VHOST_VIRTIO, 0x60, __u64)
 #define VHOST_VSOCK_SET_RUNNING                _IOW(VHOST_VIRTIO, 0x61, int)
 
+/* VHOST_VDPA specific defines */
+
+/* Get the device id. The device ids follow the same definition of
+ * the device id defined in virtio-spec.
+ */
+#define VHOST_VDPA_GET_DEVICE_ID       _IOR(VHOST_VIRTIO, 0x70, __u32)
+/* Get and set the status. The status bits follow the same definition
+ * of the device status defined in virtio-spec.
+ */
+#define VHOST_VDPA_GET_STATUS          _IOR(VHOST_VIRTIO, 0x71, __u8)
+#define VHOST_VDPA_SET_STATUS          _IOW(VHOST_VIRTIO, 0x72, __u8)
+/* Get and set the device config. The device config follows the same
+ * definition of the device config defined in virtio-spec.
+ */
+#define VHOST_VDPA_GET_CONFIG          _IOR(VHOST_VIRTIO, 0x73, \
+                                            struct vhost_vdpa_config)
+#define VHOST_VDPA_SET_CONFIG          _IOW(VHOST_VIRTIO, 0x74, \
+                                            struct vhost_vdpa_config)
+/* Enable/disable the ring. */
+#define VHOST_VDPA_SET_VRING_ENABLE    _IOW(VHOST_VIRTIO, 0x75, \
+                                            struct vhost_vring_state)
+/* Get the max ring size. */
+#define VHOST_VDPA_GET_VRING_NUM       _IOR(VHOST_VIRTIO, 0x76, __u16)
+
 #endif
index c907290..669457c 100644 (file)
@@ -119,6 +119,14 @@ struct vhost_scsi_target {
        unsigned short reserved;
 };
 
+/* VHOST_VDPA specific definitions */
+
+struct vhost_vdpa_config {
+       __u32 off;
+       __u32 len;
+       __u8 buf[0];
+};
+
 /* Feature bits */
 /* Log all write descriptors. Can be changed while device is active. */
 #define VHOST_F_LOG_ALL 26
index a1966cd..1997439 100644 (file)
@@ -36,6 +36,7 @@
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM        2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_FREE_PAGE_HINT        3 /* VQ to report free pages */
 #define VIRTIO_BALLOON_F_PAGE_POISON   4 /* Guest is using page poisoning */
+#define VIRTIO_BALLOON_F_REPORTING     5 /* Page reporting virtqueue */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
index 237e36a..48e3c29 100644 (file)
 #define VIRTIO_IOMMU_F_MMIO                    5
 
 struct virtio_iommu_range_64 {
-       __le64                                  start;
-       __le64                                  end;
+       __u64                                   start;
+       __u64                                   end;
 };
 
 struct virtio_iommu_range_32 {
-       __le32                                  start;
-       __le32                                  end;
+       __u32                                   start;
+       __u32                                   end;
 };
 
 struct virtio_iommu_config {
        /* Supported page sizes */
-       __le64                                  page_size_mask;
+       __u64                                   page_size_mask;
        /* Supported IOVA range */
        struct virtio_iommu_range_64            input_range;
        /* Max domain ID size */
        struct virtio_iommu_range_32            domain_range;
        /* Probe buffer size */
-       __le32                                  probe_size;
+       __u32                                   probe_size;
 };
 
 /* Request types */
index a3715a3..19d23e5 100644 (file)
@@ -57,6 +57,9 @@
                                         * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23  /* Set MAC address */
 
+#define VIRTIO_NET_F_HASH_REPORT  57   /* Supports hash report */
+#define VIRTIO_NET_F_RSS         60    /* Supports RSS RX steering */
+#define VIRTIO_NET_F_RSC_EXT     61    /* extended coalescing info */
 #define VIRTIO_NET_F_STANDBY     62    /* Act as standby for another device
                                         * with the same MAC.
                                         */
 #define VIRTIO_NET_S_LINK_UP   1       /* Link is up */
 #define VIRTIO_NET_S_ANNOUNCE  2       /* Announcement is needed */
 
+/* supported/enabled hash types */
+#define VIRTIO_NET_RSS_HASH_TYPE_IPv4          (1 << 0)
+#define VIRTIO_NET_RSS_HASH_TYPE_TCPv4         (1 << 1)
+#define VIRTIO_NET_RSS_HASH_TYPE_UDPv4         (1 << 2)
+#define VIRTIO_NET_RSS_HASH_TYPE_IPv6          (1 << 3)
+#define VIRTIO_NET_RSS_HASH_TYPE_TCPv6         (1 << 4)
+#define VIRTIO_NET_RSS_HASH_TYPE_UDPv6         (1 << 5)
+#define VIRTIO_NET_RSS_HASH_TYPE_IP_EX         (1 << 6)
+#define VIRTIO_NET_RSS_HASH_TYPE_TCP_EX        (1 << 7)
+#define VIRTIO_NET_RSS_HASH_TYPE_UDP_EX        (1 << 8)
+
 struct virtio_net_config {
        /* The config defining mac address (if VIRTIO_NET_F_MAC) */
        __u8 mac[ETH_ALEN];
@@ -92,6 +106,12 @@ struct virtio_net_config {
         * Any other value stands for unknown.
         */
        __u8 duplex;
+       /* maximum size of RSS key */
+       __u8 rss_max_key_size;
+       /* maximum number of indirection table entries */
+       __le16 rss_max_indirection_table_length;
+       /* bitmask of supported VIRTIO_NET_RSS_HASH_ types */
+       __le32 supported_hash_types;
 } __attribute__((packed));
 
 /*
@@ -104,6 +124,7 @@ struct virtio_net_config {
 struct virtio_net_hdr_v1 {
 #define VIRTIO_NET_HDR_F_NEEDS_CSUM    1       /* Use csum_start, csum_offset */
 #define VIRTIO_NET_HDR_F_DATA_VALID    2       /* Csum is valid */
+#define VIRTIO_NET_HDR_F_RSC_INFO      4       /* rsc info in csum_ fields */
        __u8 flags;
 #define VIRTIO_NET_HDR_GSO_NONE                0       /* Not a GSO frame */
 #define VIRTIO_NET_HDR_GSO_TCPV4       1       /* GSO frame, IPv4 TCP (TSO) */
@@ -113,11 +134,46 @@ struct virtio_net_hdr_v1 {
        __u8 gso_type;
        __virtio16 hdr_len;     /* Ethernet + IP + tcp/udp hdrs */
        __virtio16 gso_size;    /* Bytes to append to hdr_len per frame */
-       __virtio16 csum_start;  /* Position to start checksumming from */
-       __virtio16 csum_offset; /* Offset after that to place checksum */
+       union {
+               struct {
+                       __virtio16 csum_start;
+                       __virtio16 csum_offset;
+               };
+               /* Checksum calculation */
+               struct {
+                       /* Position to start checksumming from */
+                       __virtio16 start;
+                       /* Offset after that to place checksum */
+                       __virtio16 offset;
+               } csum;
+               /* Receive Segment Coalescing */
+               struct {
+                       /* Number of coalesced segments */
+                       __le16 segments;
+                       /* Number of duplicated acks */
+                       __le16 dup_acks;
+               } rsc;
+       };
        __virtio16 num_buffers; /* Number of merged rx buffers */
 };
 
+struct virtio_net_hdr_v1_hash {
+       struct virtio_net_hdr_v1 hdr;
+       __le32 hash_value;
+#define VIRTIO_NET_HASH_REPORT_NONE            0
+#define VIRTIO_NET_HASH_REPORT_IPv4            1
+#define VIRTIO_NET_HASH_REPORT_TCPv4           2
+#define VIRTIO_NET_HASH_REPORT_UDPv4           3
+#define VIRTIO_NET_HASH_REPORT_IPv6            4
+#define VIRTIO_NET_HASH_REPORT_TCPv6           5
+#define VIRTIO_NET_HASH_REPORT_UDPv6           6
+#define VIRTIO_NET_HASH_REPORT_IPv6_EX         7
+#define VIRTIO_NET_HASH_REPORT_TCPv6_EX        8
+#define VIRTIO_NET_HASH_REPORT_UDPv6_EX        9
+       __le16 hash_report;
+       __le16 padding;
+};
+
 #ifndef VIRTIO_NET_NO_LEGACY
 /* This header comes first in the scatter-gather list.
  * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must
@@ -228,7 +284,9 @@ struct virtio_net_ctrl_mac {
 
 /*
  * Control Receive Flow Steering
- *
+ */
+#define VIRTIO_NET_CTRL_MQ   4
+/*
  * The command VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET
  * enables Receive Flow Steering, specifying the number of the transmit and
  * receive queues that will be used. After the command is consumed and acked by
@@ -241,12 +299,48 @@ struct virtio_net_ctrl_mq {
        __virtio16 virtqueue_pairs;
 };
 
-#define VIRTIO_NET_CTRL_MQ   4
  #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET        0
  #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN        1
  #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX        0x8000
 
 /*
+ * The command VIRTIO_NET_CTRL_MQ_RSS_CONFIG has the same effect as
+ * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET does and additionally configures
+ * the receive steering to use a hash calculated for incoming packet
+ * to decide on receive virtqueue to place the packet. The command
+ * also provides parameters to calculate a hash and receive virtqueue.
+ */
+struct virtio_net_rss_config {
+       __le32 hash_types;
+       __le16 indirection_table_mask;
+       __le16 unclassified_queue;
+       __le16 indirection_table[1/* + indirection_table_mask */];
+       __le16 max_tx_vq;
+       __u8 hash_key_length;
+       __u8 hash_key_data[/* hash_key_length */];
+};
+
+ #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG          1
+
+/*
+ * The command VIRTIO_NET_CTRL_MQ_HASH_CONFIG requests the device
+ * to include in the virtio header of the packet the value of the
+ * calculated hash and the report type of hash. It also provides
+ * parameters for hash calculation. The command requires feature
+ * VIRTIO_NET_F_HASH_REPORT to be negotiated to extend the
+ * layout of virtio header as defined in virtio_net_hdr_v1_hash.
+ */
+struct virtio_net_hash_config {
+       __le32 hash_types;
+       /* for compatibility with virtio_net_rss_config */
+       __le16 reserved[4];
+       __u8 hash_key_length;
+       __u8 hash_key_data[/* hash_key_length */];
+};
+
+ #define VIRTIO_NET_CTRL_MQ_HASH_CONFIG         2
+
+/*
  * Control network offloads
  *
  * Reconfigures the network offloads that Guest can handle.
index 1c12059..9e22ee8 100644 (file)
@@ -16,6 +16,10 @@ config GCC_VERSION
        default $(shell,$(srctree)/scripts/gcc-version.sh $(CC)) if CC_IS_GCC
        default 0
 
+config LD_VERSION
+       int
+       default $(shell,$(LD) --version | $(srctree)/scripts/ld-version.sh)
+
 config CC_IS_CLANG
        def_bool $(success,$(CC) --version | head -n 1 | grep -q clang)
 
@@ -872,7 +876,7 @@ config BLK_CGROUP
        This option only enables generic Block IO controller infrastructure.
        One needs to also enable actual IO controlling logic/policy. For
        enabling proportional weight division of disk bandwidth in CFQ, set
-       CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
+       CONFIG_BFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
        CONFIG_BLK_DEV_THROTTLING=y.
 
        See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
@@ -1538,7 +1542,6 @@ config AIO
 
 config IO_URING
        bool "Enable IO uring support" if EXPERT
-       select ANON_INODES
        select IO_WQ
        default y
        help
@@ -1556,6 +1559,11 @@ config ADVISE_SYSCALLS
          applications use these syscalls, you can disable this option to save
          space.
 
+config HAVE_ARCH_USERFAULTFD_WP
+       bool
+       help
+         Arch has userfaultfd write protection support
+
 config MEMBARRIER
        bool "Enable membarrier() system call" if EXPERT
        default y
index 49a05ba..dc8307b 100644 (file)
@@ -239,11 +239,10 @@ static inline void msg_tree_erase(struct posix_msg_tree_node *leaf,
                info->msg_tree_rightmost = rb_prev(node);
 
        rb_erase(node, &info->msg_tree);
-       if (info->node_cache) {
+       if (info->node_cache)
                kfree(leaf);
-       } else {
+       else
                info->node_cache = leaf;
-       }
 }
 
 static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
index ce1ca9f..0ba6add 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1332,7 +1332,7 @@ static int copy_compat_shmid_from_user(struct shmid64_ds *out, void __user *buf,
        }
 }
 
-long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version)
+static long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version)
 {
        struct ipc_namespace *ns;
        struct shmid64_ds sem64;
index fe61df5..97638eb 100644 (file)
@@ -885,6 +885,7 @@ static int sysvipc_proc_release(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops sysvipc_proc_ops = {
+       .proc_flags     = PROC_ENTRY_PERMANENT,
        .proc_open      = sysvipc_proc_open,
        .proc_read      = seq_read,
        .proc_lseek     = seq_lseek,
index 7fa0c4a..8a44b93 100644 (file)
@@ -6,7 +6,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_KERNEL_XZ=y
 # CONFIG_KERNEL_LZO is not set
 # CONFIG_KERNEL_LZ4 is not set
-CONFIG_OPTIMIZE_INLINING=y
 # CONFIG_SLAB is not set
 # CONFIG_SLUB is not set
 CONFIG_SLOB=y
index 81e6d80..55e4441 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/export.h>
 #include <linux/vmalloc.h>
 #include <linux/hardirq.h>
+#include <linux/hugetlb.h>
 #include <linux/rculist.h>
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
@@ -7973,7 +7974,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
                flags |= MAP_EXECUTABLE;
        if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;
-       if (vma->vm_flags & VM_HUGETLB)
+       if (is_vm_hugetlb_page(vma))
                flags |= MAP_HUGETLB;
 
        if (file) {
index 7681f87..b0ea5eb 100644 (file)
@@ -34,7 +34,8 @@ u32 __initdata __visible main_extable_sort_needed = 1;
 /* Sort the kernel's built-in exception table */
 void __init sort_main_extable(void)
 {
-       if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
+       if (main_extable_sort_needed &&
+           &__stop___ex_table > &__start___ex_table) {
                pr_notice("Sorting __ex_table...\n");
                sort_extable(__start___ex_table, __stop___ex_table);
        }
index d2a967b..4385f3d 100644 (file)
@@ -361,6 +361,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
        if (new) {
                *new = *orig;
                INIT_LIST_HEAD(&new->anon_vma_chain);
+               new->vm_next = new->vm_prev = NULL;
        }
        return new;
 }
@@ -553,14 +554,15 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
                if (retval)
                        goto fail_nomem_anon_vma_fork;
                if (tmp->vm_flags & VM_WIPEONFORK) {
-                       /* VM_WIPEONFORK gets a clean slate in the child. */
+                       /*
+                        * VM_WIPEONFORK gets a clean slate in the child.
+                        * Don't prepare anon_vma until fault since we don't
+                        * copy page for current vma.
+                        */
                        tmp->anon_vma = NULL;
-                       if (anon_vma_prepare(tmp))
-                               goto fail_nomem_anon_vma_fork;
                } else if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
                tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
-               tmp->vm_next = tmp->vm_prev = NULL;
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file_inode(file);
index e5eb5ea..5e891c3 100644 (file)
@@ -58,7 +58,7 @@ struct gcov_node {
        struct dentry *dentry;
        struct dentry **links;
        int num_loaded;
-       char name[0];
+       char name[];
 };
 
 static const char objtree[] = OBJTREE;
index 801ee4b..acb8355 100644 (file)
@@ -38,7 +38,7 @@ static struct gcov_info *gcov_info_head;
 struct gcov_fn_info {
        unsigned int ident;
        unsigned int checksum;
-       unsigned int n_ctrs[0];
+       unsigned int n_ctrs[];
 };
 
 /**
@@ -78,7 +78,7 @@ struct gcov_info {
        unsigned int                    n_functions;
        const struct gcov_fn_info       *functions;
        unsigned int                    ctr_mask;
-       struct gcov_ctr_info            counts[0];
+       struct gcov_ctr_info            counts[];
 };
 
 /**
@@ -352,7 +352,7 @@ struct gcov_iterator {
        unsigned int count;
 
        int num_types;
-       struct type_info type_info[0];
+       struct type_info type_info[];
 };
 
 static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
index ec37563..908fdf5 100644 (file)
@@ -68,7 +68,7 @@ struct gcov_fn_info {
        unsigned int ident;
        unsigned int lineno_checksum;
        unsigned int cfg_checksum;
-       struct gcov_ctr_info ctrs[0];
+       struct gcov_ctr_info ctrs[];
 };
 
 /**
index a9b3f66..16c8c60 100644 (file)
@@ -175,7 +175,6 @@ unsigned long kallsyms_lookup_name(const char *name)
        }
        return module_kallsyms_lookup_name(name);
 }
-EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
 
 int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
                                      unsigned long),
@@ -194,7 +193,6 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
        }
        return module_kallsyms_on_each_symbol(fn, data);
 }
-EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
 
 static unsigned long get_symbol_pos(unsigned long addr,
                                    unsigned long *symbolsize,
index bc6addd..8b2b311 100644 (file)
@@ -35,7 +35,7 @@
  *                    (u64) THREAD_SIZE * 8UL);
  *
  * If you need less than 50 threads would mean we're dealing with systems
- * smaller than 3200 pages. This assuems you are capable of having ~13M memory,
+ * smaller than 3200 pages. This assumes you are capable of having ~13M memory,
  * and this would only be an be an upper limit, after which the OOM killer
  * would take effect. Systems like these are very unlikely if modules are
  * enabled.
index 33569a0..646f1e2 100644 (file)
@@ -1515,7 +1515,7 @@ struct module_sect_attr {
 struct module_sect_attrs {
        struct attribute_group grp;
        unsigned int nsections;
-       struct module_sect_attr attrs[0];
+       struct module_sect_attr attrs[];
 };
 
 static ssize_t module_sect_show(struct module_attribute *mattr,
@@ -1608,7 +1608,7 @@ static void remove_sect_attrs(struct module *mod)
 struct module_notes_attrs {
        struct kobject *dir;
        unsigned int notes;
-       struct bin_attribute attrs[0];
+       struct bin_attribute attrs[];
 };
 
 static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
@@ -4355,6 +4355,7 @@ static int modules_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops modules_proc_ops = {
+       .proc_flags     = PROC_ENTRY_PERMANENT,
        .proc_open      = modules_open,
        .proc_read      = seq_read,
        .proc_lseek     = seq_lseek,
index ef90eb1..7959449 100644 (file)
@@ -196,6 +196,50 @@ unlock:
        return res;
 }
 
+struct compat_resume_swap_area {
+       compat_loff_t offset;
+       u32 dev;
+} __packed;
+
+static int snapshot_set_swap_area(struct snapshot_data *data,
+               void __user *argp)
+{
+       sector_t offset;
+       dev_t swdev;
+
+       if (swsusp_swap_in_use())
+               return -EPERM;
+
+       if (in_compat_syscall()) {
+               struct compat_resume_swap_area swap_area;
+
+               if (copy_from_user(&swap_area, argp, sizeof(swap_area)))
+                       return -EFAULT;
+               swdev = new_decode_dev(swap_area.dev);
+               offset = swap_area.offset;
+       } else {
+               struct resume_swap_area swap_area;
+
+               if (copy_from_user(&swap_area, argp, sizeof(swap_area)))
+                       return -EFAULT;
+               swdev = new_decode_dev(swap_area.dev);
+               offset = swap_area.offset;
+       }
+
+       /*
+        * User space encodes device types as two-byte values,
+        * so we need to recode them
+        */
+       if (!swdev) {
+               data->swap = -1;
+               return -EINVAL;
+       }
+       data->swap = swap_type_of(swdev, offset, NULL);
+       if (data->swap < 0)
+               return -ENODEV;
+       return 0;
+}
+
 static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                                                        unsigned long arg)
 {
@@ -351,34 +395,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                break;
 
        case SNAPSHOT_SET_SWAP_AREA:
-               if (swsusp_swap_in_use()) {
-                       error = -EPERM;
-               } else {
-                       struct resume_swap_area swap_area;
-                       dev_t swdev;
-
-                       error = copy_from_user(&swap_area, (void __user *)arg,
-                                       sizeof(struct resume_swap_area));
-                       if (error) {
-                               error = -EFAULT;
-                               break;
-                       }
-
-                       /*
-                        * User space encodes device types as two-byte values,
-                        * so we need to recode them
-                        */
-                       swdev = new_decode_dev(swap_area.dev);
-                       if (swdev) {
-                               offset = swap_area.offset;
-                               data->swap = swap_type_of(swdev, offset, NULL);
-                               if (data->swap < 0)
-                                       error = -ENODEV;
-                       } else {
-                               data->swap = -1;
-                               error = -EINVAL;
-                       }
-               }
+               error = snapshot_set_swap_area(data, (void __user *)arg);
                break;
 
        default:
@@ -393,12 +410,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 }
 
 #ifdef CONFIG_COMPAT
-
-struct compat_resume_swap_area {
-       compat_loff_t offset;
-       u32 dev;
-} __packed;
-
 static long
 snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -409,33 +420,13 @@ snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case SNAPSHOT_AVAIL_SWAP_SIZE:
        case SNAPSHOT_ALLOC_SWAP_PAGE:
        case SNAPSHOT_CREATE_IMAGE:
+       case SNAPSHOT_SET_SWAP_AREA:
                return snapshot_ioctl(file, cmd,
                                      (unsigned long) compat_ptr(arg));
-
-       case SNAPSHOT_SET_SWAP_AREA: {
-               struct compat_resume_swap_area __user *u_swap_area =
-                       compat_ptr(arg);
-               struct resume_swap_area swap_area;
-               mm_segment_t old_fs;
-               int err;
-
-               err = get_user(swap_area.offset, &u_swap_area->offset);
-               err |= get_user(swap_area.dev, &u_swap_area->dev);
-               if (err)
-                       return -EFAULT;
-               old_fs = get_fs();
-               set_fs(KERNEL_DS);
-               err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
-                                    (unsigned long) &swap_area);
-               set_fs(old_fs);
-               return err;
-       }
-
        default:
                return snapshot_ioctl(file, cmd, arg);
        }
 }
-
 #endif /* CONFIG_COMPAT */
 
 static const struct file_operations snapshot_fops = {
index d7fb20a..1ea3ddd 100644 (file)
@@ -2799,7 +2799,7 @@ static void task_numa_work(struct callback_head *work)
                 * Skip inaccessible VMAs to avoid any confusion between
                 * PROT_NONE and NUMA hinting ptes
                 */
-               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+               if (!vma_is_accessible(vma))
                        continue;
 
                do {
index bc7e563..5d53f96 100644 (file)
@@ -615,6 +615,9 @@ config ARCH_HAS_PMEM_API
 config MEMREGION
        bool
 
+config ARCH_HAS_MEMREMAP_COMPAT_ALIGN
+       bool
+
 # use memcpy to implement user copies for nommu architectures
 config UACCESS_MEMCPY
        bool
index d1398ce..50c1f5f 100644 (file)
@@ -305,18 +305,6 @@ config HEADERS_INSTALL
          user-space program samples. It is also needed by some features such
          as uapi header sanity checks.
 
-config OPTIMIZE_INLINING
-       def_bool y
-       help
-         This option determines if the kernel forces gcc to inline the functions
-         developers have marked 'inline'. Doing so takes away freedom from gcc to
-         do what it thinks is best, which is desirable for the gcc 3.x series of
-         compilers. The gcc 4.x series have a rewritten inlining algorithm and
-         enabling this option will generate a smaller kernel there. Hopefully
-         this algorithm is so good that allowing gcc 4.x and above to make the
-         decision will become the default in the future. Until then this option
-         is there to test gcc for this.
-
 config DEBUG_SECTION_MISMATCH
        bool "Enable full Section mismatch analysis"
        help
@@ -988,6 +976,18 @@ config WQ_WATCHDOG
          state.  This can be configured through kernel parameter
          "workqueue.watchdog_thresh" and its sysfs counterpart.
 
+config TEST_LOCKUP
+       tristate "Test module to generate lockups"
+       help
+         This builds the "test_lockup" module that helps to make sure
+         that watchdogs and lockup detectors are working properly.
+
+         Depending on module parameters it could emulate soft or hard
+         lockup, "hung task", or locking arbitrary lock for a long time.
+         Also it could generate series of lockups with cooling-down periods.
+
+         If unsure, say N.
+
 endmenu # "Debug lockups and hangs"
 
 menu "Scheduler Debugging"
@@ -1655,7 +1655,7 @@ config FAILSLAB
          Provide fault-injection capability for kmalloc.
 
 config FAIL_PAGE_ALLOC
-       bool "Fault-injection capabilitiy for alloc_pages()"
+       bool "Fault-injection capability for alloc_pages()"
        depends on FAULT_INJECTION
        help
          Provide fault-injection capability for alloc_pages().
index 0e04fcb..48469c9 100644 (file)
@@ -2,18 +2,50 @@
 config ARCH_HAS_UBSAN_SANITIZE_ALL
        bool
 
-config UBSAN
+menuconfig UBSAN
        bool "Undefined behaviour sanity checker"
        help
-         This option enables undefined behaviour sanity checker
+         This option enables the Undefined Behaviour sanity checker.
          Compile-time instrumentation is used to detect various undefined
-         behaviours in runtime. Various types of checks may be enabled
-         via boot parameter ubsan_handle
-         (see: Documentation/dev-tools/ubsan.rst).
+         behaviours at runtime. For more details, see:
+         Documentation/dev-tools/ubsan.rst
+
+if UBSAN
+
+config UBSAN_TRAP
+       bool "On Sanitizer warnings, abort the running kernel code"
+       depends on $(cc-option, -fsanitize-undefined-trap-on-error)
+       help
+         Building kernels with Sanitizer features enabled tends to grow
+         the kernel size by around 5%, due to adding all the debugging
+         text on failure paths. To avoid this, Sanitizer instrumentation
+         can just issue a trap. This reduces the kernel size overhead but
+         turns all warnings (including potentially harmless conditions)
+         into full exceptions that abort the running kernel code
+         (regardless of context, locks held, etc), which may destabilize
+         the system. For some system builders this is an acceptable
+         trade-off.
+
+config UBSAN_BOUNDS
+       bool "Perform array index bounds checking"
+       default UBSAN
+       help
+         This option enables detection of directly indexed out of bounds
+         array accesses, where the array size is known at compile time.
+         Note that this does not protect array overflows via bad calls
+         to the {str,mem}*cpy() family of functions (that is addressed
+         by CONFIG_FORTIFY_SOURCE).
+
+config UBSAN_MISC
+       bool "Enable all other Undefined Behavior sanity checks"
+       default UBSAN
+       help
+         This option enables all sanity checks that don't have their
+         own Kconfig options. Disable this if you only want to have
+         individually selected checks.
 
 config UBSAN_SANITIZE_ALL
        bool "Enable instrumentation for the entire kernel"
-       depends on UBSAN
        depends on ARCH_HAS_UBSAN_SANITIZE_ALL
 
        # We build with -Wno-maybe-uninitilzed, but we still want to
@@ -30,7 +62,6 @@ config UBSAN_SANITIZE_ALL
 
 config UBSAN_NO_ALIGNMENT
        bool "Disable checking of pointers alignment"
-       depends on UBSAN
        default y if HAVE_EFFICIENT_UNALIGNED_ACCESS
        help
          This option disables the check of unaligned memory accesses.
@@ -43,7 +74,9 @@ config UBSAN_ALIGNMENT
 
 config TEST_UBSAN
        tristate "Module for testing for undefined behavior detection"
-       depends on m && UBSAN
+       depends on m
        help
          This is a test module for UBSAN.
          It triggers various undefined behavior, and detect it.
+
+endif  # if UBSAN
index 09a8acb..685aee6 100644 (file)
@@ -87,9 +87,11 @@ obj-$(CONFIG_TEST_KMOD) += test_kmod.o
 obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o
 obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o
 obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o
+CFLAGS_test_stackinit.o += $(call cc-disable-warning, switch-unreachable)
 obj-$(CONFIG_TEST_STACKINIT) += test_stackinit.o
 obj-$(CONFIG_TEST_BLACKHOLE_DEV) += test_blackhole_dev.o
 obj-$(CONFIG_TEST_MEMINIT) += test_meminit.o
+obj-$(CONFIG_TEST_LOCKUP) += test_lockup.o
 
 obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/
 
@@ -221,6 +223,10 @@ obj-$(CONFIG_MEMREGION) += memregion.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
 obj-$(CONFIG_IRQ_POLL) += irq_poll.o
 
+# stackdepot.c should not be instrumented or call instrumented functions.
+# Prevent the compiler from calling builtins like memcmp() or bcmp() from this
+# file.
+CFLAGS_stackdepot.o += -fno-builtin
 obj-$(CONFIG_STACKDEPOT) += stackdepot.o
 KASAN_SANITIZE_stackdepot.o := n
 KCOV_INSTRUMENT_stackdepot.o := n
@@ -280,7 +286,9 @@ quiet_cmd_build_OID_registry = GEN     $@
 clean-files    += oid_registry_data.c
 
 obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
+ifneq ($(CONFIG_UBSAN_TRAP),y)
 obj-$(CONFIG_UBSAN) += ubsan.o
+endif
 
 UBSAN_SANITIZE_ubsan.o := n
 KASAN_SANITIZE_ubsan.o := n
index 5db6d3a..052d3fb 100644 (file)
--- a/lib/bch.c
+++ b/lib/bch.c
  */
 struct gf_poly {
        unsigned int deg;    /* polynomial degree */
-       unsigned int c[0];   /* polynomial terms */
+       unsigned int c[];   /* polynomial terms */
 };
 
 /* given its degree, compute a polynomial size in bytes */
index aae17d9..8f199f4 100644 (file)
@@ -1031,7 +1031,7 @@ static int __init dynamic_debug_init(void)
        int n = 0, entries = 0, modct = 0;
        int verbose_bytes = 0;
 
-       if (__start___verbose == __stop___verbose) {
+       if (&__start___verbose == &__stop___verbose) {
                pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n");
                return 1;
        }
index abc86c6..8545872 100644 (file)
@@ -503,7 +503,7 @@ struct rb_node *rb_next(const struct rb_node *node)
        if (node->rb_right) {
                node = node->rb_right;
                while (node->rb_left)
-                       node=node->rb_left;
+                       node = node->rb_left;
                return (struct rb_node *)node;
        }
 
@@ -535,7 +535,7 @@ struct rb_node *rb_prev(const struct rb_node *node)
        if (node->rb_left) {
                node = node->rb_left;
                while (node->rb_right)
-                       node=node->rb_right;
+                       node = node->rb_right;
                return (struct rb_node *)node;
        }
 
index 5813072..5d63a88 100644 (file)
@@ -832,7 +832,7 @@ EXPORT_SYMBOL(sg_miter_stop);
  * @buflen:             The number of bytes to copy
  * @skip:               Number of bytes to skip before copying
  * @to_buffer:          transfer direction (true == from an sg list to a
- *                      buffer, false == from a buffer to an sg list
+ *                      buffer, false == from a buffer to an sg list)
  *
  * Returns the number of copied bytes.
  *
index 81c69c0..2caffc6 100644 (file)
@@ -20,6 +20,7 @@
  */
 
 #include <linux/gfp.h>
+#include <linux/interrupt.h>
 #include <linux/jhash.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -202,9 +203,20 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
                               unsigned long **entries)
 {
        union handle_parts parts = { .handle = handle };
-       void *slab = stack_slabs[parts.slabindex];
+       void *slab;
        size_t offset = parts.offset << STACK_ALLOC_ALIGN;
-       struct stack_record *stack = slab + offset;
+       struct stack_record *stack;
+
+       *entries = NULL;
+       if (parts.slabindex > depot_index) {
+               WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n",
+                       parts.slabindex, depot_index, handle);
+               return 0;
+       }
+       slab = stack_slabs[parts.slabindex];
+       if (!slab)
+               return 0;
+       stack = slab + offset;
 
        *entries = stack->entries;
        return stack->size;
@@ -305,3 +317,26 @@ fast_exit:
        return retval;
 }
 EXPORT_SYMBOL_GPL(stack_depot_save);
+
+static inline int in_irqentry_text(unsigned long ptr)
+{
+       return (ptr >= (unsigned long)&__irqentry_text_start &&
+               ptr < (unsigned long)&__irqentry_text_end) ||
+               (ptr >= (unsigned long)&__softirqentry_text_start &&
+                ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+unsigned int filter_irq_stacks(unsigned long *entries,
+                                            unsigned int nr_entries)
+{
+       unsigned int i;
+
+       for (i = 0; i < nr_entries; i++) {
+               if (in_irqentry_text(entries[i])) {
+                       /* Include the irqentry function into the stack. */
+                       return i + 1;
+               }
+       }
+       return nr_entries;
+}
+EXPORT_SYMBOL_GPL(filter_irq_stacks);
index 61ed71c..6b13150 100644 (file)
@@ -278,6 +278,8 @@ static void __init test_replace(void)
        unsigned int nlongs = DIV_ROUND_UP(nbits, BITS_PER_LONG);
        DECLARE_BITMAP(bmap, 1024);
 
+       BUILD_BUG_ON(EXP2_IN_BITS < nbits * 2);
+
        bitmap_zero(bmap, 1024);
        bitmap_replace(bmap, &exp2[0 * nlongs], &exp2[1 * nlongs], exp2_to_exp3_mask, nbits);
        expect_eq_bitmap(bmap, exp3_0_1, nbits);
index 9cf7762..e651c37 100644 (file)
@@ -204,7 +204,7 @@ static void test_kmod_put_module(struct kmod_test_device_info *info)
        case TEST_KMOD_DRIVER:
                break;
        case TEST_KMOD_FS_TYPE:
-               if (info && info->fs_sync && info->fs_sync->owner)
+               if (info->fs_sync && info->fs_sync->owner)
                        module_put(info->fs_sync->owner);
                break;
        default:
diff --git a/lib/test_lockup.c b/lib/test_lockup.c
new file mode 100644 (file)
index 0000000..ea09ca3
--- /dev/null
@@ -0,0 +1,599 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test module to generate lockups
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/clock.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/file.h>
+
+static unsigned int time_secs;
+module_param(time_secs, uint, 0600);
+MODULE_PARM_DESC(time_secs, "lockup time in seconds, default 0");
+
+static unsigned int time_nsecs;
+module_param(time_nsecs, uint, 0600);
+MODULE_PARM_DESC(time_nsecs, "nanoseconds part of lockup time, default 0");
+
+static unsigned int cooldown_secs;
+module_param(cooldown_secs, uint, 0600);
+MODULE_PARM_DESC(cooldown_secs, "cooldown time between iterations in seconds, default 0");
+
+static unsigned int cooldown_nsecs;
+module_param(cooldown_nsecs, uint, 0600);
+MODULE_PARM_DESC(cooldown_nsecs, "nanoseconds part of cooldown, default 0");
+
+static unsigned int iterations = 1;
+module_param(iterations, uint, 0600);
+MODULE_PARM_DESC(iterations, "lockup iterations, default 1");
+
+static bool all_cpus;
+module_param(all_cpus, bool, 0400);
+MODULE_PARM_DESC(all_cpus, "trigger lockup at all cpus at once");
+
+static int wait_state;
+static char *state = "R";
+module_param(state, charp, 0400);
+MODULE_PARM_DESC(state, "wait in 'R' running (default), 'D' uninterruptible, 'K' killable, 'S' interruptible state");
+
+static bool use_hrtimer;
+module_param(use_hrtimer, bool, 0400);
+MODULE_PARM_DESC(use_hrtimer, "use high-resolution timer for sleeping");
+
+static bool iowait;
+module_param(iowait, bool, 0400);
+MODULE_PARM_DESC(iowait, "account sleep time as iowait");
+
+static bool lock_read;
+module_param(lock_read, bool, 0400);
+MODULE_PARM_DESC(lock_read, "lock read-write locks for read");
+
+static bool lock_single;
+module_param(lock_single, bool, 0400);
+MODULE_PARM_DESC(lock_single, "acquire locks only at one cpu");
+
+static bool reacquire_locks;
+module_param(reacquire_locks, bool, 0400);
+MODULE_PARM_DESC(reacquire_locks, "release and reacquire locks/irq/preempt between iterations");
+
+static bool touch_softlockup;
+module_param(touch_softlockup, bool, 0600);
+MODULE_PARM_DESC(touch_softlockup, "touch soft-lockup watchdog between iterations");
+
+static bool touch_hardlockup;
+module_param(touch_hardlockup, bool, 0600);
+MODULE_PARM_DESC(touch_hardlockup, "touch hard-lockup watchdog between iterations");
+
+static bool call_cond_resched;
+module_param(call_cond_resched, bool, 0600);
+MODULE_PARM_DESC(call_cond_resched, "call cond_resched() between iterations");
+
+static bool measure_lock_wait;
+module_param(measure_lock_wait, bool, 0400);
+MODULE_PARM_DESC(measure_lock_wait, "measure lock wait time");
+
+static unsigned long lock_wait_threshold = ULONG_MAX;
+module_param(lock_wait_threshold, ulong, 0400);
+MODULE_PARM_DESC(lock_wait_threshold, "print lock wait time longer than this in nanoseconds, default off");
+
+static bool test_disable_irq;
+module_param_named(disable_irq, test_disable_irq, bool, 0400);
+MODULE_PARM_DESC(disable_irq, "disable interrupts: generate hard-lockups");
+
+static bool disable_softirq;
+module_param(disable_softirq, bool, 0400);
+MODULE_PARM_DESC(disable_softirq, "disable bottom-half irq handlers");
+
+static bool disable_preempt;
+module_param(disable_preempt, bool, 0400);
+MODULE_PARM_DESC(disable_preempt, "disable preemption: generate soft-lockups");
+
+static bool lock_rcu;
+module_param(lock_rcu, bool, 0400);
+MODULE_PARM_DESC(lock_rcu, "grab rcu_read_lock: generate rcu stalls");
+
+static bool lock_mmap_sem;
+module_param(lock_mmap_sem, bool, 0400);
+MODULE_PARM_DESC(lock_mmap_sem, "lock mm->mmap_sem: block procfs interfaces");
+
+static unsigned long lock_rwsem_ptr;
+module_param_unsafe(lock_rwsem_ptr, ulong, 0400);
+MODULE_PARM_DESC(lock_rwsem_ptr, "lock rw_semaphore at address");
+
+static unsigned long lock_mutex_ptr;
+module_param_unsafe(lock_mutex_ptr, ulong, 0400);
+MODULE_PARM_DESC(lock_mutex_ptr, "lock mutex at address");
+
+static unsigned long lock_spinlock_ptr;
+module_param_unsafe(lock_spinlock_ptr, ulong, 0400);
+MODULE_PARM_DESC(lock_spinlock_ptr, "lock spinlock at address");
+
+static unsigned long lock_rwlock_ptr;
+module_param_unsafe(lock_rwlock_ptr, ulong, 0400);
+MODULE_PARM_DESC(lock_rwlock_ptr, "lock rwlock at address");
+
+static unsigned int alloc_pages_nr;
+module_param_unsafe(alloc_pages_nr, uint, 0600);
+MODULE_PARM_DESC(alloc_pages_nr, "allocate and free pages under locks");
+
+static unsigned int alloc_pages_order;
+module_param(alloc_pages_order, uint, 0400);
+MODULE_PARM_DESC(alloc_pages_order, "page order to allocate");
+
+static gfp_t alloc_pages_gfp = GFP_KERNEL;
+module_param_unsafe(alloc_pages_gfp, uint, 0400);
+MODULE_PARM_DESC(alloc_pages_gfp, "allocate pages with this gfp_mask, default GFP_KERNEL");
+
+static bool alloc_pages_atomic;
+module_param(alloc_pages_atomic, bool, 0400);
+MODULE_PARM_DESC(alloc_pages_atomic, "allocate pages with GFP_ATOMIC");
+
+static bool reallocate_pages;
+module_param(reallocate_pages, bool, 0400);
+MODULE_PARM_DESC(reallocate_pages, "free and allocate pages between iterations");
+
+struct file *test_file;
+struct inode *test_inode;
+static char test_file_path[256];
+module_param_string(file_path, test_file_path, sizeof(test_file_path), 0400);
+MODULE_PARM_DESC(file_path, "file path to test");
+
+static bool test_lock_inode;
+module_param_named(lock_inode, test_lock_inode, bool, 0400);
+MODULE_PARM_DESC(lock_inode, "lock file -> inode -> i_rwsem");
+
+static bool test_lock_mapping;
+module_param_named(lock_mapping, test_lock_mapping, bool, 0400);
+MODULE_PARM_DESC(lock_mapping, "lock file -> mapping -> i_mmap_rwsem");
+
+static bool test_lock_sb_umount;
+module_param_named(lock_sb_umount, test_lock_sb_umount, bool, 0400);
+MODULE_PARM_DESC(lock_sb_umount, "lock file -> sb -> s_umount");
+
+static atomic_t alloc_pages_failed = ATOMIC_INIT(0);
+
+static atomic64_t max_lock_wait = ATOMIC64_INIT(0);
+
+static struct task_struct *main_task;
+static int master_cpu;
+
+static void test_lock(bool master, bool verbose)
+{
+       u64 uninitialized_var(wait_start);
+
+       if (measure_lock_wait)
+               wait_start = local_clock();
+
+       if (lock_mutex_ptr && master) {
+               if (verbose)
+                       pr_notice("lock mutex %ps\n", (void *)lock_mutex_ptr);
+               mutex_lock((struct mutex *)lock_mutex_ptr);
+       }
+
+       if (lock_rwsem_ptr && master) {
+               if (verbose)
+                       pr_notice("lock rw_semaphore %ps\n",
+                                 (void *)lock_rwsem_ptr);
+               if (lock_read)
+                       down_read((struct rw_semaphore *)lock_rwsem_ptr);
+               else
+                       down_write((struct rw_semaphore *)lock_rwsem_ptr);
+       }
+
+       if (lock_mmap_sem && master) {
+               if (verbose)
+                       pr_notice("lock mmap_sem pid=%d\n", main_task->pid);
+               if (lock_read)
+                       down_read(&main_task->mm->mmap_sem);
+               else
+                       down_write(&main_task->mm->mmap_sem);
+       }
+
+       if (test_disable_irq)
+               local_irq_disable();
+
+       if (disable_softirq)
+               local_bh_disable();
+
+       if (disable_preempt)
+               preempt_disable();
+
+       if (lock_rcu)
+               rcu_read_lock();
+
+       if (lock_spinlock_ptr && master) {
+               if (verbose)
+                       pr_notice("lock spinlock %ps\n",
+                                 (void *)lock_spinlock_ptr);
+               spin_lock((spinlock_t *)lock_spinlock_ptr);
+       }
+
+       if (lock_rwlock_ptr && master) {
+               if (verbose)
+                       pr_notice("lock rwlock %ps\n",
+                                 (void *)lock_rwlock_ptr);
+               if (lock_read)
+                       read_lock((rwlock_t *)lock_rwlock_ptr);
+               else
+                       write_lock((rwlock_t *)lock_rwlock_ptr);
+       }
+
+       if (measure_lock_wait) {
+               s64 cur_wait = local_clock() - wait_start;
+               s64 max_wait = atomic64_read(&max_lock_wait);
+
+               do {
+                       if (cur_wait < max_wait)
+                               break;
+                       max_wait = atomic64_cmpxchg(&max_lock_wait,
+                                                   max_wait, cur_wait);
+               } while (max_wait != cur_wait);
+
+               if (cur_wait > lock_wait_threshold)
+                       pr_notice_ratelimited("lock wait %lld ns\n", cur_wait);
+       }
+}
+
+static void test_unlock(bool master, bool verbose)
+{
+       if (lock_rwlock_ptr && master) {
+               if (lock_read)
+                       read_unlock((rwlock_t *)lock_rwlock_ptr);
+               else
+                       write_unlock((rwlock_t *)lock_rwlock_ptr);
+               if (verbose)
+                       pr_notice("unlock rwlock %ps\n",
+                                 (void *)lock_rwlock_ptr);
+       }
+
+       if (lock_spinlock_ptr && master) {
+               spin_unlock((spinlock_t *)lock_spinlock_ptr);
+               if (verbose)
+                       pr_notice("unlock spinlock %ps\n",
+                                 (void *)lock_spinlock_ptr);
+       }
+
+       if (lock_rcu)
+               rcu_read_unlock();
+
+       if (disable_preempt)
+               preempt_enable();
+
+       if (disable_softirq)
+               local_bh_enable();
+
+       if (test_disable_irq)
+               local_irq_enable();
+
+       if (lock_mmap_sem && master) {
+               if (lock_read)
+                       up_read(&main_task->mm->mmap_sem);
+               else
+                       up_write(&main_task->mm->mmap_sem);
+               if (verbose)
+                       pr_notice("unlock mmap_sem pid=%d\n", main_task->pid);
+       }
+
+       if (lock_rwsem_ptr && master) {
+               if (lock_read)
+                       up_read((struct rw_semaphore *)lock_rwsem_ptr);
+               else
+                       up_write((struct rw_semaphore *)lock_rwsem_ptr);
+               if (verbose)
+                       pr_notice("unlock rw_semaphore %ps\n",
+                                 (void *)lock_rwsem_ptr);
+       }
+
+       if (lock_mutex_ptr && master) {
+               mutex_unlock((struct mutex *)lock_mutex_ptr);
+               if (verbose)
+                       pr_notice("unlock mutex %ps\n",
+                                 (void *)lock_mutex_ptr);
+       }
+}
+
+static void test_alloc_pages(struct list_head *pages)
+{
+       struct page *page;
+       unsigned int i;
+
+       for (i = 0; i < alloc_pages_nr; i++) {
+               page = alloc_pages(alloc_pages_gfp, alloc_pages_order);
+               if (!page) {
+                       atomic_inc(&alloc_pages_failed);
+                       break;
+               }
+               list_add(&page->lru, pages);
+       }
+}
+
+static void test_free_pages(struct list_head *pages)
+{
+       struct page *page, *next;
+
+       list_for_each_entry_safe(page, next, pages, lru)
+               __free_pages(page, alloc_pages_order);
+       INIT_LIST_HEAD(pages);
+}
+
+static void test_wait(unsigned int secs, unsigned int nsecs)
+{
+       if (wait_state == TASK_RUNNING) {
+               if (secs)
+                       mdelay(secs * MSEC_PER_SEC);
+               if (nsecs)
+                       ndelay(nsecs);
+               return;
+       }
+
+       __set_current_state(wait_state);
+       if (use_hrtimer) {
+               ktime_t time;
+
+               time = ns_to_ktime((u64)secs * NSEC_PER_SEC + nsecs);
+               schedule_hrtimeout(&time, HRTIMER_MODE_REL);
+       } else {
+               schedule_timeout(secs * HZ + nsecs_to_jiffies(nsecs));
+       }
+}
+
+static void test_lockup(bool master)
+{
+       u64 lockup_start = local_clock();
+       unsigned int iter = 0;
+       LIST_HEAD(pages);
+
+       pr_notice("Start on CPU%d\n", raw_smp_processor_id());
+
+       test_lock(master, true);
+
+       test_alloc_pages(&pages);
+
+       while (iter++ < iterations && !signal_pending(main_task)) {
+
+               if (iowait)
+                       current->in_iowait = 1;
+
+               test_wait(time_secs, time_nsecs);
+
+               if (iowait)
+                       current->in_iowait = 0;
+
+               if (reallocate_pages)
+                       test_free_pages(&pages);
+
+               if (reacquire_locks)
+                       test_unlock(master, false);
+
+               if (touch_softlockup)
+                       touch_softlockup_watchdog();
+
+               if (touch_hardlockup)
+                       touch_nmi_watchdog();
+
+               if (call_cond_resched)
+                       cond_resched();
+
+               test_wait(cooldown_secs, cooldown_nsecs);
+
+               if (reacquire_locks)
+                       test_lock(master, false);
+
+               if (reallocate_pages)
+                       test_alloc_pages(&pages);
+       }
+
+       pr_notice("Finish on CPU%d in %lld ns\n", raw_smp_processor_id(),
+                 local_clock() - lockup_start);
+
+       test_free_pages(&pages);
+
+       test_unlock(master, true);
+}
+
+DEFINE_PER_CPU(struct work_struct, test_works);
+
+static void test_work_fn(struct work_struct *work)
+{
+       test_lockup(!lock_single ||
+                   work == per_cpu_ptr(&test_works, master_cpu));
+}
+
+static bool test_kernel_ptr(unsigned long addr, int size)
+{
+       void *ptr = (void *)addr;
+       char buf;
+
+       if (!addr)
+               return false;
+
+       /* should be at least readable kernel address */
+       if (access_ok(ptr, 1) ||
+           access_ok(ptr + size - 1, 1) ||
+           probe_kernel_address(ptr, buf) ||
+           probe_kernel_address(ptr + size - 1, buf)) {
+               pr_err("invalid kernel ptr: %#lx\n", addr);
+               return true;
+       }
+
+       return false;
+}
+
+static bool __maybe_unused test_magic(unsigned long addr, int offset,
+                                     unsigned int expected)
+{
+       void *ptr = (void *)addr + offset;
+       unsigned int magic = 0;
+
+       if (!addr)
+               return false;
+
+       if (probe_kernel_address(ptr, magic) || magic != expected) {
+               pr_err("invalid magic at %#lx + %#x = %#x, expected %#x\n",
+                      addr, offset, magic, expected);
+               return true;
+       }
+
+       return false;
+}
+
+static int __init test_lockup_init(void)
+{
+       u64 test_start = local_clock();
+
+       main_task = current;
+
+       switch (state[0]) {
+       case 'S':
+               wait_state = TASK_INTERRUPTIBLE;
+               break;
+       case 'D':
+               wait_state = TASK_UNINTERRUPTIBLE;
+               break;
+       case 'K':
+               wait_state = TASK_KILLABLE;
+               break;
+       case 'R':
+               wait_state = TASK_RUNNING;
+               break;
+       default:
+               pr_err("unknown state=%s\n", state);
+               return -EINVAL;
+       }
+
+       if (alloc_pages_atomic)
+               alloc_pages_gfp = GFP_ATOMIC;
+
+       if (test_kernel_ptr(lock_spinlock_ptr, sizeof(spinlock_t)) ||
+           test_kernel_ptr(lock_rwlock_ptr, sizeof(rwlock_t)) ||
+           test_kernel_ptr(lock_mutex_ptr, sizeof(struct mutex)) ||
+           test_kernel_ptr(lock_rwsem_ptr, sizeof(struct rw_semaphore)))
+               return -EINVAL;
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+       if (test_magic(lock_spinlock_ptr,
+                      offsetof(spinlock_t, rlock.magic),
+                      SPINLOCK_MAGIC) ||
+           test_magic(lock_rwlock_ptr,
+                      offsetof(rwlock_t, magic),
+                      RWLOCK_MAGIC) ||
+           test_magic(lock_mutex_ptr,
+                      offsetof(struct mutex, wait_lock.rlock.magic),
+                      SPINLOCK_MAGIC) ||
+           test_magic(lock_rwsem_ptr,
+                      offsetof(struct rw_semaphore, wait_lock.magic),
+                      SPINLOCK_MAGIC))
+               return -EINVAL;
+#endif
+
+       if ((wait_state != TASK_RUNNING ||
+            (call_cond_resched && !reacquire_locks) ||
+            (alloc_pages_nr && gfpflags_allow_blocking(alloc_pages_gfp))) &&
+           (test_disable_irq || disable_softirq || disable_preempt ||
+            lock_rcu || lock_spinlock_ptr || lock_rwlock_ptr)) {
+               pr_err("refuse to sleep in atomic context\n");
+               return -EINVAL;
+       }
+
+       if (lock_mmap_sem && !main_task->mm) {
+               pr_err("no mm to lock mmap_sem\n");
+               return -EINVAL;
+       }
+
+       if (test_file_path[0]) {
+               test_file = filp_open(test_file_path, O_RDONLY, 0);
+               if (IS_ERR(test_file)) {
+                       pr_err("cannot find file_path\n");
+                       return -EINVAL;
+               }
+               test_inode = file_inode(test_file);
+       } else if (test_lock_inode ||
+                  test_lock_mapping ||
+                  test_lock_sb_umount) {
+               pr_err("no file to lock\n");
+               return -EINVAL;
+       }
+
+       if (test_lock_inode && test_inode)
+               lock_rwsem_ptr = (unsigned long)&test_inode->i_rwsem;
+
+       if (test_lock_mapping && test_file && test_file->f_mapping)
+               lock_rwsem_ptr = (unsigned long)&test_file->f_mapping->i_mmap_rwsem;
+
+       if (test_lock_sb_umount && test_inode)
+               lock_rwsem_ptr = (unsigned long)&test_inode->i_sb->s_umount;
+
+       pr_notice("START pid=%d time=%u +%u ns cooldown=%u +%u ns iterations=%u state=%s %s%s%s%s%s%s%s%s%s%s%s\n",
+                 main_task->pid, time_secs, time_nsecs,
+                 cooldown_secs, cooldown_nsecs, iterations, state,
+                 all_cpus ? "all_cpus " : "",
+                 iowait ? "iowait " : "",
+                 test_disable_irq ? "disable_irq " : "",
+                 disable_softirq ? "disable_softirq " : "",
+                 disable_preempt ? "disable_preempt " : "",
+                 lock_rcu ? "lock_rcu " : "",
+                 lock_read ? "lock_read " : "",
+                 touch_softlockup ? "touch_softlockup " : "",
+                 touch_hardlockup ? "touch_hardlockup " : "",
+                 call_cond_resched ? "call_cond_resched " : "",
+                 reacquire_locks ? "reacquire_locks " : "");
+
+       if (alloc_pages_nr)
+               pr_notice("ALLOCATE PAGES nr=%u order=%u gfp=%pGg %s\n",
+                         alloc_pages_nr, alloc_pages_order, &alloc_pages_gfp,
+                         reallocate_pages ? "reallocate_pages " : "");
+
+       if (all_cpus) {
+               unsigned int cpu;
+
+               cpus_read_lock();
+
+               preempt_disable();
+               master_cpu = smp_processor_id();
+               for_each_online_cpu(cpu) {
+                       INIT_WORK(per_cpu_ptr(&test_works, cpu), test_work_fn);
+                       queue_work_on(cpu, system_highpri_wq,
+                                     per_cpu_ptr(&test_works, cpu));
+               }
+               preempt_enable();
+
+               for_each_online_cpu(cpu)
+                       flush_work(per_cpu_ptr(&test_works, cpu));
+
+               cpus_read_unlock();
+       } else {
+               test_lockup(true);
+       }
+
+       if (measure_lock_wait)
+               pr_notice("Maximum lock wait: %lld ns\n",
+                         atomic64_read(&max_lock_wait));
+
+       if (alloc_pages_nr)
+               pr_notice("Page allocation failed %u times\n",
+                         atomic_read(&alloc_pages_failed));
+
+       pr_notice("FINISH in %llu ns\n", local_clock() - test_start);
+
+       if (test_file)
+               fput(test_file);
+
+       if (signal_pending(main_task))
+               return -EINTR;
+
+       return -EAGAIN;
+}
+module_init(test_lockup_init);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Konstantin Khlebnikov <khlebnikov@yandex-team.ru>");
+MODULE_DESCRIPTION("Test module to generate lockups");
index 2d7d257..f93b1e1 100644 (file)
@@ -92,8 +92,9 @@ static bool range_contains(char *haystack_start, size_t haystack_size,
  * @var_type: type to be tested for zeroing initialization
  * @which: is this a SCALAR, STRING, or STRUCT type?
  * @init_level: what kind of initialization is performed
+ * @xfail: is this test expected to fail?
  */
-#define DEFINE_TEST_DRIVER(name, var_type, which)              \
+#define DEFINE_TEST_DRIVER(name, var_type, which, xfail)       \
 /* Returns 0 on success, 1 on failure. */                      \
 static noinline __init int test_ ## name (void)                        \
 {                                                              \
@@ -139,13 +140,14 @@ static noinline __init int test_ ## name (void)                   \
        for (sum = 0, i = 0; i < target_size; i++)              \
                sum += (check_buf[i] == 0xFF);                  \
                                                                \
-       if (sum == 0)                                           \
+       if (sum == 0) {                                         \
                pr_info(#name " ok\n");                         \
-       else                                                    \
-               pr_warn(#name " FAIL (uninit bytes: %d)\n",     \
-                       sum);                                   \
-                                                               \
-       return (sum != 0);                                      \
+               return 0;                                       \
+       } else {                                                \
+               pr_warn(#name " %sFAIL (uninit bytes: %d)\n",   \
+                       (xfail) ? "X" : "", sum);               \
+               return (xfail) ? 0 : 1;                         \
+       }                                                       \
 }
 #define DEFINE_TEST(name, var_type, which, init_level)         \
 /* no-op to force compiler into ignoring "uninitialized" vars */\
@@ -189,7 +191,7 @@ static noinline __init int leaf_ ## name(unsigned long sp,  \
                                                                \
        return (int)buf[0] | (int)buf[sizeof(buf) - 1];         \
 }                                                              \
-DEFINE_TEST_DRIVER(name, var_type, which)
+DEFINE_TEST_DRIVER(name, var_type, which, 0)
 
 /* Structure with no padding. */
 struct test_packed {
@@ -326,8 +328,14 @@ static noinline __init int leaf_switch_2_none(unsigned long sp, bool fill,
        return __leaf_switch_none(2, fill);
 }
 
-DEFINE_TEST_DRIVER(switch_1_none, uint64_t, SCALAR);
-DEFINE_TEST_DRIVER(switch_2_none, uint64_t, SCALAR);
+/*
+ * These are expected to fail for most configurations because neither
+ * GCC nor Clang have a way to perform initialization of variables in
+ * non-code areas (i.e. in a switch statement before the first "case").
+ * https://bugs.llvm.org/show_bug.cgi?id=44916
+ */
+DEFINE_TEST_DRIVER(switch_1_none, uint64_t, SCALAR, 1);
+DEFINE_TEST_DRIVER(switch_2_none, uint64_t, SCALAR, 1);
 
 static int __init test_stackinit_init(void)
 {
index b352903..277cb44 100644 (file)
@@ -52,7 +52,7 @@ struct ts_bm
        u8 *            pattern;
        unsigned int    patlen;
        unsigned int    bad_shift[ASIZE];
-       unsigned int    good_shift[0];
+       unsigned int    good_shift[];
 };
 
 static unsigned int bm_find(struct ts_config *conf, struct ts_state *state)
index 9c873ca..ab749ec 100644 (file)
@@ -32,7 +32,7 @@
 struct ts_fsm
 {
        unsigned int            ntokens;
-       struct ts_fsm_token     tokens[0];
+       struct ts_fsm_token     tokens[];
 };
 
 /* other values derived from ctype.h */
index 94617e0..c77a3d5 100644 (file)
@@ -36,7 +36,7 @@ struct ts_kmp
 {
        u8 *            pattern;
        unsigned int    pattern_len;
-       unsigned int    prefix_tbl[0];
+       unsigned int    prefix_tbl[];
 };
 
 static unsigned int kmp_find(struct ts_config *conf, struct ts_state *state)
index 7b9b58a..f8c0ccf 100644 (file)
@@ -45,13 +45,6 @@ static bool was_reported(struct source_location *location)
        return test_and_set_bit(REPORTED_BIT, &location->reported);
 }
 
-static void print_source_location(const char *prefix,
-                               struct source_location *loc)
-{
-       pr_err("%s %s:%d:%d\n", prefix, loc->file_name,
-               loc->line & LINE_MASK, loc->column & COLUMN_MASK);
-}
-
 static bool suppress_report(struct source_location *loc)
 {
        return current->in_ubsan || was_reported(loc);
@@ -140,13 +133,14 @@ static void val_to_string(char *str, size_t size, struct type_descriptor *type,
        }
 }
 
-static void ubsan_prologue(struct source_location *location)
+static void ubsan_prologue(struct source_location *loc, const char *reason)
 {
        current->in_ubsan++;
 
        pr_err("========================================"
                "========================================\n");
-       print_source_location("UBSAN: Undefined behaviour in", location);
+       pr_err("UBSAN: %s in %s:%d:%d\n", reason, loc->file_name,
+               loc->line & LINE_MASK, loc->column & COLUMN_MASK);
 }
 
 static void ubsan_epilogue(void)
@@ -156,6 +150,17 @@ static void ubsan_epilogue(void)
                "========================================\n");
 
        current->in_ubsan--;
+
+       if (panic_on_warn) {
+               /*
+                * This thread may hit another WARN() in the panic path.
+                * Resetting this prevents additional WARN() from panicking the
+                * system on this thread.  Other threads are blocked by the
+                * panic_mutex in panic().
+                */
+               panic_on_warn = 0;
+               panic("panic_on_warn set ...\n");
+       }
 }
 
 static void handle_overflow(struct overflow_data *data, void *lhs,
@@ -169,12 +174,12 @@ static void handle_overflow(struct overflow_data *data, void *lhs,
        if (suppress_report(&data->location))
                return;
 
-       ubsan_prologue(&data->location);
+       ubsan_prologue(&data->location, type_is_signed(type) ?
+                       "signed-integer-overflow" :
+                       "unsigned-integer-overflow");
 
        val_to_string(lhs_val_str, sizeof(lhs_val_str), type, lhs);
        val_to_string(rhs_val_str, sizeof(rhs_val_str), type, rhs);
-       pr_err("%s integer overflow:\n",
-               type_is_signed(type) ? "signed" : "unsigned");
        pr_err("%s %c %s cannot be represented in type %s\n",
                lhs_val_str,
                op,
@@ -214,7 +219,7 @@ void __ubsan_handle_negate_overflow(struct overflow_data *data,
        if (suppress_report(&data->location))
                return;
 
-       ubsan_prologue(&data->location);
+       ubsan_prologue(&data->location, "negation-overflow");
 
        val_to_string(old_val_str, sizeof(old_val_str), data->type, old_val);
 
@@ -234,7 +239,7 @@ void __ubsan_handle_divrem_overflow(struct overflow_data *data,
        if (suppress_report(&data->location))
                return;
 
-       ubsan_prologue(&data->location);
+       ubsan_prologue(&data->location, "division-overflow");
 
        val_to_string(rhs_val_str, sizeof(rhs_val_str), data->type, rhs);
 
@@ -253,7 +258,7 @@ static void handle_null_ptr_deref(struct type_mismatch_data_common *data)
        if (suppress_report(data->location))
                return;
 
-       ubsan_prologue(data->location);
+       ubsan_prologue(data->location, "null-ptr-deref");
 
        pr_err("%s null pointer of type %s\n",
                type_check_kinds[data->type_check_kind],
@@ -268,7 +273,7 @@ static void handle_misaligned_access(struct type_mismatch_data_common *data,
        if (suppress_report(data->location))
                return;
 
-       ubsan_prologue(data->location);
+       ubsan_prologue(data->location, "misaligned-access");
 
        pr_err("%s misaligned address %p for type %s\n",
                type_check_kinds[data->type_check_kind],
@@ -284,7 +289,7 @@ static void handle_object_size_mismatch(struct type_mismatch_data_common *data,
        if (suppress_report(data->location))
                return;
 
-       ubsan_prologue(data->location);
+       ubsan_prologue(data->location, "object-size-mismatch");
        pr_err("%s address %p with insufficient space\n",
                type_check_kinds[data->type_check_kind],
                (void *) ptr);
@@ -343,7 +348,7 @@ void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, void *index)
        if (suppress_report(&data->location))
                return;
 
-       ubsan_prologue(&data->location);
+       ubsan_prologue(&data->location, "array-index-out-of-bounds");
 
        val_to_string(index_str, sizeof(index_str), data->index_type, index);
        pr_err("index %s is out of range for type %s\n", index_str,
@@ -364,7 +369,7 @@ void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data,
        if (suppress_report(&data->location))
                goto out;
 
-       ubsan_prologue(&data->location);
+       ubsan_prologue(&data->location, "shift-out-of-bounds");
 
        val_to_string(rhs_str, sizeof(rhs_str), rhs_type, rhs);
        val_to_string(lhs_str, sizeof(lhs_str), lhs_type, lhs);
@@ -396,7 +401,7 @@ EXPORT_SYMBOL(__ubsan_handle_shift_out_of_bounds);
 
 void __ubsan_handle_builtin_unreachable(struct unreachable_data *data)
 {
-       ubsan_prologue(&data->location);
+       ubsan_prologue(&data->location, "unreachable");
        pr_err("calling __builtin_unreachable()\n");
        ubsan_epilogue();
        panic("can't return from __builtin_unreachable()");
@@ -411,7 +416,7 @@ void __ubsan_handle_load_invalid_value(struct invalid_value_data *data,
        if (suppress_report(&data->location))
                return;
 
-       ubsan_prologue(&data->location);
+       ubsan_prologue(&data->location, "invalid-load");
 
        val_to_string(val_str, sizeof(val_str), data->type, val);
 
index ab80933..c1acc34 100644 (file)
@@ -139,6 +139,10 @@ config HAVE_FAST_GUP
 config ARCH_KEEP_MEMBLOCK
        bool
 
+# Keep arch NUMA mapping infrastructure post-init.
+config NUMA_KEEP_MEMINFO
+       bool
+
 config MEMORY_ISOLATION
        bool
 
@@ -154,6 +158,7 @@ config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
        depends on SPARSEMEM || X86_64_ACPI_NUMA
        depends on ARCH_ENABLE_MEMORY_HOTPLUG
+       select NUMA_KEEP_MEMINFO if NUMA
 
 config MEMORY_HOTPLUG_SPARSE
        def_bool y
@@ -237,6 +242,17 @@ config COMPACTION
          linux-mm@kvack.org.
 
 #
+# support for free page reporting
+config PAGE_REPORTING
+       bool "Free page reporting"
+       def_bool n
+       help
+         Free page reporting allows for the incremental acquisition of
+         free pages from the buddy allocator for the purpose of reporting
+         those pages to another entity, such as a hypervisor, so that the
+         memory can be freed within the host for other uses.
+
+#
 # support for page migration
 #
 config MIGRATION
@@ -420,10 +436,6 @@ config THP_SWAP
 
          For selection by architectures with reasonable THP sizes.
 
-config TRANSPARENT_HUGE_PAGECACHE
-       def_bool y
-       depends on TRANSPARENT_HUGEPAGE
-
 #
 # UP and nommu archs use km based percpu allocator
 #
@@ -526,7 +538,6 @@ config MEM_SOFT_DIRTY
 config ZSWAP
        bool "Compressed cache for swap pages (EXPERIMENTAL)"
        depends on FRONTSWAP && CRYPTO=y
-       select CRYPTO_LZO
        select ZPOOL
        help
          A lightweight compressed cache for swap pages.  It takes
@@ -542,6 +553,123 @@ config ZSWAP
          they have not be fully explored on the large set of potential
          configurations and workloads that exist.
 
+choice
+       prompt "Compressed cache for swap pages default compressor"
+       depends on ZSWAP
+       default ZSWAP_COMPRESSOR_DEFAULT_LZO
+       help
+         Selects the default compression algorithm for the compressed cache
+         for swap pages.
+
+         For an overview what kind of performance can be expected from
+         a particular compression algorithm please refer to the benchmarks
+         available at the following LWN page:
+         https://lwn.net/Articles/751795/
+
+         If in doubt, select 'LZO'.
+
+         The selection made here can be overridden by using the kernel
+         command line 'zswap.compressor=' option.
+
+config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
+       bool "Deflate"
+       select CRYPTO_DEFLATE
+       help
+         Use the Deflate algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZO
+       bool "LZO"
+       select CRYPTO_LZO
+       help
+         Use the LZO algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_842
+       bool "842"
+       select CRYPTO_842
+       help
+         Use the 842 algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZ4
+       bool "LZ4"
+       select CRYPTO_LZ4
+       help
+         Use the LZ4 algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
+       bool "LZ4HC"
+       select CRYPTO_LZ4HC
+       help
+         Use the LZ4HC algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_ZSTD
+       bool "zstd"
+       select CRYPTO_ZSTD
+       help
+         Use the zstd algorithm as the default compression algorithm.
+endchoice
+
+config ZSWAP_COMPRESSOR_DEFAULT
+       string
+       depends on ZSWAP
+       default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
+       default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO
+       default "842" if ZSWAP_COMPRESSOR_DEFAULT_842
+       default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4
+       default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
+       default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD
+       default ""
+
+choice
+       prompt "Compressed cache for swap pages default allocator"
+       depends on ZSWAP
+       default ZSWAP_ZPOOL_DEFAULT_ZBUD
+       help
+         Selects the default allocator for the compressed cache for
+         swap pages.
+         The default is 'zbud' for compatibility, however please do
+         read the description of each of the allocators below before
+         making a right choice.
+
+         The selection made here can be overridden by using the kernel
+         command line 'zswap.zpool=' option.
+
+config ZSWAP_ZPOOL_DEFAULT_ZBUD
+       bool "zbud"
+       select ZBUD
+       help
+         Use the zbud allocator as the default allocator.
+
+config ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+       bool "z3fold"
+       select Z3FOLD
+       help
+         Use the z3fold allocator as the default allocator.
+
+config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
+       bool "zsmalloc"
+       select ZSMALLOC
+       help
+         Use the zsmalloc allocator as the default allocator.
+endchoice
+
+config ZSWAP_ZPOOL_DEFAULT
+       string
+       depends on ZSWAP
+       default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD
+       default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+       default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
+       default ""
+
+config ZSWAP_DEFAULT_ON
+       bool "Enable the compressed cache for swap pages by default"
+       depends on ZSWAP
+       help
+         If selected, the compressed cache for swap pages will be enabled
+         at boot, otherwise it will be disabled.
+
+         The selection made here can be overridden by using the kernel
+         command line 'zswap.enabled=' option.
+
 config ZPOOL
        tristate "Common API for compressed memory storage"
        help
@@ -714,7 +842,7 @@ config GUP_GET_PTE_LOW_HIGH
 
 config READ_ONLY_THP_FOR_FS
        bool "Read-only THP for filesystems (EXPERIMENTAL)"
-       depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
+       depends on TRANSPARENT_HUGEPAGE && SHMEM
 
        help
          Allow khugepaged to put read-only file-backed pages in THP.
index dbc8346..fccd375 100644 (file)
@@ -111,3 +111,4 @@ obj-$(CONFIG_HMM_MIRROR) += hmm.o
 obj-$(CONFIG_MEMFD_CREATE) += memfd.o
 obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
 obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
+obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
index 62f05f6..c81b4f3 100644 (file)
@@ -491,8 +491,8 @@ static void cgwb_release_workfn(struct work_struct *work)
        css_put(wb->blkcg_css);
        mutex_unlock(&wb->bdi->cgwb_release_mutex);
 
-       /* triggers blkg destruction if cgwb_refcnt becomes zero */
-       blkcg_cgwb_put(blkcg);
+       /* triggers blkg destruction if no online users left */
+       blkcg_unpin_online(blkcg);
 
        fprop_local_destroy_percpu(&wb->memcg_completions);
        percpu_ref_exit(&wb->refcnt);
@@ -592,7 +592,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
                        list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
                        list_add(&wb->memcg_node, memcg_cgwb_list);
                        list_add(&wb->blkcg_node, blkcg_cgwb_list);
-                       blkcg_cgwb_get(blkcg);
+                       blkcg_pin_online(blkcg);
                        css_get(memcg_css);
                        css_get(blkcg_css);
                }
index df3da2f..46f0fcc 100644 (file)
@@ -481,6 +481,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page,
  */
 static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
                                                struct compact_control *cc)
+       __acquires(lock)
 {
        /* Track if the lock is contended in async mode */
        if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
@@ -989,7 +990,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                /* Successfully isolated */
                del_page_from_lru_list(page, lruvec, page_lru(page));
                mod_node_page_state(page_pgdat(page),
-                               NR_ISOLATED_ANON + page_is_file_cache(page),
+                               NR_ISOLATED_ANON + page_is_file_lru(page),
                                hpage_nr_pages(page));
 
 isolate_success:
index fe5d330..f9fb9bb 100644 (file)
@@ -144,9 +144,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
        else if (size < 4)
                size = 4;
 
-       if ((size % align) != 0)
-               size = ALIGN(size, align);
-
+       size = ALIGN(size, align);
        allocation = max_t(size_t, size, PAGE_SIZE);
 
        if (!boundary)
index 0fbdc8e..23a051a 100644 (file)
@@ -1693,6 +1693,11 @@ EXPORT_SYMBOL(pagecache_get_page);
  * Any shadow entries of evicted pages, or swap entries from
  * shmem/tmpfs, are included in the returned array.
  *
+ * If it finds a Transparent Huge Page, head or tail, find_get_entries()
+ * stops at that page: the caller is likely to have a better way to handle
+ * the compound page as a whole, and then skip its extent, than repeatedly
+ * calling find_get_entries() to return all its tails.
+ *
  * Return: the number of pages and shadow entries which were found.
  */
 unsigned find_get_entries(struct address_space *mapping,
@@ -1724,8 +1729,15 @@ unsigned find_get_entries(struct address_space *mapping,
                /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas)))
                        goto put_page;
-               page = find_subpage(page, xas.xa_index);
 
+               /*
+                * Terminate early on finding a THP, to allow the caller to
+                * handle it all at once; but continue if this is hugetlbfs.
+                */
+               if (PageTransHuge(page) && !PageHuge(page)) {
+                       page = find_subpage(page, xas.xa_index);
+                       nr_entries = ret + 1;
+               }
 export:
                indices[ret] = xas.xa_index;
                entries[ret] = page;
index da3e031..6076df8 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -351,7 +351,8 @@ static struct page *no_page_table(struct vm_area_struct *vma,
         * But we can only make this optimization where a hole would surely
         * be zero-filled if handle_mm_fault() actually did handle it.
         */
-       if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
+       if ((flags & FOLL_DUMP) &&
+                       (vma_is_anonymous(vma) || !vma->vm_ops->fault))
                return ERR_PTR(-EFAULT);
        return NULL;
 }
@@ -1101,7 +1102,7 @@ retry:
                                goto retry;
                        case -EBUSY:
                                ret = 0;
-                               /* FALLTHRU */
+                               fallthrough;
                        case -EFAULT:
                        case -ENOMEM:
                        case -EHWPOISON:
@@ -1325,10 +1326,12 @@ retry:
                 * start trying again otherwise it can loop forever.
                 */
 
-               if (fatal_signal_pending(current))
+               if (fatal_signal_pending(current)) {
+                       if (!pages_done)
+                               pages_done = -EINTR;
                        break;
+               }
 
-               *locked = 1;
                ret = down_read_killable(&mm->mmap_sem);
                if (ret) {
                        BUG_ON(ret > 0);
@@ -1337,6 +1340,7 @@ retry:
                        break;
                }
 
+               *locked = 1;
                ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
                                       pages, NULL, locked);
                if (!*locked) {
@@ -1416,7 +1420,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
         * We want mlock to succeed for regions that have any permissions
         * other than PROT_NONE.
         */
-       if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+       if (vma_is_accessible(vma))
                gup_flags |= FOLL_FORCE;
 
        /*
@@ -1676,7 +1680,7 @@ check_again:
                                        list_add_tail(&head->lru, &cma_page_list);
                                        mod_node_page_state(page_pgdat(head),
                                                            NR_ISOLATED_ANON +
-                                                           page_is_file_cache(head),
+                                                           page_is_file_lru(head),
                                                            hpage_nr_pages(head));
                                }
                        }
index 0f9389f..6ecd104 100644 (file)
@@ -326,7 +326,7 @@ static struct attribute *hugepage_attr[] = {
        &defrag_attr.attr,
        &use_zero_page_attr.attr,
        &hpage_pmd_size_attr.attr,
-#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+#ifdef CONFIG_SHMEM
        &shmem_enabled_attr.attr,
 #endif
 #ifdef CONFIG_DEBUG_VM
@@ -597,6 +597,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
        if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
+               count_vm_event(THP_FAULT_FALLBACK_CHARGE);
                return VM_FAULT_FALLBACK;
        }
 
@@ -1043,6 +1044,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        ret = -EAGAIN;
        pmd = *src_pmd;
 
+       /*
+        * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
+        * does not have the VM_UFFD_WP, which means that the uffd
+        * fork event is not enabled.
+        */
+       if (!(vma->vm_flags & VM_UFFD_WP))
+               pmd = pmd_clear_uffd_wp(pmd);
+
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
        if (unlikely(is_swap_pmd(pmd))) {
                swp_entry_t entry = pmd_to_swp_entry(pmd);
@@ -1446,6 +1455,7 @@ alloc:
                        put_page(page);
                ret |= VM_FAULT_FALLBACK;
                count_vm_event(THP_FAULT_FALLBACK);
+               count_vm_event(THP_FAULT_FALLBACK_CHARGE);
                goto out;
        }
 
@@ -1977,13 +1987,16 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
  *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
  */
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-               unsigned long addr, pgprot_t newprot, int prot_numa)
+               unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
        pmd_t entry;
        bool preserve_write;
        int ret;
+       bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+       bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+       bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 
        ptl = __pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
@@ -2050,6 +2063,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        entry = pmd_modify(entry, newprot);
        if (preserve_write)
                entry = pmd_mk_savedwrite(entry);
+       if (uffd_wp) {
+               entry = pmd_wrprotect(entry);
+               entry = pmd_mkuffd_wp(entry);
+       } else if (uffd_wp_resolve) {
+               /*
+                * Leave the write bit to be handled by PF interrupt
+                * handler, then things like COW could be properly
+                * handled.
+                */
+               entry = pmd_clear_uffd_wp(entry);
+       }
        ret = HPAGE_PMD_NR;
        set_pmd_at(mm, addr, pmd, entry);
        BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
@@ -2198,7 +2222,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
        struct page *page;
        pgtable_t pgtable;
        pmd_t old_pmd, _pmd;
-       bool young, write, soft_dirty, pmd_migration = false;
+       bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
        unsigned long addr;
        int i;
 
@@ -2273,6 +2297,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                write = is_write_migration_entry(entry);
                young = false;
                soft_dirty = pmd_swp_soft_dirty(old_pmd);
+               uffd_wp = pmd_swp_uffd_wp(old_pmd);
        } else {
                page = pmd_page(old_pmd);
                if (pmd_dirty(old_pmd))
@@ -2280,6 +2305,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                write = pmd_write(old_pmd);
                young = pmd_young(old_pmd);
                soft_dirty = pmd_soft_dirty(old_pmd);
+               uffd_wp = pmd_uffd_wp(old_pmd);
        }
        VM_BUG_ON_PAGE(!page_count(page), page);
        page_ref_add(page, HPAGE_PMD_NR - 1);
@@ -2304,6 +2330,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                        entry = swp_entry_to_pte(swp_entry);
                        if (soft_dirty)
                                entry = pte_swp_mksoft_dirty(entry);
+                       if (uffd_wp)
+                               entry = pte_swp_mkuffd_wp(entry);
                } else {
                        entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
                        entry = maybe_mkwrite(entry, vma);
@@ -2313,6 +2341,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                                entry = pte_mkold(entry);
                        if (soft_dirty)
                                entry = pte_mksoft_dirty(entry);
+                       if (uffd_wp)
+                               entry = pte_mkuffd_wp(entry);
                }
                pte = pte_offset_map(&_pmd, addr);
                BUG_ON(!pte_none(*pte));
index f9ea1e5..f5fb53f 100644 (file)
@@ -2010,6 +2010,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
  * of size 'delta'.
  */
 static int gather_surplus_pages(struct hstate *h, int delta)
+       __must_hold(&hugetlb_lock)
 {
        struct list_head surplus_list;
        struct page *page, *tmp;
index c2d7ae6..aabf65d 100644 (file)
@@ -467,14 +467,14 @@ static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
        switch (MEMFILE_ATTR(cft->private)) {
        case RES_RSVD_USAGE:
                counter = &h_cg->rsvd_hugepage[idx];
-               /* Fall through. */
+               fallthrough;
        case RES_USAGE:
                val = (u64)page_counter_read(counter);
                seq_printf(seq, "%llu\n", val * PAGE_SIZE);
                break;
        case RES_RSVD_LIMIT:
                counter = &h_cg->rsvd_hugepage[idx];
-               /* Fall through. */
+               fallthrough;
        case RES_LIMIT:
                val = (u64)counter->max;
                if (val == limit)
@@ -514,7 +514,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
        switch (MEMFILE_ATTR(of_cft(of)->private)) {
        case RES_RSVD_LIMIT:
                rsvd = true;
-               /* Fall through. */
+               fallthrough;
        case RES_LIMIT:
                mutex_lock(&hugetlb_limit_mutex);
                ret = page_counter_set_max(
index 2d58ae1..b5634e7 100644 (file)
@@ -180,6 +180,8 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
 }
 
 extern int __isolate_free_page(struct page *page, unsigned int order);
+extern void __putback_isolated_page(struct page *page, unsigned int order,
+                                   int mt);
 extern void memblock_free_pages(struct page *page, unsigned long pfn,
                                        unsigned int order);
 extern void __free_pages_core(struct page *page, unsigned int order);
index e61b4a4..2906358 100644 (file)
@@ -15,7 +15,6 @@
  */
 
 #include <linux/export.h>
-#include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/kasan.h>
 #include <linux/kernel.h>
 #include "kasan.h"
 #include "../slab.h"
 
-static inline int in_irqentry_text(unsigned long ptr)
-{
-       return (ptr >= (unsigned long)&__irqentry_text_start &&
-               ptr < (unsigned long)&__irqentry_text_end) ||
-               (ptr >= (unsigned long)&__softirqentry_text_start &&
-                ptr < (unsigned long)&__softirqentry_text_end);
-}
-
-static inline unsigned int filter_irq_stacks(unsigned long *entries,
-                                            unsigned int nr_entries)
-{
-       unsigned int i;
-
-       for (i = 0; i < nr_entries; i++) {
-               if (in_irqentry_text(entries[i])) {
-                       /* Include the irqentry function into the stack. */
-                       return i + 1;
-               }
-       }
-       return nr_entries;
-}
-
 static inline depot_stack_handle_t save_stack(gfp_t flags)
 {
        unsigned long entries[KASAN_STACK_DEPTH];
index cf5c17d..80f23c9 100644 (file)
@@ -92,8 +92,16 @@ static void end_report(unsigned long *flags)
        pr_err("==================================================================\n");
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
        spin_unlock_irqrestore(&report_lock, *flags);
-       if (panic_on_warn)
+       if (panic_on_warn) {
+               /*
+                * This thread may hit another WARN() in the panic path.
+                * Resetting this prevents additional WARN() from panicking the
+                * system on this thread.  Other threads are blocked by the
+                * panic_mutex in panic().
+                */
+               panic_on_warn = 0;
                panic("panic_on_warn set ...\n");
+       }
        kasan_enable_current();
 }
 
index c659c68..99d77ff 100644 (file)
@@ -29,6 +29,7 @@ enum scan_result {
        SCAN_PMD_NULL,
        SCAN_EXCEED_NONE_PTE,
        SCAN_PTE_NON_PRESENT,
+       SCAN_PTE_UFFD_WP,
        SCAN_PAGE_RO,
        SCAN_LACK_REFERENCED_PAGE,
        SCAN_PAGE_NULL,
@@ -414,8 +415,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
            (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
             vma->vm_file &&
             (vm_flags & VM_DENYWRITE))) {
-               if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
-                       return false;
                return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
                                HPAGE_PMD_NR);
        }
@@ -513,7 +512,7 @@ void __khugepaged_exit(struct mm_struct *mm)
 
 static void release_pte_page(struct page *page)
 {
-       dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
+       dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page));
        unlock_page(page);
        putback_lru_page(page);
 }
@@ -613,7 +612,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                        goto out;
                }
                inc_node_page_state(page,
-                               NR_ISOLATED_ANON + page_is_file_cache(page));
+                               NR_ISOLATED_ANON + page_is_file_lru(page));
                VM_BUG_ON_PAGE(!PageLocked(page), page);
                VM_BUG_ON_PAGE(PageLRU(page), page);
 
@@ -1139,6 +1138,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                pte_t pteval = *_pte;
                if (is_swap_pte(pteval)) {
                        if (++unmapped <= khugepaged_max_ptes_swap) {
+                               /*
+                                * Always be strict with uffd-wp
+                                * enabled swap entries.  Please see
+                                * comment below for pte_uffd_wp().
+                                */
+                               if (pte_swp_uffd_wp(pteval)) {
+                                       result = SCAN_PTE_UFFD_WP;
+                                       goto out_unmap;
+                               }
                                continue;
                        } else {
                                result = SCAN_EXCEED_SWAP_PTE;
@@ -1158,6 +1166,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        result = SCAN_PTE_NON_PRESENT;
                        goto out_unmap;
                }
+               if (pte_uffd_wp(pteval)) {
+                       /*
+                        * Don't collapse the page if any of the small
+                        * PTEs are armed with uffd write protection.
+                        * Here we can also mark the new huge pmd as
+                        * write protected if any of the small ones is
+                        * marked but that could bring uknown
+                        * userfault messages that falls outside of
+                        * the registered range.  So, just be simple.
+                        */
+                       result = SCAN_PTE_UFFD_WP;
+                       goto out_unmap;
+               }
                if (pte_write(pteval))
                        writable = true;
 
@@ -1258,7 +1279,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
        }
 }
 
-#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+#ifdef CONFIG_SHMEM
 /*
  * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
  * khugepaged should try to collapse the page table.
@@ -1973,6 +1994,8 @@ skip:
                if (khugepaged_scan.address < hstart)
                        khugepaged_scan.address = hstart;
                VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+               if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
+                       goto skip;
 
                while (khugepaged_scan.address < hend) {
                        int ret;
@@ -1984,14 +2007,10 @@ skip:
                                  khugepaged_scan.address + HPAGE_PMD_SIZE >
                                  hend);
                        if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
-                               struct file *file;
+                               struct file *file = get_file(vma->vm_file);
                                pgoff_t pgoff = linear_page_index(vma,
                                                khugepaged_scan.address);
 
-                               if (shmem_file(vma->vm_file)
-                                   && !shmem_huge_enabled(vma))
-                                       goto skip;
-                               file = get_file(vma->vm_file);
                                up_read(&mm->mmap_sem);
                                ret = 1;
                                khugepaged_scan_file(mm, file, pgoff, hpage);
index d17c7d5..a558da9 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -455,7 +455,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
 /*
  * We use break_ksm to break COW on a ksm page: it's a stripped down
  *
- *     if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
+ *     if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1)
  *             put_page(page);
  *
  * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
@@ -2813,8 +2813,7 @@ static int ksm_memory_callback(struct notifier_block *self,
                 */
                ksm_check_stable_tree(mn->start_pfn,
                                      mn->start_pfn + mn->nr_pages);
-               /* fallthrough */
-
+               fallthrough;
        case MEM_CANCEL_OFFLINE:
                mutex_lock(&ksm_thread_mutex);
                ksm_run &= ~KSM_RUN_OFFLINE;
index 8de5e37..4d5294c 100644 (file)
@@ -223,7 +223,7 @@ restart:
                switch (ret) {
                case LRU_REMOVED_RETRY:
                        assert_spin_locked(&nlru->lock);
-                       /* fall through */
+                       fallthrough;
                case LRU_REMOVED:
                        isolated++;
                        nlru->nr_items--;
index ca19486..05b4ec2 100644 (file)
@@ -2254,7 +2254,8 @@ static void reclaim_high(struct mem_cgroup *memcg,
                        continue;
                memcg_memory_event(memcg, MEMCG_HIGH);
                try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
-       } while ((memcg = parent_mem_cgroup(memcg)));
+       } while ((memcg = parent_mem_cgroup(memcg)) &&
+                !mem_cgroup_is_root(memcg));
 }
 
 static void high_work_func(struct work_struct *work)
@@ -5812,7 +5813,7 @@ retry:
                switch (get_mctgt_type(vma, addr, ptent, &target)) {
                case MC_TARGET_DEVICE:
                        device = true;
-                       /* fall through */
+                       fallthrough;
                case MC_TARGET_PAGE:
                        page = target.page;
                        /*
index 1c961cd..a96364b 100644 (file)
@@ -1810,7 +1810,7 @@ static int __soft_offline_page(struct page *page, int flags)
                 */
                if (!__PageMovable(page))
                        inc_node_page_state(page, NR_ISOLATED_ANON +
-                                               page_is_file_cache(page));
+                                               page_is_file_lru(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
                                        MIGRATE_SYNC, MR_MEMORY_FAILURE);
index 586271f..19874d1 100644 (file)
@@ -733,6 +733,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                                pte = swp_entry_to_pte(entry);
                                if (pte_swp_soft_dirty(*src_pte))
                                        pte = pte_swp_mksoft_dirty(pte);
+                               if (pte_swp_uffd_wp(*src_pte))
+                                       pte = pte_swp_mkuffd_wp(pte);
                                set_pte_at(src_mm, addr, src_pte, pte);
                        }
                } else if (is_device_private_entry(entry)) {
@@ -762,6 +764,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                            is_cow_mapping(vm_flags)) {
                                make_device_private_entry_read(&entry);
                                pte = swp_entry_to_pte(entry);
+                               if (pte_swp_uffd_wp(*src_pte))
+                                       pte = pte_swp_mkuffd_wp(pte);
                                set_pte_at(src_mm, addr, src_pte, pte);
                        }
                }
@@ -785,6 +789,14 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pte = pte_mkclean(pte);
        pte = pte_mkold(pte);
 
+       /*
+        * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
+        * does not have the VM_UFFD_WP, which means that the uffd
+        * fork event is not enabled.
+        */
+       if (!(vm_flags & VM_UFFD_WP))
+               pte = pte_clear_uffd_wp(pte);
+
        page = vm_normal_page(vma, addr, pte);
        if (page) {
                get_page(page);
@@ -1940,7 +1952,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
  * @vma: user vma to map to
  * @addr: target user address to start at
  * @pfn: page frame number of kernel physical memory address
- * @size: size of map area
+ * @size: size of mapping area
  * @prot: page protection flags for this mapping
  *
  * Note: this is only safe if the mm semaphore is held when called.
@@ -2752,6 +2764,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
 
+       if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
+               return handle_userfault(vmf, VM_UFFD_WP);
+       }
+
        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
        if (!vmf->page) {
                /*
@@ -3085,6 +3102,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        flush_icache_page(vma, page);
        if (pte_swp_soft_dirty(vmf->orig_pte))
                pte = pte_mksoft_dirty(pte);
+       if (pte_swp_uffd_wp(vmf->orig_pte)) {
+               pte = pte_mkuffd_wp(pte);
+               pte = pte_wrprotect(pte);
+       }
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
        arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
        vmf->orig_pte = pte;
@@ -3373,7 +3394,7 @@ map_pte:
        return 0;
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void deposit_prealloc_pte(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
@@ -3475,8 +3496,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
        pte_t entry;
        vm_fault_t ret;
 
-       if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
-                       IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
+       if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
                /* THP on COW? */
                VM_BUG_ON_PAGE(memcg, page);
 
@@ -3949,8 +3969,11 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
 /* `inline' is required to avoid gcc 4.1.2 build error */
 static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 {
-       if (vma_is_anonymous(vmf->vma))
+       if (vma_is_anonymous(vmf->vma)) {
+               if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
+                       return handle_userfault(vmf, VM_UFFD_WP);
                return do_huge_pmd_wp_page(vmf, orig_pmd);
+       }
        if (vmf->vma->vm_ops->huge_fault) {
                vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
 
@@ -3964,11 +3987,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
        return VM_FAULT_FALLBACK;
 }
 
-static inline bool vma_is_accessible(struct vm_area_struct *vma)
-{
-       return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
-}
-
 static vm_fault_t create_huge_pud(struct vm_fault *vmf)
 {
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                    \
index 19389cd..635e8e2 100644 (file)
@@ -67,18 +67,17 @@ void put_online_mems(void)
 bool movable_node_enabled = false;
 
 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-bool memhp_auto_online;
+int memhp_default_online_type = MMOP_OFFLINE;
 #else
-bool memhp_auto_online = true;
+int memhp_default_online_type = MMOP_ONLINE;
 #endif
-EXPORT_SYMBOL_GPL(memhp_auto_online);
 
 static int __init setup_memhp_default_state(char *str)
 {
-       if (!strcmp(str, "online"))
-               memhp_auto_online = true;
-       else if (!strcmp(str, "offline"))
-               memhp_auto_online = false;
+       const int online_type = memhp_online_type_from_str(str);
+
+       if (online_type >= 0)
+               memhp_default_online_type = online_type;
 
        return 1;
 }
@@ -105,7 +104,13 @@ static struct resource *register_memory_resource(u64 start, u64 size)
        unsigned long flags =  IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
        char *resource_name = "System RAM";
 
-       if (start + size > max_mem_size)
+       /*
+        * Make sure value parsed from 'mem=' only restricts memory adding
+        * while booting, so that memory hotplug won't be impacted. Please
+        * refer to document of 'mem=' in kernel-parameters.txt for more
+        * details.
+        */
+       if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
                return ERR_PTR(-E2BIG);
 
        /*
@@ -301,8 +306,9 @@ static int check_hotplug_memory_addressable(unsigned long pfn,
 int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
                struct mhp_restrictions *restrictions)
 {
+       const unsigned long end_pfn = pfn + nr_pages;
+       unsigned long cur_nr_pages;
        int err;
-       unsigned long nr, start_sec, end_sec;
        struct vmem_altmap *altmap = restrictions->altmap;
 
        err = check_hotplug_memory_addressable(pfn, nr_pages);
@@ -325,18 +331,13 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
        if (err)
                return err;
 
-       start_sec = pfn_to_section_nr(pfn);
-       end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
-       for (nr = start_sec; nr <= end_sec; nr++) {
-               unsigned long pfns;
-
-               pfns = min(nr_pages, PAGES_PER_SECTION
-                               - (pfn & ~PAGE_SECTION_MASK));
-               err = sparse_add_section(nid, pfn, pfns, altmap);
+       for (; pfn < end_pfn; pfn += cur_nr_pages) {
+               /* Select all remaining pages up to the next section boundary */
+               cur_nr_pages = min(end_pfn - pfn,
+                                  SECTION_ALIGN_UP(pfn + 1) - pfn);
+               err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
                if (err)
                        break;
-               pfn += pfns;
-               nr_pages -= pfns;
                cond_resched();
        }
        vmemmap_populate_print_last();
@@ -494,7 +495,7 @@ static void __remove_section(unsigned long pfn, unsigned long nr_pages,
                             unsigned long map_offset,
                             struct vmem_altmap *altmap)
 {
-       struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
+       struct mem_section *ms = __pfn_to_section(pfn);
 
        if (WARN_ON_ONCE(!valid_section(ms)))
                return;
@@ -528,7 +529,8 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
        for (; pfn < end_pfn; pfn += cur_nr_pages) {
                cond_resched();
                /* Select all remaining pages up to the next section boundary */
-               cur_nr_pages = min(end_pfn - pfn, -(pfn | PAGE_SECTION_MASK));
+               cur_nr_pages = min(end_pfn - pfn,
+                                  SECTION_ALIGN_UP(pfn + 1) - pfn);
                __remove_section(pfn, cur_nr_pages, map_offset, altmap);
                map_offset = 0;
        }
@@ -988,6 +990,7 @@ static int check_hotplug_memory_range(u64 start, u64 size)
 
 static int online_memory_block(struct memory_block *mem, void *arg)
 {
+       mem->online_type = memhp_default_online_type;
        return device_online(&mem->dev);
 }
 
@@ -1060,7 +1063,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
        mem_hotplug_done();
 
        /* online pages if requested */
-       if (memhp_auto_online)
+       if (memhp_default_online_type != MMOP_OFFLINE)
                walk_memory_blocks(start, size, NULL, online_memory_block);
 
        return ret;
@@ -1317,7 +1320,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                        list_add_tail(&page->lru, &source);
                        if (!__PageMovable(page))
                                inc_node_page_state(page, NR_ISOLATED_ANON +
-                                                   page_is_file_cache(page));
+                                                   page_is_file_lru(page));
 
                } else {
                        pr_warn("failed to isolate pfn %lx\n", pfn);
index 5fb427a..48ba972 100644 (file)
@@ -127,6 +127,32 @@ static struct mempolicy default_policy = {
 
 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 
+/**
+ * numa_map_to_online_node - Find closest online node
+ * @nid: Node id to start the search
+ *
+ * Lookup the next closest node by distance if @nid is not online.
+ */
+int numa_map_to_online_node(int node)
+{
+       int min_dist = INT_MAX, dist, n, min_node;
+
+       if (node == NUMA_NO_NODE || node_online(node))
+               return node;
+
+       min_node = node;
+       for_each_online_node(n) {
+               dist = node_distance(node, n);
+               if (dist < min_dist) {
+                       min_dist = dist;
+                       min_node = n;
+               }
+       }
+
+       return min_node;
+}
+EXPORT_SYMBOL_GPL(numa_map_to_online_node);
+
 struct mempolicy *get_task_policy(struct task_struct *p)
 {
        struct mempolicy *pol = p->mempolicy;
@@ -442,6 +468,7 @@ static inline bool queue_pages_required(struct page *page,
  */
 static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)
+       __releases(ptl)
 {
        int ret = 0;
        struct page *page;
@@ -627,7 +654,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 {
        int nr_updated;
 
-       nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
+       nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
        if (nr_updated)
                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 
@@ -678,8 +705,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
 
        if (flags & MPOL_MF_LAZY) {
                /* Similar to task_numa_work, skip inaccessible VMAs */
-               if (!is_vm_hugetlb_page(vma) &&
-                       (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
+               if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
                        !(vma->vm_flags & VM_MIXEDMAP))
                        change_prot_numa(vma, start, endvma);
                return 1;
@@ -881,7 +907,6 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 
        switch (p->mode) {
        case MPOL_BIND:
-               /* Fall through */
        case MPOL_INTERLEAVE:
                *nodes = p->v.nodes;
                break;
@@ -897,12 +922,15 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 
 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 {
-       struct page *p;
+       struct page *p = NULL;
        int err;
 
        int locked = 1;
        err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
-       if (err >= 0) {
+       if (err == 0) {
+               /* E.g. GUP interrupted by fatal signal */
+               err = -EFAULT;
+       } else if (err > 0) {
                err = page_to_nid(p);
                put_page(p);
        }
@@ -1023,7 +1051,7 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
                if (!isolate_lru_page(head)) {
                        list_add_tail(&head->lru, pagelist);
                        mod_node_page_state(page_pgdat(head),
-                               NR_ISOLATED_ANON + page_is_file_cache(head),
+                               NR_ISOLATED_ANON + page_is_file_lru(head),
                                hpage_nr_pages(head));
                } else if (flags & MPOL_MF_STRICT) {
                        /*
@@ -2066,7 +2094,6 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
                break;
 
        case MPOL_BIND:
-               /* Fall through */
        case MPOL_INTERLEAVE:
                *mask =  mempolicy->v.nodes;
                break;
@@ -2333,7 +2360,6 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 
        switch (a->mode) {
        case MPOL_BIND:
-               /* Fall through */
        case MPOL_INTERLEAVE:
                return !!nodes_equal(a->v.nodes, b->v.nodes);
        case MPOL_PREFERRED:
index 9b2c97c..bbf457c 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/pfn_t.h>
 #include <linux/swap.h>
+#include <linux/mmzone.h>
 #include <linux/swapops.h>
 #include <linux/types.h>
 #include <linux/wait_bit.h>
 
 static DEFINE_XARRAY(pgmap_array);
 
+/*
+ * The memremap() and memremap_pages() interfaces are alternately used
+ * to map persistent memory namespaces. These interfaces place different
+ * constraints on the alignment and size of the mapping (namespace).
+ * memremap() can map individual PAGE_SIZE pages. memremap_pages() can
+ * only map subsections (2MB), and at least one architecture (PowerPC)
+ * the minimum mapping granularity of memremap_pages() is 16MB.
+ *
+ * The role of memremap_compat_align() is to communicate the minimum
+ * arch supported alignment of a namespace such that it can freely
+ * switch modes without violating the arch constraint. Namely, do not
+ * allow a namespace to be PAGE_SIZE aligned since that namespace may be
+ * reconfigured into a mode that requires SUBSECTION_SIZE alignment.
+ */
+#ifndef CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN
+unsigned long memremap_compat_align(void)
+{
+       return SUBSECTION_SIZE;
+}
+EXPORT_SYMBOL_GPL(memremap_compat_align);
+#endif
+
 #ifdef CONFIG_DEV_PAGEMAP_OPS
 DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
 EXPORT_SYMBOL(devmap_managed_key);
index 7ded070..7160c15 100644 (file)
@@ -193,7 +193,7 @@ void putback_movable_pages(struct list_head *l)
                        put_page(page);
                } else {
                        mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-                                       page_is_file_cache(page), -hpage_nr_pages(page));
+                                       page_is_file_lru(page), -hpage_nr_pages(page));
                        putback_lru_page(page);
                }
        }
@@ -243,11 +243,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
                entry = pte_to_swp_entry(*pvmw.pte);
                if (is_write_migration_entry(entry))
                        pte = maybe_mkwrite(pte, vma);
+               else if (pte_swp_uffd_wp(*pvmw.pte))
+                       pte = pte_mkuffd_wp(pte);
 
                if (unlikely(is_zone_device_page(new))) {
                        if (is_device_private_page(new)) {
                                entry = make_device_private_entry(new, pte_write(pte));
                                pte = swp_entry_to_pte(entry);
+                               if (pte_swp_uffd_wp(*pvmw.pte))
+                                       pte = pte_mkuffd_wp(pte);
                        }
                }
 
@@ -647,6 +651,14 @@ void migrate_page_states(struct page *newpage, struct page *page)
        if (PageWriteback(newpage))
                end_page_writeback(newpage);
 
+       /*
+        * PG_readahead shares the same bit with PG_reclaim.  The above
+        * end_page_writeback() may clear PG_readahead mistakenly, so set the
+        * bit after that.
+        */
+       if (PageReadahead(page))
+               SetPageReadahead(newpage);
+
        copy_page_owner(page, newpage);
 
        mem_cgroup_migrate(page, newpage);
@@ -1211,7 +1223,7 @@ out:
                 */
                if (likely(!__PageMovable(page)))
                        mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-                                       page_is_file_cache(page), -hpage_nr_pages(page));
+                                       page_is_file_lru(page), -hpage_nr_pages(page));
        }
 
        /*
@@ -1518,9 +1530,6 @@ static int do_move_pages_to_node(struct mm_struct *mm,
 {
        int err;
 
-       if (list_empty(pagelist))
-               return 0;
-
        err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
                        MIGRATE_SYNC, MR_SYSCALL);
        if (err)
@@ -1587,7 +1596,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
                err = 1;
                list_add_tail(&head->lru, pagelist);
                mod_node_page_state(page_pgdat(head),
-                       NR_ISOLATED_ANON + page_is_file_cache(head),
+                       NR_ISOLATED_ANON + page_is_file_lru(head),
                        hpage_nr_pages(head));
        }
 out_putpage:
@@ -1602,6 +1611,32 @@ out:
        return err;
 }
 
+static int move_pages_and_store_status(struct mm_struct *mm, int node,
+               struct list_head *pagelist, int __user *status,
+               int start, int i, unsigned long nr_pages)
+{
+       int err;
+
+       if (list_empty(pagelist))
+               return 0;
+
+       err = do_move_pages_to_node(mm, pagelist, node);
+       if (err) {
+               /*
+                * Positive err means the number of failed
+                * pages to migrate.  Since we are going to
+                * abort and return the number of non-migrated
+                * pages, so need to incude the rest of the
+                * nr_pages that have not been attempted as
+                * well.
+                */
+               if (err > 0)
+                       err += nr_pages - i - 1;
+               return err;
+       }
+       return store_status(status, start, node, i - start);
+}
+
 /*
  * Migrate an array of page address onto an array of nodes and fill
  * the corresponding array of status.
@@ -1645,21 +1680,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
                        current_node = node;
                        start = i;
                } else if (node != current_node) {
-                       err = do_move_pages_to_node(mm, &pagelist, current_node);
-                       if (err) {
-                               /*
-                                * Positive err means the number of failed
-                                * pages to migrate.  Since we are going to
-                                * abort and return the number of non-migrated
-                                * pages, so need to incude the rest of the
-                                * nr_pages that have not been attempted as
-                                * well.
-                                */
-                               if (err > 0)
-                                       err += nr_pages - i - 1;
-                               goto out;
-                       }
-                       err = store_status(status, start, current_node, i - start);
+                       err = move_pages_and_store_status(mm, current_node,
+                                       &pagelist, status, start, i, nr_pages);
                        if (err)
                                goto out;
                        start = i;
@@ -1673,49 +1695,29 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
                err = add_page_for_migration(mm, addr, current_node,
                                &pagelist, flags & MPOL_MF_MOVE_ALL);
 
-               if (!err) {
-                       /* The page is already on the target node */
-                       err = store_status(status, i, current_node, 1);
-                       if (err)
-                               goto out_flush;
-                       continue;
-               } else if (err > 0) {
+               if (err > 0) {
                        /* The page is successfully queued for migration */
                        continue;
                }
 
-               err = store_status(status, i, err, 1);
+               /*
+                * If the page is already on the target node (!err), store the
+                * node, otherwise, store the err.
+                */
+               err = store_status(status, i, err ? : current_node, 1);
                if (err)
                        goto out_flush;
 
-               err = do_move_pages_to_node(mm, &pagelist, current_node);
-               if (err) {
-                       if (err > 0)
-                               err += nr_pages - i - 1;
+               err = move_pages_and_store_status(mm, current_node, &pagelist,
+                               status, start, i, nr_pages);
+               if (err)
                        goto out;
-               }
-               if (i > start) {
-                       err = store_status(status, start, current_node, i - start);
-                       if (err)
-                               goto out;
-               }
                current_node = NUMA_NO_NODE;
        }
 out_flush:
-       if (list_empty(&pagelist))
-               return err;
-
        /* Make sure we do not overwrite the existing error */
-       err1 = do_move_pages_to_node(mm, &pagelist, current_node);
-       /*
-        * Don't have to report non-attempted pages here since:
-        *     - If the above loop is done gracefully all pages have been
-        *       attempted.
-        *     - If the above loop is aborted it means a fatal error
-        *       happened, should return ret.
-        */
-       if (!err1)
-               err1 = store_status(status, start, current_node, i - start);
+       err1 = move_pages_and_store_status(mm, current_node, &pagelist,
+                               status, start, i, nr_pages);
        if (err >= 0)
                err = err1;
 out:
@@ -1957,7 +1959,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
                return 0;
        }
 
-       page_lru = page_is_file_cache(page);
+       page_lru = page_is_file_lru(page);
        mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
                                hpage_nr_pages(page));
 
@@ -1993,7 +1995,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
         * Don't migrate file pages that are mapped in multiple processes
         * with execute permissions as they are probably shared libraries.
         */
-       if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
+       if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
            (vma->vm_flags & VM_EXEC))
                goto out;
 
@@ -2001,7 +2003,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
         * Also do not migrate dirty pages as not all filesystems can move
         * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
         */
-       if (page_is_file_cache(page) && PageDirty(page))
+       if (page_is_file_lru(page) && PageDirty(page))
                goto out;
 
        isolated = numamigrate_isolate_page(pgdat, page);
@@ -2016,7 +2018,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
                if (!list_empty(&migratepages)) {
                        list_del(&page->lru);
                        dec_node_page_state(page, NR_ISOLATED_ANON +
-                                       page_is_file_cache(page));
+                                       page_is_file_lru(page));
                        putback_lru_page(page);
                }
                isolated = 0;
@@ -2046,7 +2048,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
        pg_data_t *pgdat = NODE_DATA(node);
        int isolated = 0;
        struct page *new_page = NULL;
-       int page_lru = page_is_file_cache(page);
+       int page_lru = page_is_file_lru(page);
        unsigned long start = address & HPAGE_PMD_MASK;
 
        new_page = alloc_pages_node(node,
@@ -2340,6 +2342,8 @@ again:
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pte))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       if (pte_uffd_wp(pte))
+                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, addr, ptep, swp_pte);
 
                        /*
index 5c91838..7da6991 100644 (file)
@@ -37,7 +37,7 @@ void __init mminit_verify_zonelist(void)
                struct zonelist *zonelist;
                int i, listid, zoneid;
 
-               BUG_ON(MAX_ZONELISTS > 2);
+               BUILD_BUG_ON(MAX_ZONELISTS > 2);
                for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
 
                        /* Identify the zone and nodelist */
index 94ae183..8d77dbb 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1460,7 +1460,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
                         * with MAP_SHARED to preserve backward compatibility.
                         */
                        flags &= LEGACY_MAP_MASK;
-                       /* fall through */
+                       fallthrough;
                case MAP_SHARED_VALIDATE:
                        if (flags & ~flags_mask)
                                return -EOPNOTSUPP;
@@ -1487,8 +1487,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        if (!(file->f_mode & FMODE_WRITE))
                                vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
-
-                       /* fall through */
+                       fallthrough;
                case MAP_PRIVATE:
                        if (!(file->f_mode & FMODE_READ))
                                return -EACCES;
@@ -2358,8 +2357,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                gap_addr = TASK_SIZE;
 
        next = vma->vm_next;
-       if (next && next->vm_start < gap_addr &&
-                       (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
+       if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
                if (!(next->vm_flags & VM_GROWSUP))
                        return -ENOMEM;
                /* Check that both stack segments have the same anon_vma? */
@@ -2440,7 +2438,7 @@ int expand_downwards(struct vm_area_struct *vma,
        prev = vma->vm_prev;
        /* Check that both stack segments have the same anon_vma? */
        if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
-                       (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
+                       vma_is_accessible(prev)) {
                if (address - prev->vm_end < stack_guard_gap)
                        return -ENOMEM;
        }
index 311c0da..1d823b0 100644 (file)
 
 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-               int dirty_accountable, int prot_numa)
+               unsigned long cp_flags)
 {
        pte_t *pte, oldpte;
        spinlock_t *ptl;
        unsigned long pages = 0;
        int target_node = NUMA_NO_NODE;
+       bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
+       bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+       bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+       bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 
        /*
         * Can be called with only the mmap_sem for reading by
@@ -98,7 +102,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                 * it cannot move them all from MIGRATE_ASYNC
                                 * context.
                                 */
-                               if (page_is_file_cache(page) && PageDirty(page))
+                               if (page_is_file_lru(page) && PageDirty(page))
                                        continue;
 
                                /*
@@ -114,6 +118,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        if (preserve_write)
                                ptent = pte_mk_savedwrite(ptent);
 
+                       if (uffd_wp) {
+                               ptent = pte_wrprotect(ptent);
+                               ptent = pte_mkuffd_wp(ptent);
+                       } else if (uffd_wp_resolve) {
+                               /*
+                                * Leave the write bit to be handled
+                                * by PF interrupt handler, then
+                                * things like COW could be properly
+                                * handled.
+                                */
+                               ptent = pte_clear_uffd_wp(ptent);
+                       }
+
                        /* Avoid taking write faults for known dirty pages */
                        if (dirty_accountable && pte_dirty(ptent) &&
                                        (pte_soft_dirty(ptent) ||
@@ -122,11 +139,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        }
                        ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
                        pages++;
-               } else if (IS_ENABLED(CONFIG_MIGRATION)) {
+               } else if (is_swap_pte(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
+                       pte_t newpte;
 
                        if (is_write_migration_entry(entry)) {
-                               pte_t newpte;
                                /*
                                 * A protection check is difficult so
                                 * just be safe and disable write
@@ -135,22 +152,28 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                newpte = swp_entry_to_pte(entry);
                                if (pte_swp_soft_dirty(oldpte))
                                        newpte = pte_swp_mksoft_dirty(newpte);
-                               set_pte_at(vma->vm_mm, addr, pte, newpte);
-
-                               pages++;
-                       }
-
-                       if (is_write_device_private_entry(entry)) {
-                               pte_t newpte;
-
+                               if (pte_swp_uffd_wp(oldpte))
+                                       newpte = pte_swp_mkuffd_wp(newpte);
+                       } else if (is_write_device_private_entry(entry)) {
                                /*
                                 * We do not preserve soft-dirtiness. See
                                 * copy_one_pte() for explanation.
                                 */
                                make_device_private_entry_read(&entry);
                                newpte = swp_entry_to_pte(entry);
-                               set_pte_at(vma->vm_mm, addr, pte, newpte);
+                               if (pte_swp_uffd_wp(oldpte))
+                                       newpte = pte_swp_mkuffd_wp(newpte);
+                       } else {
+                               newpte = oldpte;
+                       }
+
+                       if (uffd_wp)
+                               newpte = pte_swp_mkuffd_wp(newpte);
+                       else if (uffd_wp_resolve)
+                               newpte = pte_swp_clear_uffd_wp(newpte);
 
+                       if (!pte_same(oldpte, newpte)) {
+                               set_pte_at(vma->vm_mm, addr, pte, newpte);
                                pages++;
                        }
                }
@@ -188,7 +211,7 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
 
 static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                pud_t *pud, unsigned long addr, unsigned long end,
-               pgprot_t newprot, int dirty_accountable, int prot_numa)
+               pgprot_t newprot, unsigned long cp_flags)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -229,7 +252,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                                __split_huge_pmd(vma, pmd, addr, false, NULL);
                        } else {
                                int nr_ptes = change_huge_pmd(vma, pmd, addr,
-                                               newprot, prot_numa);
+                                                             newprot, cp_flags);
 
                                if (nr_ptes) {
                                        if (nr_ptes == HPAGE_PMD_NR) {
@@ -244,7 +267,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
                        /* fall through, the trans huge pmd just split */
                }
                this_pages = change_pte_range(vma, pmd, addr, next, newprot,
-                                dirty_accountable, prot_numa);
+                                             cp_flags);
                pages += this_pages;
 next:
                cond_resched();
@@ -260,7 +283,7 @@ next:
 
 static inline unsigned long change_pud_range(struct vm_area_struct *vma,
                p4d_t *p4d, unsigned long addr, unsigned long end,
-               pgprot_t newprot, int dirty_accountable, int prot_numa)
+               pgprot_t newprot, unsigned long cp_flags)
 {
        pud_t *pud;
        unsigned long next;
@@ -272,7 +295,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma,
                if (pud_none_or_clear_bad(pud))
                        continue;
                pages += change_pmd_range(vma, pud, addr, next, newprot,
-                                dirty_accountable, prot_numa);
+                                         cp_flags);
        } while (pud++, addr = next, addr != end);
 
        return pages;
@@ -280,7 +303,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma,
 
 static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
                pgd_t *pgd, unsigned long addr, unsigned long end,
-               pgprot_t newprot, int dirty_accountable, int prot_numa)
+               pgprot_t newprot, unsigned long cp_flags)
 {
        p4d_t *p4d;
        unsigned long next;
@@ -292,7 +315,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                pages += change_pud_range(vma, p4d, addr, next, newprot,
-                                dirty_accountable, prot_numa);
+                                         cp_flags);
        } while (p4d++, addr = next, addr != end);
 
        return pages;
@@ -300,7 +323,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
 
 static unsigned long change_protection_range(struct vm_area_struct *vma,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-               int dirty_accountable, int prot_numa)
+               unsigned long cp_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
@@ -317,7 +340,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                pages += change_p4d_range(vma, pgd, addr, next, newprot,
-                                dirty_accountable, prot_numa);
+                                         cp_flags);
        } while (pgd++, addr = next, addr != end);
 
        /* Only flush the TLB if we actually modified any entries: */
@@ -330,14 +353,17 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
 
 unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
                       unsigned long end, pgprot_t newprot,
-                      int dirty_accountable, int prot_numa)
+                      unsigned long cp_flags)
 {
        unsigned long pages;
 
+       BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
+
        if (is_vm_hugetlb_page(vma))
                pages = hugetlb_change_protection(vma, start, end, newprot);
        else
-               pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
+               pages = change_protection_range(vma, start, end, newprot,
+                                               cp_flags);
 
        return pages;
 }
@@ -459,7 +485,7 @@ success:
        vma_set_page_prot(vma);
 
        change_protection(vma, start, end, vma->vm_page_prot,
-                         dirty_accountable, 0);
+                         dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
 
        /*
         * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
index e5f76da..114c56c 100644 (file)
@@ -74,6 +74,7 @@
 #include <asm/div64.h>
 #include "internal.h"
 #include "shuffle.h"
+#include "page_reporting.h"
 
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
@@ -864,6 +865,78 @@ compaction_capture(struct capture_control *capc, struct page *page,
 }
 #endif /* CONFIG_COMPACTION */
 
+/* Used for pages not on another list */
+static inline void add_to_free_list(struct page *page, struct zone *zone,
+                                   unsigned int order, int migratetype)
+{
+       struct free_area *area = &zone->free_area[order];
+
+       list_add(&page->lru, &area->free_list[migratetype]);
+       area->nr_free++;
+}
+
+/* Used for pages not on another list */
+static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
+                                        unsigned int order, int migratetype)
+{
+       struct free_area *area = &zone->free_area[order];
+
+       list_add_tail(&page->lru, &area->free_list[migratetype]);
+       area->nr_free++;
+}
+
+/* Used for pages which are on another list */
+static inline void move_to_free_list(struct page *page, struct zone *zone,
+                                    unsigned int order, int migratetype)
+{
+       struct free_area *area = &zone->free_area[order];
+
+       list_move(&page->lru, &area->free_list[migratetype]);
+}
+
+static inline void del_page_from_free_list(struct page *page, struct zone *zone,
+                                          unsigned int order)
+{
+       /* clear reported state and update reported page count */
+       if (page_reported(page))
+               __ClearPageReported(page);
+
+       list_del(&page->lru);
+       __ClearPageBuddy(page);
+       set_page_private(page, 0);
+       zone->free_area[order].nr_free--;
+}
+
+/*
+ * If this is not the largest possible page, check if the buddy
+ * of the next-highest order is free. If it is, it's possible
+ * that pages are being freed that will coalesce soon. In case,
+ * that is happening, add the free page to the tail of the list
+ * so it's less likely to be used soon and more likely to be merged
+ * as a higher order page
+ */
+static inline bool
+buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
+                  struct page *page, unsigned int order)
+{
+       struct page *higher_page, *higher_buddy;
+       unsigned long combined_pfn;
+
+       if (order >= MAX_ORDER - 2)
+               return false;
+
+       if (!pfn_valid_within(buddy_pfn))
+               return false;
+
+       combined_pfn = buddy_pfn & pfn;
+       higher_page = page + (combined_pfn - pfn);
+       buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
+       higher_buddy = higher_page + (buddy_pfn - combined_pfn);
+
+       return pfn_valid_within(buddy_pfn) &&
+              page_is_buddy(higher_page, higher_buddy, order + 1);
+}
+
 /*
  * Freeing function for a buddy system allocator.
  *
@@ -891,13 +964,14 @@ compaction_capture(struct capture_control *capc, struct page *page,
 static inline void __free_one_page(struct page *page,
                unsigned long pfn,
                struct zone *zone, unsigned int order,
-               int migratetype)
+               int migratetype, bool report)
 {
-       unsigned long combined_pfn;
+       struct capture_control *capc = task_capc(zone);
        unsigned long uninitialized_var(buddy_pfn);
-       struct page *buddy;
+       unsigned long combined_pfn;
        unsigned int max_order;
-       struct capture_control *capc = task_capc(zone);
+       struct page *buddy;
+       bool to_tail;
 
        max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
 
@@ -932,7 +1006,7 @@ continue_merging:
                if (page_is_guard(buddy))
                        clear_page_guard(zone, buddy, order, migratetype);
                else
-                       del_page_from_free_area(buddy, &zone->free_area[order]);
+                       del_page_from_free_list(buddy, zone, order);
                combined_pfn = buddy_pfn & pfn;
                page = page + (combined_pfn - pfn);
                pfn = combined_pfn;
@@ -966,35 +1040,19 @@ continue_merging:
 done_merging:
        set_page_order(page, order);
 
-       /*
-        * If this is not the largest possible page, check if the buddy
-        * of the next-highest order is free. If it is, it's possible
-        * that pages are being freed that will coalesce soon. In case,
-        * that is happening, add the free page to the tail of the list
-        * so it's less likely to be used soon and more likely to be merged
-        * as a higher order page
-        */
-       if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
-                       && !is_shuffle_order(order)) {
-               struct page *higher_page, *higher_buddy;
-               combined_pfn = buddy_pfn & pfn;
-               higher_page = page + (combined_pfn - pfn);
-               buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
-               higher_buddy = higher_page + (buddy_pfn - combined_pfn);
-               if (pfn_valid_within(buddy_pfn) &&
-                   page_is_buddy(higher_page, higher_buddy, order + 1)) {
-                       add_to_free_area_tail(page, &zone->free_area[order],
-                                             migratetype);
-                       return;
-               }
-       }
-
        if (is_shuffle_order(order))
-               add_to_free_area_random(page, &zone->free_area[order],
-                               migratetype);
+               to_tail = shuffle_pick_tail();
+       else
+               to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
+
+       if (to_tail)
+               add_to_free_list_tail(page, zone, order, migratetype);
        else
-               add_to_free_area(page, &zone->free_area[order], migratetype);
+               add_to_free_list(page, zone, order, migratetype);
 
+       /* Notify page reporting subsystem of freed page */
+       if (report)
+               page_reporting_notify_free(order);
 }
 
 /*
@@ -1311,7 +1369,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                if (unlikely(isolated_pageblocks))
                        mt = get_pageblock_migratetype(page);
 
-               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+               __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
                trace_mm_page_pcpu_drain(page, 0, mt);
        }
        spin_unlock(&zone->lock);
@@ -1327,7 +1385,7 @@ static void free_one_page(struct zone *zone,
                is_migrate_isolate(migratetype))) {
                migratetype = get_pfnblock_migratetype(page, pfn);
        }
-       __free_one_page(page, pfn, zone, order, migratetype);
+       __free_one_page(page, pfn, zone, order, migratetype, true);
        spin_unlock(&zone->lock);
 }
 
@@ -2008,13 +2066,11 @@ void __init init_cma_reserved_pageblock(struct page *page)
  * -- nyc
  */
 static inline void expand(struct zone *zone, struct page *page,
-       int low, int high, struct free_area *area,
-       int migratetype)
+       int low, int high, int migratetype)
 {
        unsigned long size = 1 << high;
 
        while (high > low) {
-               area--;
                high--;
                size >>= 1;
                VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
@@ -2028,7 +2084,7 @@ static inline void expand(struct zone *zone, struct page *page,
                if (set_page_guard(zone, &page[size], high, migratetype))
                        continue;
 
-               add_to_free_area(&page[size], area, migratetype);
+               add_to_free_list(&page[size], zone, high, migratetype);
                set_page_order(&page[size], high);
        }
 }
@@ -2186,8 +2242,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                page = get_page_from_free_area(area, migratetype);
                if (!page)
                        continue;
-               del_page_from_free_area(page, area);
-               expand(zone, page, order, current_order, area, migratetype);
+               del_page_from_free_list(page, zone, current_order);
+               expand(zone, page, order, current_order, migratetype);
                set_pcppage_migratetype(page, migratetype);
                return page;
        }
@@ -2261,7 +2317,7 @@ static int move_freepages(struct zone *zone,
                VM_BUG_ON_PAGE(page_zone(page) != zone, page);
 
                order = page_order(page);
-               move_to_free_area(page, &zone->free_area[order], migratetype);
+               move_to_free_list(page, zone, order, migratetype);
                page += 1 << order;
                pages_moved += 1 << order;
        }
@@ -2377,7 +2433,6 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
                unsigned int alloc_flags, int start_type, bool whole_block)
 {
        unsigned int current_order = page_order(page);
-       struct free_area *area;
        int free_pages, movable_pages, alike_pages;
        int old_block_type;
 
@@ -2448,8 +2503,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
        return;
 
 single_page:
-       area = &zone->free_area[current_order];
-       move_to_free_area(page, area, start_type);
+       move_to_free_list(page, zone, current_order, start_type);
 }
 
 /*
@@ -3120,7 +3174,6 @@ EXPORT_SYMBOL_GPL(split_page);
 
 int __isolate_free_page(struct page *page, unsigned int order)
 {
-       struct free_area *area = &page_zone(page)->free_area[order];
        unsigned long watermark;
        struct zone *zone;
        int mt;
@@ -3146,7 +3199,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
 
        /* Remove page from free list */
 
-       del_page_from_free_area(page, area);
+       del_page_from_free_list(page, zone, order);
 
        /*
         * Set the pageblock if the isolated page is at least half of a
@@ -3167,6 +3220,25 @@ int __isolate_free_page(struct page *page, unsigned int order)
        return 1UL << order;
 }
 
+/**
+ * __putback_isolated_page - Return a now-isolated page back where we got it
+ * @page: Page that was isolated
+ * @order: Order of the isolated page
+ *
+ * This function is meant to return a page pulled from the free lists via
+ * __isolate_free_page back to the free lists they were pulled from.
+ */
+void __putback_isolated_page(struct page *page, unsigned int order, int mt)
+{
+       struct zone *zone = page_zone(page);
+
+       /* zone lock should be held when this function is called */
+       lockdep_assert_held(&zone->lock);
+
+       /* Return isolated page to tail of freelist. */
+       __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
+}
+
 /*
  * Update NUMA hit/miss statistics
  *
@@ -8713,7 +8785,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                BUG_ON(!PageBuddy(page));
                order = page_order(page);
                offlined_pages += 1 << order;
-               del_page_from_free_area(page, &zone->free_area[order]);
+               del_page_from_free_list(page, zone, order);
                pfn += (1 << order);
        }
        spin_unlock_irqrestore(&zone->lock, flags);
index 08ded03..a3616f7 100644 (file)
@@ -303,11 +303,8 @@ static int __meminit online_page_ext(unsigned long start_pfn,
                VM_BUG_ON(!node_state(nid, N_ONLINE));
        }
 
-       for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
-               if (!pfn_in_present_section(pfn))
-                       continue;
+       for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
                fail = init_section_page_ext(pfn, nid);
-       }
        if (!fail)
                return 0;
 
index a9fd7c7..2c11a38 100644 (file)
@@ -117,13 +117,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
                __mod_zone_freepage_state(zone, nr_pages, migratetype);
        }
        set_pageblock_migratetype(page, migratetype);
+       if (isolated_page)
+               __putback_isolated_page(page, order, migratetype);
        zone->nr_isolate_pageblock--;
 out:
        spin_unlock_irqrestore(&zone->lock, flags);
-       if (isolated_page) {
-               post_alloc_hook(page, order, __GFP_MOVABLE);
-               __free_pages(page, order);
-       }
 }
 
 static inline struct page *
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
new file mode 100644 (file)
index 0000000..3bbd471
--- /dev/null
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/page_reporting.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+
+#include "page_reporting.h"
+#include "internal.h"
+
+#define PAGE_REPORTING_DELAY   (2 * HZ)
+static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
+
+enum {
+       PAGE_REPORTING_IDLE = 0,
+       PAGE_REPORTING_REQUESTED,
+       PAGE_REPORTING_ACTIVE
+};
+
+/* request page reporting */
+static void
+__page_reporting_request(struct page_reporting_dev_info *prdev)
+{
+       unsigned int state;
+
+       /* Check to see if we are in desired state */
+       state = atomic_read(&prdev->state);
+       if (state == PAGE_REPORTING_REQUESTED)
+               return;
+
+       /*
+        *  If reporting is already active there is nothing we need to do.
+        *  Test against 0 as that represents PAGE_REPORTING_IDLE.
+        */
+       state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
+       if (state != PAGE_REPORTING_IDLE)
+               return;
+
+       /*
+        * Delay the start of work to allow a sizable queue to build. For
+        * now we are limiting this to running no more than once every
+        * couple of seconds.
+        */
+       schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
+}
+
+/* notify prdev of free page reporting request */
+void __page_reporting_notify(void)
+{
+       struct page_reporting_dev_info *prdev;
+
+       /*
+        * We use RCU to protect the pr_dev_info pointer. In almost all
+        * cases this should be present, however in the unlikely case of
+        * a shutdown this will be NULL and we should exit.
+        */
+       rcu_read_lock();
+       prdev = rcu_dereference(pr_dev_info);
+       if (likely(prdev))
+               __page_reporting_request(prdev);
+
+       rcu_read_unlock();
+}
+
+static void
+page_reporting_drain(struct page_reporting_dev_info *prdev,
+                    struct scatterlist *sgl, unsigned int nents, bool reported)
+{
+       struct scatterlist *sg = sgl;
+
+       /*
+        * Drain the now reported pages back into their respective
+        * free lists/areas. We assume at least one page is populated.
+        */
+       do {
+               struct page *page = sg_page(sg);
+               int mt = get_pageblock_migratetype(page);
+               unsigned int order = get_order(sg->length);
+
+               __putback_isolated_page(page, order, mt);
+
+               /* If the pages were not reported due to error skip flagging */
+               if (!reported)
+                       continue;
+
+               /*
+                * If page was not comingled with another page we can
+                * consider the result to be "reported" since the page
+                * hasn't been modified, otherwise we will need to
+                * report on the new larger page when we make our way
+                * up to that higher order.
+                */
+               if (PageBuddy(page) && page_order(page) == order)
+                       __SetPageReported(page);
+       } while ((sg = sg_next(sg)));
+
+       /* reinitialize scatterlist now that it is empty */
+       sg_init_table(sgl, nents);
+}
+
+/*
+ * The page reporting cycle consists of 4 stages, fill, report, drain, and
+ * idle. We will cycle through the first 3 stages until we cannot obtain a
+ * full scatterlist of pages, in that case we will switch to idle.
+ */
+static int
+page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
+                    unsigned int order, unsigned int mt,
+                    struct scatterlist *sgl, unsigned int *offset)
+{
+       struct free_area *area = &zone->free_area[order];
+       struct list_head *list = &area->free_list[mt];
+       unsigned int page_len = PAGE_SIZE << order;
+       struct page *page, *next;
+       long budget;
+       int err = 0;
+
+       /*
+        * Perform early check, if free area is empty there is
+        * nothing to process so we can skip this free_list.
+        */
+       if (list_empty(list))
+               return err;
+
+       spin_lock_irq(&zone->lock);
+
+       /*
+        * Limit how many calls we will be making to the page reporting
+        * device for this list. By doing this we avoid processing any
+        * given list for too long.
+        *
+        * The current value used allows us enough calls to process over a
+        * sixteenth of the current list plus one additional call to handle
+        * any pages that may have already been present from the previous
+        * list processed. This should result in us reporting all pages on
+        * an idle system in about 30 seconds.
+        *
+        * The division here should be cheap since PAGE_REPORTING_CAPACITY
+        * should always be a power of 2.
+        */
+       budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
+
+       /* loop through free list adding unreported pages to sg list */
+       list_for_each_entry_safe(page, next, list, lru) {
+               /* We are going to skip over the reported pages. */
+               if (PageReported(page))
+                       continue;
+
+               /*
+                * If we fully consumed our budget then update our
+                * state to indicate that we are requesting additional
+                * processing and exit this list.
+                */
+               if (budget < 0) {
+                       atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
+                       next = page;
+                       break;
+               }
+
+               /* Attempt to pull page from list and place in scatterlist */
+               if (*offset) {
+                       if (!__isolate_free_page(page, order)) {
+                               next = page;
+                               break;
+                       }
+
+                       /* Add page to scatter list */
+                       --(*offset);
+                       sg_set_page(&sgl[*offset], page, page_len, 0);
+
+                       continue;
+               }
+
+               /*
+                * Make the first non-reported page in the free list
+                * the new head of the free list before we release the
+                * zone lock.
+                */
+               if (&page->lru != list && !list_is_first(&page->lru, list))
+                       list_rotate_to_front(&page->lru, list);
+
+               /* release lock before waiting on report processing */
+               spin_unlock_irq(&zone->lock);
+
+               /* begin processing pages in local list */
+               err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
+
+               /* reset offset since the full list was reported */
+               *offset = PAGE_REPORTING_CAPACITY;
+
+               /* update budget to reflect call to report function */
+               budget--;
+
+               /* reacquire zone lock and resume processing */
+               spin_lock_irq(&zone->lock);
+
+               /* flush reported pages from the sg list */
+               page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
+
+               /*
+                * Reset next to first entry, the old next isn't valid
+                * since we dropped the lock to report the pages
+                */
+               next = list_first_entry(list, struct page, lru);
+
+               /* exit on error */
+               if (err)
+                       break;
+       }
+
+       /* Rotate any leftover pages to the head of the freelist */
+       if (&next->lru != list && !list_is_first(&next->lru, list))
+               list_rotate_to_front(&next->lru, list);
+
+       spin_unlock_irq(&zone->lock);
+
+       return err;
+}
+
+static int
+page_reporting_process_zone(struct page_reporting_dev_info *prdev,
+                           struct scatterlist *sgl, struct zone *zone)
+{
+       unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
+       unsigned long watermark;
+       int err = 0;
+
+       /* Generate minimum watermark to be able to guarantee progress */
+       watermark = low_wmark_pages(zone) +
+                   (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
+
+       /*
+        * Cancel request if insufficient free memory or if we failed
+        * to allocate page reporting statistics for the zone.
+        */
+       if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
+               return err;
+
+       /* Process each free list starting from lowest order/mt */
+       for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
+               for (mt = 0; mt < MIGRATE_TYPES; mt++) {
+                       /* We do not pull pages from the isolate free list */
+                       if (is_migrate_isolate(mt))
+                               continue;
+
+                       err = page_reporting_cycle(prdev, zone, order, mt,
+                                                  sgl, &offset);
+                       if (err)
+                               return err;
+               }
+       }
+
+       /* report the leftover pages before going idle */
+       leftover = PAGE_REPORTING_CAPACITY - offset;
+       if (leftover) {
+               sgl = &sgl[offset];
+               err = prdev->report(prdev, sgl, leftover);
+
+               /* flush any remaining pages out from the last report */
+               spin_lock_irq(&zone->lock);
+               page_reporting_drain(prdev, sgl, leftover, !err);
+               spin_unlock_irq(&zone->lock);
+       }
+
+       return err;
+}
+
+static void page_reporting_process(struct work_struct *work)
+{
+       struct delayed_work *d_work = to_delayed_work(work);
+       struct page_reporting_dev_info *prdev =
+               container_of(d_work, struct page_reporting_dev_info, work);
+       int err = 0, state = PAGE_REPORTING_ACTIVE;
+       struct scatterlist *sgl;
+       struct zone *zone;
+
+       /*
+        * Change the state to "Active" so that we can track if there is
+        * anyone requests page reporting after we complete our pass. If
+        * the state is not altered by the end of the pass we will switch
+        * to idle and quit scheduling reporting runs.
+        */
+       atomic_set(&prdev->state, state);
+
+       /* allocate scatterlist to store pages being reported on */
+       sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
+       if (!sgl)
+               goto err_out;
+
+       sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
+
+       for_each_zone(zone) {
+               err = page_reporting_process_zone(prdev, sgl, zone);
+               if (err)
+                       break;
+       }
+
+       kfree(sgl);
+err_out:
+       /*
+        * If the state has reverted back to requested then there may be
+        * additional pages to be processed. We will defer for 2s to allow
+        * more pages to accumulate.
+        */
+       state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
+       if (state == PAGE_REPORTING_REQUESTED)
+               schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
+}
+
+static DEFINE_MUTEX(page_reporting_mutex);
+DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
+
+int page_reporting_register(struct page_reporting_dev_info *prdev)
+{
+       int err = 0;
+
+       mutex_lock(&page_reporting_mutex);
+
+       /* nothing to do if already in use */
+       if (rcu_access_pointer(pr_dev_info)) {
+               err = -EBUSY;
+               goto err_out;
+       }
+
+       /* initialize state and work structures */
+       atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
+       INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
+
+       /* Begin initial flush of zones */
+       __page_reporting_request(prdev);
+
+       /* Assign device to allow notifications */
+       rcu_assign_pointer(pr_dev_info, prdev);
+
+       /* enable page reporting notification */
+       if (!static_key_enabled(&page_reporting_enabled)) {
+               static_branch_enable(&page_reporting_enabled);
+               pr_info("Free page reporting enabled\n");
+       }
+err_out:
+       mutex_unlock(&page_reporting_mutex);
+
+       return err;
+}
+EXPORT_SYMBOL_GPL(page_reporting_register);
+
+void page_reporting_unregister(struct page_reporting_dev_info *prdev)
+{
+       mutex_lock(&page_reporting_mutex);
+
+       if (rcu_access_pointer(pr_dev_info) == prdev) {
+               /* Disable page reporting notification */
+               RCU_INIT_POINTER(pr_dev_info, NULL);
+               synchronize_rcu();
+
+               /* Flush any existing work, and lock it out */
+               cancel_delayed_work_sync(&prdev->work);
+       }
+
+       mutex_unlock(&page_reporting_mutex);
+}
+EXPORT_SYMBOL_GPL(page_reporting_unregister);
diff --git a/mm/page_reporting.h b/mm/page_reporting.h
new file mode 100644 (file)
index 0000000..aa6d37f
--- /dev/null
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_PAGE_REPORTING_H
+#define _MM_PAGE_REPORTING_H
+
+#include <linux/mmzone.h>
+#include <linux/pageblock-flags.h>
+#include <linux/page-isolation.h>
+#include <linux/jump_label.h>
+#include <linux/slab.h>
+#include <asm/pgtable.h>
+#include <linux/scatterlist.h>
+
+#define PAGE_REPORTING_MIN_ORDER       pageblock_order
+
+#ifdef CONFIG_PAGE_REPORTING
+DECLARE_STATIC_KEY_FALSE(page_reporting_enabled);
+void __page_reporting_notify(void);
+
+static inline bool page_reported(struct page *page)
+{
+       return static_branch_unlikely(&page_reporting_enabled) &&
+              PageReported(page);
+}
+
+/**
+ * page_reporting_notify_free - Free page notification to start page processing
+ *
+ * This function is meant to act as a screener for __page_reporting_notify
+ * which will determine if a give zone has crossed over the high-water mark
+ * that will justify us beginning page treatment. If we have crossed that
+ * threshold then it will start the process of pulling some pages and
+ * placing them in the batch list for treatment.
+ */
+static inline void page_reporting_notify_free(unsigned int order)
+{
+       /* Called from hot path in __free_one_page() */
+       if (!static_branch_unlikely(&page_reporting_enabled))
+               return;
+
+       /* Determine if we have crossed reporting threshold */
+       if (order < PAGE_REPORTING_MIN_ORDER)
+               return;
+
+       /* This will add a few cycles, but should be called infrequently */
+       __page_reporting_notify();
+}
+#else /* CONFIG_PAGE_REPORTING */
+#define page_reported(_page)   false
+
+static inline void page_reporting_notify_free(unsigned int order)
+{
+}
+#endif /* CONFIG_PAGE_REPORTING */
+#endif /*_MM_PAGE_REPORTING_H */
index 2df75a1..f79a206 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -275,19 +275,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 {
        struct anon_vma_chain *avc, *pavc;
        struct anon_vma *root = NULL;
-       struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev;
-
-       /*
-        * If parent share anon_vma with its vm_prev, keep this sharing in in
-        * child.
-        *
-        * 1. Parent has vm_prev, which implies we have vm_prev.
-        * 2. Parent and its vm_prev have the same anon_vma.
-        */
-       if (!dst->anon_vma && src->anon_vma &&
-           pprev && pprev->anon_vma == src->anon_vma)
-               dst->anon_vma = prev->anon_vma;
-
 
        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma;
@@ -946,7 +933,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                        set_pte_at(vma->vm_mm, address, pte, entry);
                        ret = 1;
                } else {
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                        pmd_t *pmd = pvmw.pmd;
                        pmd_t entry;
 
@@ -1385,7 +1372,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        struct page *subpage;
        bool ret = true;
        struct mmu_notifier_range range;
-       enum ttu_flags flags = (enum ttu_flags)arg;
+       enum ttu_flags flags = (enum ttu_flags)(long)arg;
 
        /* munlock has nothing to gain from examining un-locked vmas */
        if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1515,6 +1502,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       if (pte_uffd_wp(pteval))
+                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
                        /*
                         * No need to invalidate here it will synchronize on
@@ -1614,6 +1603,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       if (pte_uffd_wp(pteval))
+                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                        /*
                         * No need to invalidate here it will synchronize on
@@ -1680,6 +1671,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                       if (pte_uffd_wp(pteval))
+                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                        /* Invalidate as we cleared the pte */
                        mmu_notifier_invalidate_range(mm, address,
index f47347c..d722eb8 100644 (file)
@@ -410,7 +410,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
 #define SHMEM_HUGE_DENY                (-1)
 #define SHMEM_HUGE_FORCE       (-2)
 
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /* ifdef here to avoid bloating shmem.o when not necessary */
 
 static int shmem_huge __read_mostly;
@@ -580,7 +580,7 @@ static long shmem_unused_huge_count(struct super_block *sb,
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        return READ_ONCE(sbinfo->shrinklist_len);
 }
-#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
 
 #define shmem_huge SHMEM_HUGE_DENY
 
@@ -589,11 +589,11 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 {
        return 0;
 }
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
 {
-       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
            (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
            shmem_huge != SHMEM_HUGE_DENY)
                return true;
@@ -789,6 +789,32 @@ void shmem_unlock_mapping(struct address_space *mapping)
 }
 
 /*
+ * Check whether a hole-punch or truncation needs to split a huge page,
+ * returning true if no split was required, or the split has been successful.
+ *
+ * Eviction (or truncation to 0 size) should never need to split a huge page;
+ * but in rare cases might do so, if shmem_undo_range() failed to trylock on
+ * head, and then succeeded to trylock on tail.
+ *
+ * A split can only succeed when there are no additional references on the
+ * huge page: so the split below relies upon find_get_entries() having stopped
+ * when it found a subpage of the huge page, without getting further references.
+ */
+static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
+{
+       if (!PageTransCompound(page))
+               return true;
+
+       /* Just proceed to delete a huge page wholly within the range punched */
+       if (PageHead(page) &&
+           page->index >= start && page->index + HPAGE_PMD_NR <= end)
+               return true;
+
+       /* Try to split huge page, so we can truly punch the hole or truncate */
+       return split_huge_page(page) >= 0;
+}
+
+/*
  * Remove range of pages and swap entries from page cache, and free them.
  * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
  */
@@ -838,31 +864,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                        if (!trylock_page(page))
                                continue;
 
-                       if (PageTransTail(page)) {
-                               /* Middle of THP: zero out the page */
-                               clear_highpage(page);
-                               unlock_page(page);
-                               continue;
-                       } else if (PageTransHuge(page)) {
-                               if (index == round_down(end, HPAGE_PMD_NR)) {
-                                       /*
-                                        * Range ends in the middle of THP:
-                                        * zero out the page
-                                        */
-                                       clear_highpage(page);
-                                       unlock_page(page);
-                                       continue;
-                               }
-                               index += HPAGE_PMD_NR - 1;
-                               i += HPAGE_PMD_NR - 1;
-                       }
-
-                       if (!unfalloc || !PageUptodate(page)) {
-                               VM_BUG_ON_PAGE(PageTail(page), page);
-                               if (page_mapping(page) == mapping) {
-                                       VM_BUG_ON_PAGE(PageWriteback(page), page);
+                       if ((!unfalloc || !PageUptodate(page)) &&
+                           page_mapping(page) == mapping) {
+                               VM_BUG_ON_PAGE(PageWriteback(page), page);
+                               if (shmem_punch_compound(page, start, end))
                                        truncate_inode_page(mapping, page);
-                               }
                        }
                        unlock_page(page);
                }
@@ -936,43 +942,25 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 
                        lock_page(page);
 
-                       if (PageTransTail(page)) {
-                               /* Middle of THP: zero out the page */
-                               clear_highpage(page);
-                               unlock_page(page);
-                               /*
-                                * Partial thp truncate due 'start' in middle
-                                * of THP: don't need to look on these pages
-                                * again on !pvec.nr restart.
-                                */
-                               if (index != round_down(end, HPAGE_PMD_NR))
-                                       start++;
-                               continue;
-                       } else if (PageTransHuge(page)) {
-                               if (index == round_down(end, HPAGE_PMD_NR)) {
-                                       /*
-                                        * Range ends in the middle of THP:
-                                        * zero out the page
-                                        */
-                                       clear_highpage(page);
-                                       unlock_page(page);
-                                       continue;
-                               }
-                               index += HPAGE_PMD_NR - 1;
-                               i += HPAGE_PMD_NR - 1;
-                       }
-
                        if (!unfalloc || !PageUptodate(page)) {
-                               VM_BUG_ON_PAGE(PageTail(page), page);
-                               if (page_mapping(page) == mapping) {
-                                       VM_BUG_ON_PAGE(PageWriteback(page), page);
-                                       truncate_inode_page(mapping, page);
-                               } else {
+                               if (page_mapping(page) != mapping) {
                                        /* Page was replaced by swap: retry */
                                        unlock_page(page);
                                        index--;
                                        break;
                                }
+                               VM_BUG_ON_PAGE(PageWriteback(page), page);
+                               if (shmem_punch_compound(page, start, end))
+                                       truncate_inode_page(mapping, page);
+                               else {
+                                       /* Wipe the page and don't get stuck */
+                                       clear_highpage(page);
+                                       flush_dcache_page(page);
+                                       set_page_dirty(page);
+                                       if (index <
+                                           round_up(start, HPAGE_PMD_NR))
+                                               start = index + 1;
+                               }
                        }
                        unlock_page(page);
                }
@@ -1059,7 +1047,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
                         * Part of the huge page can be beyond i_size: subject
                         * to shrink under memory pressure.
                         */
-                       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
+                       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                                spin_lock(&sbinfo->shrinklist_lock);
                                /*
                                 * _careful to defend against unlocked access to
@@ -1472,9 +1460,6 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
        pgoff_t hindex;
        struct page *page;
 
-       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
-               return NULL;
-
        hindex = round_down(index, HPAGE_PMD_NR);
        if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
                                                                XA_PRESENT))
@@ -1486,6 +1471,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
        shmem_pseudo_vma_destroy(&pvma);
        if (page)
                prep_transhuge_page(page);
+       else
+               count_vm_event(THP_FILE_FALLBACK);
        return page;
 }
 
@@ -1511,7 +1498,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
        int nr;
        int err = -ENOSPC;
 
-       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                huge = false;
        nr = huge ? HPAGE_PMD_NR : 1;
 
@@ -1813,17 +1800,20 @@ repeat:
        if (shmem_huge == SHMEM_HUGE_FORCE)
                goto alloc_huge;
        switch (sbinfo->huge) {
-               loff_t i_size;
-               pgoff_t off;
        case SHMEM_HUGE_NEVER:
                goto alloc_nohuge;
-       case SHMEM_HUGE_WITHIN_SIZE:
+       case SHMEM_HUGE_WITHIN_SIZE: {
+               loff_t i_size;
+               pgoff_t off;
+
                off = round_up(index, HPAGE_PMD_NR);
                i_size = round_up(i_size_read(inode), PAGE_SIZE);
                if (i_size >= HPAGE_PMD_SIZE &&
                    i_size >> PAGE_SHIFT >= off)
                        goto alloc_huge;
-               /* fallthrough */
+
+               fallthrough;
+       }
        case SHMEM_HUGE_ADVISE:
                if (sgp_huge == SGP_HUGE)
                        goto alloc_huge;
@@ -1871,8 +1861,13 @@ alloc_nohuge:
 
        error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
                                            PageTransHuge(page));
-       if (error)
+       if (error) {
+               if (PageTransHuge(page)) {
+                       count_vm_event(THP_FILE_FALLBACK);
+                       count_vm_event(THP_FILE_FALLBACK_CHARGE);
+               }
                goto unacct;
+       }
        error = shmem_add_to_page_cache(page, mapping, hindex,
                                        NULL, gfp & GFP_RECLAIM_MASK);
        if (error) {
@@ -2089,7 +2084,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
        get_area = current->mm->get_unmapped_area;
        addr = get_area(file, uaddr, len, pgoff, flags);
 
-       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return addr;
        if (IS_ERR_VALUE(addr))
                return addr;
@@ -2228,7 +2223,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 
        file_accessed(file);
        vma->vm_ops = &shmem_vm_ops;
-       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                        ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
                        (vma->vm_end & HPAGE_PMD_MASK)) {
                khugepaged_enter(vma, vma->vm_flags);
@@ -3113,12 +3108,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 
        error = security_inode_init_security(inode, dir, &dentry->d_name,
                                             shmem_initxattrs, NULL);
-       if (error) {
-               if (error != -EOPNOTSUPP) {
-                       iput(inode);
-                       return error;
-               }
-               error = 0;
+       if (error && error != -EOPNOTSUPP) {
+               iput(inode);
+               return error;
        }
 
        inode->i_size = len-1;
@@ -3455,7 +3447,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
        case Opt_huge:
                ctx->huge = result.uint_32;
                if (ctx->huge != SHMEM_HUGE_NEVER &&
-                   !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+                   !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                      has_transparent_hugepage()))
                        goto unsupported_parameter;
                ctx->seen |= SHMEM_SEEN_HUGE;
@@ -3601,7 +3593,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
                seq_printf(seq, ",gid=%u",
                                from_kgid_munged(&init_user_ns, sbinfo->gid));
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
        if (sbinfo->huge)
                seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
@@ -3846,7 +3838,7 @@ static const struct super_operations shmem_ops = {
        .evict_inode    = shmem_evict_inode,
        .drop_inode     = generic_delete_inode,
        .put_super      = shmem_put_super,
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        .nr_cached_objects      = shmem_unused_huge_count,
        .free_cached_objects    = shmem_unused_huge_scan,
 #endif
@@ -3908,7 +3900,7 @@ int __init shmem_init(void)
                goto out1;
        }
 
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
                SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
        else
@@ -3924,7 +3916,7 @@ out2:
        return error;
 }
 
-#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
 static ssize_t shmem_enabled_show(struct kobject *kobj,
                struct kobj_attribute *attr, char *buf)
 {
@@ -3976,9 +3968,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
 
 struct kobj_attribute shmem_enabled_attr =
        __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
 
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 bool shmem_huge_enabled(struct vm_area_struct *vma)
 {
        struct inode *inode = file_inode(vma->vm_file);
@@ -4004,7 +3996,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
                        if (i_size >= HPAGE_PMD_SIZE &&
                                        i_size >> PAGE_SHIFT >= off)
                                return true;
-                       /* fall through */
+                       fallthrough;
                case SHMEM_HUGE_ADVISE:
                        /* TODO: implement fadvise() hints */
                        return (vma->vm_flags & VM_HUGEPAGE);
@@ -4013,7 +4005,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
                        return false;
        }
 }
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #else /* !CONFIG_SHMEM */
 
@@ -4182,7 +4174,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
        vma->vm_file = file;
        vma->vm_ops = &shmem_vm_ops;
 
-       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                        ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
                        (vma->vm_end & HPAGE_PMD_MASK)) {
                khugepaged_enter(vma, vma->vm_flags);
index c716059..44406d9 100644 (file)
@@ -183,11 +183,11 @@ void __meminit __shuffle_free_memory(pg_data_t *pgdat)
                shuffle_zone(z);
 }
 
-void add_to_free_area_random(struct page *page, struct free_area *area,
-               int migratetype)
+bool shuffle_pick_tail(void)
 {
        static u64 rand;
        static u8 rand_bits;
+       bool ret;
 
        /*
         * The lack of locking is deliberate. If 2 threads race to
@@ -198,10 +198,10 @@ void add_to_free_area_random(struct page *page, struct free_area *area,
                rand = get_random_u64();
        }
 
-       if (rand & 1)
-               add_to_free_area(page, area, migratetype);
-       else
-               add_to_free_area_tail(page, area, migratetype);
+       ret = rand & 1;
+
        rand_bits--;
        rand >>= 1;
+
+       return ret;
 }
index 777a257..4d79f03 100644 (file)
@@ -22,6 +22,7 @@ enum mm_shuffle_ctl {
 DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
 extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl);
 extern void __shuffle_free_memory(pg_data_t *pgdat);
+extern bool shuffle_pick_tail(void);
 static inline void shuffle_free_memory(pg_data_t *pgdat)
 {
        if (!static_branch_unlikely(&page_alloc_shuffle_key))
@@ -44,6 +45,11 @@ static inline bool is_shuffle_order(int order)
        return order >= SHUFFLE_ORDER;
 }
 #else
+static inline bool shuffle_pick_tail(void)
+{
+       return false;
+}
+
 static inline void shuffle_free_memory(pg_data_t *pgdat)
 {
 }
index 5282f88..93ec4a5 100644 (file)
@@ -1581,6 +1581,7 @@ static int slabinfo_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops slabinfo_proc_ops = {
+       .proc_flags     = PROC_ENTRY_PERMANENT,
        .proc_open      = slabinfo_open,
        .proc_read      = seq_read,
        .proc_write     = slabinfo_write,
index 3098e0c..332d4b4 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -449,6 +449,7 @@ static DEFINE_SPINLOCK(object_map_lock);
  * not vanish from under us.
  */
 static unsigned long *get_map(struct kmem_cache *s, struct page *page)
+       __acquires(&object_map_lock)
 {
        void *p;
        void *addr = page_address(page);
@@ -465,7 +466,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
        return object_map;
 }
 
-static void put_map(unsigned long *map)
+static void put_map(unsigned long *map) __releases(&object_map_lock)
 {
        VM_BUG_ON(map != object_map);
        lockdep_assert_held(&object_map_lock);
index f1af4d4..1aee5a4 100644 (file)
@@ -209,6 +209,7 @@ static inline unsigned long first_present_section_nr(void)
        return next_present_section_nr(-1);
 }
 
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
 static void subsection_mask_set(unsigned long *map, unsigned long pfn,
                unsigned long nr_pages)
 {
@@ -243,6 +244,11 @@ void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
                nr_pages -= pfns;
        }
 }
+#else
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
+{
+}
+#endif
 
 /* Record a memory area against a node. */
 void __init memory_present(int nid, unsigned long start, unsigned long end)
@@ -660,6 +666,55 @@ static void free_map_bootmem(struct page *memmap)
 
        vmemmap_free(start, end, NULL);
 }
+
+static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+       DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+       DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
+       struct mem_section *ms = __pfn_to_section(pfn);
+       unsigned long *subsection_map = ms->usage
+               ? &ms->usage->subsection_map[0] : NULL;
+
+       subsection_mask_set(map, pfn, nr_pages);
+       if (subsection_map)
+               bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
+
+       if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
+                               "section already deactivated (%#lx + %ld)\n",
+                               pfn, nr_pages))
+               return -EINVAL;
+
+       bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
+       return 0;
+}
+
+static bool is_subsection_map_empty(struct mem_section *ms)
+{
+       return bitmap_empty(&ms->usage->subsection_map[0],
+                           SUBSECTIONS_PER_SECTION);
+}
+
+static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+       struct mem_section *ms = __pfn_to_section(pfn);
+       DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+       unsigned long *subsection_map;
+       int rc = 0;
+
+       subsection_mask_set(map, pfn, nr_pages);
+
+       subsection_map = &ms->usage->subsection_map[0];
+
+       if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
+               rc = -EINVAL;
+       else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
+               rc = -EEXIST;
+       else
+               bitmap_or(subsection_map, map, subsection_map,
+                               SUBSECTIONS_PER_SECTION);
+
+       return rc;
+}
 #else
 struct page * __meminit populate_section_memmap(unsigned long pfn,
                unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
@@ -703,48 +758,51 @@ static void free_map_bootmem(struct page *memmap)
                        put_page_bootmem(page);
        }
 }
+
+static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+       return 0;
+}
+
+static bool is_subsection_map_empty(struct mem_section *ms)
+{
+       return true;
+}
+
+static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
+{
+       return 0;
+}
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
+/*
+ * To deactivate a memory region, there are 3 cases to handle across
+ * two configurations (SPARSEMEM_VMEMMAP={y,n}):
+ *
+ * 1. deactivation of a partial hot-added section (only possible in
+ *    the SPARSEMEM_VMEMMAP=y case).
+ *      a) section was present at memory init.
+ *      b) section was hot-added post memory init.
+ * 2. deactivation of a complete hot-added section.
+ * 3. deactivation of a complete section from memory init.
+ *
+ * For 1, when subsection_map does not empty we will not be freeing the
+ * usage map, but still need to free the vmemmap range.
+ *
+ * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified
+ */
 static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
                struct vmem_altmap *altmap)
 {
-       DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
-       DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
        struct mem_section *ms = __pfn_to_section(pfn);
        bool section_is_early = early_section(ms);
        struct page *memmap = NULL;
        bool empty;
-       unsigned long *subsection_map = ms->usage
-               ? &ms->usage->subsection_map[0] : NULL;
-
-       subsection_mask_set(map, pfn, nr_pages);
-       if (subsection_map)
-               bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
 
-       if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
-                               "section already deactivated (%#lx + %ld)\n",
-                               pfn, nr_pages))
+       if (clear_subsection_map(pfn, nr_pages))
                return;
 
-       /*
-        * There are 3 cases to handle across two configurations
-        * (SPARSEMEM_VMEMMAP={y,n}):
-        *
-        * 1/ deactivation of a partial hot-added section (only possible
-        * in the SPARSEMEM_VMEMMAP=y case).
-        *    a/ section was present at memory init
-        *    b/ section was hot-added post memory init
-        * 2/ deactivation of a complete hot-added section
-        * 3/ deactivation of a complete section from memory init
-        *
-        * For 1/, when subsection_map does not empty we will not be
-        * freeing the usage map, but still need to free the vmemmap
-        * range.
-        *
-        * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
-        */
-       bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
-       empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION);
+       empty = is_subsection_map_empty(ms);
        if (empty) {
                unsigned long section_nr = pfn_to_section_nr(pfn);
 
@@ -780,31 +838,19 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 static struct page * __meminit section_activate(int nid, unsigned long pfn,
                unsigned long nr_pages, struct vmem_altmap *altmap)
 {
-       DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
        struct mem_section *ms = __pfn_to_section(pfn);
        struct mem_section_usage *usage = NULL;
-       unsigned long *subsection_map;
        struct page *memmap;
        int rc = 0;
 
-       subsection_mask_set(map, pfn, nr_pages);
-
        if (!ms->usage) {
                usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
                if (!usage)
                        return ERR_PTR(-ENOMEM);
                ms->usage = usage;
        }
-       subsection_map = &ms->usage->subsection_map[0];
-
-       if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
-               rc = -EINVAL;
-       else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
-               rc = -EEXIST;
-       else
-               bitmap_or(subsection_map, map, subsection_map,
-                               SUBSECTIONS_PER_SECTION);
 
+       rc = fill_subsection_map(pfn, nr_pages);
        if (rc) {
                if (usage)
                        ms->usage = NULL;
@@ -840,6 +886,10 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
  *
  * This is only intended for hotplug.
  *
+ * Note that only VMEMMAP supports sub-section aligned hotplug,
+ * the proper alignment and size are gated by check_pfn_span().
+ *
+ *
  * Return:
  * * 0         - On success.
  * * -EEXIST   - Section has been present.
index a4af8c9..bf9a79f 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -276,7 +276,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
                            void *arg)
 {
        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-               int file = page_is_file_cache(page);
+               int file = page_is_file_lru(page);
                int lru = page_lru_base_type(page);
 
                del_page_from_lru_list(page, lruvec, lru);
@@ -394,7 +394,7 @@ void mark_page_accessed(struct page *page)
                else
                        __lru_cache_activate_page(page);
                ClearPageReferenced(page);
-               if (page_is_file_cache(page))
+               if (page_is_file_lru(page))
                        workingset_activation(page);
        }
        if (page_is_idle(page))
@@ -515,7 +515,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
                return;
 
        active = PageActive(page);
-       file = page_is_file_cache(page);
+       file = page_is_file_lru(page);
        lru = page_lru_base_type(page);
 
        del_page_from_lru_list(page, lruvec, lru + active);
@@ -548,7 +548,7 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
                            void *arg)
 {
        if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
-               int file = page_is_file_cache(page);
+               int file = page_is_file_lru(page);
                int lru = page_lru_base_type(page);
 
                del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
@@ -573,9 +573,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
                ClearPageActive(page);
                ClearPageReferenced(page);
                /*
-                * lazyfree pages are clean anonymous pages. They have
-                * SwapBacked flag cleared to distinguish normal anonymous
-                * pages
+                * Lazyfree pages are clean anonymous pages.  They have
+                * PG_swapbacked flag cleared, to distinguish them from normal
+                * anonymous pages
                 */
                ClearPageSwapBacked(page);
                add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
@@ -962,7 +962,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 
        if (page_evictable(page)) {
                lru = page_lru(page);
-               update_page_reclaim_stat(lruvec, page_is_file_cache(page),
+               update_page_reclaim_stat(lruvec, page_is_file_lru(page),
                                         PageActive(page));
                if (was_unevictable)
                        count_vm_event(UNEVICTABLE_PGRESCUED);
@@ -1004,6 +1004,10 @@ void __pagevec_lru_add(struct pagevec *pvec)
  * ascending indexes.  There may be holes in the indices due to
  * not-present entries.
  *
+ * Only one subpage of a Transparent Huge Page is returned in one call:
+ * allowing truncate_inode_pages_range() to evict the whole THP without
+ * cycling through a pagevec of extra references.
+ *
  * pagevec_lookup_entries() returns the number of entries which were
  * found.
  */
index 273a923..5871a2a 100644 (file)
@@ -2797,6 +2797,7 @@ static int swaps_open(struct inode *inode, struct file *file)
 }
 
 static const struct proc_ops swaps_proc_ops = {
+       .proc_flags     = PROC_ENTRY_PERMANENT,
        .proc_open      = swaps_open,
        .proc_read      = seq_read,
        .proc_lseek     = seq_lseek,
index bd96855..512576e 100644 (file)
@@ -53,7 +53,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
                            struct vm_area_struct *dst_vma,
                            unsigned long dst_addr,
                            unsigned long src_addr,
-                           struct page **pagep)
+                           struct page **pagep,
+                           bool wp_copy)
 {
        struct mem_cgroup *memcg;
        pte_t _dst_pte, *dst_pte;
@@ -99,9 +100,13 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
        if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
                goto out_release;
 
-       _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
-       if (dst_vma->vm_flags & VM_WRITE)
-               _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+       _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
+       if (dst_vma->vm_flags & VM_WRITE) {
+               if (wp_copy)
+                       _dst_pte = pte_mkuffd_wp(_dst_pte);
+               else
+                       _dst_pte = pte_mkwrite(_dst_pte);
+       }
 
        dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
        if (dst_vma->vm_file) {
@@ -415,7 +420,8 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
                                                unsigned long dst_addr,
                                                unsigned long src_addr,
                                                struct page **page,
-                                               bool zeropage)
+                                               bool zeropage,
+                                               bool wp_copy)
 {
        ssize_t err;
 
@@ -432,11 +438,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
        if (!(dst_vma->vm_flags & VM_SHARED)) {
                if (!zeropage)
                        err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
-                                              dst_addr, src_addr, page);
+                                              dst_addr, src_addr, page,
+                                              wp_copy);
                else
                        err = mfill_zeropage_pte(dst_mm, dst_pmd,
                                                 dst_vma, dst_addr);
        } else {
+               VM_WARN_ON_ONCE(wp_copy);
                if (!zeropage)
                        err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
                                                     dst_vma, dst_addr,
@@ -454,7 +462,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
                                              unsigned long src_start,
                                              unsigned long len,
                                              bool zeropage,
-                                             bool *mmap_changing)
+                                             bool *mmap_changing,
+                                             __u64 mode)
 {
        struct vm_area_struct *dst_vma;
        ssize_t err;
@@ -462,6 +471,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
        unsigned long src_addr, dst_addr;
        long copied;
        struct page *page;
+       bool wp_copy;
 
        /*
         * Sanitize the command parameters:
@@ -508,6 +518,14 @@ retry:
                goto out_unlock;
 
        /*
+        * validate 'mode' now that we know the dst_vma: don't allow
+        * a wrprotect copy if the userfaultfd didn't register as WP.
+        */
+       wp_copy = mode & UFFDIO_COPY_MODE_WP;
+       if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
+               goto out_unlock;
+
+       /*
         * If this is a HUGETLB vma, pass off to appropriate routine
         */
        if (is_vm_hugetlb_page(dst_vma))
@@ -562,7 +580,7 @@ retry:
                BUG_ON(pmd_trans_huge(*dst_pmd));
 
                err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
-                                      src_addr, &page, zeropage);
+                                      src_addr, &page, zeropage, wp_copy);
                cond_resched();
 
                if (unlikely(err == -ENOENT)) {
@@ -609,14 +627,68 @@ out:
 
 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
                     unsigned long src_start, unsigned long len,
-                    bool *mmap_changing)
+                    bool *mmap_changing, __u64 mode)
 {
        return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
-                             mmap_changing);
+                             mmap_changing, mode);
 }
 
 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
                       unsigned long len, bool *mmap_changing)
 {
-       return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
+       return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
+}
+
+int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
+                       unsigned long len, bool enable_wp, bool *mmap_changing)
+{
+       struct vm_area_struct *dst_vma;
+       pgprot_t newprot;
+       int err;
+
+       /*
+        * Sanitize the command parameters:
+        */
+       BUG_ON(start & ~PAGE_MASK);
+       BUG_ON(len & ~PAGE_MASK);
+
+       /* Does the address range wrap, or is the span zero-sized? */
+       BUG_ON(start + len <= start);
+
+       down_read(&dst_mm->mmap_sem);
+
+       /*
+        * If memory mappings are changing because of non-cooperative
+        * operation (e.g. mremap) running in parallel, bail out and
+        * request the user to retry later
+        */
+       err = -EAGAIN;
+       if (mmap_changing && READ_ONCE(*mmap_changing))
+               goto out_unlock;
+
+       err = -ENOENT;
+       dst_vma = find_dst_vma(dst_mm, start, len);
+       /*
+        * Make sure the vma is not shared, that the dst range is
+        * both valid and fully within a single existing vma.
+        */
+       if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+               goto out_unlock;
+       if (!userfaultfd_wp(dst_vma))
+               goto out_unlock;
+       if (!vma_is_anonymous(dst_vma))
+               goto out_unlock;
+
+       if (enable_wp)
+               newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
+       else
+               newprot = vm_get_page_prot(dst_vma->vm_flags);
+
+       change_protection(dst_vma, start, start + len, newprot,
+                         enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+
+       err = 0;
+out_unlock:
+       up_read(&dst_mm->mmap_sem);
+       return err;
 }
index 6b8eeb0..399f219 100644 (file)
@@ -3368,7 +3368,7 @@ retry:
                        goto overflow;
 
                /*
-                * If required width exeeds current VA block, move
+                * If required width exceeds current VA block, move
                 * base downwards and then recheck.
                 */
                if (base + end > va->va_end) {
index 2e8e690..b06868f 100644 (file)
@@ -919,7 +919,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                 * exceptional entries and shadow exceptional entries in the
                 * same address_space.
                 */
-               if (reclaimed && page_is_file_cache(page) &&
+               if (reclaimed && page_is_file_lru(page) &&
                    !mapping_exiting(mapping) && !dax_mapping(mapping))
                        shadow = workingset_eviction(page, target_memcg);
                __delete_from_page_cache(page, shadow);
@@ -1043,7 +1043,7 @@ static void page_check_dirty_writeback(struct page *page,
         * Anonymous pages are not handled by flushers and must be written
         * from reclaim context. Do not stall reclaim based on them
         */
-       if (!page_is_file_cache(page) ||
+       if (!page_is_file_lru(page) ||
            (PageAnon(page) && !PageSwapBacked(page))) {
                *dirty = false;
                *writeback = false;
@@ -1315,7 +1315,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         * the rest of the LRU for clean pages and see
                         * the same dirty pages again (PageReclaim).
                         */
-                       if (page_is_file_cache(page) &&
+                       if (page_is_file_lru(page) &&
                            (!current_is_kswapd() || !PageReclaim(page) ||
                             !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
                                /*
@@ -1459,7 +1459,7 @@ activate_locked:
                        try_to_free_swap(page);
                VM_BUG_ON_PAGE(PageActive(page), page);
                if (!PageMlocked(page)) {
-                       int type = page_is_file_cache(page);
+                       int type = page_is_file_lru(page);
                        SetPageActive(page);
                        stat->nr_activate[type] += nr_pages;
                        count_memcg_page_event(page, PGACTIVATE);
@@ -1497,7 +1497,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
        LIST_HEAD(clean_pages);
 
        list_for_each_entry_safe(page, next, page_list, lru) {
-               if (page_is_file_cache(page) && !PageDirty(page) &&
+               if (page_is_file_lru(page) && !PageDirty(page) &&
                    !__PageMovable(page) && !PageUnevictable(page)) {
                        ClearPageActive(page);
                        list_move(&page->lru, &clean_pages);
@@ -2053,7 +2053,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
                         * IO, plus JVM can create lots of anon VM_EXEC pages,
                         * so we ignore them here.
                         */
-                       if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
+                       if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
                                list_add(&page->lru, &l_active);
                                continue;
                        }
index c9c0d71..96d21a7 100644 (file)
@@ -1256,9 +1256,12 @@ const char * const vmstat_text[] = {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        "thp_fault_alloc",
        "thp_fault_fallback",
+       "thp_fault_fallback_charge",
        "thp_collapse_alloc",
        "thp_collapse_alloc_failed",
        "thp_file_alloc",
+       "thp_file_fallback",
+       "thp_file_fallback_charge",
        "thp_file_mapped",
        "thp_split_page",
        "thp_split_page_failed",
index 22d17ec..2f836a2 100644 (file)
@@ -424,7 +424,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle,
        case ZPOOL_MM_WO:
                zs_mm = ZS_MM_WO;
                break;
-       case ZPOOL_MM_RW: /* fall through */
+       case ZPOOL_MM_RW:
        default:
                zs_mm = ZS_MM_RW;
                break;
@@ -891,12 +891,12 @@ static inline int trypin_tag(unsigned long handle)
        return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
 }
 
-static void pin_tag(unsigned long handle)
+static void pin_tag(unsigned long handle) __acquires(bitlock)
 {
        bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
 }
 
-static void unpin_tag(unsigned long handle)
+static void unpin_tag(unsigned long handle) __releases(bitlock)
 {
        bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
 }
@@ -1833,12 +1833,12 @@ static void migrate_lock_init(struct zspage *zspage)
        rwlock_init(&zspage->lock);
 }
 
-static void migrate_read_lock(struct zspage *zspage)
+static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
 {
        read_lock(&zspage->lock);
 }
 
-static void migrate_read_unlock(struct zspage *zspage)
+static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
 {
        read_unlock(&zspage->lock);
 }
index 55094e6..fbb7829 100644 (file)
@@ -77,8 +77,8 @@ static bool zswap_pool_reached_full;
 
 #define ZSWAP_PARAM_UNSET ""
 
-/* Enable/disable zswap (disabled by default) */
-static bool zswap_enabled;
+/* Enable/disable zswap */
+static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
 static int zswap_enabled_param_set(const char *,
                                   const struct kernel_param *);
 static struct kernel_param_ops zswap_enabled_param_ops = {
@@ -88,8 +88,7 @@ static struct kernel_param_ops zswap_enabled_param_ops = {
 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
 
 /* Crypto compressor to use */
-#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
-static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
+static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
 static int zswap_compressor_param_set(const char *,
                                      const struct kernel_param *);
 static struct kernel_param_ops zswap_compressor_param_ops = {
@@ -101,8 +100,7 @@ module_param_cb(compressor, &zswap_compressor_param_ops,
                &zswap_compressor, 0644);
 
 /* Compressed storage zpool to use */
-#define ZSWAP_ZPOOL_DEFAULT "zbud"
-static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
+static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
 static int zswap_zpool_param_set(const char *, const struct kernel_param *);
 static struct kernel_param_ops zswap_zpool_param_ops = {
        .set =          zswap_zpool_param_set,
@@ -599,11 +597,12 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void)
        bool has_comp, has_zpool;
 
        has_comp = crypto_has_comp(zswap_compressor, 0, 0);
-       if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
+       if (!has_comp && strcmp(zswap_compressor,
+                               CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
                pr_err("compressor %s not available, using default %s\n",
-                      zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
+                      zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
                param_free_charp(&zswap_compressor);
-               zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
+               zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
                has_comp = crypto_has_comp(zswap_compressor, 0, 0);
        }
        if (!has_comp) {
@@ -614,11 +613,12 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void)
        }
 
        has_zpool = zpool_has_pool(zswap_zpool_type);
-       if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
+       if (!has_zpool && strcmp(zswap_zpool_type,
+                                CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
                pr_err("zpool %s not available, using default %s\n",
-                      zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
+                      zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
                param_free_charp(&zswap_zpool_type);
-               zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
+               zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
                has_zpool = zpool_has_pool(zswap_zpool_type);
        }
        if (!has_zpool) {
index 7cb992e..1344f23 100644 (file)
@@ -383,11 +383,11 @@ static int client_options_show(struct seq_file *s, void *p)
        return 0;
 }
 
-CEPH_DEFINE_SHOW_FUNC(monmap_show)
-CEPH_DEFINE_SHOW_FUNC(osdmap_show)
-CEPH_DEFINE_SHOW_FUNC(monc_show)
-CEPH_DEFINE_SHOW_FUNC(osdc_show)
-CEPH_DEFINE_SHOW_FUNC(client_options_show)
+DEFINE_SHOW_ATTRIBUTE(monmap);
+DEFINE_SHOW_ATTRIBUTE(osdmap);
+DEFINE_SHOW_ATTRIBUTE(monc);
+DEFINE_SHOW_ATTRIBUTE(osdc);
+DEFINE_SHOW_ATTRIBUTE(client_options);
 
 void __init ceph_debugfs_init(void)
 {
@@ -414,31 +414,31 @@ void ceph_debugfs_client_init(struct ceph_client *client)
                                                      0400,
                                                      client->debugfs_dir,
                                                      client,
-                                                     &monc_show_fops);
+                                                     &monc_fops);
 
        client->osdc.debugfs_file = debugfs_create_file("osdc",
                                                      0400,
                                                      client->debugfs_dir,
                                                      client,
-                                                     &osdc_show_fops);
+                                                     &osdc_fops);
 
        client->debugfs_monmap = debugfs_create_file("monmap",
                                        0400,
                                        client->debugfs_dir,
                                        client,
-                                       &monmap_show_fops);
+                                       &monmap_fops);
 
        client->debugfs_osdmap = debugfs_create_file("osdmap",
                                        0400,
                                        client->debugfs_dir,
                                        client,
-                                       &osdmap_show_fops);
+                                       &osdmap_fops);
 
        client->debugfs_options = debugfs_create_file("client_options",
                                        0400,
                                        client->debugfs_dir,
                                        client,
-                                       &client_options_show_fops);
+                                       &client_options_fops);
 }
 
 void ceph_debugfs_client_cleanup(struct ceph_client *client)
index 9d9e4e4..3d8c801 100644 (file)
@@ -467,7 +467,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
                                 struct ceph_msg *msg)
 {
        struct ceph_client *client = monc->client;
-       struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+       struct ceph_monmap *monmap;
        void *p, *end;
 
        mutex_lock(&monc->mutex);
@@ -484,13 +484,13 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
                goto out;
        }
 
-       if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+       if (ceph_check_fsid(client, &monmap->fsid) < 0) {
                kfree(monmap);
                goto out;
        }
 
-       client->monc.monmap = monmap;
-       kfree(old);
+       kfree(monc->monmap);
+       monc->monmap = monmap;
 
        __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
        client->have_fsid = true;
index af868d3..998e26b 100644 (file)
@@ -3483,9 +3483,6 @@ static int ceph_redirect_decode(void **p, void *end,
                goto e_inval;
        }
 
-       len = ceph_decode_32(p);
-       *p += len; /* skip osd_instructions */
-
        /* skip the rest */
        *p = struct_end;
 out:
@@ -5228,85 +5225,6 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
        ceph_msgpool_destroy(&osdc->msgpool_op_reply);
 }
 
-/*
- * Read some contiguous pages.  If we cross a stripe boundary, shorten
- * *plen.  Return number of bytes read, or error.
- */
-int ceph_osdc_readpages(struct ceph_osd_client *osdc,
-                       struct ceph_vino vino, struct ceph_file_layout *layout,
-                       u64 off, u64 *plen,
-                       u32 truncate_seq, u64 truncate_size,
-                       struct page **pages, int num_pages, int page_align)
-{
-       struct ceph_osd_request *req;
-       int rc = 0;
-
-       dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
-            vino.snap, off, *plen);
-       req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
-                                   CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
-                                   NULL, truncate_seq, truncate_size,
-                                   false);
-       if (IS_ERR(req))
-               return PTR_ERR(req);
-
-       /* it may be a short read due to an object boundary */
-       osd_req_op_extent_osd_data_pages(req, 0,
-                               pages, *plen, page_align, false, false);
-
-       dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
-            off, *plen, *plen, page_align);
-
-       rc = ceph_osdc_start_request(osdc, req, false);
-       if (!rc)
-               rc = ceph_osdc_wait_request(osdc, req);
-
-       ceph_osdc_put_request(req);
-       dout("readpages result %d\n", rc);
-       return rc;
-}
-EXPORT_SYMBOL(ceph_osdc_readpages);
-
-/*
- * do a synchronous write on N pages
- */
-int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
-                        struct ceph_file_layout *layout,
-                        struct ceph_snap_context *snapc,
-                        u64 off, u64 len,
-                        u32 truncate_seq, u64 truncate_size,
-                        struct timespec64 *mtime,
-                        struct page **pages, int num_pages)
-{
-       struct ceph_osd_request *req;
-       int rc = 0;
-       int page_align = off & ~PAGE_MASK;
-
-       req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
-                                   CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
-                                   snapc, truncate_seq, truncate_size,
-                                   true);
-       if (IS_ERR(req))
-               return PTR_ERR(req);
-
-       /* it may be a short write due to an object boundary */
-       osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
-                               false, false);
-       dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
-
-       req->r_mtime = *mtime;
-       rc = ceph_osdc_start_request(osdc, req, true);
-       if (!rc)
-               rc = ceph_osdc_wait_request(osdc, req);
-
-       ceph_osdc_put_request(req);
-       if (rc == 0)
-               rc = len;
-       dout("writepages result %d\n", rc);
-       return rc;
-}
-EXPORT_SYMBOL(ceph_osdc_writepages);
-
 static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
                                     u64 src_snapid, u64 src_version,
                                     struct ceph_object_id *src_oid,
index ee060d5..25fbd8d 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/auth.h>
 #include <linux/sunrpc/auth_gss.h>
+#include <linux/sunrpc/gss_krb5.h>
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/sunrpc/gss_err.h>
 #include <linux/workqueue.h>
@@ -1050,7 +1051,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
                goto err_put_mech;
        auth = &gss_auth->rpc_auth;
        auth->au_cslack = GSS_CRED_SLACK >> 2;
-       auth->au_rslack = GSS_VERF_SLACK >> 2;
+       auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2;
        auth->au_verfsize = GSS_VERF_SLACK >> 2;
        auth->au_ralign = GSS_VERF_SLACK >> 2;
        auth->au_flags = 0;
@@ -1724,8 +1725,9 @@ bad_mic:
        goto out;
 }
 
-static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
-                             struct rpc_task *task, struct xdr_stream *xdr)
+static noinline_for_stack int
+gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+                  struct rpc_task *task, struct xdr_stream *xdr)
 {
        struct rpc_rqst *rqstp = task->tk_rqstp;
        struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf;
@@ -1816,8 +1818,9 @@ out:
        return -EAGAIN;
 }
 
-static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
-                            struct rpc_task *task, struct xdr_stream *xdr)
+static noinline_for_stack int
+gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+                 struct rpc_task *task, struct xdr_stream *xdr)
 {
        struct rpc_rqst *rqstp = task->tk_rqstp;
        struct xdr_buf  *snd_buf = &rqstp->rq_snd_buf;
@@ -1934,35 +1937,69 @@ gss_unwrap_resp_auth(struct rpc_cred *cred)
        return 0;
 }
 
-static int
+/*
+ * RFC 2203, Section 5.3.2.2
+ *
+ *     struct rpc_gss_integ_data {
+ *             opaque databody_integ<>;
+ *             opaque checksum<>;
+ *     };
+ *
+ *     struct rpc_gss_data_t {
+ *             unsigned int seq_num;
+ *             proc_req_arg_t arg;
+ *     };
+ */
+static noinline_for_stack int
 gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
                      struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
                      struct xdr_stream *xdr)
 {
-       struct xdr_buf integ_buf, *rcv_buf = &rqstp->rq_rcv_buf;
-       u32 data_offset, mic_offset, integ_len, maj_stat;
+       struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf;
        struct rpc_auth *auth = cred->cr_auth;
+       u32 len, offset, seqno, maj_stat;
        struct xdr_netobj mic;
-       __be32 *p;
+       int ret;
 
-       p = xdr_inline_decode(xdr, 2 * sizeof(*p));
-       if (unlikely(!p))
+       ret = -EIO;
+       mic.data = NULL;
+
+       /* opaque databody_integ<>; */
+       if (xdr_stream_decode_u32(xdr, &len))
                goto unwrap_failed;
-       integ_len = be32_to_cpup(p++);
-       if (integ_len & 3)
+       if (len & 3)
                goto unwrap_failed;
-       data_offset = (u8 *)(p) - (u8 *)rcv_buf->head[0].iov_base;
-       mic_offset = integ_len + data_offset;
-       if (mic_offset > rcv_buf->len)
+       offset = rcv_buf->len - xdr_stream_remaining(xdr);
+       if (xdr_stream_decode_u32(xdr, &seqno))
                goto unwrap_failed;
-       if (be32_to_cpup(p) != rqstp->rq_seqno)
+       if (seqno != rqstp->rq_seqno)
                goto bad_seqno;
+       if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len))
+               goto unwrap_failed;
 
-       if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len))
+       /*
+        * The xdr_stream now points to the beginning of the
+        * upper layer payload, to be passed below to
+        * rpcauth_unwrap_resp_decode(). The checksum, which
+        * follows the upper layer payload in @rcv_buf, is
+        * located and parsed without updating the xdr_stream.
+        */
+
+       /* opaque checksum<>; */
+       offset += len;
+       if (xdr_decode_word(rcv_buf, offset, &len))
+               goto unwrap_failed;
+       offset += sizeof(__be32);
+       if (offset + len > rcv_buf->len)
                goto unwrap_failed;
-       if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset))
+       mic.len = len;
+       mic.data = kmalloc(len, GFP_NOFS);
+       if (!mic.data)
+               goto unwrap_failed;
+       if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len))
                goto unwrap_failed;
-       maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
+
+       maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &gss_data, &mic);
        if (maj_stat == GSS_S_CONTEXT_EXPIRED)
                clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
        if (maj_stat != GSS_S_COMPLETE)
@@ -1970,19 +2007,24 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
 
        auth->au_rslack = auth->au_verfsize + 2 + 1 + XDR_QUADLEN(mic.len);
        auth->au_ralign = auth->au_verfsize + 2;
-       return 0;
+       ret = 0;
+
+out:
+       kfree(mic.data);
+       return ret;
+
 unwrap_failed:
        trace_rpcgss_unwrap_failed(task);
-       return -EIO;
+       goto out;
 bad_seqno:
-       trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(p));
-       return -EIO;
+       trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno);
+       goto out;
 bad_mic:
        trace_rpcgss_verify_mic(task, maj_stat);
-       return -EIO;
+       goto out;
 }
 
-static int
+static noinline_for_stack int
 gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
                     struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
                     struct xdr_stream *xdr)
index 07992d3..325a085 100644 (file)
@@ -1099,8 +1099,9 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
                task->tk_msg.rpc_proc = msg->rpc_proc;
                task->tk_msg.rpc_argp = msg->rpc_argp;
                task->tk_msg.rpc_resp = msg->rpc_resp;
-               if (msg->rpc_cred != NULL)
-                       task->tk_msg.rpc_cred = get_cred(msg->rpc_cred);
+               task->tk_msg.rpc_cred = msg->rpc_cred;
+               if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
+                       get_cred(task->tk_msg.rpc_cred);
        }
 }
 
@@ -1126,6 +1127,9 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
 
        task = rpc_new_task(task_setup_data);
 
+       if (!RPC_IS_ASYNC(task))
+               task->tk_flags |= RPC_TASK_CRED_NOREF;
+
        rpc_task_set_client(task, task_setup_data->rpc_client);
        rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
 
index 55e9002..7eba20a 100644 (file)
@@ -204,10 +204,6 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
                struct rpc_task *task,
                unsigned char queue_priority)
 {
-       WARN_ON_ONCE(RPC_IS_QUEUED(task));
-       if (RPC_IS_QUEUED(task))
-               return;
-
        INIT_LIST_HEAD(&task->u.tk_wait.timer_list);
        if (RPC_IS_PRIORITY(queue))
                __rpc_add_wait_queue_priority(queue, task, queue_priority);
@@ -382,7 +378,7 @@ static void rpc_make_runnable(struct workqueue_struct *wq,
  * NB: An RPC task will only receive interrupt-driven events as long
  * as it's on a wait queue.
  */
-static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
+static void __rpc_do_sleep_on_priority(struct rpc_wait_queue *q,
                struct rpc_task *task,
                unsigned char queue_priority)
 {
@@ -395,12 +391,23 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
 
 }
 
+static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
+               struct rpc_task *task,
+               unsigned char queue_priority)
+{
+       if (WARN_ON_ONCE(RPC_IS_QUEUED(task)))
+               return;
+       __rpc_do_sleep_on_priority(q, task, queue_priority);
+}
+
 static void __rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q,
                struct rpc_task *task, unsigned long timeout,
                unsigned char queue_priority)
 {
+       if (WARN_ON_ONCE(RPC_IS_QUEUED(task)))
+               return;
        if (time_is_after_jiffies(timeout)) {
-               __rpc_sleep_on_priority(q, task, queue_priority);
+               __rpc_do_sleep_on_priority(q, task, queue_priority);
                __rpc_add_timer(q, task, timeout);
        } else
                task->tk_status = -ETIMEDOUT;
@@ -1162,7 +1169,8 @@ static void rpc_release_resources_task(struct rpc_task *task)
 {
        xprt_release(task);
        if (task->tk_msg.rpc_cred) {
-               put_cred(task->tk_msg.rpc_cred);
+               if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
+                       put_cred(task->tk_msg.rpc_cred);
                task->tk_msg.rpc_cred = NULL;
        }
        rpc_task_release_client(task);
index e5497dc..15b58c5 100644 (file)
@@ -1235,61 +1235,6 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
 }
 EXPORT_SYMBOL_GPL(xdr_encode_word);
 
-/**
- * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf
- * @buf: pointer to buffer containing a mic
- * @mic: on success, returns the address of the mic
- * @offset: the offset in buf where mic may be found
- *
- * This function may modify the xdr buf if the mic is found to be straddling
- * a boundary between head, pages, and tail.  On success the mic can be read
- * from the address returned.  There is no need to free the mic.
- *
- * Return: Success returns 0, otherwise an integer error.
- */
-int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset)
-{
-       struct xdr_buf subbuf;
-       unsigned int boundary;
-
-       if (xdr_decode_word(buf, offset, &mic->len))
-               return -EFAULT;
-       offset += 4;
-
-       /* Is the mic partially in the head? */
-       boundary = buf->head[0].iov_len;
-       if (offset < boundary && (offset + mic->len) > boundary)
-               xdr_shift_buf(buf, boundary - offset);
-
-       /* Is the mic partially in the pages? */
-       boundary += buf->page_len;
-       if (offset < boundary && (offset + mic->len) > boundary)
-               xdr_shrink_pagelen(buf, boundary - offset);
-
-       if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len))
-               return -EFAULT;
-
-       /* Is the mic contained entirely in the head? */
-       mic->data = subbuf.head[0].iov_base;
-       if (subbuf.head[0].iov_len == mic->len)
-               return 0;
-       /* ..or is the mic contained entirely in the tail? */
-       mic->data = subbuf.tail[0].iov_base;
-       if (subbuf.tail[0].iov_len == mic->len)
-               return 0;
-
-       /* Find a contiguous area in @buf to hold all of @mic */
-       if (mic->len > buf->buflen - buf->len)
-               return -ENOMEM;
-       if (buf->tail[0].iov_len != 0)
-               mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
-       else
-               mic->data = buf->head[0].iov_base + buf->head[0].iov_len;
-       __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len);
-       return 0;
-}
-EXPORT_SYMBOL_GPL(xdr_buf_read_mic);
-
 /* Returns 0 on success, or else a negative error code. */
 static int
 xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
index 1a0ae0c..c92c1aa 100644 (file)
@@ -44,10 +44,10 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
 size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
 {
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+       struct rpcrdma_ep *ep = r_xprt->rx_ep;
        size_t maxmsg;
 
-       maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv);
+       maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv);
        maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
        return maxmsg - RPCRDMA_HDRLEN_MIN;
 }
@@ -115,7 +115,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
        if (rc < 0)
                goto failed_marshal;
 
-       if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+       if (rpcrdma_post_sends(r_xprt, req))
                goto drop_connection;
        return 0;
 
@@ -190,7 +190,7 @@ create_req:
        if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS)
                return NULL;
 
-       size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE);
+       size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE);
        req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL);
        if (!req)
                return NULL;
index 125297c..ef99788 100644 (file)
@@ -52,7 +52,7 @@
 
 /**
  * frwr_release_mr - Destroy one MR
- * @mr: MR allocated by frwr_init_mr
+ * @mr: MR allocated by frwr_mr_init
  *
  */
 void frwr_release_mr(struct rpcrdma_mr *mr)
@@ -74,7 +74,7 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr)
 
        if (mr->mr_dir != DMA_NONE) {
                trace_xprtrdma_mr_unmap(mr);
-               ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
+               ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
                                mr->mr_sg, mr->mr_nents, mr->mr_dir);
                mr->mr_dir = DMA_NONE;
        }
@@ -106,21 +106,22 @@ void frwr_reset(struct rpcrdma_req *req)
 }
 
 /**
- * frwr_init_mr - Initialize one MR
- * @ia: interface adapter
+ * frwr_mr_init - Initialize one MR
+ * @r_xprt: controlling transport instance
  * @mr: generic MR to prepare for FRWR
  *
  * Returns zero if successful. Otherwise a negative errno
  * is returned.
  */
-int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
 {
-       unsigned int depth = ia->ri_max_frwr_depth;
+       struct rpcrdma_ep *ep = r_xprt->rx_ep;
+       unsigned int depth = ep->re_max_fr_depth;
        struct scatterlist *sg;
        struct ib_mr *frmr;
        int rc;
 
-       frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
+       frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth);
        if (IS_ERR(frmr))
                goto out_mr_err;
 
@@ -128,6 +129,7 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
        if (!sg)
                goto out_list_err;
 
+       mr->mr_xprt = r_xprt;
        mr->frwr.fr_mr = frmr;
        mr->mr_dir = DMA_NONE;
        INIT_LIST_HEAD(&mr->mr_list);
@@ -149,29 +151,24 @@ out_list_err:
 
 /**
  * frwr_query_device - Prepare a transport for use with FRWR
- * @r_xprt: controlling transport instance
+ * @ep: endpoint to fill in
  * @device: RDMA device to query
  *
  * On success, sets:
- *     ep->rep_attr
- *     ep->rep_max_requests
- *     ia->ri_max_rdma_segs
- *
- * And these FRWR-related fields:
- *     ia->ri_max_frwr_depth
- *     ia->ri_mrtype
+ *     ep->re_attr
+ *     ep->re_max_requests
+ *     ep->re_max_rdma_segs
+ *     ep->re_max_fr_depth
+ *     ep->re_mrtype
  *
  * Return values:
  *   On success, returns zero.
  *   %-EINVAL - the device does not support FRWR memory registration
  *   %-ENOMEM - the device is not sufficiently capable for NFS/RDMA
  */
-int frwr_query_device(struct rpcrdma_xprt *r_xprt,
-                     const struct ib_device *device)
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
 {
        const struct ib_device_attr *attrs = &device->attrs;
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
        int max_qp_wr, depth, delta;
        unsigned int max_sge;
 
@@ -188,23 +185,23 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
                pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge);
                return -ENOMEM;
        }
-       ep->rep_attr.cap.max_send_sge = max_sge;
-       ep->rep_attr.cap.max_recv_sge = 1;
+       ep->re_attr.cap.max_send_sge = max_sge;
+       ep->re_attr.cap.max_recv_sge = 1;
 
-       ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
+       ep->re_mrtype = IB_MR_TYPE_MEM_REG;
        if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
-               ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
+               ep->re_mrtype = IB_MR_TYPE_SG_GAPS;
 
        /* Quirk: Some devices advertise a large max_fast_reg_page_list_len
         * capability, but perform optimally when the MRs are not larger
         * than a page.
         */
        if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS)
-               ia->ri_max_frwr_depth = attrs->max_sge_rd;
+               ep->re_max_fr_depth = attrs->max_sge_rd;
        else
-               ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len;
-       if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS)
-               ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS;
+               ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len;
+       if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS)
+               ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS;
 
        /* Add room for frwr register and invalidate WRs.
         * 1. FRWR reg WR for head
@@ -220,11 +217,11 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
        /* Calculate N if the device max FRWR depth is smaller than
         * RPCRDMA_MAX_DATA_SEGS.
         */
-       if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) {
-               delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth;
+       if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) {
+               delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth;
                do {
                        depth += 2; /* FRWR reg + invalidate */
-                       delta -= ia->ri_max_frwr_depth;
+                       delta -= ep->re_max_fr_depth;
                } while (delta > 0);
        }
 
@@ -233,34 +230,34 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
        max_qp_wr -= 1;
        if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
                return -ENOMEM;
-       if (ep->rep_max_requests > max_qp_wr)
-               ep->rep_max_requests = max_qp_wr;
-       ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
-       if (ep->rep_attr.cap.max_send_wr > max_qp_wr) {
-               ep->rep_max_requests = max_qp_wr / depth;
-               if (!ep->rep_max_requests)
+       if (ep->re_max_requests > max_qp_wr)
+               ep->re_max_requests = max_qp_wr;
+       ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
+       if (ep->re_attr.cap.max_send_wr > max_qp_wr) {
+               ep->re_max_requests = max_qp_wr / depth;
+               if (!ep->re_max_requests)
                        return -ENOMEM;
-               ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
+               ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
        }
-       ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
-       ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
-       ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests;
-       ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
-       ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
-
-       ia->ri_max_rdma_segs =
-               DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
+       ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+       ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
+       ep->re_attr.cap.max_recv_wr = ep->re_max_requests;
+       ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+       ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
+
+       ep->re_max_rdma_segs =
+               DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth);
        /* Reply chunks require segments for head and tail buffers */
-       ia->ri_max_rdma_segs += 2;
-       if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
-               ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
+       ep->re_max_rdma_segs += 2;
+       if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
+               ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
 
        /* Ensure the underlying device is capable of conveying the
         * largest r/wsize NFS will ask for. This guarantees that
         * failing over from one RDMA device to another will not
         * break NFS I/O.
         */
-       if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS)
+       if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS)
                return -ENOMEM;
 
        return 0;
@@ -286,14 +283,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
                                int nsegs, bool writing, __be32 xid,
                                struct rpcrdma_mr *mr)
 {
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct rpcrdma_ep *ep = r_xprt->rx_ep;
        struct ib_reg_wr *reg_wr;
        int i, n, dma_nents;
        struct ib_mr *ibmr;
        u8 key;
 
-       if (nsegs > ia->ri_max_frwr_depth)
-               nsegs = ia->ri_max_frwr_depth;
+       if (nsegs > ep->re_max_fr_depth)
+               nsegs = ep->re_max_fr_depth;
        for (i = 0; i < nsegs;) {
                if (seg->mr_page)
                        sg_set_page(&mr->mr_sg[i],
@@ -306,7 +303,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 
                ++seg;
                ++i;
-               if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS)
+               if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
                        continue;
                if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
                    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
@@ -315,7 +312,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
        mr->mr_dir = rpcrdma_data_dir(writing);
        mr->mr_nents = i;
 
-       dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents,
+       dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents,
                                  mr->mr_dir);
        if (!dma_nents)
                goto out_dmamap_err;
@@ -356,8 +353,8 @@ out_mapmr_err:
 
 /**
  * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
- * @cq:        completion queue (ignored)
- * @wc:        completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed FastReg WR
  *
  */
 static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
@@ -369,20 +366,25 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
        /* WARNING: Only wr_cqe and status are reliable at this point */
        trace_xprtrdma_wc_fastreg(wc, frwr);
        /* The MR will get recycled when the associated req is retransmitted */
+
+       rpcrdma_flush_disconnect(cq, wc);
 }
 
 /**
- * frwr_send - post Send WR containing the RPC Call message
- * @ia: interface adapter
- * @req: Prepared RPC Call
+ * frwr_send - post Send WRs containing the RPC Call message
+ * @r_xprt: controlling transport instance
+ * @req: prepared RPC Call
  *
  * For FRWR, chain any FastReg WRs to the Send WR. Only a
  * single ib_post_send call is needed to register memory
  * and then post the Send WR.
  *
- * Returns the result of ib_post_send.
+ * Returns the return code from ib_post_send.
+ *
+ * Caller must hold the transport send lock to ensure that the
+ * pointers to the transport's rdma_cm_id and QP are stable.
  */
-int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
        struct ib_send_wr *post_wr;
        struct rpcrdma_mr *mr;
@@ -403,7 +405,7 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
                post_wr = &frwr->fr_regwr.wr;
        }
 
-       return ib_post_send(ia->ri_id->qp, post_wr, NULL);
+       return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL);
 }
 
 /**
@@ -419,7 +421,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
        list_for_each_entry(mr, mrs, mr_list)
                if (mr->mr_handle == rep->rr_inv_rkey) {
                        list_del_init(&mr->mr_list);
-                       trace_xprtrdma_mr_remoteinv(mr);
+                       trace_xprtrdma_mr_reminv(mr);
                        rpcrdma_mr_put(mr);
                        break;  /* only one invalidated MR per RPC */
                }
@@ -435,8 +437,8 @@ static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
 
 /**
  * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
- * @cq:        completion queue (ignored)
- * @wc:        completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
  *
  */
 static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
@@ -449,12 +451,14 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
        /* WARNING: Only wr_cqe and status are reliable at this point */
        trace_xprtrdma_wc_li(wc, frwr);
        __frwr_release_mr(wc, mr);
+
+       rpcrdma_flush_disconnect(cq, wc);
 }
 
 /**
  * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
- * @cq:        completion queue (ignored)
- * @wc:        completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
  *
  * Awaken anyone waiting for an MR to finish being fenced.
  */
@@ -469,6 +473,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
        trace_xprtrdma_wc_li_wake(wc, frwr);
        __frwr_release_mr(wc, mr);
        complete(&frwr->fr_linv_done);
+
+       rpcrdma_flush_disconnect(cq, wc);
 }
 
 /**
@@ -526,10 +532,10 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 
        /* Transport disconnect drains the receive CQ before it
         * replaces the QP. The RPC reply handler won't call us
-        * unless ri_id->qp is a valid pointer.
+        * unless re_id->qp is a valid pointer.
         */
        bad_wr = NULL;
-       rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+       rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
 
        /* The final LOCAL_INV WR in the chain is supposed to
         * do the wake. If it was never posted, the wake will
@@ -556,8 +562,8 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 
 /**
  * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
- * @cq:        completion queue (ignored)
- * @wc:        completed WR
+ * @cq:        completion queue
+ * @wc:        WCE for a completed LocalInv WR
  *
  */
 static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
@@ -575,6 +581,8 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
        /* Ensure @rep is generated before __frwr_release_mr */
        smp_rmb();
        rpcrdma_complete_rqst(rep);
+
+       rpcrdma_flush_disconnect(cq, wc);
 }
 
 /**
@@ -629,10 +637,10 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 
        /* Transport disconnect drains the receive CQ before it
         * replaces the QP. The RPC reply handler won't call us
-        * unless ri_id->qp is a valid pointer.
+        * unless re_id->qp is a valid pointer.
         */
        bad_wr = NULL;
-       rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+       rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
        if (!rc)
                return;
 
index 577513b..4a81e69 100644 (file)
@@ -103,21 +103,20 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
 
 /**
  * rpcrdma_set_max_header_sizes - Initialize inline payload sizes
- * @r_xprt: transport instance to initialize
+ * @ep: endpoint to initialize
  *
  * The max_inline fields contain the maximum size of an RPC message
  * so the marshaling code doesn't have to repeat this calculation
  * for every RPC.
  */
-void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep)
 {
-       unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs;
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+       unsigned int maxsegs = ep->re_max_rdma_segs;
 
-       ep->rep_max_inline_send =
-               ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs);
-       ep->rep_max_inline_recv =
-               ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
+       ep->re_max_inline_send =
+               ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs);
+       ep->re_max_inline_recv =
+               ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
 }
 
 /* The client can send a request inline as long as the RPCRDMA header
@@ -132,9 +131,10 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
                                struct rpc_rqst *rqst)
 {
        struct xdr_buf *xdr = &rqst->rq_snd_buf;
+       struct rpcrdma_ep *ep = r_xprt->rx_ep;
        unsigned int count, remaining, offset;
 
-       if (xdr->len > r_xprt->rx_ep.rep_max_inline_send)
+       if (xdr->len > ep->re_max_inline_send)
                return false;
 
        if (xdr->page_len) {
@@ -145,7 +145,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
                        remaining -= min_t(unsigned int,
                                           PAGE_SIZE - offset, remaining);
                        offset = 0;
-                       if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge)
+                       if (++count > ep->re_attr.cap.max_send_sge)
                                return false;
                }
        }
@@ -162,7 +162,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
 static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
                                   struct rpc_rqst *rqst)
 {
-       return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv;
+       return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv;
 }
 
 /* The client is required to provide a Reply chunk if the maximum
@@ -176,7 +176,7 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
        const struct xdr_buf *buf = &rqst->rq_rcv_buf;
 
        return (buf->head[0].iov_len + buf->tail[0].iov_len) <
-               r_xprt->rx_ep.rep_max_inline_recv;
+               r_xprt->rx_ep->re_max_inline_recv;
 }
 
 /* Split @vec on page boundaries into SGEs. FMR registers pages, not
@@ -255,7 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
        /* When encoding a Read chunk, the tail iovec contains an
         * XDR pad and may be omitted.
         */
-       if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
+       if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup)
                goto out;
 
        /* When encoding a Write chunk, some servers need to see an
@@ -263,7 +263,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
         * layer provides space in the tail iovec that may be used
         * for this purpose.
         */
-       if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
+       if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup)
                goto out;
 
        if (xdrbuf->tail[0].iov_len)
@@ -1450,8 +1450,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 
        if (credits == 0)
                credits = 1;    /* don't deadlock */
-       else if (credits > r_xprt->rx_ep.rep_max_requests)
-               credits = r_xprt->rx_ep.rep_max_requests;
+       else if (credits > r_xprt->rx_ep->re_max_requests)
+               credits = r_xprt->rx_ep->re_max_requests;
        if (buf->rb_credits != credits)
                rpcrdma_update_cwnd(r_xprt, credits);
        rpcrdma_post_recvs(r_xprt, false);
index 3cfeba6..659da37 100644 (file)
@@ -240,9 +240,10 @@ xprt_rdma_connect_worker(struct work_struct *work)
        struct rpc_xprt *xprt = &r_xprt->rx_xprt;
        int rc;
 
-       rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+       rc = rpcrdma_xprt_connect(r_xprt);
        xprt_clear_connecting(xprt);
-       if (r_xprt->rx_ep.rep_connected > 0) {
+       if (r_xprt->rx_ep && r_xprt->rx_ep->re_connect_status > 0) {
+               xprt->connect_cookie++;
                xprt->stat.connect_count++;
                xprt->stat.connect_time += (long)jiffies -
                                           xprt->stat.connect_start;
@@ -265,7 +266,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 
        trace_xprtrdma_op_inject_dsc(r_xprt);
-       rdma_disconnect(r_xprt->rx_ia.ri_id);
+       rdma_disconnect(r_xprt->rx_ep->re_id);
 }
 
 /**
@@ -284,9 +285,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
 
        cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
 
-       rpcrdma_ep_destroy(r_xprt);
+       rpcrdma_xprt_disconnect(r_xprt);
        rpcrdma_buffer_destroy(&r_xprt->rx_buf);
-       rpcrdma_ia_close(&r_xprt->rx_ia);
 
        xprt_rdma_free_addresses(xprt);
        xprt_free(xprt);
@@ -316,10 +316,15 @@ xprt_setup_rdma(struct xprt_create *args)
        if (args->addrlen > sizeof(xprt->addr))
                return ERR_PTR(-EBADF);
 
+       if (!try_module_get(THIS_MODULE))
+               return ERR_PTR(-EIO);
+
        xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0,
                          xprt_rdma_slot_table_entries);
-       if (!xprt)
+       if (!xprt) {
+               module_put(THIS_MODULE);
                return ERR_PTR(-ENOMEM);
+       }
 
        xprt->timeout = &xprt_rdma_default_timeout;
        xprt->connect_timeout = xprt->timeout->to_initval;
@@ -347,23 +352,17 @@ xprt_setup_rdma(struct xprt_create *args)
        xprt_rdma_format_addresses(xprt, sap);
 
        new_xprt = rpcx_to_rdmax(xprt);
-       rc = rpcrdma_ia_open(new_xprt);
-       if (rc)
-               goto out1;
-
-       rc = rpcrdma_ep_create(new_xprt);
-       if (rc)
-               goto out2;
-
        rc = rpcrdma_buffer_create(new_xprt);
-       if (rc)
-               goto out3;
-
-       if (!try_module_get(THIS_MODULE))
-               goto out4;
+       if (rc) {
+               xprt_rdma_free_addresses(xprt);
+               xprt_free(xprt);
+               module_put(THIS_MODULE);
+               return ERR_PTR(rc);
+       }
 
        INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
                          xprt_rdma_connect_worker);
+
        xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
 
        dprintk("RPC:       %s: %s:%s\n", __func__,
@@ -371,19 +370,6 @@ xprt_setup_rdma(struct xprt_create *args)
                xprt->address_strings[RPC_DISPLAY_PORT]);
        trace_xprtrdma_create(new_xprt);
        return xprt;
-
-out4:
-       rpcrdma_buffer_destroy(&new_xprt->rx_buf);
-       rc = -ENODEV;
-out3:
-       rpcrdma_ep_destroy(new_xprt);
-out2:
-       rpcrdma_ia_close(&new_xprt->rx_ia);
-out1:
-       trace_xprtrdma_op_destroy(new_xprt);
-       xprt_rdma_free_addresses(xprt);
-       xprt_free(xprt);
-       return ERR_PTR(rc);
 }
 
 /**
@@ -398,26 +384,11 @@ out1:
 void xprt_rdma_close(struct rpc_xprt *xprt)
 {
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
-       might_sleep();
 
        trace_xprtrdma_op_close(r_xprt);
 
-       /* Prevent marshaling and sending of new requests */
-       xprt_clear_connected(xprt);
-
-       if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) {
-               rpcrdma_ia_remove(ia);
-               goto out;
-       }
-
-       if (ep->rep_connected == -ENODEV)
-               return;
-       rpcrdma_ep_disconnect(ep, ia);
+       rpcrdma_xprt_disconnect(r_xprt);
 
-out:
        xprt->reestablish_timeout = 0;
        ++xprt->connect_cookie;
        xprt_disconnect_done(xprt);
@@ -517,10 +488,11 @@ static void
 xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 {
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+       struct rpcrdma_ep *ep = r_xprt->rx_ep;
        unsigned long delay;
 
        delay = 0;
-       if (r_xprt->rx_ep.rep_connected != 0) {
+       if (ep && ep->re_connect_status != 0) {
                delay = xprt_reconnect_delay(xprt);
                xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
        }
@@ -694,7 +666,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst)
                goto drop_connection;
        rqst->rq_xtime = ktime_get();
 
-       if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+       if (rpcrdma_post_sends(r_xprt, req))
                goto drop_connection;
 
        rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
index 353f61a..cdd84c0 100644 (file)
@@ -84,6 +84,7 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep);
 static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
 static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
+static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep);
 static struct rpcrdma_regbuf *
 rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
                     gfp_t flags);
@@ -96,17 +97,17 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
  */
 static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
 {
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct rdma_cm_id *id = r_xprt->rx_ep->re_id;
 
        /* Flush Receives, then wait for deferred Reply work
         * to complete.
         */
-       ib_drain_rq(ia->ri_id->qp);
+       ib_drain_rq(id->qp);
 
        /* Deferred Reply processing might have scheduled
         * local invalidations.
         */
-       ib_drain_sq(ia->ri_id->qp);
+       ib_drain_sq(id->qp);
 }
 
 /**
@@ -115,26 +116,43 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
  * @context: ep that owns QP where event occurred
  *
  * Called from the RDMA provider (device driver) possibly in an interrupt
- * context.
+ * context. The QP is always destroyed before the ID, so the ID will be
+ * reliably available when this handler is invoked.
  */
-static void
-rpcrdma_qp_event_handler(struct ib_event *event, void *context)
+static void rpcrdma_qp_event_handler(struct ib_event *event, void *context)
 {
        struct rpcrdma_ep *ep = context;
-       struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
-                                                  rx_ep);
 
-       trace_xprtrdma_qp_event(r_xprt, event);
+       trace_xprtrdma_qp_event(ep, event);
+}
+
+/**
+ * rpcrdma_flush_disconnect - Disconnect on flushed completion
+ * @cq: completion queue
+ * @wc: work completion entry
+ *
+ * Must be called in process context.
+ */
+void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc)
+{
+       struct rpcrdma_xprt *r_xprt = cq->cq_context;
+       struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+       if (wc->status != IB_WC_SUCCESS &&
+           r_xprt->rx_ep->re_connect_status == 1) {
+               r_xprt->rx_ep->re_connect_status = -ECONNABORTED;
+               trace_xprtrdma_flush_dct(r_xprt, wc->status);
+               xprt_force_disconnect(xprt);
+       }
 }
 
 /**
  * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
  * @cq:        completion queue
- * @wc:        completed WR
+ * @wc:        WCE for a completed Send WR
  *
  */
-static void
-rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
+static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct ib_cqe *cqe = wc->wr_cqe;
        struct rpcrdma_sendctx *sc =
@@ -143,25 +161,25 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
        /* WARNING: Only wr_cqe and status are reliable at this point */
        trace_xprtrdma_wc_send(sc, wc);
        rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc);
+       rpcrdma_flush_disconnect(cq, wc);
 }
 
 /**
  * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
- * @cq:        completion queue (ignored)
- * @wc:        completed WR
+ * @cq:        completion queue
+ * @wc:        WCE for a completed Receive WR
  *
  */
-static void
-rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
+static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 {
        struct ib_cqe *cqe = wc->wr_cqe;
        struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
                                               rr_cqe);
-       struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+       struct rpcrdma_xprt *r_xprt = cq->cq_context;
 
        /* WARNING: Only wr_cqe and status are reliable at this point */
        trace_xprtrdma_wc_receive(wc);
-       --r_xprt->rx_ep.rep_receive_count;
+       --r_xprt->rx_ep->re_receive_count;
        if (wc->status != IB_WC_SUCCESS)
                goto out_flushed;
 
@@ -178,35 +196,35 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
        return;
 
 out_flushed:
+       rpcrdma_flush_disconnect(cq, wc);
        rpcrdma_rep_destroy(rep);
 }
 
-static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt,
+static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
                                      struct rdma_conn_param *param)
 {
        const struct rpcrdma_connect_private *pmsg = param->private_data;
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
        unsigned int rsize, wsize;
 
        /* Default settings for RPC-over-RDMA Version One */
-       r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
+       ep->re_implicit_roundup = xprt_rdma_pad_optimize;
        rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
        wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
 
        if (pmsg &&
            pmsg->cp_magic == rpcrdma_cmp_magic &&
            pmsg->cp_version == RPCRDMA_CMP_VERSION) {
-               r_xprt->rx_ia.ri_implicit_roundup = true;
+               ep->re_implicit_roundup = true;
                rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
                wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
        }
 
-       if (rsize < ep->rep_inline_recv)
-               ep->rep_inline_recv = rsize;
-       if (wsize < ep->rep_inline_send)
-               ep->rep_inline_send = wsize;
+       if (rsize < ep->re_inline_recv)
+               ep->re_inline_recv = rsize;
+       if (wsize < ep->re_inline_send)
+               ep->re_inline_send = wsize;
 
-       rpcrdma_set_max_header_sizes(r_xprt);
+       rpcrdma_set_max_header_sizes(ep);
 }
 
 /**
@@ -220,116 +238,103 @@ static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt,
 static int
 rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
 {
-       struct rpcrdma_xprt *r_xprt = id->context;
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-       struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+       struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr;
+       struct rpcrdma_ep *ep = id->context;
+       struct rpc_xprt *xprt = ep->re_xprt;
 
        might_sleep();
 
-       trace_xprtrdma_cm_event(r_xprt, event);
        switch (event->event) {
        case RDMA_CM_EVENT_ADDR_RESOLVED:
        case RDMA_CM_EVENT_ROUTE_RESOLVED:
-               ia->ri_async_rc = 0;
-               complete(&ia->ri_done);
+               ep->re_async_rc = 0;
+               complete(&ep->re_done);
                return 0;
        case RDMA_CM_EVENT_ADDR_ERROR:
-               ia->ri_async_rc = -EPROTO;
-               complete(&ia->ri_done);
+               ep->re_async_rc = -EPROTO;
+               complete(&ep->re_done);
                return 0;
        case RDMA_CM_EVENT_ROUTE_ERROR:
-               ia->ri_async_rc = -ENETUNREACH;
-               complete(&ia->ri_done);
+               ep->re_async_rc = -ENETUNREACH;
+               complete(&ep->re_done);
                return 0;
        case RDMA_CM_EVENT_DEVICE_REMOVAL:
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-               pr_info("rpcrdma: removing device %s for %s:%s\n",
-                       ia->ri_id->device->name,
-                       rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
-#endif
-               init_completion(&ia->ri_remove_done);
-               set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
-               ep->rep_connected = -ENODEV;
+               pr_info("rpcrdma: removing device %s for %pISpc\n",
+                       ep->re_id->device->name, sap);
+               /* fall through */
+       case RDMA_CM_EVENT_ADDR_CHANGE:
+               ep->re_connect_status = -ENODEV;
                xprt_force_disconnect(xprt);
-               wait_for_completion(&ia->ri_remove_done);
-
-               ia->ri_id = NULL;
-               /* Return 1 to ensure the core destroys the id. */
-               return 1;
+               goto disconnected;
        case RDMA_CM_EVENT_ESTABLISHED:
-               ++xprt->connect_cookie;
-               ep->rep_connected = 1;
-               rpcrdma_update_cm_private(r_xprt, &event->param.conn);
-               trace_xprtrdma_inline_thresh(r_xprt);
-               wake_up_all(&ep->rep_connect_wait);
+               kref_get(&ep->re_kref);
+               ep->re_connect_status = 1;
+               rpcrdma_update_cm_private(ep, &event->param.conn);
+               trace_xprtrdma_inline_thresh(ep);
+               wake_up_all(&ep->re_connect_wait);
                break;
        case RDMA_CM_EVENT_CONNECT_ERROR:
-               ep->rep_connected = -ENOTCONN;
+               ep->re_connect_status = -ENOTCONN;
                goto disconnected;
        case RDMA_CM_EVENT_UNREACHABLE:
-               ep->rep_connected = -ENETUNREACH;
+               ep->re_connect_status = -ENETUNREACH;
                goto disconnected;
        case RDMA_CM_EVENT_REJECTED:
-               dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
-                       rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
-                       rdma_reject_msg(id, event->status));
-               ep->rep_connected = -ECONNREFUSED;
+               dprintk("rpcrdma: connection to %pISpc rejected: %s\n",
+                       sap, rdma_reject_msg(id, event->status));
+               ep->re_connect_status = -ECONNREFUSED;
                if (event->status == IB_CM_REJ_STALE_CONN)
-                       ep->rep_connected = -EAGAIN;
+                       ep->re_connect_status = -EAGAIN;
                goto disconnected;
        case RDMA_CM_EVENT_DISCONNECTED:
-               ep->rep_connected = -ECONNABORTED;
+               ep->re_connect_status = -ECONNABORTED;
 disconnected:
-               xprt_force_disconnect(xprt);
-               wake_up_all(&ep->rep_connect_wait);
-               break;
+               return rpcrdma_ep_destroy(ep);
        default:
                break;
        }
 
-       dprintk("RPC:       %s: %s:%s on %s/frwr: %s\n", __func__,
-               rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
-               ia->ri_id->device->name, rdma_event_msg(event->event));
+       dprintk("RPC:       %s: %pISpc on %s/frwr: %s\n", __func__, sap,
+               ep->re_id->device->name, rdma_event_msg(event->event));
        return 0;
 }
 
-static struct rdma_cm_id *
-rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
+static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
+                                           struct rpcrdma_ep *ep)
 {
        unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
+       struct rpc_xprt *xprt = &r_xprt->rx_xprt;
        struct rdma_cm_id *id;
        int rc;
 
-       init_completion(&ia->ri_done);
+       init_completion(&ep->re_done);
 
-       id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler,
-                           xprt, RDMA_PS_TCP, IB_QPT_RC);
+       id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep,
+                           RDMA_PS_TCP, IB_QPT_RC);
        if (IS_ERR(id))
                return id;
 
-       ia->ri_async_rc = -ETIMEDOUT;
-       rc = rdma_resolve_addr(id, NULL,
-                              (struct sockaddr *)&xprt->rx_xprt.addr,
+       ep->re_async_rc = -ETIMEDOUT;
+       rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr,
                               RDMA_RESOLVE_TIMEOUT);
        if (rc)
                goto out;
-       rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+       rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
        if (rc < 0)
                goto out;
 
-       rc = ia->ri_async_rc;
+       rc = ep->re_async_rc;
        if (rc)
                goto out;
 
-       ia->ri_async_rc = -ETIMEDOUT;
+       ep->re_async_rc = -ETIMEDOUT;
        rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
        if (rc)
                goto out;
-       rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+       rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
        if (rc < 0)
                goto out;
-       rc = ia->ri_async_rc;
+       rc = ep->re_async_rc;
        if (rc)
                goto out;
 
@@ -340,356 +345,181 @@ out:
        return ERR_PTR(rc);
 }
 
-/*
- * Exported functions.
- */
-
-/**
- * rpcrdma_ia_open - Open and initialize an Interface Adapter.
- * @xprt: transport with IA to (re)initialize
- *
- * Returns 0 on success, negative errno if an appropriate
- * Interface Adapter could not be found and opened.
- */
-int
-rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
+static void rpcrdma_ep_put(struct kref *kref)
 {
-       struct rpcrdma_ia *ia = &xprt->rx_ia;
-       int rc;
+       struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
 
-       ia->ri_id = rpcrdma_create_id(xprt, ia);
-       if (IS_ERR(ia->ri_id)) {
-               rc = PTR_ERR(ia->ri_id);
-               goto out_err;
+       if (ep->re_id->qp) {
+               rdma_destroy_qp(ep->re_id);
+               ep->re_id->qp = NULL;
        }
 
-       ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0);
-       if (IS_ERR(ia->ri_pd)) {
-               rc = PTR_ERR(ia->ri_pd);
-               pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
-               goto out_err;
-       }
+       if (ep->re_attr.recv_cq)
+               ib_free_cq(ep->re_attr.recv_cq);
+       ep->re_attr.recv_cq = NULL;
+       if (ep->re_attr.send_cq)
+               ib_free_cq(ep->re_attr.send_cq);
+       ep->re_attr.send_cq = NULL;
 
-       return 0;
+       if (ep->re_pd)
+               ib_dealloc_pd(ep->re_pd);
+       ep->re_pd = NULL;
 
-out_err:
-       rpcrdma_ia_close(ia);
-       return rc;
+       kfree(ep);
+       module_put(THIS_MODULE);
 }
 
-/**
- * rpcrdma_ia_remove - Handle device driver unload
- * @ia: interface adapter being removed
- *
- * Divest transport H/W resources associated with this adapter,
- * but allow it to be restored later.
- *
- * Caller must hold the transport send lock.
+/* Returns:
+ *     %0 if @ep still has a positive kref count, or
+ *     %1 if @ep was destroyed successfully.
  */
-void
-rpcrdma_ia_remove(struct rpcrdma_ia *ia)
+static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep)
 {
-       struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
-                                                  rx_ia);
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-
-       /* This is similar to rpcrdma_ep_destroy, but:
-        * - Don't cancel the connect worker.
-        * - Don't call rpcrdma_ep_disconnect, which waits
-        *   for another conn upcall, which will deadlock.
-        * - rdma_disconnect is unneeded, the underlying
-        *   connection is already gone.
-        */
-       if (ia->ri_id->qp) {
-               rpcrdma_xprt_drain(r_xprt);
-               rdma_destroy_qp(ia->ri_id);
-               ia->ri_id->qp = NULL;
-       }
-       ib_free_cq(ep->rep_attr.recv_cq);
-       ep->rep_attr.recv_cq = NULL;
-       ib_free_cq(ep->rep_attr.send_cq);
-       ep->rep_attr.send_cq = NULL;
-
-       /* The ULP is responsible for ensuring all DMA
-        * mappings and MRs are gone.
-        */
-       rpcrdma_reps_unmap(r_xprt);
-       rpcrdma_reqs_reset(r_xprt);
-       rpcrdma_mrs_destroy(r_xprt);
-       rpcrdma_sendctxs_destroy(r_xprt);
-       ib_dealloc_pd(ia->ri_pd);
-       ia->ri_pd = NULL;
-
-       /* Allow waiters to continue */
-       complete(&ia->ri_remove_done);
-
-       trace_xprtrdma_remove(r_xprt);
-}
-
-/**
- * rpcrdma_ia_close - Clean up/close an IA.
- * @ia: interface adapter to close
- *
- */
-void
-rpcrdma_ia_close(struct rpcrdma_ia *ia)
-{
-       if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
-               if (ia->ri_id->qp)
-                       rdma_destroy_qp(ia->ri_id);
-               rdma_destroy_id(ia->ri_id);
-       }
-       ia->ri_id = NULL;
-
-       /* If the pd is still busy, xprtrdma missed freeing a resource */
-       if (ia->ri_pd && !IS_ERR(ia->ri_pd))
-               ib_dealloc_pd(ia->ri_pd);
-       ia->ri_pd = NULL;
+       return kref_put(&ep->re_kref, rpcrdma_ep_put);
 }
 
-/**
- * rpcrdma_ep_create - Create unconnected endpoint
- * @r_xprt: transport to instantiate
- *
- * Returns zero on success, or a negative errno.
- */
-int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
+static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 {
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
-       struct ib_cq *sendcq, *recvcq;
+       struct rpcrdma_connect_private *pmsg;
+       struct ib_device *device;
+       struct rdma_cm_id *id;
+       struct rpcrdma_ep *ep;
        int rc;
 
-       ep->rep_max_requests = r_xprt->rx_xprt.max_reqs;
-       ep->rep_inline_send = xprt_rdma_max_inline_write;
-       ep->rep_inline_recv = xprt_rdma_max_inline_read;
+       ep = kzalloc(sizeof(*ep), GFP_NOFS);
+       if (!ep)
+               return -EAGAIN;
+       ep->re_xprt = &r_xprt->rx_xprt;
+       kref_init(&ep->re_kref);
 
-       rc = frwr_query_device(r_xprt, ia->ri_id->device);
+       id = rpcrdma_create_id(r_xprt, ep);
+       if (IS_ERR(id)) {
+               rc = PTR_ERR(id);
+               goto out_free;
+       }
+       __module_get(THIS_MODULE);
+       device = id->device;
+       ep->re_id = id;
+
+       ep->re_max_requests = r_xprt->rx_xprt.max_reqs;
+       ep->re_inline_send = xprt_rdma_max_inline_write;
+       ep->re_inline_recv = xprt_rdma_max_inline_read;
+       rc = frwr_query_device(ep, device);
        if (rc)
-               return rc;
-       r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests);
+               goto out_destroy;
+
+       r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests);
 
-       ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
-       ep->rep_attr.qp_context = ep;
-       ep->rep_attr.srq = NULL;
-       ep->rep_attr.cap.max_inline_data = 0;
-       ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
-       ep->rep_attr.qp_type = IB_QPT_RC;
-       ep->rep_attr.port_num = ~0;
+       ep->re_attr.event_handler = rpcrdma_qp_event_handler;
+       ep->re_attr.qp_context = ep;
+       ep->re_attr.srq = NULL;
+       ep->re_attr.cap.max_inline_data = 0;
+       ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+       ep->re_attr.qp_type = IB_QPT_RC;
+       ep->re_attr.port_num = ~0;
 
        dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
                "iovs: send %d recv %d\n",
                __func__,
-               ep->rep_attr.cap.max_send_wr,
-               ep->rep_attr.cap.max_recv_wr,
-               ep->rep_attr.cap.max_send_sge,
-               ep->rep_attr.cap.max_recv_sge);
-
-       ep->rep_send_batch = ep->rep_max_requests >> 3;
-       ep->rep_send_count = ep->rep_send_batch;
-       init_waitqueue_head(&ep->rep_connect_wait);
-       ep->rep_receive_count = 0;
-
-       sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt,
-                                ep->rep_attr.cap.max_send_wr + 1,
-                                IB_POLL_WORKQUEUE);
-       if (IS_ERR(sendcq)) {
-               rc = PTR_ERR(sendcq);
-               goto out1;
+               ep->re_attr.cap.max_send_wr,
+               ep->re_attr.cap.max_recv_wr,
+               ep->re_attr.cap.max_send_sge,
+               ep->re_attr.cap.max_recv_sge);
+
+       ep->re_send_batch = ep->re_max_requests >> 3;
+       ep->re_send_count = ep->re_send_batch;
+       init_waitqueue_head(&ep->re_connect_wait);
+
+       ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt,
+                                             ep->re_attr.cap.max_send_wr,
+                                             IB_POLL_WORKQUEUE);
+       if (IS_ERR(ep->re_attr.send_cq)) {
+               rc = PTR_ERR(ep->re_attr.send_cq);
+               goto out_destroy;
        }
 
-       recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
-                                ep->rep_attr.cap.max_recv_wr + 1,
-                                IB_POLL_WORKQUEUE);
-       if (IS_ERR(recvcq)) {
-               rc = PTR_ERR(recvcq);
-               goto out2;
+       ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt,
+                                             ep->re_attr.cap.max_recv_wr,
+                                             IB_POLL_WORKQUEUE);
+       if (IS_ERR(ep->re_attr.recv_cq)) {
+               rc = PTR_ERR(ep->re_attr.recv_cq);
+               goto out_destroy;
        }
-
-       ep->rep_attr.send_cq = sendcq;
-       ep->rep_attr.recv_cq = recvcq;
+       ep->re_receive_count = 0;
 
        /* Initialize cma parameters */
-       memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
+       memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma));
 
        /* Prepare RDMA-CM private message */
+       pmsg = &ep->re_cm_private;
        pmsg->cp_magic = rpcrdma_cmp_magic;
        pmsg->cp_version = RPCRDMA_CMP_VERSION;
        pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK;
-       pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send);
-       pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv);
-       ep->rep_remote_cma.private_data = pmsg;
-       ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
+       pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send);
+       pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv);
+       ep->re_remote_cma.private_data = pmsg;
+       ep->re_remote_cma.private_data_len = sizeof(*pmsg);
 
        /* Client offers RDMA Read but does not initiate */
-       ep->rep_remote_cma.initiator_depth = 0;
-       ep->rep_remote_cma.responder_resources =
-               min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom);
+       ep->re_remote_cma.initiator_depth = 0;
+       ep->re_remote_cma.responder_resources =
+               min_t(int, U8_MAX, device->attrs.max_qp_rd_atom);
 
        /* Limit transport retries so client can detect server
         * GID changes quickly. RPC layer handles re-establishing
         * transport connection and retransmission.
         */
-       ep->rep_remote_cma.retry_count = 6;
+       ep->re_remote_cma.retry_count = 6;
 
        /* RPC-over-RDMA handles its own flow control. In addition,
         * make all RNR NAKs visible so we know that RPC-over-RDMA
         * flow control is working correctly (no NAKs should be seen).
         */
-       ep->rep_remote_cma.flow_control = 0;
-       ep->rep_remote_cma.rnr_retry_count = 0;
+       ep->re_remote_cma.flow_control = 0;
+       ep->re_remote_cma.rnr_retry_count = 0;
 
-       return 0;
-
-out2:
-       ib_free_cq(sendcq);
-out1:
-       return rc;
-}
-
-/**
- * rpcrdma_ep_destroy - Disconnect and destroy endpoint.
- * @r_xprt: transport instance to shut down
- *
- */
-void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
-       if (ia->ri_id && ia->ri_id->qp) {
-               rpcrdma_ep_disconnect(ep, ia);
-               rdma_destroy_qp(ia->ri_id);
-               ia->ri_id->qp = NULL;
-       }
-
-       if (ep->rep_attr.recv_cq)
-               ib_free_cq(ep->rep_attr.recv_cq);
-       if (ep->rep_attr.send_cq)
-               ib_free_cq(ep->rep_attr.send_cq);
-}
-
-/* Re-establish a connection after a device removal event.
- * Unlike a normal reconnection, a fresh PD and a new set
- * of MRs and buffers is needed.
- */
-static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
-                                   struct ib_qp_init_attr *qp_init_attr)
-{
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-       int rc, err;
-
-       trace_xprtrdma_reinsert(r_xprt);
-
-       rc = -EHOSTUNREACH;
-       if (rpcrdma_ia_open(r_xprt))
-               goto out1;
-
-       rc = -ENOMEM;
-       err = rpcrdma_ep_create(r_xprt);
-       if (err) {
-               pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
-               goto out2;
-       }
-       memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr));
-
-       rc = -ENETUNREACH;
-       err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr);
-       if (err) {
-               pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
-               goto out3;
-       }
-       return 0;
-
-out3:
-       rpcrdma_ep_destroy(r_xprt);
-out2:
-       rpcrdma_ia_close(ia);
-out1:
-       return rc;
-}
-
-static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt,
-                               struct ib_qp_init_attr *qp_init_attr)
-{
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rdma_cm_id *id, *old;
-       int err, rc;
-
-       rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia);
-
-       rc = -EHOSTUNREACH;
-       id = rpcrdma_create_id(r_xprt, ia);
-       if (IS_ERR(id))
-               goto out;
-
-       /* As long as the new ID points to the same device as the
-        * old ID, we can reuse the transport's existing PD and all
-        * previously allocated MRs. Also, the same device means
-        * the transport's previous DMA mappings are still valid.
-        *
-        * This is a sanity check only. There should be no way these
-        * point to two different devices here.
-        */
-       old = id;
-       rc = -ENETUNREACH;
-       if (ia->ri_id->device != id->device) {
-               pr_err("rpcrdma: can't reconnect on different device!\n");
+       ep->re_pd = ib_alloc_pd(device, 0);
+       if (IS_ERR(ep->re_pd)) {
+               rc = PTR_ERR(ep->re_pd);
                goto out_destroy;
        }
 
-       err = rdma_create_qp(id, ia->ri_pd, qp_init_attr);
-       if (err)
+       rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr);
+       if (rc)
                goto out_destroy;
 
-       /* Atomically replace the transport's ID and QP. */
-       rc = 0;
-       old = ia->ri_id;
-       ia->ri_id = id;
-       rdma_destroy_qp(old);
+       r_xprt->rx_ep = ep;
+       return 0;
 
 out_destroy:
-       rdma_destroy_id(old);
-out:
+       rpcrdma_ep_destroy(ep);
+       rdma_destroy_id(id);
+out_free:
+       kfree(ep);
+       r_xprt->rx_ep = NULL;
        return rc;
 }
 
-/*
- * Connect unconnected endpoint.
+/**
+ * rpcrdma_xprt_connect - Connect an unconnected transport
+ * @r_xprt: controlling transport instance
+ *
+ * Returns 0 on success or a negative errno.
  */
-int
-rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
 {
-       struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
-                                                  rx_ia);
        struct rpc_xprt *xprt = &r_xprt->rx_xprt;
-       struct ib_qp_init_attr qp_init_attr;
+       struct rpcrdma_ep *ep;
        int rc;
 
 retry:
-       memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr));
-       switch (ep->rep_connected) {
-       case 0:
-               rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr);
-               if (rc) {
-                       rc = -ENETUNREACH;
-                       goto out_noupdate;
-               }
-               break;
-       case -ENODEV:
-               rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr);
-               if (rc)
-                       goto out_noupdate;
-               break;
-       default:
-               rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr);
-               if (rc)
-                       goto out;
-       }
+       rpcrdma_xprt_disconnect(r_xprt);
+       rc = rpcrdma_ep_create(r_xprt);
+       if (rc)
+               return rc;
+       ep = r_xprt->rx_ep;
 
-       ep->rep_connected = 0;
+       ep->re_connect_status = 0;
        xprt_clear_connected(xprt);
 
        rpcrdma_reset_cwnd(r_xprt);
@@ -699,64 +529,68 @@ retry:
        if (rc)
                goto out;
 
-       rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+       rc = rdma_connect(ep->re_id, &ep->re_remote_cma);
        if (rc)
                goto out;
 
        if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
                xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
-       wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
-       if (ep->rep_connected <= 0) {
-               if (ep->rep_connected == -EAGAIN)
+       wait_event_interruptible(ep->re_connect_wait,
+                                ep->re_connect_status != 0);
+       if (ep->re_connect_status <= 0) {
+               if (ep->re_connect_status == -EAGAIN)
                        goto retry;
-               rc = ep->rep_connected;
+               rc = ep->re_connect_status;
                goto out;
        }
 
        rc = rpcrdma_reqs_setup(r_xprt);
        if (rc) {
-               rpcrdma_ep_disconnect(ep, ia);
+               rpcrdma_xprt_disconnect(r_xprt);
                goto out;
        }
        rpcrdma_mrs_create(r_xprt);
 
 out:
        if (rc)
-               ep->rep_connected = rc;
-
-out_noupdate:
+               ep->re_connect_status = rc;
        trace_xprtrdma_connect(r_xprt, rc);
        return rc;
 }
 
 /**
- * rpcrdma_ep_disconnect - Disconnect underlying transport
- * @ep: endpoint to disconnect
- * @ia: associated interface adapter
+ * rpcrdma_xprt_disconnect - Disconnect underlying transport
+ * @r_xprt: controlling transport instance
  *
  * Caller serializes. Either the transport send lock is held,
  * or we're being called to destroy the transport.
+ *
+ * On return, @r_xprt is completely divested of all hardware
+ * resources and prepared for the next ->connect operation.
  */
-void
-rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt)
 {
-       struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
-                                                  rx_ep);
+       struct rpcrdma_ep *ep = r_xprt->rx_ep;
+       struct rdma_cm_id *id;
        int rc;
 
-       /* returns without wait if ID is not connected */
-       rc = rdma_disconnect(ia->ri_id);
-       if (!rc)
-               wait_event_interruptible(ep->rep_connect_wait,
-                                                       ep->rep_connected != 1);
-       else
-               ep->rep_connected = rc;
+       if (!ep)
+               return;
+
+       id = ep->re_id;
+       rc = rdma_disconnect(id);
        trace_xprtrdma_disconnect(r_xprt, rc);
 
        rpcrdma_xprt_drain(r_xprt);
+       rpcrdma_reps_unmap(r_xprt);
        rpcrdma_reqs_reset(r_xprt);
        rpcrdma_mrs_destroy(r_xprt);
        rpcrdma_sendctxs_destroy(r_xprt);
+
+       if (rpcrdma_ep_destroy(ep))
+               rdma_destroy_id(id);
+
+       r_xprt->rx_ep = NULL;
 }
 
 /* Fixed-size circular FIFO queue. This implementation is wait-free and
@@ -793,7 +627,7 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
 {
        struct rpcrdma_sendctx *sc;
 
-       sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge),
+       sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge),
                     GFP_KERNEL);
        if (!sc)
                return NULL;
@@ -813,14 +647,14 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
         * the ->send_request call to fail temporarily before too many
         * Sends are posted.
         */
-       i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS;
+       i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS;
        buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
        if (!buf->rb_sc_ctxs)
                return -ENOMEM;
 
        buf->rb_sc_last = i - 1;
        for (i = 0; i <= buf->rb_sc_last; i++) {
-               sc = rpcrdma_sendctx_create(&r_xprt->rx_ep);
+               sc = rpcrdma_sendctx_create(r_xprt->rx_ep);
                if (!sc)
                        return -ENOMEM;
 
@@ -924,10 +758,10 @@ static void
 rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct rpcrdma_ep *ep = r_xprt->rx_ep;
        unsigned int count;
 
-       for (count = 0; count < ia->ri_max_rdma_segs; count++) {
+       for (count = 0; count < ep->re_max_rdma_segs; count++) {
                struct rpcrdma_mr *mr;
                int rc;
 
@@ -935,14 +769,12 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
                if (!mr)
                        break;
 
-               rc = frwr_init_mr(ia, mr);
+               rc = frwr_mr_init(r_xprt, mr);
                if (rc) {
                        kfree(mr);
                        break;
                }
 
-               mr->mr_xprt = r_xprt;
-
                spin_lock(&buf->rb_lock);
                rpcrdma_mr_push(mr, &buf->rb_mrs);
                list_add(&mr->mr_all, &buf->rb_all_mrs);
@@ -973,12 +805,12 @@ rpcrdma_mr_refresh_worker(struct work_struct *work)
 void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+       struct rpcrdma_ep *ep = r_xprt->rx_ep;
 
-       /* If there is no underlying device, it's no use to
-        * wake the refresh worker.
+       /* If there is no underlying connection, it's no use
+        * to wake the refresh worker.
         */
-       if (ep->rep_connected != -ENODEV) {
+       if (ep->re_connect_status == 1) {
                /* The work is scheduled on a WQ_MEM_RECLAIM
                 * workqueue in order to prevent MR allocation
                 * from recursing into NFS during direct reclaim.
@@ -1042,7 +874,7 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 
        /* Compute maximum header buffer size in bytes */
        maxhdrsize = rpcrdma_fixed_maxsz + 3 +
-                    r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz;
+                    r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz;
        maxhdrsize *= sizeof(__be32);
        rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
                                  DMA_TO_DEVICE, GFP_KERNEL);
@@ -1120,7 +952,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
        if (rep == NULL)
                goto out;
 
-       rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv,
+       rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv,
                                               DMA_FROM_DEVICE, GFP_KERNEL);
        if (!rep->rr_rdmabuf)
                goto out_free;
@@ -1345,7 +1177,7 @@ void rpcrdma_mr_put(struct rpcrdma_mr *mr)
 
        if (mr->mr_dir != DMA_NONE) {
                trace_xprtrdma_mr_unmap(mr);
-               ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
+               ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
                                mr->mr_sg, mr->mr_nents, mr->mr_dir);
                mr->mr_dir = DMA_NONE;
        }
@@ -1463,7 +1295,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags)
 bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
                              struct rpcrdma_regbuf *rb)
 {
-       struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+       struct ib_device *device = r_xprt->rx_ep->re_id->device;
 
        if (rb->rg_direction == DMA_NONE)
                return false;
@@ -1476,7 +1308,7 @@ bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
        }
 
        rb->rg_device = device;
-       rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey;
+       rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey;
        return true;
 }
 
@@ -1502,31 +1334,28 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
 }
 
 /**
- * rpcrdma_ep_post - Post WRs to a transport's Send Queue
- * @ia: transport's device information
- * @ep: transport's RDMA endpoint information
+ * rpcrdma_post_sends - Post WRs to a transport's Send Queue
+ * @r_xprt: controlling transport instance
  * @req: rpcrdma_req containing the Send WR to post
  *
  * Returns 0 if the post was successful, otherwise -ENOTCONN
  * is returned.
  */
-int
-rpcrdma_ep_post(struct rpcrdma_ia *ia,
-               struct rpcrdma_ep *ep,
-               struct rpcrdma_req *req)
+int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
        struct ib_send_wr *send_wr = &req->rl_wr;
+       struct rpcrdma_ep *ep = r_xprt->rx_ep;
        int rc;
 
-       if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) {
+       if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) {
                send_wr->send_flags |= IB_SEND_SIGNALED;
-               ep->rep_send_count = ep->rep_send_batch;
+               ep->re_send_count = ep->re_send_batch;
        } else {
                send_wr->send_flags &= ~IB_SEND_SIGNALED;
-               --ep->rep_send_count;
+               --ep->re_send_count;
        }
 
-       rc = frwr_send(ia, req);
+       rc = frwr_send(r_xprt, req);
        trace_xprtrdma_post_send(req, rc);
        if (rc)
                return -ENOTCONN;
@@ -1542,7 +1371,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
 {
        struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+       struct rpcrdma_ep *ep = r_xprt->rx_ep;
        struct ib_recv_wr *wr, *bad_wr;
        struct rpcrdma_rep *rep;
        int needed, count, rc;
@@ -1551,9 +1380,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
        count = 0;
 
        needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
-       if (likely(ep->rep_receive_count > needed))
+       if (likely(ep->re_receive_count > needed))
                goto out;
-       needed -= ep->rep_receive_count;
+       needed -= ep->re_receive_count;
        if (!temp)
                needed += RPCRDMA_MAX_RECV_BATCH;
 
@@ -1579,7 +1408,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
        if (!wr)
                goto out;
 
-       rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
+       rc = ib_post_recv(ep->re_id->qp, wr,
                          (const struct ib_recv_wr **)&bad_wr);
 out:
        trace_xprtrdma_post_recvs(r_xprt, count, rc);
@@ -1593,6 +1422,6 @@ out:
                        --count;
                }
        }
-       ep->rep_receive_count += count;
+       ep->re_receive_count += count;
        return;
 }
index 37d5080..0a16fdb 100644 (file)
 #define RPCRDMA_IDLE_DISC_TO   (5U * 60 * HZ)
 
 /*
- * Interface Adapter -- one per transport instance
+ * RDMA Endpoint -- connection endpoint details
  */
-struct rpcrdma_ia {
-       struct rdma_cm_id       *ri_id;
-       struct ib_pd            *ri_pd;
-       int                     ri_async_rc;
-       unsigned int            ri_max_rdma_segs;
-       unsigned int            ri_max_frwr_depth;
-       bool                    ri_implicit_roundup;
-       enum ib_mr_type         ri_mrtype;
-       unsigned long           ri_flags;
-       struct completion       ri_done;
-       struct completion       ri_remove_done;
-};
-
-enum {
-       RPCRDMA_IAF_REMOVING = 0,
-};
-
-/*
- * RDMA Endpoint -- one per transport instance
- */
-
 struct rpcrdma_ep {
-       unsigned int            rep_send_count;
-       unsigned int            rep_send_batch;
-       unsigned int            rep_max_inline_send;
-       unsigned int            rep_max_inline_recv;
-       int                     rep_connected;
-       struct ib_qp_init_attr  rep_attr;
-       wait_queue_head_t       rep_connect_wait;
-       struct rpcrdma_connect_private  rep_cm_private;
-       struct rdma_conn_param  rep_remote_cma;
-       unsigned int            rep_max_requests;       /* depends on device */
-       unsigned int            rep_inline_send;        /* negotiated */
-       unsigned int            rep_inline_recv;        /* negotiated */
-       int                     rep_receive_count;
+       struct kref             re_kref;
+       struct rdma_cm_id       *re_id;
+       struct ib_pd            *re_pd;
+       unsigned int            re_max_rdma_segs;
+       unsigned int            re_max_fr_depth;
+       bool                    re_implicit_roundup;
+       enum ib_mr_type         re_mrtype;
+       struct completion       re_done;
+       unsigned int            re_send_count;
+       unsigned int            re_send_batch;
+       unsigned int            re_max_inline_send;
+       unsigned int            re_max_inline_recv;
+       int                     re_async_rc;
+       int                     re_connect_status;
+       struct ib_qp_init_attr  re_attr;
+       wait_queue_head_t       re_connect_wait;
+       struct rpc_xprt         *re_xprt;
+       struct rpcrdma_connect_private
+                               re_cm_private;
+       struct rdma_conn_param  re_remote_cma;
+       int                     re_receive_count;
+       unsigned int            re_max_requests; /* depends on device */
+       unsigned int            re_inline_send; /* negotiated */
+       unsigned int            re_inline_recv; /* negotiated */
 };
 
 /* Pre-allocate extra Work Requests for handling backward receives
@@ -422,8 +412,7 @@ struct rpcrdma_stats {
  */
 struct rpcrdma_xprt {
        struct rpc_xprt         rx_xprt;
-       struct rpcrdma_ia       rx_ia;
-       struct rpcrdma_ep       rx_ep;
+       struct rpcrdma_ep       *rx_ep;
        struct rpcrdma_buffer   rx_buf;
        struct delayed_work     rx_connect_worker;
        struct rpc_timeout      rx_timeout;
@@ -455,22 +444,13 @@ extern int xprt_rdma_pad_optimize;
 extern unsigned int xprt_rdma_memreg_strategy;
 
 /*
- * Interface Adapter calls - xprtrdma/verbs.c
- */
-int rpcrdma_ia_open(struct rpcrdma_xprt *xprt);
-void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
-void rpcrdma_ia_close(struct rpcrdma_ia *);
-
-/*
  * Endpoint calls - xprtrdma/verbs.c
  */
-int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt);
-int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc);
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
 
-int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
-                               struct rpcrdma_req *);
+int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
 void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
 
 /*
@@ -536,15 +516,14 @@ rpcrdma_data_dir(bool writing)
 /* Memory registration calls xprtrdma/frwr_ops.c
  */
 void frwr_reset(struct rpcrdma_req *req);
-int frwr_query_device(struct rpcrdma_xprt *r_xprt,
-                     const struct ib_device *device);
-int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device);
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr);
 void frwr_release_mr(struct rpcrdma_mr *mr);
 struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
                                struct rpcrdma_mr_seg *seg,
                                int nsegs, bool writing, __be32 xid,
                                struct rpcrdma_mr *mr);
-int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
 void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
 void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
 void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
@@ -569,7 +548,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
                              enum rpcrdma_chunktype rtype);
 void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc);
 int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
-void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep);
 void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt);
 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
 void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
index 17cb902..0bda8a7 100644 (file)
@@ -1861,7 +1861,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
        struct rpc_xprt *xprt = &transport->xprt;
        struct file *filp;
        struct socket *sock;
-       int status = -EIO;
+       int status;
 
        status = __sock_create(xprt->xprt_net, AF_LOCAL,
                                        SOCK_STREAM, 0, &sock, 1);
index c585047..418c46f 100644 (file)
@@ -23,7 +23,7 @@
 
 struct perf_event * __percpu *sample_hbp;
 
-static char ksym_name[KSYM_NAME_LEN] = "pid_max";
+static char ksym_name[KSYM_NAME_LEN] = "jiffies";
 module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO);
 MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any"
                        " write operations on the kernel symbol");
@@ -41,11 +41,15 @@ static int __init hw_break_module_init(void)
 {
        int ret;
        struct perf_event_attr attr;
+       void *addr = __symbol_get(ksym_name);
+
+       if (!addr)
+               return -ENXIO;
 
        hw_breakpoint_init(&attr);
-       attr.bp_addr = kallsyms_lookup_name(ksym_name);
+       attr.bp_addr = (unsigned long)addr;
        attr.bp_len = HW_BREAKPOINT_LEN_4;
-       attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+       attr.bp_type = HW_BREAKPOINT_W;
 
        sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL);
        if (IS_ERR((void __force *)sample_hbp)) {
@@ -66,6 +70,7 @@ fail:
 static void __exit hw_break_module_exit(void)
 {
        unregister_wide_hw_breakpoint(sample_hbp);
+       symbol_put(ksym_name);
        printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name);
 }
 
index 019771b..5b15bc4 100644 (file)
@@ -1,16 +1,26 @@
 # SPDX-License-Identifier: GPL-2.0
 ifdef CONFIG_UBSAN
+
+ifdef CONFIG_UBSAN_ALIGNMENT
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=alignment)
+endif
+
+ifdef CONFIG_UBSAN_BOUNDS
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds)
+endif
+
+ifdef CONFIG_UBSAN_MISC
       CFLAGS_UBSAN += $(call cc-option, -fsanitize=shift)
       CFLAGS_UBSAN += $(call cc-option, -fsanitize=integer-divide-by-zero)
       CFLAGS_UBSAN += $(call cc-option, -fsanitize=unreachable)
       CFLAGS_UBSAN += $(call cc-option, -fsanitize=signed-integer-overflow)
-      CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds)
       CFLAGS_UBSAN += $(call cc-option, -fsanitize=object-size)
       CFLAGS_UBSAN += $(call cc-option, -fsanitize=bool)
       CFLAGS_UBSAN += $(call cc-option, -fsanitize=enum)
+endif
 
-ifdef CONFIG_UBSAN_ALIGNMENT
-      CFLAGS_UBSAN += $(call cc-option, -fsanitize=alignment)
+ifdef CONFIG_UBSAN_TRAP
+      CFLAGS_UBSAN += $(call cc-option, -fsanitize-undefined-trap-on-error)
 endif
 
       # -fsanitize=* options makes GCC less smart than usual and
index a63380c..d64c67b 100755 (executable)
@@ -64,6 +64,7 @@ my $color = "auto";
 my $allow_c99_comments = 1; # Can be overridden by --ignore C99_COMMENT_TOLERANCE
 # git output parsing needs US English output, so first set backtick child process LANGUAGE
 my $git_command ='export LANGUAGE=en_US.UTF-8; git';
+my $tabsize = 8;
 
 sub help {
        my ($exitcode) = @_;
@@ -98,6 +99,7 @@ Options:
   --show-types               show the specific message type in the output
   --max-line-length=n        set the maximum line length, if exceeded, warn
   --min-conf-desc-length=n   set the min description length, if shorter, warn
+  --tab-size=n               set the number of spaces for tab (default 8)
   --root=PATH                PATH to the kernel tree root
   --no-summary               suppress the per-file summary
   --mailback                 only produce a report in case of warnings/errors
@@ -215,6 +217,7 @@ GetOptions(
        'list-types!'   => \$list_types,
        'max-line-length=i' => \$max_line_length,
        'min-conf-desc-length=i' => \$min_conf_desc_length,
+       'tab-size=i'    => \$tabsize,
        'root=s'        => \$root,
        'summary!'      => \$summary,
        'mailback!'     => \$mailback,
@@ -267,6 +270,9 @@ if ($color =~ /^[01]$/) {
        die "Invalid color mode: $color\n";
 }
 
+# skip TAB size 1 to avoid additional checks on $tabsize - 1
+die "Invalid TAB size: $tabsize\n" if ($tabsize < 2);
+
 sub hash_save_array_words {
        my ($hashRef, $arrayRef) = @_;
 
@@ -804,12 +810,12 @@ sub build_types {
                  }x;
        $Type   = qr{
                        $NonptrType
-                       (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+)?
+                       (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+){0,4}
                        (?:\s+$Inline|\s+$Modifier)*
                  }x;
        $TypeMisordered = qr{
                        $NonptrTypeMisordered
-                       (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+)?
+                       (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+){0,4}
                        (?:\s+$Inline|\s+$Modifier)*
                  }x;
        $Declare        = qr{(?:$Storage\s+(?:$Inline\s+)?)?$Type};
@@ -1118,6 +1124,7 @@ sub parse_email {
        my ($formatted_email) = @_;
 
        my $name = "";
+       my $name_comment = "";
        my $address = "";
        my $comment = "";
 
@@ -1150,6 +1157,10 @@ sub parse_email {
 
        $name = trim($name);
        $name =~ s/^\"|\"$//g;
+       $name =~ s/(\s*\([^\)]+\))\s*//;
+       if (defined($1)) {
+               $name_comment = trim($1);
+       }
        $address = trim($address);
        $address =~ s/^\<|\>$//g;
 
@@ -1158,7 +1169,7 @@ sub parse_email {
                $name = "\"$name\"";
        }
 
-       return ($name, $address, $comment);
+       return ($name, $name_comment, $address, $comment);
 }
 
 sub format_email {
@@ -1184,6 +1195,23 @@ sub format_email {
        return $formatted_email;
 }
 
+sub reformat_email {
+       my ($email) = @_;
+
+       my ($email_name, $name_comment, $email_address, $comment) = parse_email($email);
+       return format_email($email_name, $email_address);
+}
+
+sub same_email_addresses {
+       my ($email1, $email2) = @_;
+
+       my ($email1_name, $name1_comment, $email1_address, $comment1) = parse_email($email1);
+       my ($email2_name, $name2_comment, $email2_address, $comment2) = parse_email($email2);
+
+       return $email1_name eq $email2_name &&
+              $email1_address eq $email2_address;
+}
+
 sub which {
        my ($bin) = @_;
 
@@ -1217,7 +1245,7 @@ sub expand_tabs {
                if ($c eq "\t") {
                        $res .= ' ';
                        $n++;
-                       for (; ($n % 8) != 0; $n++) {
+                       for (; ($n % $tabsize) != 0; $n++) {
                                $res .= ' ';
                        }
                        next;
@@ -2230,7 +2258,7 @@ sub string_find_replace {
 sub tabify {
        my ($leading) = @_;
 
-       my $source_indent = 8;
+       my $source_indent = $tabsize;
        my $max_spaces_before_tab = $source_indent - 1;
        my $spaces_to_tab = " " x $source_indent;
 
@@ -2272,6 +2300,19 @@ sub pos_last_openparen {
        return length(expand_tabs(substr($line, 0, $last_openparen))) + 1;
 }
 
+sub get_raw_comment {
+       my ($line, $rawline) = @_;
+       my $comment = '';
+
+       for my $i (0 .. (length($line) - 1)) {
+               if (substr($line, $i, 1) eq "$;") {
+                       $comment .= substr($rawline, $i, 1);
+               }
+       }
+
+       return $comment;
+}
+
 sub process {
        my $filename = shift;
 
@@ -2294,6 +2335,7 @@ sub process {
        my $is_binding_patch = -1;
        my $in_header_lines = $file ? 0 : 1;
        my $in_commit_log = 0;          #Scanning lines before patch
+       my $has_patch_separator = 0;    #Found a --- line
        my $has_commit_log = 0;         #Encountered lines before patch
        my $commit_log_lines = 0;       #Number of commit log lines
        my $commit_log_possible_stack_dump = 0;
@@ -2433,6 +2475,7 @@ sub process {
                $sline =~ s/$;/ /g;     #with comments as spaces
 
                my $rawline = $rawlines[$linenr - 1];
+               my $raw_comment = get_raw_comment($line, $rawline);
 
 # check if it's a mode change, rename or start of a patch
                if (!$in_commit_log &&
@@ -2604,21 +2647,26 @@ sub process {
                        $author = $1;
                        $author = encode("utf8", $author) if ($line =~ /=\?utf-8\?/i);
                        $author =~ s/"//g;
+                       $author = reformat_email($author);
                }
 
 # Check the patch for a signoff:
-               if ($line =~ /^\s*signed-off-by:/i) {
+               if ($line =~ /^\s*signed-off-by:\s*(.*)/i) {
                        $signoff++;
                        $in_commit_log = 0;
                        if ($author ne '') {
-                               my $l = $line;
-                               $l =~ s/"//g;
-                               if ($l =~ /^\s*signed-off-by:\s*\Q$author\E/i) {
-                                   $authorsignoff = 1;
+                               if (same_email_addresses($1, $author)) {
+                                       $authorsignoff = 1;
                                }
                        }
                }
 
+# Check for patch separator
+               if ($line =~ /^---$/) {
+                       $has_patch_separator = 1;
+                       $in_commit_log = 0;
+               }
+
 # Check if MAINTAINERS is being updated.  If so, there's probably no need to
 # emit the "does MAINTAINERS need updating?" message on file add/move/delete
                if ($line =~ /^\s*MAINTAINERS\s*\|/) {
@@ -2664,7 +2712,7 @@ sub process {
                                }
                        }
 
-                       my ($email_name, $email_address, $comment) = parse_email($email);
+                       my ($email_name, $name_comment, $email_address, $comment) = parse_email($email);
                        my $suggested_email = format_email(($email_name, $email_address));
                        if ($suggested_email eq "") {
                                ERROR("BAD_SIGN_OFF",
@@ -2675,9 +2723,7 @@ sub process {
                                $dequoted =~ s/" </ </;
                                # Don't force email to have quotes
                                # Allow just an angle bracketed address
-                               if ("$dequoted$comment" ne $email &&
-                                   "<$email_address>$comment" ne $email &&
-                                   "$suggested_email$comment" ne $email) {
+                               if (!same_email_addresses($email, $suggested_email)) {
                                        WARN("BAD_SIGN_OFF",
                                             "email address '$email' might be better as '$suggested_email$comment'\n" . $herecurr);
                                }
@@ -2720,10 +2766,10 @@ sub process {
                             "A patch subject line should describe the change not the tool that found it\n" . $herecurr);
                }
 
-# Check for unwanted Gerrit info
-               if ($in_commit_log && $line =~ /^\s*change-id:/i) {
+# Check for Gerrit Change-Ids not in any patch context
+               if ($realfile eq '' && !$has_patch_separator && $line =~ /^\s*change-id:/i) {
                        ERROR("GERRIT_CHANGE_ID",
-                             "Remove Gerrit Change-Id's before submitting upstream.\n" . $herecurr);
+                             "Remove Gerrit Change-Id's before submitting upstream\n" . $herecurr);
                }
 
 # Check if the commit log is in a possible stack dump
@@ -2761,7 +2807,7 @@ sub process {
 
 # Check for git id commit length and improperly formed commit descriptions
                if ($in_commit_log && !$commit_log_possible_stack_dump &&
-                   $line !~ /^\s*(?:Link|Patchwork|http|https|BugLink):/i &&
+                   $line !~ /^\s*(?:Link|Patchwork|http|https|BugLink|base-commit):/i &&
                    $line !~ /^This reverts commit [0-9a-f]{7,40}/ &&
                    ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
                     ($line =~ /(?:\s|^)[0-9a-f]{12,40}(?:[\s"'\(\[]|$)/i &&
@@ -3087,7 +3133,7 @@ sub process {
                                        $comment = '/*';
                                } elsif ($realfile =~ /\.(c|dts|dtsi)$/) {
                                        $comment = '//';
-                               } elsif (($checklicenseline == 2) || $realfile =~ /\.(sh|pl|py|awk|tc)$/) {
+                               } elsif (($checklicenseline == 2) || $realfile =~ /\.(sh|pl|py|awk|tc|yaml)$/) {
                                        $comment = '#';
                                } elsif ($realfile =~ /\.rst$/) {
                                        $comment = '..';
@@ -3111,6 +3157,17 @@ sub process {
                                                WARN("SPDX_LICENSE_TAG",
                                                     "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
                                        }
+                                       if ($realfile =~ m@^Documentation/devicetree/bindings/@ &&
+                                           not $spdx_license =~ /GPL-2\.0.*BSD-2-Clause/) {
+                                               my $msg_level = \&WARN;
+                                               $msg_level = \&CHK if ($file);
+                                               if (&{$msg_level}("SPDX_LICENSE_TAG",
+
+                                                                 "DT binding documents should be licensed (GPL-2.0-only OR BSD-2-Clause)\n" . $herecurr) &&
+                                                   $fix) {
+                                                       $fixed[$fixlinenr] =~ s/SPDX-License-Identifier: .*/SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)/;
+                                               }
+                                       }
                                }
                        }
                }
@@ -3198,7 +3255,7 @@ sub process {
                next if ($realfile !~ /\.(h|c|pl|dtsi|dts)$/);
 
 # at the beginning of a line any tabs must come first and anything
-# more than 8 must use tabs.
+# more than $tabsize must use tabs.
                if ($rawline =~ /^\+\s* \t\s*\S/ ||
                    $rawline =~ /^\+\s*        \s*/) {
                        my $herevet = "$here\n" . cat_vet($rawline) . "\n";
@@ -3217,7 +3274,7 @@ sub process {
                                "please, no space before tabs\n" . $herevet) &&
                            $fix) {
                                while ($fixed[$fixlinenr] =~
-                                          s/(^\+.*) {8,8}\t/$1\t\t/) {}
+                                          s/(^\+.*) {$tabsize,$tabsize}\t/$1\t\t/) {}
                                while ($fixed[$fixlinenr] =~
                                           s/(^\+.*) +\t/$1\t/) {}
                        }
@@ -3239,11 +3296,11 @@ sub process {
                if ($perl_version_ok &&
                    $sline =~ /^\+\t+( +)(?:$c90_Keywords\b|\{\s*$|\}\s*(?:else\b|while\b|\s*$)|$Declare\s*$Ident\s*[;=])/) {
                        my $indent = length($1);
-                       if ($indent % 8) {
+                       if ($indent % $tabsize) {
                                if (WARN("TABSTOP",
                                         "Statements should start on a tabstop\n" . $herecurr) &&
                                    $fix) {
-                                       $fixed[$fixlinenr] =~ s@(^\+\t+) +@$1 . "\t" x ($indent/8)@e;
+                                       $fixed[$fixlinenr] =~ s@(^\+\t+) +@$1 . "\t" x ($indent/$tabsize)@e;
                                }
                        }
                }
@@ -3261,8 +3318,8 @@ sub process {
                                my $newindent = $2;
 
                                my $goodtabindent = $oldindent .
-                                       "\t" x ($pos / 8) .
-                                       " "  x ($pos % 8);
+                                       "\t" x ($pos / $tabsize) .
+                                       " "  x ($pos % $tabsize);
                                my $goodspaceindent = $oldindent . " "  x $pos;
 
                                if ($newindent ne $goodtabindent &&
@@ -3733,11 +3790,11 @@ sub process {
                        #print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n";
 
                        if ($check && $s ne '' &&
-                           (($sindent % 8) != 0 ||
+                           (($sindent % $tabsize) != 0 ||
                             ($sindent < $indent) ||
                             ($sindent == $indent &&
                              ($s !~ /^\s*(?:\}|\{|else\b)/)) ||
-                            ($sindent > $indent + 8))) {
+                            ($sindent > $indent + $tabsize))) {
                                WARN("SUSPECT_CODE_INDENT",
                                     "suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n");
                        }
@@ -4014,7 +4071,7 @@ sub process {
                }
 
 # check for function declarations without arguments like "int foo()"
-               if ($line =~ /(\b$Type\s+$Ident)\s*\(\s*\)/) {
+               if ($line =~ /(\b$Type\s*$Ident)\s*\(\s*\)/) {
                        if (ERROR("FUNCTION_WITHOUT_ARGS",
                                  "Bad function definition - $1() should probably be $1(void)\n" . $herecurr) &&
                            $fix) {
@@ -4582,7 +4639,7 @@ sub process {
                                            ($op eq '>' &&
                                             $ca =~ /<\S+\@\S+$/))
                                        {
-                                               $ok = 1;
+                                               $ok = 1;
                                        }
 
                                        # for asm volatile statements
@@ -4917,7 +4974,7 @@ sub process {
                        # conditional.
                        substr($s, 0, length($c), '');
                        $s =~ s/\n.*//g;
-                       $s =~ s/$;//g;  # Remove any comments
+                       $s =~ s/$;//g;  # Remove any comments
                        if (length($c) && $s !~ /^\s*{?\s*\\*\s*$/ &&
                            $c !~ /}\s*while\s*/)
                        {
@@ -4956,7 +5013,7 @@ sub process {
 # if and else should not have general statements after it
                if ($line =~ /^.\s*(?:}\s*)?else\b(.*)/) {
                        my $s = $1;
-                       $s =~ s/$;//g;  # Remove any comments
+                       $s =~ s/$;//g;  # Remove any comments
                        if ($s !~ /^\s*(?:\sif|(?:{|)\s*\\?\s*$)/) {
                                ERROR("TRAILING_STATEMENTS",
                                      "trailing statements should be on next line\n" . $herecurr);
@@ -5132,7 +5189,7 @@ sub process {
                        {
                        }
 
-                       # Flatten any obvious string concatentation.
+                       # Flatten any obvious string concatenation.
                        while ($dstat =~ s/($String)\s*$Ident/$1/ ||
                               $dstat =~ s/$Ident\s*($String)/$1/)
                        {
@@ -6230,13 +6287,17 @@ sub process {
                }
 
 # check for function declarations that have arguments without identifier names
+# while avoiding uninitialized_var(x)
                if (defined $stat &&
-                   $stat =~ /^.\s*(?:extern\s+)?$Type\s*(?:$Ident|\(\s*\*\s*$Ident\s*\))\s*\(\s*([^{]+)\s*\)\s*;/s &&
-                   $1 ne "void") {
-                       my $args = trim($1);
+                   $stat =~ /^.\s*(?:extern\s+)?$Type\s*(?:($Ident)|\(\s*\*\s*$Ident\s*\))\s*\(\s*([^{]+)\s*\)\s*;/s &&
+                   (!defined($1) ||
+                    (defined($1) && $1 ne "uninitialized_var")) &&
+                    $2 ne "void") {
+                       my $args = trim($2);
                        while ($args =~ m/\s*($Type\s*(?:$Ident|\(\s*\*\s*$Ident?\s*\)\s*$balanced_parens)?)/g) {
                                my $arg = trim($1);
-                               if ($arg =~ /^$Type$/ && $arg !~ /enum\s+$Ident$/) {
+                               if ($arg =~ /^$Type$/ &&
+                                       $arg !~ /enum\s+$Ident$/) {
                                        WARN("FUNCTION_ARGUMENTS",
                                             "function definition argument '$arg' should also have an identifier name\n" . $herecurr);
                                }
@@ -6389,6 +6450,28 @@ sub process {
                        }
                }
 
+# check for /* fallthrough */ like comment, prefer fallthrough;
+               my @fallthroughs = (
+                       'fallthrough',
+                       '@fallthrough@',
+                       'lint -fallthrough[ \t]*',
+                       'intentional(?:ly)?[ \t]*fall(?:(?:s | |-)[Tt]|t)hr(?:ough|u|ew)',
+                       '(?:else,?\s*)?FALL(?:S | |-)?THR(?:OUGH|U|EW)[ \t.!]*(?:-[^\n\r]*)?',
+                       'Fall(?:(?:s | |-)[Tt]|t)hr(?:ough|u|ew)[ \t.!]*(?:-[^\n\r]*)?',
+                       'fall(?:s | |-)?thr(?:ough|u|ew)[ \t.!]*(?:-[^\n\r]*)?',
+                   );
+               if ($raw_comment ne '') {
+                       foreach my $ft (@fallthroughs) {
+                               if ($raw_comment =~ /$ft/) {
+                                       my $msg_level = \&WARN;
+                                       $msg_level = \&CHK if ($file);
+                                       &{$msg_level}("PREFER_FALLTHROUGH",
+                                                     "Prefer 'fallthrough;' over fallthrough comment\n" . $herecurr);
+                                       last;
+                               }
+                       }
+               }
+
 # check for switch/default statements without a break;
                if ($perl_version_ok &&
                    defined $stat &&
index 2548ff8..06ac7bd 100644 (file)
@@ -497,7 +497,7 @@ struct rb_node *rb_next(const struct rb_node *node)
        if (node->rb_right) {
                node = node->rb_right;
                while (node->rb_left)
-                       node=node->rb_left;
+                       node = node->rb_left;
                return (struct rb_node *)node;
        }
 
@@ -528,7 +528,7 @@ struct rb_node *rb_prev(const struct rb_node *node)
        if (node->rb_left) {
                node = node->rb_left;
                while (node->rb_right)
-                       node=node->rb_right;
+                       node = node->rb_right;
                return (struct rb_node *)node;
        }
 
index dbebf05..47f9cc9 100644 (file)
@@ -21,8 +21,8 @@ DRIVERS := ../../../drivers
 NVDIMM_SRC := $(DRIVERS)/nvdimm
 ACPI_SRC := $(DRIVERS)/acpi/nfit
 DAX_SRC := $(DRIVERS)/dax
-ccflags-y := -I$(src)/$(NVDIMM_SRC)/
-ccflags-y += -I$(src)/$(ACPI_SRC)/
+ccflags-y := -I$(srctree)/drivers/nvdimm/
+ccflags-y += -I$(srctree)/drivers/acpi/nfit/
 
 obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
index fb3c3d7..75baebf 100644 (file)
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
-ccflags-y := -I$(src)/../../../../drivers/nvdimm/
-ccflags-y += -I$(src)/../../../../drivers/acpi/nfit/
+ccflags-y := -I$(srctree)/drivers/nvdimm/
+ccflags-y += -I$(srctree)/drivers/acpi/nfit/
 
 obj-m += nfit_test.o
 obj-m += nfit_test_iomap.o
index bf6422a..a8ee5c4 100644 (file)
@@ -3164,7 +3164,9 @@ static __init int nfit_test_init(void)
        mcsafe_test();
        dax_pmem_test();
        dax_pmem_core_test();
+#ifdef CONFIG_DEV_DAX_PMEM_COMPAT
        dax_pmem_compat_test();
+#endif
 
        nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm);
 
index 37a04da..11eee0b 100644 (file)
@@ -7,13 +7,14 @@
 #include <pthread.h>
 #include <sys/epoll.h>
 #include <sys/socket.h>
+#include <sys/eventfd.h>
 #include "../../kselftest_harness.h"
 
 struct epoll_mtcontext
 {
        int efd[3];
        int sfd[4];
-       int count;
+       volatile int count;
 
        pthread_t main;
        pthread_t waiter;
@@ -3071,4 +3072,68 @@ TEST(epoll58)
        close(ctx.sfd[3]);
 }
 
+static void *epoll59_thread(void *ctx_)
+{
+       struct epoll_mtcontext *ctx = ctx_;
+       struct epoll_event e;
+       int i;
+
+       for (i = 0; i < 100000; i++) {
+               while (ctx->count == 0)
+                       ;
+
+               e.events = EPOLLIN | EPOLLERR | EPOLLET;
+               epoll_ctl(ctx->efd[0], EPOLL_CTL_MOD, ctx->sfd[0], &e);
+               ctx->count = 0;
+       }
+
+       return NULL;
+}
+
+/*
+ *        t0
+ *      (p) \
+ *           e0
+ *     (et) /
+ *        e0
+ *
+ * Based on https://bugzilla.kernel.org/show_bug.cgi?id=205933
+ */
+TEST(epoll59)
+{
+       pthread_t emitter;
+       struct pollfd pfd;
+       struct epoll_event e;
+       struct epoll_mtcontext ctx = { 0 };
+       int i, ret;
+
+       signal(SIGUSR1, signal_handler);
+
+       ctx.efd[0] = epoll_create1(0);
+       ASSERT_GE(ctx.efd[0], 0);
+
+       ctx.sfd[0] = eventfd(1, 0);
+       ASSERT_GE(ctx.sfd[0], 0);
+
+       e.events = EPOLLIN | EPOLLERR | EPOLLET;
+       ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+       ASSERT_EQ(pthread_create(&emitter, NULL, epoll59_thread, &ctx), 0);
+
+       for (i = 0; i < 100000; i++) {
+               ret = epoll_wait(ctx.efd[0], &e, 1, 1000);
+               ASSERT_GT(ret, 0);
+
+               while (ctx.count != 0)
+                       ;
+               ctx.count = 1;
+       }
+       if (pthread_tryjoin_np(emitter, NULL) < 0) {
+               pthread_kill(emitter, SIGUSR1);
+               pthread_join(emitter, NULL);
+       }
+       close(ctx.efd[0]);
+       close(ctx.sfd[0]);
+}
+
 TEST_HARNESS_MAIN
index f988d2f..8a8d0f4 100755 (executable)
@@ -41,6 +41,11 @@ for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do
                continue;
        fi
 
+       if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then
+               echo "$dev, Skipped: ahci doesn't support recovery"
+               continue
+       fi
+
        # Don't inject errosr into an already-frozen PE. This happens with
        # PEs that contain multiple PCI devices (e.g. multi-function cards)
        # and injecting new errors during the recovery process will probably
index 0b0db8d..5881e97 100644 (file)
@@ -25,6 +25,7 @@ $(OUTPUT)/tm-unavailable: CFLAGS += -O0 -pthread -m64 -Wno-error=uninitialized -
 $(OUTPUT)/tm-trap: CFLAGS += -O0 -pthread -m64
 $(OUTPUT)/tm-signal-context-force-tm: CFLAGS += -pthread -m64
 $(OUTPUT)/tm-signal-pagefault: CFLAGS += -pthread -m64
+$(OUTPUT)/tm-poison: CFLAGS += -m64
 
 SIGNAL_CONTEXT_CHK_TESTS := $(patsubst %,$(OUTPUT)/%,$(SIGNAL_CONTEXT_CHK_TESTS))
 $(SIGNAL_CONTEXT_CHK_TESTS): tm-signal.S
index d336277..61e5cfe 100644 (file)
@@ -54,6 +54,7 @@
 #include <linux/userfaultfd.h>
 #include <setjmp.h>
 #include <stdbool.h>
+#include <assert.h>
 
 #include "../kselftest.h"
 
@@ -76,6 +77,8 @@ static int test_type;
 #define ALARM_INTERVAL_SECS 10
 static volatile bool test_uffdio_copy_eexist = true;
 static volatile bool test_uffdio_zeropage_eexist = true;
+/* Whether to test uffd write-protection */
+static bool test_uffdio_wp = false;
 
 static bool map_shared;
 static int huge_fd;
@@ -86,6 +89,13 @@ static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
 static char *zeropage;
 pthread_attr_t attr;
 
+/* Userfaultfd test statistics */
+struct uffd_stats {
+       int cpu;
+       unsigned long missing_faults;
+       unsigned long wp_faults;
+};
+
 /* pthread_mutex_t starts at page offset 0 */
 #define area_mutex(___area, ___nr)                                     \
        ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
@@ -125,6 +135,37 @@ static void usage(void)
        exit(1);
 }
 
+static void uffd_stats_reset(struct uffd_stats *uffd_stats,
+                            unsigned long n_cpus)
+{
+       int i;
+
+       for (i = 0; i < n_cpus; i++) {
+               uffd_stats[i].cpu = i;
+               uffd_stats[i].missing_faults = 0;
+               uffd_stats[i].wp_faults = 0;
+       }
+}
+
+static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
+{
+       int i;
+       unsigned long long miss_total = 0, wp_total = 0;
+
+       for (i = 0; i < n_cpus; i++) {
+               miss_total += stats[i].missing_faults;
+               wp_total += stats[i].wp_faults;
+       }
+
+       printf("userfaults: %llu missing (", miss_total);
+       for (i = 0; i < n_cpus; i++)
+               printf("%lu+", stats[i].missing_faults);
+       printf("\b), %llu wp (", wp_total);
+       for (i = 0; i < n_cpus; i++)
+               printf("%lu+", stats[i].wp_faults);
+       printf("\b)\n");
+}
+
 static int anon_release_pages(char *rel_area)
 {
        int ret = 0;
@@ -245,10 +286,15 @@ struct uffd_test_ops {
        void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
 };
 
-#define ANON_EXPECTED_IOCTLS           ((1 << _UFFDIO_WAKE) | \
+#define SHMEM_EXPECTED_IOCTLS          ((1 << _UFFDIO_WAKE) | \
                                         (1 << _UFFDIO_COPY) | \
                                         (1 << _UFFDIO_ZEROPAGE))
 
+#define ANON_EXPECTED_IOCTLS           ((1 << _UFFDIO_WAKE) | \
+                                        (1 << _UFFDIO_COPY) | \
+                                        (1 << _UFFDIO_ZEROPAGE) | \
+                                        (1 << _UFFDIO_WRITEPROTECT))
+
 static struct uffd_test_ops anon_uffd_test_ops = {
        .expected_ioctls = ANON_EXPECTED_IOCTLS,
        .allocate_area  = anon_allocate_area,
@@ -257,7 +303,7 @@ static struct uffd_test_ops anon_uffd_test_ops = {
 };
 
 static struct uffd_test_ops shmem_uffd_test_ops = {
-       .expected_ioctls = ANON_EXPECTED_IOCTLS,
+       .expected_ioctls = SHMEM_EXPECTED_IOCTLS,
        .allocate_area  = shmem_allocate_area,
        .release_pages  = shmem_release_pages,
        .alias_mapping = noop_alias_mapping,
@@ -281,6 +327,21 @@ static int my_bcmp(char *str1, char *str2, size_t n)
        return 0;
 }
 
+static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
+{
+       struct uffdio_writeprotect prms = { 0 };
+
+       /* Write protection page faults */
+       prms.range.start = start;
+       prms.range.len = len;
+       /* Undo write-protect, do wakeup after that */
+       prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
+
+       if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
+               fprintf(stderr, "clear WP failed for address 0x%Lx\n",
+                       start), exit(1);
+}
+
 static void *locking_thread(void *arg)
 {
        unsigned long cpu = (unsigned long) arg;
@@ -419,7 +480,10 @@ static int __copy_page(int ufd, unsigned long offset, bool retry)
        uffdio_copy.dst = (unsigned long) area_dst + offset;
        uffdio_copy.src = (unsigned long) area_src + offset;
        uffdio_copy.len = page_size;
-       uffdio_copy.mode = 0;
+       if (test_uffdio_wp)
+               uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
+       else
+               uffdio_copy.mode = 0;
        uffdio_copy.copy = 0;
        if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
                /* real retval in ufdio_copy.copy */
@@ -467,8 +531,8 @@ static int uffd_read_msg(int ufd, struct uffd_msg *msg)
        return 0;
 }
 
-/* Return 1 if page fault handled by us; otherwise 0 */
-static int uffd_handle_page_fault(struct uffd_msg *msg)
+static void uffd_handle_page_fault(struct uffd_msg *msg,
+                                  struct uffd_stats *stats)
 {
        unsigned long offset;
 
@@ -476,25 +540,32 @@ static int uffd_handle_page_fault(struct uffd_msg *msg)
                fprintf(stderr, "unexpected msg event %u\n",
                        msg->event), exit(1);
 
-       if (bounces & BOUNCE_VERIFY &&
-           msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
-               fprintf(stderr, "unexpected write fault\n"), exit(1);
+       if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
+               wp_range(uffd, msg->arg.pagefault.address, page_size, false);
+               stats->wp_faults++;
+       } else {
+               /* Missing page faults */
+               if (bounces & BOUNCE_VERIFY &&
+                   msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+                       fprintf(stderr, "unexpected write fault\n"), exit(1);
 
-       offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
-       offset &= ~(page_size-1);
+               offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
+               offset &= ~(page_size-1);
 
-       return copy_page(uffd, offset);
+               if (copy_page(uffd, offset))
+                       stats->missing_faults++;
+       }
 }
 
 static void *uffd_poll_thread(void *arg)
 {
-       unsigned long cpu = (unsigned long) arg;
+       struct uffd_stats *stats = (struct uffd_stats *)arg;
+       unsigned long cpu = stats->cpu;
        struct pollfd pollfd[2];
        struct uffd_msg msg;
        struct uffdio_register uffd_reg;
        int ret;
        char tmp_chr;
-       unsigned long userfaults = 0;
 
        pollfd[0].fd = uffd;
        pollfd[0].events = POLLIN;
@@ -524,7 +595,7 @@ static void *uffd_poll_thread(void *arg)
                                msg.event), exit(1);
                        break;
                case UFFD_EVENT_PAGEFAULT:
-                       userfaults += uffd_handle_page_fault(&msg);
+                       uffd_handle_page_fault(&msg, stats);
                        break;
                case UFFD_EVENT_FORK:
                        close(uffd);
@@ -543,50 +614,67 @@ static void *uffd_poll_thread(void *arg)
                        break;
                }
        }
-       return (void *)userfaults;
+
+       return NULL;
 }
 
 pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 static void *uffd_read_thread(void *arg)
 {
-       unsigned long *this_cpu_userfaults;
+       struct uffd_stats *stats = (struct uffd_stats *)arg;
        struct uffd_msg msg;
 
-       this_cpu_userfaults = (unsigned long *) arg;
-       *this_cpu_userfaults = 0;
-
        pthread_mutex_unlock(&uffd_read_mutex);
        /* from here cancellation is ok */
 
        for (;;) {
                if (uffd_read_msg(uffd, &msg))
                        continue;
-               (*this_cpu_userfaults) += uffd_handle_page_fault(&msg);
+               uffd_handle_page_fault(&msg, stats);
        }
-       return (void *)NULL;
+
+       return NULL;
 }
 
 static void *background_thread(void *arg)
 {
        unsigned long cpu = (unsigned long) arg;
-       unsigned long page_nr;
+       unsigned long page_nr, start_nr, mid_nr, end_nr;
+
+       start_nr = cpu * nr_pages_per_cpu;
+       end_nr = (cpu+1) * nr_pages_per_cpu;
+       mid_nr = (start_nr + end_nr) / 2;
 
-       for (page_nr = cpu * nr_pages_per_cpu;
-            page_nr < (cpu+1) * nr_pages_per_cpu;
-            page_nr++)
+       /* Copy the first half of the pages */
+       for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
+               copy_page_retry(uffd, page_nr * page_size);
+
+       /*
+        * If we need to test uffd-wp, set it up now.  Then we'll have
+        * at least the first half of the pages mapped already which
+        * can be write-protected for testing
+        */
+       if (test_uffdio_wp)
+               wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
+                       nr_pages_per_cpu * page_size, true);
+
+       /*
+        * Continue the 2nd half of the page copying, handling write
+        * protection faults if any
+        */
+       for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
                copy_page_retry(uffd, page_nr * page_size);
 
        return NULL;
 }
 
-static int stress(unsigned long *userfaults)
+static int stress(struct uffd_stats *uffd_stats)
 {
        unsigned long cpu;
        pthread_t locking_threads[nr_cpus];
        pthread_t uffd_threads[nr_cpus];
        pthread_t background_threads[nr_cpus];
-       void **_userfaults = (void **) userfaults;
 
        finished = 0;
        for (cpu = 0; cpu < nr_cpus; cpu++) {
@@ -595,12 +683,13 @@ static int stress(unsigned long *userfaults)
                        return 1;
                if (bounces & BOUNCE_POLL) {
                        if (pthread_create(&uffd_threads[cpu], &attr,
-                                          uffd_poll_thread, (void *)cpu))
+                                          uffd_poll_thread,
+                                          (void *)&uffd_stats[cpu]))
                                return 1;
                } else {
                        if (pthread_create(&uffd_threads[cpu], &attr,
                                           uffd_read_thread,
-                                          &_userfaults[cpu]))
+                                          (void *)&uffd_stats[cpu]))
                                return 1;
                        pthread_mutex_lock(&uffd_read_mutex);
                }
@@ -637,7 +726,8 @@ static int stress(unsigned long *userfaults)
                                fprintf(stderr, "pipefd write error\n");
                                return 1;
                        }
-                       if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
+                       if (pthread_join(uffd_threads[cpu],
+                                        (void *)&uffd_stats[cpu]))
                                return 1;
                } else {
                        if (pthread_cancel(uffd_threads[cpu]))
@@ -735,17 +825,31 @@ static int faulting_process(int signal_test)
        }
 
        for (nr = 0; nr < split_nr_pages; nr++) {
+               int steps = 1;
+               unsigned long offset = nr * page_size;
+
                if (signal_test) {
                        if (sigsetjmp(*sigbuf, 1) != 0) {
-                               if (nr == lastnr) {
+                               if (steps == 1 && nr == lastnr) {
                                        fprintf(stderr, "Signal repeated\n");
                                        return 1;
                                }
 
                                lastnr = nr;
                                if (signal_test == 1) {
-                                       if (copy_page(uffd, nr * page_size))
-                                               signalled++;
+                                       if (steps == 1) {
+                                               /* This is a MISSING request */
+                                               steps++;
+                                               if (copy_page(uffd, offset))
+                                                       signalled++;
+                                       } else {
+                                               /* This is a WP request */
+                                               assert(steps == 2);
+                                               wp_range(uffd,
+                                                        (__u64)area_dst +
+                                                        offset,
+                                                        page_size, false);
+                                       }
                                } else {
                                        signalled++;
                                        continue;
@@ -758,8 +862,13 @@ static int faulting_process(int signal_test)
                        fprintf(stderr,
                                "nr %lu memory corruption %Lu %Lu\n",
                                nr, count,
-                               count_verify[nr]), exit(1);
-               }
+                               count_verify[nr]);
+               }
+               /*
+                * Trigger write protection if there is by writting
+                * the same value back.
+                */
+               *area_count(area_dst, nr) = count;
        }
 
        if (signal_test)
@@ -781,6 +890,11 @@ static int faulting_process(int signal_test)
                                nr, count,
                                count_verify[nr]), exit(1);
                }
+               /*
+                * Trigger write protection if there is by writting
+                * the same value back.
+                */
+               *area_count(area_dst, nr) = count;
        }
 
        if (uffd_test_ops->release_pages(area_dst))
@@ -884,6 +998,8 @@ static int userfaultfd_zeropage_test(void)
        uffdio_register.range.start = (unsigned long) area_dst;
        uffdio_register.range.len = nr_pages * page_size;
        uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+       if (test_uffdio_wp)
+               uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
                fprintf(stderr, "register failure\n"), exit(1);
 
@@ -908,11 +1024,11 @@ static int userfaultfd_events_test(void)
 {
        struct uffdio_register uffdio_register;
        unsigned long expected_ioctls;
-       unsigned long userfaults;
        pthread_t uffd_mon;
        int err, features;
        pid_t pid;
        char c;
+       struct uffd_stats stats = { 0 };
 
        printf("testing events (fork, remap, remove): ");
        fflush(stdout);
@@ -929,6 +1045,8 @@ static int userfaultfd_events_test(void)
        uffdio_register.range.start = (unsigned long) area_dst;
        uffdio_register.range.len = nr_pages * page_size;
        uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+       if (test_uffdio_wp)
+               uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
                fprintf(stderr, "register failure\n"), exit(1);
 
@@ -939,7 +1057,7 @@ static int userfaultfd_events_test(void)
                        "unexpected missing ioctl for anon memory\n"),
                        exit(1);
 
-       if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
+       if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
                perror("uffd_poll_thread create"), exit(1);
 
        pid = fork();
@@ -955,13 +1073,14 @@ static int userfaultfd_events_test(void)
 
        if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
                perror("pipe write"), exit(1);
-       if (pthread_join(uffd_mon, (void **)&userfaults))
+       if (pthread_join(uffd_mon, NULL))
                return 1;
 
        close(uffd);
-       printf("userfaults: %ld\n", userfaults);
 
-       return userfaults != nr_pages;
+       uffd_stats_report(&stats, 1);
+
+       return stats.missing_faults != nr_pages;
 }
 
 static int userfaultfd_sig_test(void)
@@ -973,6 +1092,7 @@ static int userfaultfd_sig_test(void)
        int err, features;
        pid_t pid;
        char c;
+       struct uffd_stats stats = { 0 };
 
        printf("testing signal delivery: ");
        fflush(stdout);
@@ -988,6 +1108,8 @@ static int userfaultfd_sig_test(void)
        uffdio_register.range.start = (unsigned long) area_dst;
        uffdio_register.range.len = nr_pages * page_size;
        uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+       if (test_uffdio_wp)
+               uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
                fprintf(stderr, "register failure\n"), exit(1);
 
@@ -1004,7 +1126,7 @@ static int userfaultfd_sig_test(void)
        if (uffd_test_ops->release_pages(area_dst))
                return 1;
 
-       if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
+       if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
                perror("uffd_poll_thread create"), exit(1);
 
        pid = fork();
@@ -1030,6 +1152,7 @@ static int userfaultfd_sig_test(void)
        close(uffd);
        return userfaults != 0;
 }
+
 static int userfaultfd_stress(void)
 {
        void *area;
@@ -1038,7 +1161,7 @@ static int userfaultfd_stress(void)
        struct uffdio_register uffdio_register;
        unsigned long cpu;
        int err;
-       unsigned long userfaults[nr_cpus];
+       struct uffd_stats uffd_stats[nr_cpus];
 
        uffd_test_ops->allocate_area((void **)&area_src);
        if (!area_src)
@@ -1119,6 +1242,8 @@ static int userfaultfd_stress(void)
                uffdio_register.range.start = (unsigned long) area_dst;
                uffdio_register.range.len = nr_pages * page_size;
                uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+               if (test_uffdio_wp)
+                       uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
                if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
                        fprintf(stderr, "register failure\n");
                        return 1;
@@ -1167,10 +1292,17 @@ static int userfaultfd_stress(void)
                if (uffd_test_ops->release_pages(area_dst))
                        return 1;
 
+               uffd_stats_reset(uffd_stats, nr_cpus);
+
                /* bounce pass */
-               if (stress(userfaults))
+               if (stress(uffd_stats))
                        return 1;
 
+               /* Clear all the write protections if there is any */
+               if (test_uffdio_wp)
+                       wp_range(uffd, (unsigned long)area_dst,
+                                nr_pages * page_size, false);
+
                /* unregister */
                if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
                        fprintf(stderr, "unregister failure\n");
@@ -1209,10 +1341,7 @@ static int userfaultfd_stress(void)
                area_src_alias = area_dst_alias;
                area_dst_alias = tmp_area;
 
-               printf("userfaults:");
-               for (cpu = 0; cpu < nr_cpus; cpu++)
-                       printf(" %lu", userfaults[cpu]);
-               printf("\n");
+               uffd_stats_report(uffd_stats, nr_cpus);
        }
 
        if (err)
@@ -1252,6 +1381,8 @@ static void set_test_type(const char *type)
        if (!strcmp(type, "anon")) {
                test_type = TEST_ANON;
                uffd_test_ops = &anon_uffd_test_ops;
+               /* Only enable write-protect test for anonymous test */
+               test_uffdio_wp = true;
        } else if (!strcmp(type, "hugetlb")) {
                test_type = TEST_HUGETLB;
                uffd_test_ops = &hugetlb_uffd_test_ops;
index 83ec6e4..7eb3216 100644 (file)
@@ -46,7 +46,7 @@ static void   start_daemon_mode(void);
 
 pthread_t event_tid;
 pthread_mutex_t input_lock;
-void usage()
+void usage(void)
 {
        printf("Usage: tmon [OPTION...]\n");
        printf("  -c, --control         cooling device in control\n");
@@ -62,7 +62,7 @@ void usage()
        exit(0);
 }
 
-void version()
+void version(void)
 {
        printf("TMON version %s\n", VERSION);
        exit(EXIT_SUCCESS);
@@ -70,7 +70,6 @@ void version()
 
 static void tmon_cleanup(void)
 {
-
        syslog(LOG_INFO, "TMON exit cleanup\n");
        fflush(stdout);
        refresh();
@@ -96,7 +95,6 @@ static void tmon_cleanup(void)
        exit(1);
 }
 
-
 static void tmon_sig_handler(int sig)
 {
        syslog(LOG_INFO, "TMON caught signal %d\n", sig);
@@ -120,7 +118,6 @@ static void tmon_sig_handler(int sig)
        tmon_exit = true;
 }
 
-
 static void start_syslog(void)
 {
        if (debug_on)
@@ -167,7 +164,6 @@ static void prepare_logging(void)
                return;
        }
 
-
        fprintf(tmon_log, "#----------- THERMAL SYSTEM CONFIG -------------\n");
        for (i = 0; i < ptdata.nr_tz_sensor; i++) {
                char binding_str[33]; /* size of long + 1 */
@@ -175,7 +171,7 @@ static void prepare_logging(void)
 
                memset(binding_str, 0, sizeof(binding_str));
                for (j = 0; j < 32; j++)
-                       binding_str[j] = (ptdata.tzi[i].cdev_binding & 1<<j) ?
+                       binding_str[j] = (ptdata.tzi[i].cdev_binding & (1 << j)) ?
                                '1' : '0';
 
                fprintf(tmon_log, "#thermal zone %s%02d cdevs binding: %32s\n",
@@ -187,7 +183,6 @@ static void prepare_logging(void)
                                trip_type_name[ptdata.tzi[i].tp[j].type],
                                ptdata.tzi[i].tp[j].temp);
                }
-
        }
 
        for (i = 0; i < ptdata.nr_cooling_dev; i++)
@@ -219,7 +214,6 @@ static struct option opts[] = {
        { 0, 0, NULL, 0 }
 };
 
-
 int main(int argc, char **argv)
 {
        int err = 0;
@@ -283,7 +277,7 @@ int main(int argc, char **argv)
        if (signal(SIGINT, tmon_sig_handler) == SIG_ERR)
                syslog(LOG_DEBUG, "Cannot handle SIGINT\n");
        if (signal(SIGTERM, tmon_sig_handler) == SIG_ERR)
-               syslog(LOG_DEBUG, "Cannot handle SIGINT\n");
+               syslog(LOG_DEBUG, "Cannot handle SIGTERM\n");
 
        if (probe_thermal_sysfs()) {
                pthread_mutex_destroy(&input_lock);
@@ -328,8 +322,7 @@ int main(int argc, char **argv)
                        show_cooling_device();
                }
                time_elapsed += ticktime;
-               controller_handler(trec[0].temp[target_tz_index] / 1000,
-                               &yk);
+               controller_handler(trec[0].temp[target_tz_index] / 1000, &yk);
                trec[0].pid_out_pct = yk;
                if (!dialogue_on)
                        show_control_w();
@@ -340,14 +333,15 @@ int main(int argc, char **argv)
        return 0;
 }
 
-static void start_daemon_mode()
+static void start_daemon_mode(void)
 {
        daemon_mode = 1;
        /* fork */
        pid_t   sid, pid = fork();
-       if (pid < 0) {
+
+       if (pid < 0)
                exit(EXIT_FAILURE);
-       else if (pid > 0)
+       else if (pid > 0)
                /* kill parent */
                exit(EXIT_SUCCESS);
 
@@ -366,11 +360,9 @@ static void start_daemon_mode()
        if ((chdir("/")) < 0)
                exit(EXIT_FAILURE);
 
-
        sleep(10);
 
        close(STDIN_FILENO);
        close(STDOUT_FILENO);
        close(STDERR_FILENO);
-
 }
index 8e2a908..f33f32f 100644 (file)
@@ -8,7 +8,32 @@ CFLAGS += -g -O2 -Werror -Wall -I. -I../include/ -I ../../usr/include/ -Wno-poin
 vpath %.c ../../drivers/virtio ../../drivers/vhost
 mod:
        ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test V=${V}
-.PHONY: all test mod clean
+
+#oot: build vhost as an out of tree module for a distro kernel
+#no effort is taken to make it actually build or work, but tends to mostly work
+#if the distro kernel is very close to upstream
+#unsupported! this is a development tool only, don't use the
+#resulting modules in production!
+OOT_KSRC=/lib/modules/$$(uname -r)/build
+OOT_VHOST=`pwd`/../../drivers/vhost
+#Everyone depends on vhost
+#Tweak the below to enable more modules
+OOT_CONFIGS=\
+       CONFIG_VHOST=m \
+       CONFIG_VHOST_NET=n \
+       CONFIG_VHOST_SCSI=n \
+       CONFIG_VHOST_VSOCK=n
+OOT_BUILD=KCFLAGS="-I "${OOT_VHOST} ${MAKE} -C ${OOT_KSRC} V=${V}
+oot-build:
+       echo "UNSUPPORTED! Don't use the resulting modules in production!"
+       ${OOT_BUILD} M=`pwd`/vhost_test
+       ${OOT_BUILD} M=${OOT_VHOST} ${OOT_CONFIGS}
+
+oot-clean: oot-build
+oot: oot-build
+oot-clean: OOT_BUILD+=clean
+
+.PHONY: all test mod clean vhost oot oot-clean oot-build
 clean:
        ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
               vhost_test/Module.symvers vhost_test/modules.order *.d