OSDN Git Service

Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 22 Nov 2013 17:57:35 +0000 (09:57 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 22 Nov 2013 17:57:35 +0000 (09:57 -0800)
Pull networking fixes from David Miller:

 1) Fix memory leaks and other issues in mwifiex driver, from Amitkumar
    Karwar.

 2) skb_segment() can choke on packets using frag lists, fix from
    Herbert Xu with help from Eric Dumazet and others.

 3) IPv4 output cached route instantiation properly handles races
    involving two threads trying to install the same route, but we
    forgot to propagate this logic to input routes as well.  Fix from
    Alexei Starovoitov.

 4) Put protections in place to make sure that recvmsg() paths never
    accidently copy uninitialized memory back into userspace and also
    make sure that we never try to use more that sockaddr_storage for
    building the on-kernel-stack copy of a sockaddr.  Fixes from Hannes
    Frederic Sowa.

 5) R8152 driver transmit flow bug fixes from Hayes Wang.

 6) Fix some minor fallouts from genetlink changes, from Johannes Berg
    and Michael Opdenacker.

 7) AF_PACKET sendmsg path can race with netdevice unregister notifier,
    fix by using RCU to make sure the network device doesn't go away
    from under us.  Fix from Daniel Borkmann.

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (43 commits)
  gso: handle new frag_list of frags GRO packets
  genetlink: fix genl_set_err() group ID
  genetlink: fix genlmsg_multicast() bug
  packet: fix use after free race in send path when dev is released
  xen-netback: stop the VIF thread before unbinding IRQs
  wimax: remove dead code
  net/phy: Add the autocross feature for forced links on VSC82x4
  net/phy: Add VSC8662 support
  net/phy: Add VSC8574 support
  net/phy: Add VSC8234 support
  net: add BUG_ON if kernel advertises msg_namelen > sizeof(struct sockaddr_storage)
  net: rework recvmsg handler msg_name and msg_namelen logic
  bridge: flush br's address entry in fdb when remove the
  net: core: Always propagate flag changes to interfaces
  ipv4: fix race in concurrent ip_route_input_slow()
  r8152: fix incorrect type in assignment
  r8152: support stopping/waking tx queue
  r8152: modify the tx flow
  r8152: fix tx/rx memory overflow
  netfilter: ebt_ip6: fix source and destination matching
  ...

468 files changed:
Documentation/assoc_array.txt [new file with mode: 0644]
Documentation/devicetree/bindings/dma/atmel-dma.txt
Documentation/devicetree/bindings/i2c/trivial-devices.txt
Documentation/devicetree/bindings/powerpc/fsl/dma.txt
Documentation/dmatest.txt
Documentation/filesystems/btrfs.txt
Documentation/kernel-parameters.txt
Documentation/power/runtime_pm.txt
Documentation/security/00-INDEX
Documentation/security/IMA-templates.txt [new file with mode: 0644]
Documentation/security/keys.txt
Documentation/vm/split_page_table_lock
MAINTAINERS
arch/alpha/Kconfig
arch/alpha/include/asm/machvec.h
arch/alpha/include/asm/pal.h
arch/alpha/include/asm/rtc.h
arch/alpha/include/asm/string.h
arch/alpha/include/uapi/asm/pal.h
arch/alpha/kernel/Makefile
arch/alpha/kernel/alpha_ksyms.c
arch/alpha/kernel/irq_alpha.c
arch/alpha/kernel/machvec_impl.h
arch/alpha/kernel/perf_event.c
arch/alpha/kernel/process.c
arch/alpha/kernel/proto.h
arch/alpha/kernel/rtc.c [new file with mode: 0644]
arch/alpha/kernel/setup.c
arch/alpha/kernel/smp.c
arch/alpha/kernel/sys_jensen.c
arch/alpha/kernel/sys_marvel.c
arch/alpha/kernel/time.c
arch/alpha/kernel/traps.c
arch/alpha/lib/csum_partial_copy.c
arch/alpha/lib/ev6-memset.S
arch/alpha/lib/memset.S
arch/arm/Kconfig
arch/arm/common/edma.c
arch/arm/include/asm/hardware/iop3xx-adma.h
arch/arm/include/asm/hardware/iop_adma.h
arch/arm/include/asm/memory.h
arch/arm/kernel/head.S
arch/arm/kernel/traps.c
arch/arm/kvm/mmu.c
arch/arm/lib/bitops.h
arch/arm/mach-iop13xx/include/mach/adma.h
arch/arm/mm/mmu.c
arch/arm/mm/nommu.c
arch/arm/mm/proc-v7.S
arch/avr32/boot/u-boot/head.S
arch/avr32/include/asm/kprobes.h
arch/avr32/include/uapi/asm/Kbuild
arch/avr32/include/uapi/asm/auxvec.h
arch/avr32/include/uapi/asm/bitsperlong.h [deleted file]
arch/avr32/include/uapi/asm/byteorder.h
arch/avr32/include/uapi/asm/cachectl.h
arch/avr32/include/uapi/asm/errno.h [deleted file]
arch/avr32/include/uapi/asm/fcntl.h [deleted file]
arch/avr32/include/uapi/asm/ioctl.h [deleted file]
arch/avr32/include/uapi/asm/ioctls.h [deleted file]
arch/avr32/include/uapi/asm/ipcbuf.h [deleted file]
arch/avr32/include/uapi/asm/kvm_para.h [deleted file]
arch/avr32/include/uapi/asm/mman.h [deleted file]
arch/avr32/include/uapi/asm/msgbuf.h
arch/avr32/include/uapi/asm/poll.h [deleted file]
arch/avr32/include/uapi/asm/posix_types.h
arch/avr32/include/uapi/asm/resource.h [deleted file]
arch/avr32/include/uapi/asm/sembuf.h
arch/avr32/include/uapi/asm/setup.h
arch/avr32/include/uapi/asm/shmbuf.h
arch/avr32/include/uapi/asm/sigcontext.h
arch/avr32/include/uapi/asm/siginfo.h [deleted file]
arch/avr32/include/uapi/asm/signal.h
arch/avr32/include/uapi/asm/socket.h
arch/avr32/include/uapi/asm/sockios.h
arch/avr32/include/uapi/asm/stat.h
arch/avr32/include/uapi/asm/statfs.h [deleted file]
arch/avr32/include/uapi/asm/swab.h
arch/avr32/include/uapi/asm/termbits.h
arch/avr32/include/uapi/asm/termios.h
arch/avr32/include/uapi/asm/types.h
arch/avr32/include/uapi/asm/unistd.h
arch/avr32/kernel/entry-avr32b.S
arch/avr32/kernel/head.S
arch/ia64/hp/common/sba_iommu.c
arch/ia64/include/asm/pci.h
arch/ia64/kernel/perfmon.c
arch/ia64/pci/pci.c
arch/ia64/sn/kernel/io_acpi_init.c
arch/parisc/include/asm/socket.h [new file with mode: 0644]
arch/parisc/include/asm/uaccess.h
arch/parisc/include/uapi/asm/socket.h
arch/parisc/lib/memcpy.c
arch/parisc/mm/fault.c
arch/powerpc/Makefile
arch/powerpc/boot/dts/fsl/b4si-post.dtsi
arch/powerpc/boot/dts/fsl/elo3-dma-0.dtsi [new file with mode: 0644]
arch/powerpc/boot/dts/fsl/elo3-dma-1.dtsi [new file with mode: 0644]
arch/powerpc/boot/dts/fsl/t4240si-post.dtsi
arch/powerpc/configs/pseries_le_defconfig [new file with mode: 0644]
arch/powerpc/include/asm/elf.h
arch/powerpc/include/asm/hvcall.h
arch/powerpc/include/asm/plpar_wrappers.h
arch/powerpc/include/asm/smp.h
arch/powerpc/include/asm/thread_info.h
arch/powerpc/kernel/eeh.c
arch/powerpc/kernel/eeh_event.c
arch/powerpc/kernel/process.c
arch/powerpc/kernel/prom.c
arch/powerpc/kernel/signal_32.c
arch/powerpc/kernel/signal_64.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/time.c
arch/powerpc/kernel/vdso64/sigtramp.S
arch/powerpc/kernel/vio.c
arch/powerpc/mm/gup.c
arch/powerpc/mm/slice.c
arch/powerpc/platforms/Kconfig.cputype
arch/powerpc/platforms/powernv/rng.c
arch/powerpc/platforms/pseries/eeh_pseries.c
arch/powerpc/platforms/pseries/lpar.c
arch/powerpc/platforms/pseries/rng.c
arch/powerpc/platforms/pseries/setup.c
arch/powerpc/platforms/wsp/chroma.c
arch/powerpc/platforms/wsp/h8.c
arch/powerpc/platforms/wsp/ics.c
arch/powerpc/platforms/wsp/opb_pic.c
arch/powerpc/platforms/wsp/psr2.c
arch/powerpc/platforms/wsp/scom_wsp.c
arch/powerpc/platforms/wsp/wsp.c
arch/x86/include/asm/pci.h
arch/x86/include/uapi/asm/msr-index.h
arch/x86/kvm/mmu_audit.c
arch/x86/mm/pgtable.c
arch/x86/pci/acpi.c
block/blk-mq.c
block/partitions/efi.c
crypto/Kconfig
crypto/Makefile
crypto/asymmetric_keys/Kconfig
crypto/asymmetric_keys/asymmetric_type.c
crypto/asymmetric_keys/public_key.c
crypto/asymmetric_keys/public_key.h
crypto/asymmetric_keys/rsa.c
crypto/asymmetric_keys/x509_cert_parser.c
crypto/asymmetric_keys/x509_parser.h
crypto/asymmetric_keys/x509_public_key.c
crypto/async_tx/async_memcpy.c
crypto/async_tx/async_pq.c
crypto/async_tx/async_raid6_recov.c
crypto/async_tx/async_tx.c
crypto/async_tx/async_xor.c
crypto/async_tx/raid6test.c
crypto/hash_info.c [new file with mode: 0644]
drivers/acpi/Kconfig
drivers/acpi/ac.c
drivers/acpi/acpi_lpss.c
drivers/acpi/acpi_platform.c
drivers/acpi/blacklist.c
drivers/acpi/device_pm.c
drivers/acpi/ec.c
drivers/acpi/glue.c
drivers/acpi/pci_root.c
drivers/acpi/scan.c
drivers/acpi/video.c
drivers/ata/libata-acpi.c
drivers/ata/pata_arasan_cf.c
drivers/base/platform.c
drivers/base/power/main.c
drivers/block/null_blk.c
drivers/block/virtio_blk.c
drivers/char/tpm/Kconfig
drivers/char/tpm/Makefile
drivers/char/tpm/tpm-interface.c [moved from drivers/char/tpm/tpm.c with 93% similarity]
drivers/char/tpm/tpm.h
drivers/char/tpm/tpm_atmel.c
drivers/char/tpm/tpm_eventlog.c
drivers/char/tpm/tpm_i2c_atmel.c [new file with mode: 0644]
drivers/char/tpm/tpm_i2c_infineon.c
drivers/char/tpm/tpm_i2c_nuvoton.c [new file with mode: 0644]
drivers/char/tpm/tpm_i2c_stm_st33.c
drivers/char/tpm/tpm_ibmvtpm.c
drivers/char/tpm/tpm_ppi.c
drivers/char/tpm/tpm_tis.c
drivers/char/tpm/xen-tpmfront.c
drivers/cpufreq/cpufreq_conservative.c
drivers/cpufreq/cpufreq_governor.c
drivers/cpufreq/omap-cpufreq.c
drivers/dma/Kconfig
drivers/dma/amba-pl08x.c
drivers/dma/at_hdmac.c
drivers/dma/coh901318.c
drivers/dma/cppi41.c
drivers/dma/dma-jz4740.c
drivers/dma/dmaengine.c
drivers/dma/dmatest.c
drivers/dma/dw/core.c
drivers/dma/edma.c
drivers/dma/ep93xx_dma.c
drivers/dma/fsldma.c
drivers/dma/fsldma.h
drivers/dma/imx-dma.c
drivers/dma/imx-sdma.c
drivers/dma/intel_mid_dma.c
drivers/dma/ioat/dma.c
drivers/dma/ioat/dma.h
drivers/dma/ioat/dma_v2.c
drivers/dma/ioat/dma_v2.h
drivers/dma/ioat/dma_v3.c
drivers/dma/ioat/pci.c
drivers/dma/iop-adma.c
drivers/dma/ipu/ipu_idmac.c
drivers/dma/k3dma.c
drivers/dma/mmp_pdma.c
drivers/dma/mmp_tdma.c
drivers/dma/mv_xor.c
drivers/dma/mv_xor.h
drivers/dma/mxs-dma.c
drivers/dma/omap-dma.c
drivers/dma/pl330.c
drivers/dma/ppc4xx/adma.c
drivers/dma/sa11x0-dma.c
drivers/dma/sh/shdma-base.c
drivers/dma/sh/shdmac.c
drivers/dma/ste_dma40.c
drivers/dma/tegra20-apb-dma.c
drivers/dma/timb_dma.c
drivers/dma/txx9dmac.c
drivers/gpio/gpiolib.c
drivers/gpu/drm/i915/intel_acpi.c
drivers/gpu/drm/i915/intel_opregion.c
drivers/gpu/drm/nouveau/core/subdev/mxm/base.c
drivers/gpu/drm/nouveau/nouveau_acpi.c
drivers/gpu/drm/radeon/radeon_acpi.c
drivers/gpu/drm/radeon/radeon_atpx_handler.c
drivers/gpu/drm/radeon/radeon_bios.c
drivers/hid/i2c-hid/i2c-hid.c
drivers/i2c/i2c-core.c
drivers/ide/ide-acpi.c
drivers/idle/intel_idle.c
drivers/md/md.c
drivers/md/raid1.c
drivers/md/raid1.h
drivers/md/raid10.c
drivers/md/raid5.c
drivers/md/raid5.h
drivers/media/platform/m2m-deinterlace.c
drivers/media/platform/timblogiw.c
drivers/misc/carma/carma-fpga.c
drivers/mmc/core/sdio_bus.c
drivers/mtd/nand/atmel_nand.c
drivers/mtd/nand/fsmc_nand.c
drivers/net/ethernet/micrel/ks8842.c
drivers/ntb/ntb_transport.c
drivers/pci/hotplug/acpi_pcihp.c
drivers/pci/hotplug/acpiphp.h
drivers/pci/hotplug/pciehp_acpi.c
drivers/pci/hotplug/sgi_hotplug.c
drivers/pci/ioapic.c
drivers/pci/pci-acpi.c
drivers/pci/pci-label.c
drivers/platform/x86/apple-gmux.c
drivers/pnp/pnpacpi/core.c
drivers/rtc/Kconfig
drivers/rtc/rtc-at91rm9200.c
drivers/spi/spi-dw-mid.c
drivers/spi/spi.c
drivers/tty/serial/sh-sci.c
drivers/usb/core/hub.c
drivers/usb/core/usb-acpi.c
drivers/xen/pci.c
fs/9p/vfs_dentry.c
fs/aio.c
fs/bio.c
fs/btrfs/Kconfig
fs/btrfs/async-thread.c
fs/btrfs/check-integrity.c
fs/btrfs/ctree.h
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/inode.c
fs/btrfs/ordered-data.c
fs/btrfs/scrub.c
fs/btrfs/transaction.c
fs/btrfs/tree-log.c
fs/btrfs/volumes.c
fs/configfs/dir.c
fs/coredump.c
fs/dcache.c
fs/efivarfs/super.c
fs/exec.c
fs/gfs2/glock.c
fs/gfs2/inode.c
fs/gfs2/lock_dlm.c
fs/gfs2/quota.c
fs/gfs2/rgrp.c
fs/hostfs/hostfs_kern.c
fs/libfs.c
fs/namei.c
fs/nfsd/nfs4xdr.c
fs/nfsd/vfs.c
fs/proc/base.c
fs/proc/generic.c
fs/proc/namespaces.c
fs/squashfs/Kconfig
fs/squashfs/Makefile
fs/squashfs/block.c
fs/squashfs/cache.c
fs/squashfs/decompressor.c
fs/squashfs/decompressor.h
fs/squashfs/decompressor_multi.c [new file with mode: 0644]
fs/squashfs/decompressor_multi_percpu.c [new file with mode: 0644]
fs/squashfs/decompressor_single.c [new file with mode: 0644]
fs/squashfs/file.c
fs/squashfs/file_cache.c [new file with mode: 0644]
fs/squashfs/file_direct.c [new file with mode: 0644]
fs/squashfs/lzo_wrapper.c
fs/squashfs/page_actor.c [new file with mode: 0644]
fs/squashfs/page_actor.h [new file with mode: 0644]
fs/squashfs/squashfs.h
fs/squashfs/squashfs_fs_sb.h
fs/squashfs/super.c
fs/squashfs/xz_wrapper.c
fs/squashfs/zlib_wrapper.c
fs/xfs/xfs_bmap.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_trans_inode.c
fs/xfs/xfs_trans_resv.c
include/acpi/acpi_bus.h
include/crypto/hash_info.h [new file with mode: 0644]
include/crypto/public_key.h
include/keys/big_key-type.h [new file with mode: 0644]
include/keys/keyring-type.h
include/keys/system_keyring.h [new file with mode: 0644]
include/linux/acpi.h
include/linux/assoc_array.h [new file with mode: 0644]
include/linux/assoc_array_priv.h [new file with mode: 0644]
include/linux/audit.h
include/linux/blkdev.h
include/linux/device.h
include/linux/dmaengine.h
include/linux/fs.h
include/linux/hugetlb.h
include/linux/key-type.h
include/linux/key.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/pci-acpi.h
include/linux/platform_data/edma.h
include/linux/security.h
include/linux/seqlock.h
include/linux/slab.h
include/linux/slab_def.h
include/linux/slub_def.h
include/linux/user_namespace.h
include/linux/wait.h
include/trace/events/btrfs.h
include/uapi/linux/audit.h
include/uapi/linux/hash_info.h [new file with mode: 0644]
include/uapi/linux/keyctl.h
include/uapi/linux/raid/md_p.h
init/Kconfig
init/main.c
ipc/shm.c
kernel/Makefile
kernel/audit.c
kernel/audit.h
kernel/auditfilter.c
kernel/auditsc.c
kernel/cgroup.c
kernel/modsign_certificate.S [deleted file]
kernel/modsign_pubkey.c [deleted file]
kernel/module-internal.h
kernel/module_signing.c
kernel/power/snapshot.c
kernel/power/user.c
kernel/system_certificates.S [new file with mode: 0644]
kernel/system_keyring.c [new file with mode: 0644]
kernel/user.c
kernel/user_namespace.c
lib/Kconfig
lib/Makefile
lib/assoc_array.c [new file with mode: 0644]
lib/mpi/mpiutil.c
mm/hugetlb.c
mm/memory.c
mm/mempolicy.c
mm/migrate.c
mm/slab.c
mm/slub.c
mm/swap.c
net/Kconfig
net/ipv4/tcp.c
net/sunrpc/rpc_pipe.c
scripts/asn1_compiler.c
scripts/checkpatch.pl
security/Makefile
security/apparmor/audit.c
security/apparmor/capability.c
security/apparmor/domain.c
security/apparmor/include/audit.h
security/apparmor/include/capability.h
security/apparmor/include/ipc.h
security/apparmor/ipc.c
security/apparmor/lsm.c
security/capability.c
security/integrity/digsig.c
security/integrity/digsig_asymmetric.c
security/integrity/evm/evm_main.c
security/integrity/evm/evm_posix_acl.c
security/integrity/iint.c
security/integrity/ima/Kconfig
security/integrity/ima/Makefile
security/integrity/ima/ima.h
security/integrity/ima/ima_api.c
security/integrity/ima/ima_appraise.c
security/integrity/ima/ima_crypto.c
security/integrity/ima/ima_fs.c
security/integrity/ima/ima_init.c
security/integrity/ima/ima_main.c
security/integrity/ima/ima_policy.c
security/integrity/ima/ima_queue.c
security/integrity/ima/ima_template.c [new file with mode: 0644]
security/integrity/ima/ima_template_lib.c [new file with mode: 0644]
security/integrity/ima/ima_template_lib.h [new file with mode: 0644]
security/integrity/integrity.h
security/keys/Kconfig
security/keys/Makefile
security/keys/big_key.c [new file with mode: 0644]
security/keys/compat.c
security/keys/gc.c
security/keys/internal.h
security/keys/key.c
security/keys/keyctl.c
security/keys/keyring.c
security/keys/persistent.c [new file with mode: 0644]
security/keys/proc.c
security/keys/process_keys.c
security/keys/request_key.c
security/keys/request_key_auth.c
security/keys/sysctl.c
security/keys/user_defined.c
security/lsm_audit.c
security/security.c
security/selinux/hooks.c
security/selinux/include/objsec.h
security/selinux/include/security.h
security/selinux/include/xfrm.h
security/selinux/netlabel.c
security/selinux/netnode.c
security/selinux/nlmsgtab.c
security/selinux/selinuxfs.c
security/selinux/ss/ebitmap.c
security/selinux/ss/ebitmap.h
security/selinux/ss/mls.c
security/selinux/ss/mls_types.h
security/selinux/ss/policydb.c
security/selinux/ss/services.c
security/selinux/xfrm.c
security/smack/smack.h
security/smack/smack_access.c
security/smack/smack_lsm.c
security/smack/smackfs.c
sound/soc/davinci/davinci-pcm.c
tools/power/x86/turbostat/turbostat.c
virt/kvm/kvm_main.c

diff --git a/Documentation/assoc_array.txt b/Documentation/assoc_array.txt
new file mode 100644 (file)
index 0000000..f4faec0
--- /dev/null
@@ -0,0 +1,574 @@
+                  ========================================
+                  GENERIC ASSOCIATIVE ARRAY IMPLEMENTATION
+                  ========================================
+
+Contents:
+
+ - Overview.
+
+ - The public API.
+   - Edit script.
+   - Operations table.
+   - Manipulation functions.
+   - Access functions.
+   - Index key form.
+
+ - Internal workings.
+   - Basic internal tree layout.
+   - Shortcuts.
+   - Splitting and collapsing nodes.
+   - Non-recursive iteration.
+   - Simultaneous alteration and iteration.
+
+
+========
+OVERVIEW
+========
+
+This associative array implementation is an object container with the following
+properties:
+
+ (1) Objects are opaque pointers.  The implementation does not care where they
+     point (if anywhere) or what they point to (if anything).
+
+     [!] NOTE: Pointers to objects _must_ be zero in the least significant bit.
+
+ (2) Objects do not need to contain linkage blocks for use by the array.  This
+     permits an object to be located in multiple arrays simultaneously.
+     Rather, the array is made up of metadata blocks that point to objects.
+
+ (3) Objects require index keys to locate them within the array.
+
+ (4) Index keys must be unique.  Inserting an object with the same key as one
+     already in the array will replace the old object.
+
+ (5) Index keys can be of any length and can be of different lengths.
+
+ (6) Index keys should encode the length early on, before any variation due to
+     length is seen.
+
+ (7) Index keys can include a hash to scatter objects throughout the array.
+
+ (8) The array can iterated over.  The objects will not necessarily come out in
+     key order.
+
+ (9) The array can be iterated over whilst it is being modified, provided the
+     RCU readlock is being held by the iterator.  Note, however, under these
+     circumstances, some objects may be seen more than once.  If this is a
+     problem, the iterator should lock against modification.  Objects will not
+     be missed, however, unless deleted.
+
+(10) Objects in the array can be looked up by means of their index key.
+
+(11) Objects can be looked up whilst the array is being modified, provided the
+     RCU readlock is being held by the thread doing the look up.
+
+The implementation uses a tree of 16-pointer nodes internally that are indexed
+on each level by nibbles from the index key in the same manner as in a radix
+tree.  To improve memory efficiency, shortcuts can be emplaced to skip over
+what would otherwise be a series of single-occupancy nodes.  Further, nodes
+pack leaf object pointers into spare space in the node rather than making an
+extra branch until as such time an object needs to be added to a full node.
+
+
+==============
+THE PUBLIC API
+==============
+
+The public API can be found in <linux/assoc_array.h>.  The associative array is
+rooted on the following structure:
+
+       struct assoc_array {
+               ...
+       };
+
+The code is selected by enabling CONFIG_ASSOCIATIVE_ARRAY.
+
+
+EDIT SCRIPT
+-----------
+
+The insertion and deletion functions produce an 'edit script' that can later be
+applied to effect the changes without risking ENOMEM.  This retains the
+preallocated metadata blocks that will be installed in the internal tree and
+keeps track of the metadata blocks that will be removed from the tree when the
+script is applied.
+
+This is also used to keep track of dead blocks and dead objects after the
+script has been applied so that they can be freed later.  The freeing is done
+after an RCU grace period has passed - thus allowing access functions to
+proceed under the RCU read lock.
+
+The script appears as outside of the API as a pointer of the type:
+
+       struct assoc_array_edit;
+
+There are two functions for dealing with the script:
+
+ (1) Apply an edit script.
+
+       void assoc_array_apply_edit(struct assoc_array_edit *edit);
+
+     This will perform the edit functions, interpolating various write barriers
+     to permit accesses under the RCU read lock to continue.  The edit script
+     will then be passed to call_rcu() to free it and any dead stuff it points
+     to.
+
+ (2) Cancel an edit script.
+
+       void assoc_array_cancel_edit(struct assoc_array_edit *edit);
+
+     This frees the edit script and all preallocated memory immediately.  If
+     this was for insertion, the new object is _not_ released by this function,
+     but must rather be released by the caller.
+
+These functions are guaranteed not to fail.
+
+
+OPERATIONS TABLE
+----------------
+
+Various functions take a table of operations:
+
+       struct assoc_array_ops {
+               ...
+       };
+
+This points to a number of methods, all of which need to be provided:
+
+ (1) Get a chunk of index key from caller data:
+
+       unsigned long (*get_key_chunk)(const void *index_key, int level);
+
+     This should return a chunk of caller-supplied index key starting at the
+     *bit* position given by the level argument.  The level argument will be a
+     multiple of ASSOC_ARRAY_KEY_CHUNK_SIZE and the function should return
+     ASSOC_ARRAY_KEY_CHUNK_SIZE bits.  No error is possible.
+
+
+ (2) Get a chunk of an object's index key.
+
+       unsigned long (*get_object_key_chunk)(const void *object, int level);
+
+     As the previous function, but gets its data from an object in the array
+     rather than from a caller-supplied index key.
+
+
+ (3) See if this is the object we're looking for.
+
+       bool (*compare_object)(const void *object, const void *index_key);
+
+     Compare the object against an index key and return true if it matches and
+     false if it doesn't.
+
+
+ (4) Diff the index keys of two objects.
+
+       int (*diff_objects)(const void *a, const void *b);
+
+     Return the bit position at which the index keys of two objects differ or
+     -1 if they are the same.
+
+
+ (5) Free an object.
+
+       void (*free_object)(void *object);
+
+     Free the specified object.  Note that this may be called an RCU grace
+     period after assoc_array_apply_edit() was called, so synchronize_rcu() may
+     be necessary on module unloading.
+
+
+MANIPULATION FUNCTIONS
+----------------------
+
+There are a number of functions for manipulating an associative array:
+
+ (1) Initialise an associative array.
+
+       void assoc_array_init(struct assoc_array *array);
+
+     This initialises the base structure for an associative array.  It can't
+     fail.
+
+
+ (2) Insert/replace an object in an associative array.
+
+       struct assoc_array_edit *
+       assoc_array_insert(struct assoc_array *array,
+                          const struct assoc_array_ops *ops,
+                          const void *index_key,
+                          void *object);
+
+     This inserts the given object into the array.  Note that the least
+     significant bit of the pointer must be zero as it's used to type-mark
+     pointers internally.
+
+     If an object already exists for that key then it will be replaced with the
+     new object and the old one will be freed automatically.
+
+     The index_key argument should hold index key information and is
+     passed to the methods in the ops table when they are called.
+
+     This function makes no alteration to the array itself, but rather returns
+     an edit script that must be applied.  -ENOMEM is returned in the case of
+     an out-of-memory error.
+
+     The caller should lock exclusively against other modifiers of the array.
+
+
+ (3) Delete an object from an associative array.
+
+       struct assoc_array_edit *
+       assoc_array_delete(struct assoc_array *array,
+                          const struct assoc_array_ops *ops,
+                          const void *index_key);
+
+     This deletes an object that matches the specified data from the array.
+
+     The index_key argument should hold index key information and is
+     passed to the methods in the ops table when they are called.
+
+     This function makes no alteration to the array itself, but rather returns
+     an edit script that must be applied.  -ENOMEM is returned in the case of
+     an out-of-memory error.  NULL will be returned if the specified object is
+     not found within the array.
+
+     The caller should lock exclusively against other modifiers of the array.
+
+
+ (4) Delete all objects from an associative array.
+
+       struct assoc_array_edit *
+       assoc_array_clear(struct assoc_array *array,
+                         const struct assoc_array_ops *ops);
+
+     This deletes all the objects from an associative array and leaves it
+     completely empty.
+
+     This function makes no alteration to the array itself, but rather returns
+     an edit script that must be applied.  -ENOMEM is returned in the case of
+     an out-of-memory error.
+
+     The caller should lock exclusively against other modifiers of the array.
+
+
+ (5) Destroy an associative array, deleting all objects.
+
+       void assoc_array_destroy(struct assoc_array *array,
+                                const struct assoc_array_ops *ops);
+
+     This destroys the contents of the associative array and leaves it
+     completely empty.  It is not permitted for another thread to be traversing
+     the array under the RCU read lock at the same time as this function is
+     destroying it as no RCU deferral is performed on memory release -
+     something that would require memory to be allocated.
+
+     The caller should lock exclusively against other modifiers and accessors
+     of the array.
+
+
+ (6) Garbage collect an associative array.
+
+       int assoc_array_gc(struct assoc_array *array,
+                          const struct assoc_array_ops *ops,
+                          bool (*iterator)(void *object, void *iterator_data),
+                          void *iterator_data);
+
+     This iterates over the objects in an associative array and passes each one
+     to iterator().  If iterator() returns true, the object is kept.  If it
+     returns false, the object will be freed.  If the iterator() function
+     returns true, it must perform any appropriate refcount incrementing on the
+     object before returning.
+
+     The internal tree will be packed down if possible as part of the iteration
+     to reduce the number of nodes in it.
+
+     The iterator_data is passed directly to iterator() and is otherwise
+     ignored by the function.
+
+     The function will return 0 if successful and -ENOMEM if there wasn't
+     enough memory.
+
+     It is possible for other threads to iterate over or search the array under
+     the RCU read lock whilst this function is in progress.  The caller should
+     lock exclusively against other modifiers of the array.
+
+
+ACCESS FUNCTIONS
+----------------
+
+There are two functions for accessing an associative array:
+
+ (1) Iterate over all the objects in an associative array.
+
+       int assoc_array_iterate(const struct assoc_array *array,
+                               int (*iterator)(const void *object,
+                                               void *iterator_data),
+                               void *iterator_data);
+
+     This passes each object in the array to the iterator callback function.
+     iterator_data is private data for that function.
+
+     This may be used on an array at the same time as the array is being
+     modified, provided the RCU read lock is held.  Under such circumstances,
+     it is possible for the iteration function to see some objects twice.  If
+     this is a problem, then modification should be locked against.  The
+     iteration algorithm should not, however, miss any objects.
+
+     The function will return 0 if no objects were in the array or else it will
+     return the result of the last iterator function called.  Iteration stops
+     immediately if any call to the iteration function results in a non-zero
+     return.
+
+
+ (2) Find an object in an associative array.
+
+       void *assoc_array_find(const struct assoc_array *array,
+                              const struct assoc_array_ops *ops,
+                              const void *index_key);
+
+     This walks through the array's internal tree directly to the object
+     specified by the index key..
+
+     This may be used on an array at the same time as the array is being
+     modified, provided the RCU read lock is held.
+
+     The function will return the object if found (and set *_type to the object
+     type) or will return NULL if the object was not found.
+
+
+INDEX KEY FORM
+--------------
+
+The index key can be of any form, but since the algorithms aren't told how long
+the key is, it is strongly recommended that the index key includes its length
+very early on before any variation due to the length would have an effect on
+comparisons.
+
+This will cause leaves with different length keys to scatter away from each
+other - and those with the same length keys to cluster together.
+
+It is also recommended that the index key begin with a hash of the rest of the
+key to maximise scattering throughout keyspace.
+
+The better the scattering, the wider and lower the internal tree will be.
+
+Poor scattering isn't too much of a problem as there are shortcuts and nodes
+can contain mixtures of leaves and metadata pointers.
+
+The index key is read in chunks of machine word.  Each chunk is subdivided into
+one nibble (4 bits) per level, so on a 32-bit CPU this is good for 8 levels and
+on a 64-bit CPU, 16 levels.  Unless the scattering is really poor, it is
+unlikely that more than one word of any particular index key will have to be
+used.
+
+
+=================
+INTERNAL WORKINGS
+=================
+
+The associative array data structure has an internal tree.  This tree is
+constructed of two types of metadata blocks: nodes and shortcuts.
+
+A node is an array of slots.  Each slot can contain one of four things:
+
+ (*) A NULL pointer, indicating that the slot is empty.
+
+ (*) A pointer to an object (a leaf).
+
+ (*) A pointer to a node at the next level.
+
+ (*) A pointer to a shortcut.
+
+
+BASIC INTERNAL TREE LAYOUT
+--------------------------
+
+Ignoring shortcuts for the moment, the nodes form a multilevel tree.  The index
+key space is strictly subdivided by the nodes in the tree and nodes occur on
+fixed levels.  For example:
+
+ Level:        0               1               2               3
+       =============== =============== =============== ===============
+                                                       NODE D
+                       NODE B          NODE C  +------>+---+
+               +------>+---+   +------>+---+   |       | 0 |
+       NODE A  |       | 0 |   |       | 0 |   |       +---+
+       +---+   |       +---+   |       +---+   |       :   :
+       | 0 |   |       :   :   |       :   :   |       +---+
+       +---+   |       +---+   |       +---+   |       | f |
+       | 1 |---+       | 3 |---+       | 7 |---+       +---+
+       +---+           +---+           +---+
+       :   :           :   :           | 8 |---+
+       +---+           +---+           +---+   |       NODE E
+       | e |---+       | f |           :   :   +------>+---+
+       +---+   |       +---+           +---+           | 0 |
+       | f |   |                       | f |           +---+
+       +---+   |                       +---+           :   :
+               |       NODE F                          +---+
+               +------>+---+                           | f |
+                       | 0 |           NODE G          +---+
+                       +---+   +------>+---+
+                       :   :   |       | 0 |
+                       +---+   |       +---+
+                       | 6 |---+       :   :
+                       +---+           +---+
+                       :   :           | f |
+                       +---+           +---+
+                       | f |
+                       +---+
+
+In the above example, there are 7 nodes (A-G), each with 16 slots (0-f).
+Assuming no other meta data nodes in the tree, the key space is divided thusly:
+
+       KEY PREFIX      NODE
+       ==========      ====
+       137*            D
+       138*            E
+       13[0-69-f]*     C
+       1[0-24-f]*      B
+       e6*             G
+       e[0-57-f]*      F
+       [02-df]*        A
+
+So, for instance, keys with the following example index keys will be found in
+the appropriate nodes:
+
+       INDEX KEY       PREFIX  NODE
+       =============== ======= ====
+       13694892892489  13      C
+       13795289025897  137     D
+       13889dde88793   138     E
+       138bbb89003093  138     E
+       1394879524789   12      C
+       1458952489      1       B
+       9431809de993ba  -       A
+       b4542910809cd   -       A
+       e5284310def98   e       F
+       e68428974237    e6      G
+       e7fffcbd443     e       F
+       f3842239082     -       A
+
+To save memory, if a node can hold all the leaves in its portion of keyspace,
+then the node will have all those leaves in it and will not have any metadata
+pointers - even if some of those leaves would like to be in the same slot.
+
+A node can contain a heterogeneous mix of leaves and metadata pointers.
+Metadata pointers must be in the slots that match their subdivisions of key
+space.  The leaves can be in any slot not occupied by a metadata pointer.  It
+is guaranteed that none of the leaves in a node will match a slot occupied by a
+metadata pointer.  If the metadata pointer is there, any leaf whose key matches
+the metadata key prefix must be in the subtree that the metadata pointer points
+to.
+
+In the above example list of index keys, node A will contain:
+
+       SLOT    CONTENT         INDEX KEY (PREFIX)
+       ====    =============== ==================
+       1       PTR TO NODE B   1*
+       any     LEAF            9431809de993ba
+       any     LEAF            b4542910809cd
+       e       PTR TO NODE F   e*
+       any     LEAF            f3842239082
+
+and node B:
+
+       3       PTR TO NODE C   13*
+       any     LEAF            1458952489
+
+
+SHORTCUTS
+---------
+
+Shortcuts are metadata records that jump over a piece of keyspace.  A shortcut
+is a replacement for a series of single-occupancy nodes ascending through the
+levels.  Shortcuts exist to save memory and to speed up traversal.
+
+It is possible for the root of the tree to be a shortcut - say, for example,
+the tree contains at least 17 nodes all with key prefix '1111'.  The insertion
+algorithm will insert a shortcut to skip over the '1111' keyspace in a single
+bound and get to the fourth level where these actually become different.
+
+
+SPLITTING AND COLLAPSING NODES
+------------------------------
+
+Each node has a maximum capacity of 16 leaves and metadata pointers.  If the
+insertion algorithm finds that it is trying to insert a 17th object into a
+node, that node will be split such that at least two leaves that have a common
+key segment at that level end up in a separate node rooted on that slot for
+that common key segment.
+
+If the leaves in a full node and the leaf that is being inserted are
+sufficiently similar, then a shortcut will be inserted into the tree.
+
+When the number of objects in the subtree rooted at a node falls to 16 or
+fewer, then the subtree will be collapsed down to a single node - and this will
+ripple towards the root if possible.
+
+
+NON-RECURSIVE ITERATION
+-----------------------
+
+Each node and shortcut contains a back pointer to its parent and the number of
+slot in that parent that points to it.  None-recursive iteration uses these to
+proceed rootwards through the tree, going to the parent node, slot N + 1 to
+make sure progress is made without the need for a stack.
+
+The backpointers, however, make simultaneous alteration and iteration tricky.
+
+
+SIMULTANEOUS ALTERATION AND ITERATION
+-------------------------------------
+
+There are a number of cases to consider:
+
+ (1) Simple insert/replace.  This involves simply replacing a NULL or old
+     matching leaf pointer with the pointer to the new leaf after a barrier.
+     The metadata blocks don't change otherwise.  An old leaf won't be freed
+     until after the RCU grace period.
+
+ (2) Simple delete.  This involves just clearing an old matching leaf.  The
+     metadata blocks don't change otherwise.  The old leaf won't be freed until
+     after the RCU grace period.
+
+ (3) Insertion replacing part of a subtree that we haven't yet entered.  This
+     may involve replacement of part of that subtree - but that won't affect
+     the iteration as we won't have reached the pointer to it yet and the
+     ancestry blocks are not replaced (the layout of those does not change).
+
+ (4) Insertion replacing nodes that we're actively processing.  This isn't a
+     problem as we've passed the anchoring pointer and won't switch onto the
+     new layout until we follow the back pointers - at which point we've
+     already examined the leaves in the replaced node (we iterate over all the
+     leaves in a node before following any of its metadata pointers).
+
+     We might, however, re-see some leaves that have been split out into a new
+     branch that's in a slot further along than we were at.
+
+ (5) Insertion replacing nodes that we're processing a dependent branch of.
+     This won't affect us until we follow the back pointers.  Similar to (4).
+
+ (6) Deletion collapsing a branch under us.  This doesn't affect us because the
+     back pointers will get us back to the parent of the new node before we
+     could see the new node.  The entire collapsed subtree is thrown away
+     unchanged - and will still be rooted on the same slot, so we shouldn't
+     process it a second time as we'll go back to slot + 1.
+
+Note:
+
+ (*) Under some circumstances, we need to simultaneously change the parent
+     pointer and the parent slot pointer on a node (say, for example, we
+     inserted another node before it and moved it up a level).  We cannot do
+     this without locking against a read - so we have to replace that node too.
+
+     However, when we're changing a shortcut into a node this isn't a problem
+     as shortcuts only have one slot and so the parent slot number isn't used
+     when traversing backwards over one.  This means that it's okay to change
+     the slot number first - provided suitable barriers are used to make sure
+     the parent slot number is read after the back pointer.
+
+Obsolete blocks and leaves are freed up after an RCU grace period has passed,
+so as long as anyone doing walking or iteration holds the RCU read lock, the
+old superstructure should not go away on them.
index e1f343c..f69bcf5 100644 (file)
@@ -28,7 +28,7 @@ The three cells in order are:
 dependent:
   - bit 7-0: peripheral identifier for the hardware handshaking interface. The
   identifier can be different for tx and rx.
-  - bit 11-8: FIFO configuration. 0 for half FIFO, 1 for ALAP, 1 for ASAP.
+  - bit 11-8: FIFO configuration. 0 for half FIFO, 1 for ALAP, 2 for ASAP.
 
 Example:
 
index ad6a738..f1fb26e 100644 (file)
@@ -15,6 +15,7 @@ adi,adt7461           +/-1C TDM Extended Temp Range I.C
 adt7461                        +/-1C TDM Extended Temp Range I.C
 at,24c08               i2c serial eeprom  (24cxx)
 atmel,24c02            i2c serial eeprom  (24cxx)
+atmel,at97sc3204t      i2c trusted platform module (TPM)
 catalyst,24c32         i2c serial eeprom
 dallas,ds1307          64 x 8, Serial, I2C Real-Time Clock
 dallas,ds1338          I2C RTC with 56-Byte NV RAM
@@ -44,6 +45,7 @@ mc,rv3029c2           Real Time Clock Module with I2C-Bus
 national,lm75          I2C TEMP SENSOR
 national,lm80          Serial Interface ACPI-Compatible Microprocessor System Hardware Monitor
 national,lm92          Â±0.33°C Accurate, 12-Bit + Sign Temperature Sensor and Thermal Window Comparator with Two-Wire Interface
+nuvoton,npct501                i2c trusted platform module (TPM)
 nxp,pca9556            Octal SMBus and I2C registered interface
 nxp,pca9557            8-bit I2C-bus and SMBus I/O port with reset
 nxp,pcf8563            Real-time clock/calendar
@@ -61,3 +63,4 @@ taos,tsl2550          Ambient Light Sensor with SMBUS/Two Wire Serial Interface
 ti,tsc2003             I2C Touch-Screen Controller
 ti,tmp102              Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface
 ti,tmp275              Digital Temperature Sensor
+winbond,wpct301                i2c trusted platform module (TPM)
index 2a4b4bc..7fc1b01 100644 (file)
@@ -1,33 +1,30 @@
-* Freescale 83xx DMA Controller
+* Freescale DMA Controllers
 
-Freescale PowerPC 83xx have on chip general purpose DMA controllers.
+** Freescale Elo DMA Controller
+   This is a little-endian 4-channel DMA controller, used in Freescale mpc83xx
+   series chips such as mpc8315, mpc8349, mpc8379 etc.
 
 Required properties:
 
-- compatible        : compatible list, contains 2 entries, first is
-                "fsl,CHIP-dma", where CHIP is the processor
-                (mpc8349, mpc8360, etc.) and the second is
-                "fsl,elo-dma"
-- reg               : <registers mapping for DMA general status reg>
-- ranges               : Should be defined as specified in 1) to describe the
-                 DMA controller channels.
+- compatible        : must include "fsl,elo-dma"
+- reg               : DMA General Status Register, i.e. DGSR which contains
+                      status for all the 4 DMA channels
+- ranges            : describes the mapping between the address space of the
+                      DMA channels and the address space of the DMA controller
 - cell-index        : controller index.  0 for controller @ 0x8100
-- interrupts        : <interrupt mapping for DMA IRQ>
+- interrupts        : interrupt specifier for DMA IRQ
 - interrupt-parent  : optional, if needed for interrupt mapping
 
-
 - DMA channel nodes:
-        - compatible        : compatible list, contains 2 entries, first is
-                        "fsl,CHIP-dma-channel", where CHIP is the processor
-                        (mpc8349, mpc8350, etc.) and the second is
-                        "fsl,elo-dma-channel". However, see note below.
-        - reg               : <registers mapping for channel>
-        - cell-index        : dma channel index starts at 0.
+        - compatible        : must include "fsl,elo-dma-channel"
+                              However, see note below.
+        - reg               : DMA channel specific registers
+        - cell-index        : DMA channel index starts at 0.
 
 Optional properties:
-        - interrupts        : <interrupt mapping for DMA channel IRQ>
-                         (on 83xx this is expected to be identical to
-                          the interrupts property of the parent node)
+        - interrupts        : interrupt specifier for DMA channel IRQ
+                              (on 83xx this is expected to be identical to
+                              the interrupts property of the parent node)
         - interrupt-parent  : optional, if needed for interrupt mapping
 
 Example:
@@ -70,30 +67,27 @@ Example:
                };
        };
 
-* Freescale 85xx/86xx DMA Controller
-
-Freescale PowerPC 85xx/86xx have on chip general purpose DMA controllers.
+** Freescale EloPlus DMA Controller
+   This is a 4-channel DMA controller with extended addresses and chaining,
+   mainly used in Freescale mpc85xx/86xx, Pxxx and BSC series chips, such as
+   mpc8540, mpc8641 p4080, bsc9131 etc.
 
 Required properties:
 
-- compatible        : compatible list, contains 2 entries, first is
-                "fsl,CHIP-dma", where CHIP is the processor
-                (mpc8540, mpc8540, etc.) and the second is
-                "fsl,eloplus-dma"
-- reg               : <registers mapping for DMA general status reg>
+- compatible        : must include "fsl,eloplus-dma"
+- reg               : DMA General Status Register, i.e. DGSR which contains
+                      status for all the 4 DMA channels
 - cell-index        : controller index.  0 for controller @ 0x21000,
                                          1 for controller @ 0xc000
-- ranges               : Should be defined as specified in 1) to describe the
-                 DMA controller channels.
+- ranges            : describes the mapping between the address space of the
+                      DMA channels and the address space of the DMA controller
 
 - DMA channel nodes:
-        - compatible        : compatible list, contains 2 entries, first is
-                        "fsl,CHIP-dma-channel", where CHIP is the processor
-                        (mpc8540, mpc8560, etc.) and the second is
-                        "fsl,eloplus-dma-channel". However, see note below.
-        - cell-index        : dma channel index starts at 0.
-        - reg               : <registers mapping for channel>
-        - interrupts        : <interrupt mapping for DMA channel IRQ>
+        - compatible        : must include "fsl,eloplus-dma-channel"
+                              However, see note below.
+        - cell-index        : DMA channel index starts at 0.
+        - reg               : DMA channel specific registers
+        - interrupts        : interrupt specifier for DMA channel IRQ
         - interrupt-parent  : optional, if needed for interrupt mapping
 
 Example:
@@ -134,6 +128,76 @@ Example:
                };
        };
 
+** Freescale Elo3 DMA Controller
+   DMA controller which has same function as EloPlus except that Elo3 has 8
+   channels while EloPlus has only 4, it is used in Freescale Txxx and Bxxx
+   series chips, such as t1040, t4240, b4860.
+
+Required properties:
+
+- compatible        : must include "fsl,elo3-dma"
+- reg               : contains two entries for DMA General Status Registers,
+                      i.e. DGSR0 which includes status for channel 1~4, and
+                      DGSR1 for channel 5~8
+- ranges            : describes the mapping between the address space of the
+                      DMA channels and the address space of the DMA controller
+
+- DMA channel nodes:
+        - compatible        : must include "fsl,eloplus-dma-channel"
+        - reg               : DMA channel specific registers
+        - interrupts        : interrupt specifier for DMA channel IRQ
+        - interrupt-parent  : optional, if needed for interrupt mapping
+
+Example:
+dma@100300 {
+       #address-cells = <1>;
+       #size-cells = <1>;
+       compatible = "fsl,elo3-dma";
+       reg = <0x100300 0x4>,
+             <0x100600 0x4>;
+       ranges = <0x0 0x100100 0x500>;
+       dma-channel@0 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x0 0x80>;
+               interrupts = <28 2 0 0>;
+       };
+       dma-channel@80 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x80 0x80>;
+               interrupts = <29 2 0 0>;
+       };
+       dma-channel@100 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x100 0x80>;
+               interrupts = <30 2 0 0>;
+       };
+       dma-channel@180 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x180 0x80>;
+               interrupts = <31 2 0 0>;
+       };
+       dma-channel@300 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x300 0x80>;
+               interrupts = <76 2 0 0>;
+       };
+       dma-channel@380 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x380 0x80>;
+               interrupts = <77 2 0 0>;
+       };
+       dma-channel@400 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x400 0x80>;
+               interrupts = <78 2 0 0>;
+       };
+       dma-channel@480 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x480 0x80>;
+               interrupts = <79 2 0 0>;
+       };
+};
+
 Note on DMA channel compatible properties: The compatible property must say
 "fsl,elo-dma-channel" or "fsl,eloplus-dma-channel" to be used by the Elo DMA
 driver (fsldma).  Any DMA channel used by fsldma cannot be used by another
index a2b5663..dd77a81 100644 (file)
@@ -15,39 +15,48 @@ be built as module or inside kernel. Let's consider those cases.
 
        Part 2 - When dmatest is built as a module...
 
-After mounting debugfs and loading the module, the /sys/kernel/debug/dmatest
-folder with nodes will be created. There are two important files located. First
-is the 'run' node that controls run and stop phases of the test, and the second
-one, 'results', is used to get the test case results.
-
-Note that in this case test will not run on load automatically.
-
 Example of usage:
+       % modprobe dmatest channel=dma0chan0 timeout=2000 iterations=1 run=1
+
+...or:
+       % modprobe dmatest
        % echo dma0chan0 > /sys/module/dmatest/parameters/channel
        % echo 2000 > /sys/module/dmatest/parameters/timeout
        % echo 1 > /sys/module/dmatest/parameters/iterations
-       % echo 1 > /sys/kernel/debug/dmatest/run
+       % echo 1 > /sys/module/dmatest/parameters/run
+
+...or on the kernel command line:
+
+       dmatest.channel=dma0chan0 dmatest.timeout=2000 dmatest.iterations=1 dmatest.run=1
 
 Hint: available channel list could be extracted by running the following
 command:
        % ls -1 /sys/class/dma/
 
-After a while you will start to get messages about current status or error like
-in the original code.
+Once started a message like "dmatest: Started 1 threads using dma0chan0" is
+emitted.  After that only test failure messages are reported until the test
+stops.
 
 Note that running a new test will not stop any in progress test.
 
-The following command should return actual state of the test.
-       % cat /sys/kernel/debug/dmatest/run
-
-To wait for test done the user may perform a busy loop that checks the state.
-
-       % while [ $(cat /sys/kernel/debug/dmatest/run) = "Y" ]
-       > do
-       >       echo -n "."
-       >       sleep 1
-       > done
-       > echo
+The following command returns the state of the test.
+       % cat /sys/module/dmatest/parameters/run
+
+To wait for test completion userpace can poll 'run' until it is false, or use
+the wait parameter.  Specifying 'wait=1' when loading the module causes module
+initialization to pause until a test run has completed, while reading
+/sys/module/dmatest/parameters/wait waits for any running test to complete
+before returning.  For example, the following scripts wait for 42 tests
+to complete before exiting.  Note that if 'iterations' is set to 'infinite' then
+waiting is disabled.
+
+Example:
+       % modprobe dmatest run=1 iterations=42 wait=1
+       % modprobe -r dmatest
+...or:
+       % modprobe dmatest run=1 iterations=42
+       % cat /sys/module/dmatest/parameters/wait
+       % modprobe -r dmatest
 
        Part 3 - When built-in in the kernel...
 
@@ -62,21 +71,22 @@ case. You always could check them at run-time by running
 
        Part 4 - Gathering the test results
 
-The module provides a storage for the test results in the memory. The gathered
-data could be used after test is done.
+Test results are printed to the kernel log buffer with the format:
 
-The special file 'results' in the debugfs represents gathered data of the in
-progress test. The messages collected are printed to the kernel log as well.
+"dmatest: result <channel>: <test id>: '<error msg>' with src_off=<val> dst_off=<val> len=<val> (<err code>)"
 
 Example of output:
-       % cat /sys/kernel/debug/dmatest/results
-       dma0chan0-copy0: #1: No errors with src_off=0x7bf dst_off=0x8ad len=0x3fea (0)
+       % dmesg | tail -n 1
+       dmatest: result dma0chan0-copy0: #1: No errors with src_off=0x7bf dst_off=0x8ad len=0x3fea (0)
 
 The message format is unified across the different types of errors. A number in
 the parens represents additional information, e.g. error code, error counter,
-or status.
+or status.  A test thread also emits a summary line at completion listing the
+number of tests executed, number that failed, and a result code.
 
-Comparison between buffers is stored to the dedicated structure.
+Example:
+       % dmesg | tail -n 1
+       dmatest: dma0chan0-copy0: summary 1 test, 0 failures 1000 iops 100000 KB/s (0)
 
-Note that the verify result is now accessible only via file 'results' in the
-debugfs.
+The details of a data miscompare error are also emitted, but do not follow the
+above format.
index 9dae594..5dd282d 100644 (file)
@@ -70,6 +70,12 @@ Unless otherwise specified, all options default to off.
 
        See comments at the top of fs/btrfs/check-integrity.c for more info.
 
+  commit=<seconds>
+       Set the interval of periodic commit, 30 seconds by default. Higher
+       values defer data being synced to permanent storage with obvious
+       consequences when the system crashes. The upper bound is not forced,
+       but a warning is printed if it's more than 300 seconds (5 minutes).
+
   compress
   compress=<type>
   compress-force
@@ -154,7 +160,11 @@ Unless otherwise specified, all options default to off.
        Currently this scans a list of several previous tree roots and tries to 
        use the first readable.
 
- skip_balance
+  rescan_uuid_tree
+       Force check and rebuild procedure of the UUID tree. This should not
+       normally be needed.
+
+  skip_balance
        Skip automatic resume of interrupted balance operation after mount.
        May be resumed with "btrfs balance resume."
 
@@ -234,24 +244,14 @@ available from the git repository at the following location:
 
 These include the following tools:
 
-mkfs.btrfs: create a filesystem
-
-btrfsctl: control program to create snapshots and subvolumes:
+* mkfs.btrfs: create a filesystem
 
-       mount /dev/sda2 /mnt
-       btrfsctl -s new_subvol_name /mnt
-       btrfsctl -s snapshot_of_default /mnt/default
-       btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
-       btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
-       ls /mnt
-       default snapshot_of_a_snapshot snapshot_of_new_subvol
-       new_subvol_name snapshot_of_default
+* btrfs: a single tool to manage the filesystems, refer to the manpage for more details
 
-       Snapshots and subvolumes cannot be deleted right now, but you can
-       rm -rf all the files and directories inside them.
+* 'btrfsck' or 'btrfs check': do a consistency check of the filesystem
 
-btrfsck: do a limited check of the FS extent trees.
+Other tools for specific tasks:
 
-btrfs-debug-tree: print all of the FS metadata in text form.  Example:
+* btrfs-convert: in-place conversion from ext2/3/4 filesystems
 
-       btrfs-debug-tree /dev/sda2 >& big_output_file
+* btrfs-image: dump filesystem metadata for debugging
index 9ca3e74..50680a5 100644 (file)
@@ -1190,15 +1190,24 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        owned by uid=0.
 
        ima_hash=       [IMA]
-                       Format: { "sha1" | "md5" }
+                       Format: { md5 | sha1 | rmd160 | sha256 | sha384
+                                  | sha512 | ... }
                        default: "sha1"
 
+                       The list of supported hash algorithms is defined
+                       in crypto/hash_info.h.
+
        ima_tcb         [IMA]
                        Load a policy which meets the needs of the Trusted
                        Computing Base.  This means IMA will measure all
                        programs exec'd, files mmap'd for exec, and all files
                        opened for read by uid=0.
 
+       ima_template=   [IMA]
+                       Select one of defined IMA measurements template formats.
+                       Formats: { "ima" | "ima-ng" }
+                       Default: "ima-ng"
+
        init=           [KNL]
                        Format: <full_path>
                        Run specified binary instead of /sbin/init as init
index 0f54333..b6ce00b 100644 (file)
@@ -547,13 +547,11 @@ helper functions described in Section 4.  In that case, pm_runtime_resume()
 should be used.  Of course, for this purpose the device's runtime PM has to be
 enabled earlier by calling pm_runtime_enable().
 
-If the device bus type's or driver's ->probe() callback runs
-pm_runtime_suspend() or pm_runtime_idle() or their asynchronous counterparts,
-they will fail returning -EAGAIN, because the device's usage counter is
-incremented by the driver core before executing ->probe().  Still, it may be
-desirable to suspend the device as soon as ->probe() has finished, so the driver
-core uses pm_runtime_put_sync() to invoke the subsystem-level idle callback for
-the device at that time.
+It may be desirable to suspend the device once ->probe() has finished.
+Therefore the driver core uses the asyncronous pm_request_idle() to submit a
+request to execute the subsystem-level idle callback for the device at that
+time.  A driver that makes use of the runtime autosuspend feature, may want to
+update the last busy mark before returning from ->probe().
 
 Moreover, the driver core prevents runtime PM callbacks from racing with the bus
 notifier callback in __device_release_driver(), which is necessary, because the
@@ -656,7 +654,7 @@ out the following operations:
     __pm_runtime_disable() with 'false' as the second argument for every device
     right before executing the subsystem-level .suspend_late() callback for it.
 
-  * During system resume it calls pm_runtime_enable() and pm_runtime_put_sync()
+  * During system resume it calls pm_runtime_enable() and pm_runtime_put()
     for every device right after executing the subsystem-level .resume_early()
     callback and right after executing the subsystem-level .resume() callback
     for it, respectively.
index 414235c..45c82fd 100644 (file)
@@ -22,3 +22,5 @@ keys.txt
        - description of the kernel key retention service.
 tomoyo.txt
        - documentation on the TOMOYO Linux Security Module.
+IMA-templates.txt
+       - documentation on the template management mechanism for IMA.
diff --git a/Documentation/security/IMA-templates.txt b/Documentation/security/IMA-templates.txt
new file mode 100644 (file)
index 0000000..a777e5f
--- /dev/null
@@ -0,0 +1,87 @@
+                       IMA Template Management Mechanism
+
+
+==== INTRODUCTION ====
+
+The original 'ima' template is fixed length, containing the filedata hash
+and pathname. The filedata hash is limited to 20 bytes (md5/sha1).
+The pathname is a null terminated string, limited to 255 characters.
+To overcome these limitations and to add additional file metadata, it is
+necessary to extend the current version of IMA by defining additional
+templates. For example, information that could be possibly reported are
+the inode UID/GID or the LSM labels either of the inode and of the process
+that is accessing it.
+
+However, the main problem to introduce this feature is that, each time
+a new template is defined, the functions that generate and display
+the measurements list would include the code for handling a new format
+and, thus, would significantly grow over the time.
+
+The proposed solution solves this problem by separating the template
+management from the remaining IMA code. The core of this solution is the
+definition of two new data structures: a template descriptor, to determine
+which information should be included in the measurement list; a template
+field, to generate and display data of a given type.
+
+Managing templates with these structures is very simple. To support
+a new data type, developers define the field identifier and implement
+two functions, init() and show(), respectively to generate and display
+measurement entries. Defining a new template descriptor requires
+specifying the template format, a string of field identifiers separated
+by the '|' character. While in the current implementation it is possible
+to define new template descriptors only by adding their definition in the
+template specific code (ima_template.c), in a future version it will be
+possible to register a new template on a running kernel by supplying to IMA
+the desired format string. In this version, IMA initializes at boot time
+all defined template descriptors by translating the format into an array
+of template fields structures taken from the set of the supported ones.
+
+After the initialization step, IMA will call ima_alloc_init_template()
+(new function defined within the patches for the new template management
+mechanism) to generate a new measurement entry by using the template
+descriptor chosen through the kernel configuration or through the newly
+introduced 'ima_template=' kernel command line parameter. It is during this
+phase that the advantages of the new architecture are clearly shown:
+the latter function will not contain specific code to handle a given template
+but, instead, it simply calls the init() method of the template fields
+associated to the chosen template descriptor and store the result (pointer
+to allocated data and data length) in the measurement entry structure.
+
+The same mechanism is employed to display measurements entries.
+The functions ima[_ascii]_measurements_show() retrieve, for each entry,
+the template descriptor used to produce that entry and call the show()
+method for each item of the array of template fields structures.
+
+
+
+==== SUPPORTED TEMPLATE FIELDS AND DESCRIPTORS ====
+
+In the following, there is the list of supported template fields
+('<identifier>': description), that can be used to define new template
+descriptors by adding their identifier to the format string
+(support for more data types will be added later):
+
+ - 'd': the digest of the event (i.e. the digest of a measured file),
+        calculated with the SHA1 or MD5 hash algorithm;
+ - 'n': the name of the event (i.e. the file name), with size up to 255 bytes;
+ - 'd-ng': the digest of the event, calculated with an arbitrary hash
+           algorithm (field format: [<hash algo>:]digest, where the digest
+           prefix is shown only if the hash algorithm is not SHA1 or MD5);
+ - 'n-ng': the name of the event, without size limitations.
+
+
+Below, there is the list of defined template descriptors:
+ - "ima": its format is 'd|n';
+ - "ima-ng" (default): its format is 'd-ng|n-ng'.
+
+
+
+==== USE ====
+
+To specify the template descriptor to be used to generate measurement entries,
+currently the following methods are supported:
+
+ - select a template descriptor among those supported in the kernel
+   configuration ('ima-ng' is the default choice);
+ - specify a template descriptor name from the kernel command line through
+   the 'ima_template=' parameter.
index 7b4145d..a4c33f1 100644 (file)
@@ -865,15 +865,14 @@ encountered:
      calling processes has a searchable link to the key from one of its
      keyrings. There are three functions for dealing with these:
 
-       key_ref_t make_key_ref(const struct key *key,
-                              unsigned long possession);
+       key_ref_t make_key_ref(const struct key *key, bool possession);
 
        struct key *key_ref_to_ptr(const key_ref_t key_ref);
 
-       unsigned long is_key_possessed(const key_ref_t key_ref);
+       bool is_key_possessed(const key_ref_t key_ref);
 
      The first function constructs a key reference from a key pointer and
-     possession information (which must be 0 or 1 and not any other value).
+     possession information (which must be true or false).
 
      The second function retrieves the key pointer from a reference and the
      third retrieves the possession flag.
@@ -961,14 +960,17 @@ payload contents" for more information.
     the argument will not be parsed.
 
 
-(*) Extra references can be made to a key by calling the following function:
+(*) Extra references can be made to a key by calling one of the following
+    functions:
 
+       struct key *__key_get(struct key *key);
        struct key *key_get(struct key *key);
 
-    These need to be disposed of by calling key_put() when they've been
-    finished with. The key pointer passed in will be returned. If the pointer
-    is NULL or CONFIG_KEYS is not set then the key will not be dereferenced and
-    no increment will take place.
+    Keys so references will need to be disposed of by calling key_put() when
+    they've been finished with.  The key pointer passed in will be returned.
+
+    In the case of key_get(), if the pointer is NULL or CONFIG_KEYS is not set
+    then the key will not be dereferenced and no increment will take place.
 
 
 (*) A key's serial number can be obtained by calling:
index 7521d36..6dea4fd 100644 (file)
@@ -63,9 +63,9 @@ levels.
 PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table
 allocation and pgtable_pmd_page_dtor() on freeing.
 
-Allocation usually happens in pmd_alloc_one(), freeing in pmd_free(), but
-make sure you cover all PMD table allocation / freeing paths: i.e X86_PAE
-preallocate few PMDs on pgd_alloc().
+Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and
+pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing
+paths: i.e X86_PAE preallocate few PMDs on pgd_alloc().
 
 With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK.
 
index 63f3048..8285ed4 100644 (file)
@@ -4065,6 +4065,7 @@ F:        arch/x86/include/uapi/asm/hyperv.h
 F:     arch/x86/kernel/cpu/mshyperv.c
 F:     drivers/hid/hid-hyperv.c
 F:     drivers/hv/
+F:     drivers/input/serio/hyperv-keyboard.c
 F:     drivers/net/hyperv/
 F:     drivers/scsi/storvsc_drv.c
 F:     drivers/video/hyperv_fb.c
@@ -7515,9 +7516,10 @@ SELINUX SECURITY MODULE
 M:     Stephen Smalley <sds@tycho.nsa.gov>
 M:     James Morris <james.l.morris@oracle.com>
 M:     Eric Paris <eparis@parisplace.org>
+M:     Paul Moore <paul@paul-moore.com>
 L:     selinux@tycho.nsa.gov (subscribers-only, general discussion)
 W:     http://selinuxproject.org
-T:     git git://git.infradead.org/users/eparis/selinux.git
+T:     git git://git.infradead.org/users/pcmoore/selinux
 S:     Supported
 F:     include/linux/selinux*
 F:     security/selinux/
@@ -8664,6 +8666,7 @@ F:        drivers/media/usb/tm6000/
 TPM DEVICE DRIVER
 M:     Leonidas Da Silva Barbosa <leosilva@linux.vnet.ibm.com>
 M:     Ashley Lai <ashley@ashleylai.com>
+M:     Peter Huewe <peterhuewe@gmx.de>
 M:     Rajiv Andrade <mail@srajiv.net>
 W:     http://tpmdd.sourceforge.net
 M:     Marcel Selhorst <tpmdd@selhorst.net>
@@ -9522,8 +9525,8 @@ F:        drivers/xen/*swiotlb*
 
 XFS FILESYSTEM
 P:     Silicon Graphics Inc
+M:     Dave Chinner <dchinner@fromorbit.com>
 M:     Ben Myers <bpm@sgi.com>
-M:     Alex Elder <elder@kernel.org>
 M:     xfs@oss.sgi.com
 L:     xfs@oss.sgi.com
 W:     http://oss.sgi.com/projects/xfs
index 135c674..d39dc9b 100644 (file)
@@ -16,8 +16,8 @@ config ALPHA
        select ARCH_WANT_IPC_PARSE_VERSION
        select ARCH_HAVE_NMI_SAFE_CMPXCHG
        select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
+       select GENERIC_CLOCKEVENTS
        select GENERIC_SMP_IDLE_THREAD
-       select GENERIC_CMOS_UPDATE
        select GENERIC_STRNCPY_FROM_USER
        select GENERIC_STRNLEN_USER
        select HAVE_MOD_ARCH_SPECIFIC
@@ -488,6 +488,20 @@ config VGA_HOSE
          which always have multiple hoses, and whose consoles support it.
 
 
+config ALPHA_QEMU
+       bool "Run under QEMU emulation"
+       depends on !ALPHA_GENERIC
+       ---help---
+         Assume the presence of special features supported by QEMU PALcode
+         that reduce the overhead of system emulation.
+
+         Generic kernels will auto-detect QEMU.  But when building a
+         system-specific kernel, the assumption is that we want to
+         elimiate as many runtime tests as possible.
+
+         If unsure, say N.
+
+
 config ALPHA_SRM
        bool "Use SRM as bootloader" if ALPHA_CABRIOLET || ALPHA_AVANTI_CH || ALPHA_EB64P || ALPHA_PC164 || ALPHA_TAKARA || ALPHA_EB164 || ALPHA_ALCOR || ALPHA_MIATA || ALPHA_LX164 || ALPHA_SX164 || ALPHA_NAUTILUS || ALPHA_NONAME
        depends on TTY
@@ -572,6 +586,30 @@ config NUMA
          Access).  This option is for configuring high-end multiprocessor
          server machines.  If in doubt, say N.
 
+config ALPHA_WTINT
+       bool "Use WTINT" if ALPHA_SRM || ALPHA_GENERIC
+       default y if ALPHA_QEMU
+       default n if ALPHA_EV5 || ALPHA_EV56 || (ALPHA_EV4 && !ALPHA_LCA)
+       default n if !ALPHA_SRM && !ALPHA_GENERIC
+       default y if SMP
+       ---help---
+         The Wait for Interrupt (WTINT) PALcall attempts to place the CPU
+         to sleep until the next interrupt.  This may reduce the power
+         consumed, and the heat produced by the computer.  However, it has
+         the side effect of making the cycle counter unreliable as a timing
+         device across the sleep.
+
+         For emulation under QEMU, definitely say Y here, as we have other
+         mechanisms for measuring time than the cycle counter.
+
+         For EV4 (but not LCA), EV5 and EV56 systems, or for systems running
+         MILO, sleep mode is not supported so you might as well say N here.
+
+         For SMP systems we cannot use the cycle counter for timing anyway,
+         so you might as well say Y here.
+
+         If unsure, say N.
+
 config NODES_SHIFT
        int
        default "7"
@@ -613,9 +651,41 @@ config VERBOSE_MCHECK_ON
 
          Take the default (1) unless you want more control or more info.
 
+choice
+       prompt "Timer interrupt frequency (HZ)?"
+       default HZ_128 if ALPHA_QEMU
+       default HZ_1200 if ALPHA_RAWHIDE
+       default HZ_1024
+       ---help---
+         The frequency at which timer interrupts occur.  A high frequency
+         minimizes latency, whereas a low frequency minimizes overhead of
+         process accounting.  The later effect is especially significant
+         when being run under QEMU.
+
+         Note that some Alpha hardware cannot change the interrupt frequency
+         of the timer.  If unsure, say 1024 (or 1200 for Rawhide).
+
+       config HZ_32
+               bool "32 Hz"
+       config HZ_64
+               bool "64 Hz"
+       config HZ_128
+               bool "128 Hz"
+       config HZ_256
+               bool "256 Hz"
+       config HZ_1024
+               bool "1024 Hz"
+       config HZ_1200
+               bool "1200 Hz"
+endchoice
+
 config HZ
-       int
-       default 1200 if ALPHA_RAWHIDE
+       int 
+       default 32 if HZ_32
+       default 64 if HZ_64
+       default 128 if HZ_128
+       default 256 if HZ_256
+       default 1200 if HZ_1200
        default 1024
 
 source "drivers/pci/Kconfig"
index 72dbf23..75cb364 100644 (file)
@@ -33,6 +33,7 @@ struct alpha_machine_vector
 
        int nr_irqs;
        int rtc_port;
+       int rtc_boot_cpu_only;
        unsigned int max_asn;
        unsigned long max_isa_dma_address;
        unsigned long irq_probe_mask;
@@ -95,9 +96,6 @@ struct alpha_machine_vector
 
        struct _alpha_agp_info *(*agp_info)(void);
 
-       unsigned int (*rtc_get_time)(struct rtc_time *);
-       int (*rtc_set_time)(struct rtc_time *);
-
        const char *vector_name;
 
        /* NUMA information */
@@ -126,13 +124,19 @@ extern struct alpha_machine_vector alpha_mv;
 
 #ifdef CONFIG_ALPHA_GENERIC
 extern int alpha_using_srm;
+extern int alpha_using_qemu;
 #else
-#ifdef CONFIG_ALPHA_SRM
-#define alpha_using_srm 1
-#else
-#define alpha_using_srm 0
-#endif
+# ifdef CONFIG_ALPHA_SRM
+#  define alpha_using_srm 1
+# else
+#  define alpha_using_srm 0
+# endif
+# ifdef CONFIG_ALPHA_QEMU
+#  define alpha_using_qemu 1
+# else
+#  define alpha_using_qemu 0
+# endif
 #endif /* GENERIC */
 
-#endif
+#endif /* __KERNEL__ */
 #endif /* __ALPHA_MACHVEC_H */
index 6fcd2b5..5422a47 100644 (file)
@@ -89,6 +89,7 @@ __CALL_PAL_W1(wrmces, unsigned long);
 __CALL_PAL_RW2(wrperfmon, unsigned long, unsigned long, unsigned long);
 __CALL_PAL_W1(wrusp, unsigned long);
 __CALL_PAL_W1(wrvptptr, unsigned long);
+__CALL_PAL_RW1(wtint, unsigned long, unsigned long);
 
 /*
  * TB routines..
@@ -111,5 +112,75 @@ __CALL_PAL_W1(wrvptptr, unsigned long);
 #define tbiap()                __tbi(-1, /* no second argument */)
 #define tbia()         __tbi(-2, /* no second argument */)
 
+/*
+ * QEMU Cserv routines..
+ */
+
+static inline unsigned long
+qemu_get_walltime(void)
+{
+       register unsigned long v0 __asm__("$0");
+       register unsigned long a0 __asm__("$16") = 3;
+
+       asm("call_pal %2 # cserve get_time"
+           : "=r"(v0), "+r"(a0)
+           : "i"(PAL_cserve)
+           : "$17", "$18", "$19", "$20", "$21");
+
+       return v0;
+}
+
+static inline unsigned long
+qemu_get_alarm(void)
+{
+       register unsigned long v0 __asm__("$0");
+       register unsigned long a0 __asm__("$16") = 4;
+
+       asm("call_pal %2 # cserve get_alarm"
+           : "=r"(v0), "+r"(a0)
+           : "i"(PAL_cserve)
+           : "$17", "$18", "$19", "$20", "$21");
+
+       return v0;
+}
+
+static inline void
+qemu_set_alarm_rel(unsigned long expire)
+{
+       register unsigned long a0 __asm__("$16") = 5;
+       register unsigned long a1 __asm__("$17") = expire;
+
+       asm volatile("call_pal %2 # cserve set_alarm_rel"
+                    : "+r"(a0), "+r"(a1)
+                    : "i"(PAL_cserve)
+                    : "$0", "$18", "$19", "$20", "$21");
+}
+
+static inline void
+qemu_set_alarm_abs(unsigned long expire)
+{
+       register unsigned long a0 __asm__("$16") = 6;
+       register unsigned long a1 __asm__("$17") = expire;
+
+       asm volatile("call_pal %2 # cserve set_alarm_abs"
+                    : "+r"(a0), "+r"(a1)
+                    : "i"(PAL_cserve)
+                    : "$0", "$18", "$19", "$20", "$21");
+}
+
+static inline unsigned long
+qemu_get_vmtime(void)
+{
+       register unsigned long v0 __asm__("$0");
+       register unsigned long a0 __asm__("$16") = 7;
+
+       asm("call_pal %2 # cserve get_time"
+           : "=r"(v0), "+r"(a0)
+           : "i"(PAL_cserve)
+           : "$17", "$18", "$19", "$20", "$21");
+
+       return v0;
+}
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __ALPHA_PAL_H */
index d70408d..f71c3b0 100644 (file)
@@ -1,12 +1 @@
-#ifndef _ALPHA_RTC_H
-#define _ALPHA_RTC_H
-
-#if defined(CONFIG_ALPHA_MARVEL) && defined(CONFIG_SMP) \
- || defined(CONFIG_ALPHA_GENERIC)
-# define get_rtc_time          alpha_mv.rtc_get_time
-# define set_rtc_time          alpha_mv.rtc_set_time
-#endif
-
 #include <asm-generic/rtc.h>
-
-#endif
index b02b8a2..c2911f5 100644 (file)
@@ -22,15 +22,27 @@ extern void * __memcpy(void *, const void *, size_t);
 
 #define __HAVE_ARCH_MEMSET
 extern void * __constant_c_memset(void *, unsigned long, size_t);
+extern void * ___memset(void *, int, size_t);
 extern void * __memset(void *, int, size_t);
 extern void * memset(void *, int, size_t);
 
-#define memset(s, c, n)                                                            \
-(__builtin_constant_p(c)                                                   \
- ? (__builtin_constant_p(n) && (c) == 0                                            \
-    ? __builtin_memset((s),0,(n))                                          \
-    : __constant_c_memset((s),0x0101010101010101UL*(unsigned char)(c),(n))) \
- : __memset((s),(c),(n)))
+/* For gcc 3.x, we cannot have the inline function named "memset" because
+   the __builtin_memset will attempt to resolve to the inline as well,
+   leading to a "sorry" about unimplemented recursive inlining.  */
+extern inline void *__memset(void *s, int c, size_t n)
+{
+       if (__builtin_constant_p(c)) {
+               if (__builtin_constant_p(n)) {
+                       return __builtin_memset(s, c, n);
+               } else {
+                       unsigned long c8 = (c & 0xff) * 0x0101010101010101UL;
+                       return __constant_c_memset(s, c8, n);
+               }
+       }
+       return ___memset(s, c, n);
+}
+
+#define memset __memset
 
 #define __HAVE_ARCH_STRCPY
 extern char * strcpy(char *,const char *);
index 3c0ce08..dfc8140 100644 (file)
@@ -46,6 +46,7 @@
 #define PAL_rdusp      58
 #define PAL_whami      60
 #define PAL_retsys     61
+#define PAL_wtint      62
 #define PAL_rti                63
 
 
index 84ec46b..0d54650 100644 (file)
@@ -16,6 +16,7 @@ obj-$(CONFIG_PCI)     += pci.o pci_iommu.o pci-sysfs.o
 obj-$(CONFIG_SRM_ENV)  += srm_env.o
 obj-$(CONFIG_MODULES)  += module.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
+obj-$(CONFIG_RTC_DRV_ALPHA) += rtc.o
 
 ifdef CONFIG_ALPHA_GENERIC
 
index 89566b3..f4c7ab6 100644 (file)
@@ -40,6 +40,7 @@ EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(__memset);
+EXPORT_SYMBOL(___memset);
 EXPORT_SYMBOL(__memsetw);
 EXPORT_SYMBOL(__constant_c_memset);
 EXPORT_SYMBOL(copy_page);
index 28e4429..1c8625c 100644 (file)
@@ -66,21 +66,7 @@ do_entInt(unsigned long type, unsigned long vector,
                break;
        case 1:
                old_regs = set_irq_regs(regs);
-#ifdef CONFIG_SMP
-         {
-               long cpu;
-
-               smp_percpu_timer_interrupt(regs);
-               cpu = smp_processor_id();
-               if (cpu != boot_cpuid) {
-                       kstat_incr_irqs_this_cpu(RTC_IRQ, irq_to_desc(RTC_IRQ));
-               } else {
-                       handle_irq(RTC_IRQ);
-               }
-         }
-#else
                handle_irq(RTC_IRQ);
-#endif
                set_irq_regs(old_regs);
                return;
        case 2:
@@ -228,7 +214,7 @@ process_mcheck_info(unsigned long vector, unsigned long la_ptr,
  */
 
 struct irqaction timer_irqaction = {
-       .handler        = timer_interrupt,
+       .handler        = rtc_timer_interrupt,
        .name           = "timer",
 };
 
index 7fa6248..f54bdf6 100644 (file)
 #define CAT1(x,y)  x##y
 #define CAT(x,y)   CAT1(x,y)
 
-#define DO_DEFAULT_RTC \
-       .rtc_port = 0x70, \
-       .rtc_get_time = common_get_rtc_time, \
-       .rtc_set_time = common_set_rtc_time
+#define DO_DEFAULT_RTC                 .rtc_port = 0x70
 
 #define DO_EV4_MMU                                                     \
        .max_asn =                      EV4_MAX_ASN,                    \
index d821b17..c52e7f0 100644 (file)
@@ -83,6 +83,8 @@ struct alpha_pmu_t {
        long pmc_left[3];
         /* Subroutine for allocation of PMCs.  Enforces constraints. */
        int (*check_constraints)(struct perf_event **, unsigned long *, int);
+       /* Subroutine for checking validity of a raw event for this PMU. */
+       int (*raw_event_valid)(u64 config);
 };
 
 /*
@@ -203,6 +205,12 @@ success:
 }
 
 
+static int ev67_raw_event_valid(u64 config)
+{
+       return config >= EV67_CYCLES && config < EV67_LAST_ET;
+};
+
+
 static const struct alpha_pmu_t ev67_pmu = {
        .event_map = ev67_perfmon_event_map,
        .max_events = ARRAY_SIZE(ev67_perfmon_event_map),
@@ -211,7 +219,8 @@ static const struct alpha_pmu_t ev67_pmu = {
        .pmc_count_mask = {EV67_PCTR_0_COUNT_MASK,  EV67_PCTR_1_COUNT_MASK,  0},
        .pmc_max_period = {(1UL<<20) - 1, (1UL<<20) - 1, 0},
        .pmc_left = {16, 4, 0},
-       .check_constraints = ev67_check_constraints
+       .check_constraints = ev67_check_constraints,
+       .raw_event_valid = ev67_raw_event_valid,
 };
 
 
@@ -609,7 +618,9 @@ static int __hw_perf_event_init(struct perf_event *event)
        } else if (attr->type == PERF_TYPE_HW_CACHE) {
                return -EOPNOTSUPP;
        } else if (attr->type == PERF_TYPE_RAW) {
-               ev = attr->config & 0xff;
+               if (!alpha_pmu->raw_event_valid(attr->config))
+                       return -EINVAL;
+               ev = attr->config;
        } else {
                return -EOPNOTSUPP;
        }
index f2360a7..1941a07 100644 (file)
 void (*pm_power_off)(void) = machine_power_off;
 EXPORT_SYMBOL(pm_power_off);
 
+#ifdef CONFIG_ALPHA_WTINT
+/*
+ * Sleep the CPU.
+ * EV6, LCA45 and QEMU know how to power down, skipping N timer interrupts.
+ */
+void arch_cpu_idle(void)
+{
+       wtint(0);
+       local_irq_enable();
+}
+
+void arch_cpu_idle_dead(void)
+{
+       wtint(INT_MAX);
+}
+#endif /* ALPHA_WTINT */
+
 struct halt_info {
        int mode;
        char *restart_cmd;
index d3e52d3..da2d6ec 100644 (file)
@@ -135,17 +135,15 @@ extern void unregister_srm_console(void);
 /* smp.c */
 extern void setup_smp(void);
 extern void handle_ipi(struct pt_regs *);
-extern void smp_percpu_timer_interrupt(struct pt_regs *);
 
 /* bios32.c */
 /* extern void reset_for_srm(void); */
 
 /* time.c */
-extern irqreturn_t timer_interrupt(int irq, void *dev);
+extern irqreturn_t rtc_timer_interrupt(int irq, void *dev);
+extern void init_clockevent(void);
 extern void common_init_rtc(void);
 extern unsigned long est_cycle_freq;
-extern unsigned int common_get_rtc_time(struct rtc_time *time);
-extern int common_set_rtc_time(struct rtc_time *time);
 
 /* smc37c93x.c */
 extern void SMC93x_Init(void);
diff --git a/arch/alpha/kernel/rtc.c b/arch/alpha/kernel/rtc.c
new file mode 100644 (file)
index 0000000..c8d284d
--- /dev/null
@@ -0,0 +1,323 @@
+/*
+ *  linux/arch/alpha/kernel/rtc.c
+ *
+ *  Copyright (C) 1991, 1992, 1995, 1999, 2000  Linus Torvalds
+ *
+ * This file contains date handling.
+ */
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mc146818rtc.h>
+#include <linux/bcd.h>
+#include <linux/rtc.h>
+#include <linux/platform_device.h>
+
+#include <asm/rtc.h>
+
+#include "proto.h"
+
+
+/*
+ * Support for the RTC device.
+ *
+ * We don't want to use the rtc-cmos driver, because we don't want to support
+ * alarms, as that would be indistinguishable from timer interrupts.
+ *
+ * Further, generic code is really, really tied to a 1900 epoch.  This is
+ * true in __get_rtc_time as well as the users of struct rtc_time e.g.
+ * rtc_tm_to_time.  Thankfully all of the other epochs in use are later
+ * than 1900, and so it's easy to adjust.
+ */
+
+static unsigned long rtc_epoch;
+
+static int __init
+specifiy_epoch(char *str)
+{
+       unsigned long epoch = simple_strtoul(str, NULL, 0);
+       if (epoch < 1900)
+               printk("Ignoring invalid user specified epoch %lu\n", epoch);
+       else
+               rtc_epoch = epoch;
+       return 1;
+}
+__setup("epoch=", specifiy_epoch);
+
+static void __init
+init_rtc_epoch(void)
+{
+       int epoch, year, ctrl;
+
+       if (rtc_epoch != 0) {
+               /* The epoch was specified on the command-line.  */
+               return;
+       }
+
+       /* Detect the epoch in use on this computer.  */
+       ctrl = CMOS_READ(RTC_CONTROL);
+       year = CMOS_READ(RTC_YEAR);
+       if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
+               year = bcd2bin(year);
+
+       /* PC-like is standard; used for year >= 70 */
+       epoch = 1900;
+       if (year < 20) {
+               epoch = 2000;
+       } else if (year >= 20 && year < 48) {
+               /* NT epoch */
+               epoch = 1980;
+       } else if (year >= 48 && year < 70) {
+               /* Digital UNIX epoch */
+               epoch = 1952;
+       }
+       rtc_epoch = epoch;
+
+       printk(KERN_INFO "Using epoch %d for rtc year %d\n", epoch, year);
+}
+
+static int
+alpha_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+       __get_rtc_time(tm);
+
+       /* Adjust for non-default epochs.  It's easier to depend on the
+          generic __get_rtc_time and adjust the epoch here than create
+          a copy of __get_rtc_time with the edits we need.  */
+       if (rtc_epoch != 1900) {
+               int year = tm->tm_year;
+               /* Undo the century adjustment made in __get_rtc_time.  */
+               if (year >= 100)
+                       year -= 100;
+               year += rtc_epoch - 1900;
+               /* Redo the century adjustment with the epoch in place.  */
+               if (year <= 69)
+                       year += 100;
+               tm->tm_year = year;
+       }
+
+       return rtc_valid_tm(tm);
+}
+
+static int
+alpha_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+       struct rtc_time xtm;
+
+       if (rtc_epoch != 1900) {
+               xtm = *tm;
+               xtm.tm_year -= rtc_epoch - 1900;
+               tm = &xtm;
+       }
+
+       return __set_rtc_time(tm);
+}
+
+static int
+alpha_rtc_set_mmss(struct device *dev, unsigned long nowtime)
+{
+       int retval = 0;
+       int real_seconds, real_minutes, cmos_minutes;
+       unsigned char save_control, save_freq_select;
+
+       /* Note: This code only updates minutes and seconds.  Comments
+          indicate this was to avoid messing with unknown time zones,
+          and with the epoch nonsense described above.  In order for
+          this to work, the existing clock cannot be off by more than
+          15 minutes.
+
+          ??? This choice is may be out of date.  The x86 port does
+          not have problems with timezones, and the epoch processing has
+          now been fixed in alpha_set_rtc_time.
+
+          In either case, one can always force a full rtc update with
+          the userland hwclock program, so surely 15 minute accuracy
+          is no real burden.  */
+
+       /* In order to set the CMOS clock precisely, we have to be called
+          500 ms after the second nowtime has started, because when
+          nowtime is written into the registers of the CMOS clock, it will
+          jump to the next second precisely 500 ms later. Check the Motorola
+          MC146818A or Dallas DS12887 data sheet for details.  */
+
+       /* irq are locally disabled here */
+       spin_lock(&rtc_lock);
+       /* Tell the clock it's being set */
+       save_control = CMOS_READ(RTC_CONTROL);
+       CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
+
+       /* Stop and reset prescaler */
+       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+       CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
+
+       cmos_minutes = CMOS_READ(RTC_MINUTES);
+       if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
+               cmos_minutes = bcd2bin(cmos_minutes);
+
+       real_seconds = nowtime % 60;
+       real_minutes = nowtime / 60;
+       if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) {
+               /* correct for half hour time zone */
+               real_minutes += 30;
+       }
+       real_minutes %= 60;
+
+       if (abs(real_minutes - cmos_minutes) < 30) {
+               if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
+                       real_seconds = bin2bcd(real_seconds);
+                       real_minutes = bin2bcd(real_minutes);
+               }
+               CMOS_WRITE(real_seconds,RTC_SECONDS);
+               CMOS_WRITE(real_minutes,RTC_MINUTES);
+       } else {
+               printk_once(KERN_NOTICE
+                           "set_rtc_mmss: can't update from %d to %d\n",
+                           cmos_minutes, real_minutes);
+               retval = -1;
+       }
+
+       /* The following flags have to be released exactly in this order,
+        * otherwise the DS12887 (popular MC146818A clone with integrated
+        * battery and quartz) will not reset the oscillator and will not
+        * update precisely 500 ms later. You won't find this mentioned in
+        * the Dallas Semiconductor data sheets, but who believes data
+        * sheets anyway ...                           -- Markus Kuhn
+        */
+       CMOS_WRITE(save_control, RTC_CONTROL);
+       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+       spin_unlock(&rtc_lock);
+
+       return retval;
+}
+
+static int
+alpha_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
+{
+       switch (cmd) {
+       case RTC_EPOCH_READ:
+               return put_user(rtc_epoch, (unsigned long __user *)arg);
+       case RTC_EPOCH_SET:
+               if (arg < 1900)
+                       return -EINVAL;
+               rtc_epoch = arg;
+               return 0;
+       default:
+               return -ENOIOCTLCMD;
+       }
+}
+
+static const struct rtc_class_ops alpha_rtc_ops = {
+       .read_time = alpha_rtc_read_time,
+       .set_time = alpha_rtc_set_time,
+       .set_mmss = alpha_rtc_set_mmss,
+       .ioctl = alpha_rtc_ioctl,
+};
+
+/*
+ * Similarly, except do the actual CMOS access on the boot cpu only.
+ * This requires marshalling the data across an interprocessor call.
+ */
+
+#if defined(CONFIG_SMP) && \
+    (defined(CONFIG_ALPHA_GENERIC) || defined(CONFIG_ALPHA_MARVEL))
+# define HAVE_REMOTE_RTC 1
+
+union remote_data {
+       struct rtc_time *tm;
+       unsigned long now;
+       long retval;
+};
+
+static void
+do_remote_read(void *data)
+{
+       union remote_data *x = data;
+       x->retval = alpha_rtc_read_time(NULL, x->tm);
+}
+
+static int
+remote_read_time(struct device *dev, struct rtc_time *tm)
+{
+       union remote_data x;
+       if (smp_processor_id() != boot_cpuid) {
+               x.tm = tm;
+               smp_call_function_single(boot_cpuid, do_remote_read, &x, 1);
+               return x.retval;
+       }
+       return alpha_rtc_read_time(NULL, tm);
+}
+
+static void
+do_remote_set(void *data)
+{
+       union remote_data *x = data;
+       x->retval = alpha_rtc_set_time(NULL, x->tm);
+}
+
+static int
+remote_set_time(struct device *dev, struct rtc_time *tm)
+{
+       union remote_data x;
+       if (smp_processor_id() != boot_cpuid) {
+               x.tm = tm;
+               smp_call_function_single(boot_cpuid, do_remote_set, &x, 1);
+               return x.retval;
+       }
+       return alpha_rtc_set_time(NULL, tm);
+}
+
+static void
+do_remote_mmss(void *data)
+{
+       union remote_data *x = data;
+       x->retval = alpha_rtc_set_mmss(NULL, x->now);
+}
+
+static int
+remote_set_mmss(struct device *dev, unsigned long now)
+{
+       union remote_data x;
+       if (smp_processor_id() != boot_cpuid) {
+               x.now = now;
+               smp_call_function_single(boot_cpuid, do_remote_mmss, &x, 1);
+               return x.retval;
+       }
+       return alpha_rtc_set_mmss(NULL, now);
+}
+
+static const struct rtc_class_ops remote_rtc_ops = {
+       .read_time = remote_read_time,
+       .set_time = remote_set_time,
+       .set_mmss = remote_set_mmss,
+       .ioctl = alpha_rtc_ioctl,
+};
+#endif
+
+static int __init
+alpha_rtc_init(void)
+{
+       const struct rtc_class_ops *ops;
+       struct platform_device *pdev;
+       struct rtc_device *rtc;
+       const char *name;
+
+       init_rtc_epoch();
+       name = "rtc-alpha";
+       ops = &alpha_rtc_ops;
+
+#ifdef HAVE_REMOTE_RTC
+       if (alpha_mv.rtc_boot_cpu_only)
+               ops = &remote_rtc_ops;
+#endif
+
+       pdev = platform_device_register_simple(name, -1, NULL, 0);
+       rtc = devm_rtc_device_register(&pdev->dev, name, ops, THIS_MODULE);
+       if (IS_ERR(rtc))
+               return PTR_ERR(rtc);
+
+       platform_set_drvdata(pdev, rtc);
+       return 0;
+}
+device_initcall(alpha_rtc_init);
index 9e3107c..b20af76 100644 (file)
@@ -115,10 +115,17 @@ unsigned long alpha_agpgart_size = DEFAULT_AGP_APER_SIZE;
 
 #ifdef CONFIG_ALPHA_GENERIC
 struct alpha_machine_vector alpha_mv;
+#endif
+
+#ifndef alpha_using_srm
 int alpha_using_srm;
 EXPORT_SYMBOL(alpha_using_srm);
 #endif
 
+#ifndef alpha_using_qemu
+int alpha_using_qemu;
+#endif
+
 static struct alpha_machine_vector *get_sysvec(unsigned long, unsigned long,
                                               unsigned long);
 static struct alpha_machine_vector *get_sysvec_byname(const char *);
@@ -529,11 +536,15 @@ setup_arch(char **cmdline_p)
        atomic_notifier_chain_register(&panic_notifier_list,
                        &alpha_panic_block);
 
-#ifdef CONFIG_ALPHA_GENERIC
+#ifndef alpha_using_srm
        /* Assume that we've booted from SRM if we haven't booted from MILO.
           Detect the later by looking for "MILO" in the system serial nr.  */
        alpha_using_srm = strncmp((const char *)hwrpb->ssn, "MILO", 4) != 0;
 #endif
+#ifndef alpha_using_qemu
+       /* Similarly, look for QEMU.  */
+       alpha_using_qemu = strstr((const char *)hwrpb->ssn, "QEMU") != 0;
+#endif
 
        /* If we are using SRM, we want to allow callbacks
           as early as possible, so do this NOW, and then
@@ -1207,6 +1218,7 @@ show_cpuinfo(struct seq_file *f, void *slot)
        char *systype_name;
        char *sysvariation_name;
        int nr_processors;
+       unsigned long timer_freq;
 
        cpu_index = (unsigned) (cpu->type - 1);
        cpu_name = "Unknown";
@@ -1218,6 +1230,12 @@ show_cpuinfo(struct seq_file *f, void *slot)
 
        nr_processors = get_nr_processors(cpu, hwrpb->nr_processors);
 
+#if CONFIG_HZ == 1024 || CONFIG_HZ == 1200
+       timer_freq = (100UL * hwrpb->intr_freq) / 4096;
+#else
+       timer_freq = 100UL * CONFIG_HZ;
+#endif
+
        seq_printf(f, "cpu\t\t\t: Alpha\n"
                      "cpu model\t\t: %s\n"
                      "cpu variation\t\t: %ld\n"
@@ -1243,8 +1261,7 @@ show_cpuinfo(struct seq_file *f, void *slot)
                       (char*)hwrpb->ssn,
                       est_cycle_freq ? : hwrpb->cycle_freq,
                       est_cycle_freq ? "est." : "",
-                      hwrpb->intr_freq / 4096,
-                      (100 * hwrpb->intr_freq / 4096) % 100,
+                      timer_freq / 100, timer_freq % 100,
                       hwrpb->pagesize,
                       hwrpb->pa_bits,
                       hwrpb->max_asn,
index 9dbbcb3..99ac36d 100644 (file)
@@ -138,9 +138,11 @@ smp_callin(void)
 
        /* Get our local ticker going. */
        smp_setup_percpu_timer(cpuid);
+       init_clockevent();
 
        /* Call platform-specific callin, if specified */
-       if (alpha_mv.smp_callin) alpha_mv.smp_callin();
+       if (alpha_mv.smp_callin)
+               alpha_mv.smp_callin();
 
        /* All kernel threads share the same mm context.  */
        atomic_inc(&init_mm.mm_count);
@@ -498,35 +500,6 @@ smp_cpus_done(unsigned int max_cpus)
               ((bogosum + 2500) / (5000/HZ)) % 100);
 }
 
-\f
-void
-smp_percpu_timer_interrupt(struct pt_regs *regs)
-{
-       struct pt_regs *old_regs;
-       int cpu = smp_processor_id();
-       unsigned long user = user_mode(regs);
-       struct cpuinfo_alpha *data = &cpu_data[cpu];
-
-       old_regs = set_irq_regs(regs);
-
-       /* Record kernel PC.  */
-       profile_tick(CPU_PROFILING);
-
-       if (!--data->prof_counter) {
-               /* We need to make like a normal interrupt -- otherwise
-                  timer interrupts ignore the global interrupt lock,
-                  which would be a Bad Thing.  */
-               irq_enter();
-
-               update_process_times(user);
-
-               data->prof_counter = data->prof_multiplier;
-
-               irq_exit();
-       }
-       set_irq_regs(old_regs);
-}
-
 int
 setup_profiling_timer(unsigned int multiplier)
 {
index 5a0af11..608f2a7 100644 (file)
@@ -224,8 +224,6 @@ struct alpha_machine_vector jensen_mv __initmv = {
        .machine_check          = jensen_machine_check,
        .max_isa_dma_address    = ALPHA_MAX_ISA_DMA_ADDRESS,
        .rtc_port               = 0x170,
-       .rtc_get_time           = common_get_rtc_time,
-       .rtc_set_time           = common_set_rtc_time,
 
        .nr_irqs                = 16,
        .device_interrupt       = jensen_device_interrupt,
index c92e389..f21d61f 100644 (file)
@@ -22,7 +22,6 @@
 #include <asm/hwrpb.h>
 #include <asm/tlbflush.h>
 #include <asm/vga.h>
-#include <asm/rtc.h>
 
 #include "proto.h"
 #include "err_impl.h"
@@ -400,57 +399,6 @@ marvel_init_rtc(void)
        init_rtc_irq();
 }
 
-struct marvel_rtc_time {
-       struct rtc_time *time;
-       int retval;
-};
-
-#ifdef CONFIG_SMP
-static void
-smp_get_rtc_time(void *data)
-{
-       struct marvel_rtc_time *mrt = data;
-       mrt->retval = __get_rtc_time(mrt->time);
-}
-
-static void
-smp_set_rtc_time(void *data)
-{
-       struct marvel_rtc_time *mrt = data;
-       mrt->retval = __set_rtc_time(mrt->time);
-}
-#endif
-
-static unsigned int
-marvel_get_rtc_time(struct rtc_time *time)
-{
-#ifdef CONFIG_SMP
-       struct marvel_rtc_time mrt;
-
-       if (smp_processor_id() != boot_cpuid) {
-               mrt.time = time;
-               smp_call_function_single(boot_cpuid, smp_get_rtc_time, &mrt, 1);
-               return mrt.retval;
-       }
-#endif
-       return __get_rtc_time(time);
-}
-
-static int
-marvel_set_rtc_time(struct rtc_time *time)
-{
-#ifdef CONFIG_SMP
-       struct marvel_rtc_time mrt;
-
-       if (smp_processor_id() != boot_cpuid) {
-               mrt.time = time;
-               smp_call_function_single(boot_cpuid, smp_set_rtc_time, &mrt, 1);
-               return mrt.retval;
-       }
-#endif
-       return __set_rtc_time(time);
-}
-
 static void
 marvel_smp_callin(void)
 {
@@ -492,8 +440,7 @@ struct alpha_machine_vector marvel_ev7_mv __initmv = {
        .vector_name            = "MARVEL/EV7",
        DO_EV7_MMU,
        .rtc_port               = 0x70,
-       .rtc_get_time           = marvel_get_rtc_time,
-       .rtc_set_time           = marvel_set_rtc_time,
+       .rtc_boot_cpu_only      = 1,
        DO_MARVEL_IO,
        .machine_check          = marvel_machine_check,
        .max_isa_dma_address    = ALPHA_MAX_ISA_DMA_ADDRESS,
index ea33950..ee39cee 100644 (file)
@@ -3,13 +3,7 @@
  *
  *  Copyright (C) 1991, 1992, 1995, 1999, 2000  Linus Torvalds
  *
- * This file contains the PC-specific time handling details:
- * reading the RTC at bootup, etc..
- * 1994-07-02    Alan Modra
- *     fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
- * 1995-03-26    Markus Kuhn
- *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
- *      precision CMOS clock update
+ * This file contains the clocksource time handling.
  * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
  *             "A Kernel Model for Precision Timekeeping" by Dave Mills
  * 1997-01-09    Adrian Sun
@@ -21,9 +15,6 @@
  * 1999-04-16  Thorsten Kranzkowski (dl8bcu@gmx.net)
  *     fixed algorithm in do_gettimeofday() for calculating the precise time
  *     from processor cycle counter (now taking lost_ticks into account)
- * 2000-08-13  Jan-Benedict Glaw <jbglaw@lug-owl.de>
- *     Fixed time_init to be aware of epoches != 1900. This prevents
- *     booting up in 2048 for me;) Code is stolen from rtc.c.
  * 2003-06-03  R. Scott Bailey <scott.bailey@eds.com>
  *     Tighten sanity in time_init from 1% (10,000 PPM) to 250 PPM
  */
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/hwrpb.h>
-#include <asm/rtc.h>
 
 #include <linux/mc146818rtc.h>
 #include <linux/time.h>
 #include <linux/timex.h>
 #include <linux/clocksource.h>
+#include <linux/clockchips.h>
 
 #include "proto.h"
 #include "irq_impl.h"
 
-static int set_rtc_mmss(unsigned long);
-
 DEFINE_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
-#define TICK_SIZE (tick_nsec / 1000)
-
-/*
- * Shift amount by which scaled_ticks_per_cycle is scaled.  Shifting
- * by 48 gives us 16 bits for HZ while keeping the accuracy good even
- * for large CPU clock rates.
- */
-#define FIX_SHIFT      48
-
-/* lump static variables together for more efficient access: */
-static struct {
-       /* cycle counter last time it got invoked */
-       __u32 last_time;
-       /* ticks/cycle * 2^48 */
-       unsigned long scaled_ticks_per_cycle;
-       /* partial unused tick */
-       unsigned long partial_tick;
-} state;
-
 unsigned long est_cycle_freq;
 
 #ifdef CONFIG_IRQ_WORK
@@ -108,109 +78,156 @@ static inline __u32 rpcc(void)
        return __builtin_alpha_rpcc();
 }
 
-int update_persistent_clock(struct timespec now)
-{
-       return set_rtc_mmss(now.tv_sec);
-}
 
-void read_persistent_clock(struct timespec *ts)
+\f
+/*
+ * The RTC as a clock_event_device primitive.
+ */
+
+static DEFINE_PER_CPU(struct clock_event_device, cpu_ce);
+
+irqreturn_t
+rtc_timer_interrupt(int irq, void *dev)
 {
-       unsigned int year, mon, day, hour, min, sec, epoch;
-
-       sec = CMOS_READ(RTC_SECONDS);
-       min = CMOS_READ(RTC_MINUTES);
-       hour = CMOS_READ(RTC_HOURS);
-       day = CMOS_READ(RTC_DAY_OF_MONTH);
-       mon = CMOS_READ(RTC_MONTH);
-       year = CMOS_READ(RTC_YEAR);
-
-       if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-               sec = bcd2bin(sec);
-               min = bcd2bin(min);
-               hour = bcd2bin(hour);
-               day = bcd2bin(day);
-               mon = bcd2bin(mon);
-               year = bcd2bin(year);
-       }
+       int cpu = smp_processor_id();
+       struct clock_event_device *ce = &per_cpu(cpu_ce, cpu);
 
-       /* PC-like is standard; used for year >= 70 */
-       epoch = 1900;
-       if (year < 20)
-               epoch = 2000;
-       else if (year >= 20 && year < 48)
-               /* NT epoch */
-               epoch = 1980;
-       else if (year >= 48 && year < 70)
-               /* Digital UNIX epoch */
-               epoch = 1952;
+       /* Don't run the hook for UNUSED or SHUTDOWN.  */
+       if (likely(ce->mode == CLOCK_EVT_MODE_PERIODIC))
+               ce->event_handler(ce);
 
-       printk(KERN_INFO "Using epoch = %d\n", epoch);
+       if (test_irq_work_pending()) {
+               clear_irq_work_pending();
+               irq_work_run();
+       }
 
-       if ((year += epoch) < 1970)
-               year += 100;
+       return IRQ_HANDLED;
+}
 
-       ts->tv_sec = mktime(year, mon, day, hour, min, sec);
-       ts->tv_nsec = 0;
+static void
+rtc_ce_set_mode(enum clock_event_mode mode, struct clock_event_device *ce)
+{
+       /* The mode member of CE is updated in generic code.
+          Since we only support periodic events, nothing to do.  */
+}
+
+static int
+rtc_ce_set_next_event(unsigned long evt, struct clock_event_device *ce)
+{
+       /* This hook is for oneshot mode, which we don't support.  */
+       return -EINVAL;
 }
 
+static void __init
+init_rtc_clockevent(void)
+{
+       int cpu = smp_processor_id();
+       struct clock_event_device *ce = &per_cpu(cpu_ce, cpu);
+
+       *ce = (struct clock_event_device){
+               .name = "rtc",
+               .features = CLOCK_EVT_FEAT_PERIODIC,
+               .rating = 100,
+               .cpumask = cpumask_of(cpu),
+               .set_mode = rtc_ce_set_mode,
+               .set_next_event = rtc_ce_set_next_event,
+       };
 
+       clockevents_config_and_register(ce, CONFIG_HZ, 0, 0);
+}
 
+\f
 /*
- * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "xtime_update()" routine every clocktick
+ * The QEMU clock as a clocksource primitive.
  */
-irqreturn_t timer_interrupt(int irq, void *dev)
+
+static cycle_t
+qemu_cs_read(struct clocksource *cs)
 {
-       unsigned long delta;
-       __u32 now;
-       long nticks;
+       return qemu_get_vmtime();
+}
 
-#ifndef CONFIG_SMP
-       /* Not SMP, do kernel PC profiling here.  */
-       profile_tick(CPU_PROFILING);
-#endif
+static struct clocksource qemu_cs = {
+       .name                   = "qemu",
+       .rating                 = 400,
+       .read                   = qemu_cs_read,
+       .mask                   = CLOCKSOURCE_MASK(64),
+       .flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
+       .max_idle_ns            = LONG_MAX
+};
 
-       /*
-        * Calculate how many ticks have passed since the last update,
-        * including any previous partial leftover.  Save any resulting
-        * fraction for the next pass.
-        */
-       now = rpcc();
-       delta = now - state.last_time;
-       state.last_time = now;
-       delta = delta * state.scaled_ticks_per_cycle + state.partial_tick;
-       state.partial_tick = delta & ((1UL << FIX_SHIFT) - 1); 
-       nticks = delta >> FIX_SHIFT;
 
-       if (nticks)
-               xtime_update(nticks);
+/*
+ * The QEMU alarm as a clock_event_device primitive.
+ */
 
-       if (test_irq_work_pending()) {
-               clear_irq_work_pending();
-               irq_work_run();
-       }
+static void
+qemu_ce_set_mode(enum clock_event_mode mode, struct clock_event_device *ce)
+{
+       /* The mode member of CE is updated for us in generic code.
+          Just make sure that the event is disabled.  */
+       qemu_set_alarm_abs(0);
+}
 
-#ifndef CONFIG_SMP
-       while (nticks--)
-               update_process_times(user_mode(get_irq_regs()));
-#endif
+static int
+qemu_ce_set_next_event(unsigned long evt, struct clock_event_device *ce)
+{
+       qemu_set_alarm_rel(evt);
+       return 0;
+}
 
+static irqreturn_t
+qemu_timer_interrupt(int irq, void *dev)
+{
+       int cpu = smp_processor_id();
+       struct clock_event_device *ce = &per_cpu(cpu_ce, cpu);
+
+       ce->event_handler(ce);
        return IRQ_HANDLED;
 }
 
+static void __init
+init_qemu_clockevent(void)
+{
+       int cpu = smp_processor_id();
+       struct clock_event_device *ce = &per_cpu(cpu_ce, cpu);
+
+       *ce = (struct clock_event_device){
+               .name = "qemu",
+               .features = CLOCK_EVT_FEAT_ONESHOT,
+               .rating = 400,
+               .cpumask = cpumask_of(cpu),
+               .set_mode = qemu_ce_set_mode,
+               .set_next_event = qemu_ce_set_next_event,
+       };
+
+       clockevents_config_and_register(ce, NSEC_PER_SEC, 1000, LONG_MAX);
+}
+
+\f
 void __init
 common_init_rtc(void)
 {
-       unsigned char x;
+       unsigned char x, sel = 0;
 
        /* Reset periodic interrupt frequency.  */
-       x = CMOS_READ(RTC_FREQ_SELECT) & 0x3f;
-        /* Test includes known working values on various platforms
-           where 0x26 is wrong; we refuse to change those. */
-       if (x != 0x26 && x != 0x25 && x != 0x19 && x != 0x06) {
-               printk("Setting RTC_FREQ to 1024 Hz (%x)\n", x);
-               CMOS_WRITE(0x26, RTC_FREQ_SELECT);
+#if CONFIG_HZ == 1024 || CONFIG_HZ == 1200
+       x = CMOS_READ(RTC_FREQ_SELECT) & 0x3f;
+       /* Test includes known working values on various platforms
+          where 0x26 is wrong; we refuse to change those. */
+       if (x != 0x26 && x != 0x25 && x != 0x19 && x != 0x06) {
+               sel = RTC_REF_CLCK_32KHZ + 6;
        }
+#elif CONFIG_HZ == 256 || CONFIG_HZ == 128 || CONFIG_HZ == 64 || CONFIG_HZ == 32
+       sel = RTC_REF_CLCK_32KHZ + __builtin_ffs(32768 / CONFIG_HZ);
+#else
+# error "Unknown HZ from arch/alpha/Kconfig"
+#endif
+       if (sel) {
+               printk(KERN_INFO "Setting RTC_FREQ to %d Hz (%x)\n",
+                      CONFIG_HZ, sel);
+               CMOS_WRITE(sel, RTC_FREQ_SELECT);
+       }
 
        /* Turn on periodic interrupts.  */
        x = CMOS_READ(RTC_CONTROL);
@@ -233,16 +250,37 @@ common_init_rtc(void)
        init_rtc_irq();
 }
 
-unsigned int common_get_rtc_time(struct rtc_time *time)
-{
-       return __get_rtc_time(time);
-}
+\f
+#ifndef CONFIG_ALPHA_WTINT
+/*
+ * The RPCC as a clocksource primitive.
+ *
+ * While we have free-running timecounters running on all CPUs, and we make
+ * a half-hearted attempt in init_rtc_rpcc_info to sync the timecounter
+ * with the wall clock, that initialization isn't kept up-to-date across
+ * different time counters in SMP mode.  Therefore we can only use this
+ * method when there's only one CPU enabled.
+ *
+ * When using the WTINT PALcall, the RPCC may shift to a lower frequency,
+ * or stop altogether, while waiting for the interrupt.  Therefore we cannot
+ * use this method when WTINT is in use.
+ */
 
-int common_set_rtc_time(struct rtc_time *time)
+static cycle_t read_rpcc(struct clocksource *cs)
 {
-       return __set_rtc_time(time);
+       return rpcc();
 }
 
+static struct clocksource clocksource_rpcc = {
+       .name                   = "rpcc",
+       .rating                 = 300,
+       .read                   = read_rpcc,
+       .mask                   = CLOCKSOURCE_MASK(32),
+       .flags                  = CLOCK_SOURCE_IS_CONTINUOUS
+};
+#endif /* ALPHA_WTINT */
+
+\f
 /* Validate a computed cycle counter result against the known bounds for
    the given processor core.  There's too much brokenness in the way of
    timing hardware for any one method to work everywhere.  :-(
@@ -353,33 +391,6 @@ rpcc_after_update_in_progress(void)
        return rpcc();
 }
 
-#ifndef CONFIG_SMP
-/* Until and unless we figure out how to get cpu cycle counters
-   in sync and keep them there, we can't use the rpcc.  */
-static cycle_t read_rpcc(struct clocksource *cs)
-{
-       cycle_t ret = (cycle_t)rpcc();
-       return ret;
-}
-
-static struct clocksource clocksource_rpcc = {
-       .name                   = "rpcc",
-       .rating                 = 300,
-       .read                   = read_rpcc,
-       .mask                   = CLOCKSOURCE_MASK(32),
-       .flags                  = CLOCK_SOURCE_IS_CONTINUOUS
-};
-
-static inline void register_rpcc_clocksource(long cycle_freq)
-{
-       clocksource_register_hz(&clocksource_rpcc, cycle_freq);
-}
-#else /* !CONFIG_SMP */
-static inline void register_rpcc_clocksource(long cycle_freq)
-{
-}
-#endif /* !CONFIG_SMP */
-
 void __init
 time_init(void)
 {
@@ -387,6 +398,15 @@ time_init(void)
        unsigned long cycle_freq, tolerance;
        long diff;
 
+       if (alpha_using_qemu) {
+               clocksource_register_hz(&qemu_cs, NSEC_PER_SEC);
+               init_qemu_clockevent();
+
+               timer_irqaction.handler = qemu_timer_interrupt;
+               init_rtc_irq();
+               return;
+       }
+
        /* Calibrate CPU clock -- attempt #1.  */
        if (!est_cycle_freq)
                est_cycle_freq = validate_cc_value(calibrate_cc_with_pit());
@@ -421,100 +441,25 @@ time_init(void)
                       "and unable to estimate a proper value!\n");
        }
 
-       /* From John Bowman <bowman@math.ualberta.ca>: allow the values
-          to settle, as the Update-In-Progress bit going low isn't good
-          enough on some hardware.  2ms is our guess; we haven't found 
-          bogomips yet, but this is close on a 500Mhz box.  */
-       __delay(1000000);
-
-
-       if (HZ > (1<<16)) {
-               extern void __you_loose (void);
-               __you_loose();
-       }
-
-       register_rpcc_clocksource(cycle_freq);
-
-       state.last_time = cc1;
-       state.scaled_ticks_per_cycle
-               = ((unsigned long) HZ << FIX_SHIFT) / cycle_freq;
-       state.partial_tick = 0L;
+       /* See above for restrictions on using clocksource_rpcc.  */
+#ifndef CONFIG_ALPHA_WTINT
+       if (hwrpb->nr_processors == 1)
+               clocksource_register_hz(&clocksource_rpcc, cycle_freq);
+#endif
 
        /* Startup the timer source. */
        alpha_mv.init_rtc();
+       init_rtc_clockevent();
 }
 
-/*
- * In order to set the CMOS clock precisely, set_rtc_mmss has to be
- * called 500 ms after the second nowtime has started, because when
- * nowtime is written into the registers of the CMOS clock, it will
- * jump to the next second precisely 500 ms later. Check the Motorola
- * MC146818A or Dallas DS12887 data sheet for details.
- *
- * BUG: This routine does not handle hour overflow properly; it just
- *      sets the minutes. Usually you won't notice until after reboot!
- */
-
-
-static int
-set_rtc_mmss(unsigned long nowtime)
+/* Initialize the clock_event_device for secondary cpus.  */
+#ifdef CONFIG_SMP
+void __init
+init_clockevent(void)
 {
-       int retval = 0;
-       int real_seconds, real_minutes, cmos_minutes;
-       unsigned char save_control, save_freq_select;
-
-       /* irq are locally disabled here */
-       spin_lock(&rtc_lock);
-       /* Tell the clock it's being set */
-       save_control = CMOS_READ(RTC_CONTROL);
-       CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
-
-       /* Stop and reset prescaler */
-       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
-       CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
-
-       cmos_minutes = CMOS_READ(RTC_MINUTES);
-       if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-               cmos_minutes = bcd2bin(cmos_minutes);
-
-       /*
-        * since we're only adjusting minutes and seconds,
-        * don't interfere with hour overflow. This avoids
-        * messing with unknown time zones but requires your
-        * RTC not to be off by more than 15 minutes
-        */
-       real_seconds = nowtime % 60;
-       real_minutes = nowtime / 60;
-       if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1) {
-               /* correct for half hour time zone */
-               real_minutes += 30;
-       }
-       real_minutes %= 60;
-
-       if (abs(real_minutes - cmos_minutes) < 30) {
-               if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-                       real_seconds = bin2bcd(real_seconds);
-                       real_minutes = bin2bcd(real_minutes);
-               }
-               CMOS_WRITE(real_seconds,RTC_SECONDS);
-               CMOS_WRITE(real_minutes,RTC_MINUTES);
-       } else {
-               printk_once(KERN_NOTICE
-                      "set_rtc_mmss: can't update from %d to %d\n",
-                      cmos_minutes, real_minutes);
-               retval = -1;
-       }
-
-       /* The following flags have to be released exactly in this order,
-        * otherwise the DS12887 (popular MC146818A clone with integrated
-        * battery and quartz) will not reset the oscillator and will not
-        * update precisely 500 ms later. You won't find this mentioned in
-        * the Dallas Semiconductor data sheets, but who believes data
-        * sheets anyway ...                           -- Markus Kuhn
-        */
-       CMOS_WRITE(save_control, RTC_CONTROL);
-       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-       spin_unlock(&rtc_lock);
-
-       return retval;
+       if (alpha_using_qemu)
+               init_qemu_clockevent();
+       else
+               init_rtc_clockevent();
 }
+#endif
index bd0665c..9c4c189 100644 (file)
@@ -241,6 +241,21 @@ do_entIF(unsigned long type, struct pt_regs *regs)
                               (const char *)(data[1] | (long)data[2] << 32), 
                               data[0]);
                }
+#ifdef CONFIG_ALPHA_WTINT
+               if (type == 4) {
+                       /* If CALL_PAL WTINT is totally unsupported by the
+                          PALcode, e.g. MILO, "emulate" it by overwriting
+                          the insn.  */
+                       unsigned int *pinsn
+                         = (unsigned int *) regs->pc - 1;
+                       if (*pinsn == PAL_wtint) {
+                               *pinsn = 0x47e01400; /* mov 0,$0 */
+                               imb();
+                               regs->r0 = 0;
+                               return;
+                       }
+               }
+#endif /* ALPHA_WTINT */
                die_if_kernel((type == 1 ? "Kernel Bug" : "Instruction fault"),
                              regs, type, NULL);
        }
index ffb19b7..ff3c107 100644 (file)
@@ -130,7 +130,7 @@ csum_partial_cfu_aligned(const unsigned long __user *src, unsigned long *dst,
                *dst = word | tmp;
                checksum += carry;
        }
-       if (err) *errp = err;
+       if (err && errp) *errp = err;
        return checksum;
 }
 
@@ -185,7 +185,7 @@ csum_partial_cfu_dest_aligned(const unsigned long __user *src,
                *dst = word | tmp;
                checksum += carry;
        }
-       if (err) *errp = err;
+       if (err && errp) *errp = err;
        return checksum;
 }
 
@@ -242,7 +242,7 @@ csum_partial_cfu_src_aligned(const unsigned long __user *src,
        stq_u(partial_dest | second_dest, dst);
 out:
        checksum += carry;
-       if (err) *errp = err;
+       if (err && errp) *errp = err;
        return checksum;
 }
 
@@ -325,7 +325,7 @@ csum_partial_cfu_unaligned(const unsigned long __user * src,
                stq_u(partial_dest | word | second_dest, dst);
                checksum += carry;
        }
-       if (err) *errp = err;
+       if (err && errp) *errp = err;
        return checksum;
 }
 
@@ -339,7 +339,7 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len,
 
        if (len) {
                if (!access_ok(VERIFY_READ, src, len)) {
-                       *errp = -EFAULT;
+                       if (errp) *errp = -EFAULT;
                        memset(dst, 0, len);
                        return sum;
                }
index d8b94e1..356bb2f 100644 (file)
        .set noat
        .set noreorder
 .text
+       .globl memset
        .globl __memset
+       .globl ___memset
        .globl __memsetw
        .globl __constant_c_memset
-       .globl memset
 
-       .ent __memset
+       .ent ___memset
 .align 5
-__memset:
+___memset:
        .frame $30,0,$26,0
        .prologue 0
 
@@ -227,7 +228,7 @@ end_b:
        nop
        nop
        ret $31,($26),1         # L0 :
-       .end __memset
+       .end ___memset
 
        /*
         * This is the original body of code, prior to replication and
@@ -594,4 +595,5 @@ end_w:
 
        .end __memsetw
 
-memset = __memset
+memset = ___memset
+__memset = ___memset
index 311b8cf..76ccc6d 100644 (file)
 .text
        .globl memset
        .globl __memset
+       .globl ___memset
        .globl __memsetw
        .globl __constant_c_memset
-       .ent __memset
+
+       .ent ___memset
 .align 5
-__memset:
+___memset:
        .frame $30,0,$26,0
        .prologue 0
 
@@ -103,7 +105,7 @@ within_one_quad:
 
 end:
        ret $31,($26),1         /* E1 */
-       .end __memset
+       .end ___memset
 
        .align 5
        .ent __memsetw
@@ -121,4 +123,5 @@ __memsetw:
 
        .end __memsetw
 
-memset = __memset
+memset = ___memset
+__memset = ___memset
index 214b698..c1f1a7e 100644 (file)
@@ -25,7 +25,7 @@ config ARM
        select HARDIRQS_SW_RESEND
        select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
        select HAVE_ARCH_KGDB
-       select HAVE_ARCH_SECCOMP_FILTER
+       select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
        select HAVE_ARCH_TRACEHOOK
        select HAVE_BPF_JIT
        select HAVE_CONTEXT_TRACKING
@@ -1496,6 +1496,7 @@ config HAVE_ARM_ARCH_TIMER
        bool "Architected timer support"
        depends on CPU_V7
        select ARM_ARCH_TIMER
+       select GENERIC_CLOCKEVENTS
        help
          This option enables support for the ARM architected timer
 
@@ -1719,7 +1720,6 @@ config AEABI
 config OABI_COMPAT
        bool "Allow old ABI binaries to run with this kernel (EXPERIMENTAL)"
        depends on AEABI && !THUMB2_KERNEL
-       default y
        help
          This option preserves the old syscall interface along with the
          new (ARM EABI) one. It also provides a compatibility layer to
@@ -1727,11 +1727,16 @@ config OABI_COMPAT
          in memory differs between the legacy ABI and the new ARM EABI
          (only for non "thumb" binaries). This option adds a tiny
          overhead to all syscalls and produces a slightly larger kernel.
+
+         The seccomp filter system will not be available when this is
+         selected, since there is no way yet to sensibly distinguish
+         between calling conventions during filtering.
+
          If you know you'll be using only pure EABI user space then you
          can say N here. If this option is not selected and you attempt
          to execute a legacy ABI binary then the result will be
          UNPREDICTABLE (in fact it can be predicted that it won't work
-         at all). If in doubt say Y.
+         at all). If in doubt say N.
 
 config ARCH_HAS_HOLES_MEMORYMODEL
        bool
index 8e1a024..41bca32 100644 (file)
@@ -404,7 +404,7 @@ static irqreturn_t dma_irq_handler(int irq, void *data)
                                        BIT(slot));
                        if (edma_cc[ctlr]->intr_data[channel].callback)
                                edma_cc[ctlr]->intr_data[channel].callback(
-                                       channel, DMA_COMPLETE,
+                                       channel, EDMA_DMA_COMPLETE,
                                        edma_cc[ctlr]->intr_data[channel].data);
                }
        } while (sh_ipr);
@@ -459,7 +459,7 @@ static irqreturn_t dma_ccerr_handler(int irq, void *data)
                                                                callback) {
                                                edma_cc[ctlr]->intr_data[k].
                                                callback(k,
-                                               DMA_CC_ERROR,
+                                               EDMA_DMA_CC_ERROR,
                                                edma_cc[ctlr]->intr_data
                                                [k].data);
                                        }
index 9b28f12..240b29e 100644 (file)
@@ -393,36 +393,6 @@ static inline int iop_chan_zero_sum_slot_count(size_t len, int src_cnt,
        return slot_cnt;
 }
 
-static inline int iop_desc_is_pq(struct iop_adma_desc_slot *desc)
-{
-       return 0;
-}
-
-static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
-                                       struct iop_adma_chan *chan)
-{
-       union iop3xx_desc hw_desc = { .ptr = desc->hw_desc, };
-
-       switch (chan->device->id) {
-       case DMA0_ID:
-       case DMA1_ID:
-               return hw_desc.dma->dest_addr;
-       case AAU_ID:
-               return hw_desc.aau->dest_addr;
-       default:
-               BUG();
-       }
-       return 0;
-}
-
-
-static inline u32 iop_desc_get_qdest_addr(struct iop_adma_desc_slot *desc,
-                                         struct iop_adma_chan *chan)
-{
-       BUG();
-       return 0;
-}
-
 static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc,
                                        struct iop_adma_chan *chan)
 {
index 122f86d..250760e 100644 (file)
@@ -82,8 +82,6 @@ struct iop_adma_chan {
  * @slot_cnt: total slots used in an transaction (group of operations)
  * @slots_per_op: number of slots per operation
  * @idx: pool index
- * @unmap_src_cnt: number of xor sources
- * @unmap_len: transaction bytecount
  * @tx_list: list of descriptors that are associated with one operation
  * @async_tx: support for the async_tx api
  * @group_list: list of slots that make up a multi-descriptor transaction
@@ -99,8 +97,6 @@ struct iop_adma_desc_slot {
        u16 slot_cnt;
        u16 slots_per_op;
        u16 idx;
-       u16 unmap_src_cnt;
-       size_t unmap_len;
        struct list_head tx_list;
        struct dma_async_tx_descriptor async_tx;
        union {
index 4dd2145..9ecccc8 100644 (file)
@@ -226,7 +226,14 @@ static inline phys_addr_t __virt_to_phys(unsigned long x)
 static inline unsigned long __phys_to_virt(phys_addr_t x)
 {
        unsigned long t;
-       __pv_stub(x, t, "sub", __PV_BITS_31_24);
+
+       /*
+        * 'unsigned long' cast discard upper word when
+        * phys_addr_t is 64 bit, and makes sure that inline
+        * assembler expression receives 32 bit argument
+        * in place where 'r' 32 bit operand is expected.
+        */
+       __pv_stub((unsigned long) x, t, "sub", __PV_BITS_31_24);
        return t;
 }
 
index 7801866..11d59b3 100644 (file)
@@ -508,6 +508,7 @@ __fixup_smp:
        teq     r0, #0x0                @ '0' on actual UP A9 hardware
        beq     __fixup_smp_on_up       @ So its an A9 UP
        ldr     r0, [r0, #4]            @ read SCU Config
+ARM_BE8(rev    r0, r0)                 @ byteswap if big endian
        and     r0, r0, #0x3            @ number of CPUs
        teq     r0, #0x0                @ is 1?
        movne   pc, lr
@@ -644,7 +645,11 @@ ARM_BE8(rev16      ip, ip)
        bcc     1b
        bx      lr
 #else
+#ifdef CONFIG_CPU_ENDIAN_BE8
+       moveq   r0, #0x00004000 @ set bit 22, mov to mvn instruction
+#else
        moveq   r0, #0x400000   @ set bit 22, mov to mvn instruction
+#endif
        b       2f
 1:     ldr     ip, [r7, r3]
 #ifdef CONFIG_CPU_ENDIAN_BE8
@@ -653,7 +658,7 @@ ARM_BE8(rev16       ip, ip)
        tst     ip, #0x000f0000 @ check the rotation field
        orrne   ip, ip, r6, lsl #24 @ mask in offset bits 31-24
        biceq   ip, ip, #0x00004000 @ clear bit 22
-       orreq   ip, ip, r0, lsl #24 @ mask in offset bits 7-0
+       orreq   ip, ip, r0      @ mask in offset bits 7-0
 #else
        bic     ip, ip, #0x000000ff
        tst     ip, #0xf00      @ check the rotation field
index 6125f25..dbf0923 100644 (file)
@@ -856,7 +856,7 @@ static void __init kuser_init(void *vectors)
                memcpy(vectors + 0xfe0, vectors + 0xfe8, 4);
 }
 #else
-static void __init kuser_init(void *vectors)
+static inline void __init kuser_init(void *vectors)
 {
 }
 #endif
index 3719583..5809069 100644 (file)
@@ -334,6 +334,17 @@ out:
        return err;
 }
 
+static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
+{
+       if (!is_vmalloc_addr(kaddr)) {
+               BUG_ON(!virt_addr_valid(kaddr));
+               return __pa(kaddr);
+       } else {
+               return page_to_phys(vmalloc_to_page(kaddr)) +
+                      offset_in_page(kaddr);
+       }
+}
+
 /**
  * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
  * @from:      The virtual kernel start address of the range
@@ -345,16 +356,27 @@ out:
  */
 int create_hyp_mappings(void *from, void *to)
 {
-       unsigned long phys_addr = virt_to_phys(from);
+       phys_addr_t phys_addr;
+       unsigned long virt_addr;
        unsigned long start = KERN_TO_HYP((unsigned long)from);
        unsigned long end = KERN_TO_HYP((unsigned long)to);
 
-       /* Check for a valid kernel memory mapping */
-       if (!virt_addr_valid(from) || !virt_addr_valid(to - 1))
-               return -EINVAL;
+       start = start & PAGE_MASK;
+       end = PAGE_ALIGN(end);
 
-       return __create_hyp_mappings(hyp_pgd, start, end,
-                                    __phys_to_pfn(phys_addr), PAGE_HYP);
+       for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
+               int err;
+
+               phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
+               err = __create_hyp_mappings(hyp_pgd, virt_addr,
+                                           virt_addr + PAGE_SIZE,
+                                           __phys_to_pfn(phys_addr),
+                                           PAGE_HYP);
+               if (err)
+                       return err;
+       }
+
+       return 0;
 }
 
 /**
index e0c68d5..52886b8 100644 (file)
@@ -10,7 +10,7 @@ UNWIND(       .fnstart        )
        and     r3, r0, #31             @ Get bit offset
        mov     r0, r0, lsr #5
        add     r1, r1, r0, lsl #2      @ Get word offset
-#if __LINUX_ARM_ARCH__ >= 7
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
        .arch_extension mp
        ALT_SMP(W(pldw) [r1])
        ALT_UP(W(nop))
index 6d3782d..a86fd0e 100644 (file)
@@ -218,20 +218,6 @@ iop_chan_xor_slot_count(size_t len, int src_cnt, int *slots_per_op)
 #define iop_chan_pq_slot_count iop_chan_xor_slot_count
 #define iop_chan_pq_zero_sum_slot_count iop_chan_xor_slot_count
 
-static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc,
-                                       struct iop_adma_chan *chan)
-{
-       struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
-       return hw_desc->dest_addr;
-}
-
-static inline u32 iop_desc_get_qdest_addr(struct iop_adma_desc_slot *desc,
-                                         struct iop_adma_chan *chan)
-{
-       struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
-       return hw_desc->q_dest_addr;
-}
-
 static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc,
                                        struct iop_adma_chan *chan)
 {
@@ -350,18 +336,6 @@ iop_desc_init_pq(struct iop_adma_desc_slot *desc, int src_cnt,
        hw_desc->desc_ctrl = u_desc_ctrl.value;
 }
 
-static inline int iop_desc_is_pq(struct iop_adma_desc_slot *desc)
-{
-       struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc;
-       union {
-               u32 value;
-               struct iop13xx_adma_desc_ctrl field;
-       } u_desc_ctrl;
-
-       u_desc_ctrl.value = hw_desc->desc_ctrl;
-       return u_desc_ctrl.field.pq_xfer_en;
-}
-
 static inline void
 iop_desc_init_pq_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt,
                          unsigned long flags)
index 78eeeca..580ef2d 100644 (file)
@@ -558,8 +558,8 @@ static void __init build_mem_type_table(void)
                mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_WB;
                break;
        }
-       printk("Memory policy: ECC %sabled, Data cache %s\n",
-               ecc_mask ? "en" : "dis", cp->policy);
+       pr_info("Memory policy: %sData cache %s\n",
+               ecc_mask ? "ECC enabled, " : "", cp->policy);
 
        for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
                struct mem_type *t = &mem_types[i];
index 5c668b7..55764a7 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/mach/arch.h>
 #include <asm/cputype.h>
 #include <asm/mpu.h>
+#include <asm/procinfo.h>
 
 #include "mm.h"
 
index 60920f6..bd17819 100644 (file)
@@ -92,7 +92,7 @@ ENDPROC(cpu_v7_dcache_clean_area)
 
 /* Suspend/resume support: derived from arch/arm/mach-s5pv210/sleep.S */
 .globl cpu_v7_suspend_size
-.equ   cpu_v7_suspend_size, 4 * 8
+.equ   cpu_v7_suspend_size, 4 * 9
 #ifdef CONFIG_ARM_CPU_SUSPEND
 ENTRY(cpu_v7_do_suspend)
        stmfd   sp!, {r4 - r10, lr}
@@ -101,13 +101,17 @@ ENTRY(cpu_v7_do_suspend)
        stmia   r0!, {r4 - r5}
 #ifdef CONFIG_MMU
        mrc     p15, 0, r6, c3, c0, 0   @ Domain ID
+#ifdef CONFIG_ARM_LPAE
+       mrrc    p15, 1, r5, r7, c2      @ TTB 1
+#else
        mrc     p15, 0, r7, c2, c0, 1   @ TTB 1
+#endif
        mrc     p15, 0, r11, c2, c0, 2  @ TTB control register
 #endif
        mrc     p15, 0, r8, c1, c0, 0   @ Control register
        mrc     p15, 0, r9, c1, c0, 1   @ Auxiliary control register
        mrc     p15, 0, r10, c1, c0, 2  @ Co-processor access control
-       stmia   r0, {r6 - r11}
+       stmia   r0, {r5 - r11}
        ldmfd   sp!, {r4 - r10, pc}
 ENDPROC(cpu_v7_do_suspend)
 
@@ -118,16 +122,19 @@ ENTRY(cpu_v7_do_resume)
        ldmia   r0!, {r4 - r5}
        mcr     p15, 0, r4, c13, c0, 0  @ FCSE/PID
        mcr     p15, 0, r5, c13, c0, 3  @ User r/o thread ID
-       ldmia   r0, {r6 - r11}
+       ldmia   r0, {r5 - r11}
 #ifdef CONFIG_MMU
        mcr     p15, 0, ip, c8, c7, 0   @ invalidate TLBs
        mcr     p15, 0, r6, c3, c0, 0   @ Domain ID
-#ifndef CONFIG_ARM_LPAE
+#ifdef CONFIG_ARM_LPAE
+       mcrr    p15, 0, r1, ip, c2      @ TTB 0
+       mcrr    p15, 1, r5, r7, c2      @ TTB 1
+#else
        ALT_SMP(orr     r1, r1, #TTB_FLAGS_SMP)
        ALT_UP(orr      r1, r1, #TTB_FLAGS_UP)
-#endif
        mcr     p15, 0, r1, c2, c0, 0   @ TTB 0
        mcr     p15, 0, r7, c2, c0, 1   @ TTB 1
+#endif
        mcr     p15, 0, r11, c2, c0, 2  @ TTB control register
        ldr     r4, =PRRR               @ PRRR
        ldr     r5, =NMRR               @ NMRR
index 4488fa2..2ffc298 100644 (file)
@@ -8,6 +8,8 @@
  * published by the Free Software Foundation.
  */
 #include <asm/setup.h>
+#include <asm/thread_info.h>
+#include <asm/sysreg.h>
 
        /*
         * The kernel is loaded where we want it to be and all caches
        .section .init.text,"ax"
        .global _start
 _start:
-       /* Check if the boot loader actually provided a tag table */
-       lddpc   r0, magic_number
-       cp.w    r12, r0
-       brne    no_tag_table
-
        /* Initialize .bss */
        lddpc   r2, bss_start_addr
        lddpc   r3, end_addr
@@ -34,6 +31,25 @@ _start:
        cp      r2, r3
        brlo    1b
 
+       /* Initialize status register */
+       lddpc   r0, init_sr
+       mtsr    SYSREG_SR, r0
+
+       /* Set initial stack pointer */
+       lddpc   sp, stack_addr
+       sub     sp, -THREAD_SIZE
+
+#ifdef CONFIG_FRAME_POINTER
+       /* Mark last stack frame */
+       mov     lr, 0
+       mov     r7, 0
+#endif
+
+       /* Check if the boot loader actually provided a tag table */
+       lddpc   r0, magic_number
+       cp.w    r12, r0
+       brne    no_tag_table
+
        /*
         * Save the tag table address for later use. This must be done
         * _after_ .bss has been initialized...
@@ -53,8 +69,15 @@ bss_start_addr:
        .long   __bss_start
 end_addr:
        .long   _end
+init_sr:
+       .long   0x007f0000      /* Supervisor mode, everything masked */
+stack_addr:
+       .long   init_thread_union
+panic_addr:
+       .long   panic
 
 no_tag_table:
        sub     r12, pc, (. - 2f)
-       bral    panic
+       /* branch to panic() which can be far away with that construct */
+       lddpc   pc, panic_addr
 2:     .asciz  "Boot loader didn't provide correct magic number\n"
index 996cb65..45f563e 100644 (file)
@@ -16,6 +16,7 @@
 typedef u16    kprobe_opcode_t;
 #define BREAKPOINT_INSTRUCTION 0xd673  /* breakpoint */
 #define MAX_INSN_SIZE          2
+#define MAX_STACK_SIZE         64      /* 32 would probably be OK */
 
 #define kretprobe_blacklist_size 0
 
@@ -26,6 +27,19 @@ struct arch_specific_insn {
        kprobe_opcode_t insn[MAX_INSN_SIZE];
 };
 
+struct prev_kprobe {
+       struct kprobe *kp;
+       unsigned int status;
+};
+
+/* per-cpu kprobe control block */
+struct kprobe_ctlblk {
+       unsigned int kprobe_status;
+       struct prev_kprobe prev_kprobe;
+       struct pt_regs jprobe_saved_regs;
+       char jprobes_stack[MAX_STACK_SIZE];
+};
+
 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 extern int kprobe_exceptions_notify(struct notifier_block *self,
                                    unsigned long val, void *data);
index 3b85ead..08d8a3d 100644 (file)
@@ -2,35 +2,35 @@
 include include/uapi/asm-generic/Kbuild.asm
 
 header-y += auxvec.h
-header-y += bitsperlong.h
 header-y += byteorder.h
 header-y += cachectl.h
-header-y += errno.h
-header-y += fcntl.h
-header-y += ioctl.h
-header-y += ioctls.h
-header-y += ipcbuf.h
-header-y += kvm_para.h
-header-y += mman.h
 header-y += msgbuf.h
 header-y += param.h
-header-y += poll.h
 header-y += posix_types.h
 header-y += ptrace.h
-header-y += resource.h
 header-y += sembuf.h
 header-y += setup.h
 header-y += shmbuf.h
 header-y += sigcontext.h
-header-y += siginfo.h
 header-y += signal.h
 header-y += socket.h
 header-y += sockios.h
 header-y += stat.h
-header-y += statfs.h
 header-y += swab.h
 header-y += termbits.h
 header-y += termios.h
 header-y += types.h
 header-y += unistd.h
+generic-y += bitsperlong.h
+generic-y += errno.h
+generic-y += fcntl.h
+generic-y += ioctl.h
+generic-y += ioctls.h
+generic-y += ipcbuf.h
+generic-y += kvm_para.h
+generic-y += mman.h
 generic-y += param.h
+generic-y += poll.h
+generic-y += resource.h
+generic-y += siginfo.h
+generic-y += statfs.h
index d5dd435..4f02da3 100644 (file)
@@ -1,4 +1,4 @@
-#ifndef __ASM_AVR32_AUXVEC_H
-#define __ASM_AVR32_AUXVEC_H
+#ifndef _UAPI__ASM_AVR32_AUXVEC_H
+#define _UAPI__ASM_AVR32_AUXVEC_H
 
-#endif /* __ASM_AVR32_AUXVEC_H */
+#endif /* _UAPI__ASM_AVR32_AUXVEC_H */
diff --git a/arch/avr32/include/uapi/asm/bitsperlong.h b/arch/avr32/include/uapi/asm/bitsperlong.h
deleted file mode 100644 (file)
index 6dc0bb0..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/bitsperlong.h>
index 50abc21..71242f0 100644 (file)
@@ -1,9 +1,9 @@
 /*
  * AVR32 endian-conversion functions.
  */
-#ifndef __ASM_AVR32_BYTEORDER_H
-#define __ASM_AVR32_BYTEORDER_H
+#ifndef _UAPI__ASM_AVR32_BYTEORDER_H
+#define _UAPI__ASM_AVR32_BYTEORDER_H
 
 #include <linux/byteorder/big_endian.h>
 
-#endif /* __ASM_AVR32_BYTEORDER_H */
+#endif /* _UAPI__ASM_AVR32_BYTEORDER_H */
index 4faf1ce..573a958 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef __ASM_AVR32_CACHECTL_H
-#define __ASM_AVR32_CACHECTL_H
+#ifndef _UAPI__ASM_AVR32_CACHECTL_H
+#define _UAPI__ASM_AVR32_CACHECTL_H
 
 /*
  * Operations that can be performed through the cacheflush system call
@@ -8,4 +8,4 @@
 /* Clean the data cache, then invalidate the icache */
 #define CACHE_IFLUSH   0
 
-#endif /* __ASM_AVR32_CACHECTL_H */
+#endif /* _UAPI__ASM_AVR32_CACHECTL_H */
diff --git a/arch/avr32/include/uapi/asm/errno.h b/arch/avr32/include/uapi/asm/errno.h
deleted file mode 100644 (file)
index 558a724..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_AVR32_ERRNO_H
-#define __ASM_AVR32_ERRNO_H
-
-#include <asm-generic/errno.h>
-
-#endif /* __ASM_AVR32_ERRNO_H */
diff --git a/arch/avr32/include/uapi/asm/fcntl.h b/arch/avr32/include/uapi/asm/fcntl.h
deleted file mode 100644 (file)
index 14c0c44..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_AVR32_FCNTL_H
-#define __ASM_AVR32_FCNTL_H
-
-#include <asm-generic/fcntl.h>
-
-#endif /* __ASM_AVR32_FCNTL_H */
diff --git a/arch/avr32/include/uapi/asm/ioctl.h b/arch/avr32/include/uapi/asm/ioctl.h
deleted file mode 100644 (file)
index c8472c1..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_AVR32_IOCTL_H
-#define __ASM_AVR32_IOCTL_H
-
-#include <asm-generic/ioctl.h>
-
-#endif /* __ASM_AVR32_IOCTL_H */
diff --git a/arch/avr32/include/uapi/asm/ioctls.h b/arch/avr32/include/uapi/asm/ioctls.h
deleted file mode 100644 (file)
index 909cf66..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_AVR32_IOCTLS_H
-#define __ASM_AVR32_IOCTLS_H
-
-#include <asm-generic/ioctls.h>
-
-#endif /* __ASM_AVR32_IOCTLS_H */
diff --git a/arch/avr32/include/uapi/asm/ipcbuf.h b/arch/avr32/include/uapi/asm/ipcbuf.h
deleted file mode 100644 (file)
index 84c7e51..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ipcbuf.h>
diff --git a/arch/avr32/include/uapi/asm/kvm_para.h b/arch/avr32/include/uapi/asm/kvm_para.h
deleted file mode 100644 (file)
index 14fab8f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/kvm_para.h>
diff --git a/arch/avr32/include/uapi/asm/mman.h b/arch/avr32/include/uapi/asm/mman.h
deleted file mode 100644 (file)
index 8eebf89..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/mman.h>
index ac18bc4..9eae6ef 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef __ASM_AVR32_MSGBUF_H
-#define __ASM_AVR32_MSGBUF_H
+#ifndef _UAPI__ASM_AVR32_MSGBUF_H
+#define _UAPI__ASM_AVR32_MSGBUF_H
 
 /*
  * The msqid64_ds structure for i386 architecture.
@@ -28,4 +28,4 @@ struct msqid64_ds {
        unsigned long  __unused5;
 };
 
-#endif /* __ASM_AVR32_MSGBUF_H */
+#endif /* _UAPI__ASM_AVR32_MSGBUF_H */
diff --git a/arch/avr32/include/uapi/asm/poll.h b/arch/avr32/include/uapi/asm/poll.h
deleted file mode 100644 (file)
index c98509d..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/poll.h>
index 9ba9e74..5b813a8 100644 (file)
@@ -5,8 +5,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#ifndef __ASM_AVR32_POSIX_TYPES_H
-#define __ASM_AVR32_POSIX_TYPES_H
+#ifndef _UAPI__ASM_AVR32_POSIX_TYPES_H
+#define _UAPI__ASM_AVR32_POSIX_TYPES_H
 
 /*
  * This file is generally used by user-level software, so you need to
@@ -34,4 +34,4 @@ typedef unsigned short  __kernel_old_dev_t;
 
 #include <asm-generic/posix_types.h>
 
-#endif /* __ASM_AVR32_POSIX_TYPES_H */
+#endif /* _UAPI__ASM_AVR32_POSIX_TYPES_H */
diff --git a/arch/avr32/include/uapi/asm/resource.h b/arch/avr32/include/uapi/asm/resource.h
deleted file mode 100644 (file)
index c6dd101..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_AVR32_RESOURCE_H
-#define __ASM_AVR32_RESOURCE_H
-
-#include <asm-generic/resource.h>
-
-#endif /* __ASM_AVR32_RESOURCE_H */
index e472216..6c6f7cf 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef __ASM_AVR32_SEMBUF_H
-#define __ASM_AVR32_SEMBUF_H
+#ifndef _UAPI__ASM_AVR32_SEMBUF_H
+#define _UAPI__ASM_AVR32_SEMBUF_H
 
 /*
 * The semid64_ds structure for AVR32 architecture.
@@ -22,4 +22,4 @@ struct semid64_ds {
         unsigned long   __unused4;
 };
 
-#endif /* __ASM_AVR32_SEMBUF_H */
+#endif /* _UAPI__ASM_AVR32_SEMBUF_H */
index e58aa93..a654df7 100644 (file)
@@ -13,5 +13,4 @@
 
 #define COMMAND_LINE_SIZE 256
 
-
 #endif /* _UAPI__ASM_AVR32_SETUP_H__ */
index c62fba4..b94cf8b 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef __ASM_AVR32_SHMBUF_H
-#define __ASM_AVR32_SHMBUF_H
+#ifndef _UAPI__ASM_AVR32_SHMBUF_H
+#define _UAPI__ASM_AVR32_SHMBUF_H
 
 /*
  * The shmid64_ds structure for i386 architecture.
@@ -39,4 +39,4 @@ struct shminfo64 {
        unsigned long   __unused4;
 };
 
-#endif /* __ASM_AVR32_SHMBUF_H */
+#endif /* _UAPI__ASM_AVR32_SHMBUF_H */
index e04062b..27e56bf 100644 (file)
@@ -5,8 +5,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#ifndef __ASM_AVR32_SIGCONTEXT_H
-#define __ASM_AVR32_SIGCONTEXT_H
+#ifndef _UAPI__ASM_AVR32_SIGCONTEXT_H
+#define _UAPI__ASM_AVR32_SIGCONTEXT_H
 
 struct sigcontext {
        unsigned long   oldmask;
@@ -31,4 +31,4 @@ struct sigcontext {
        unsigned long   r0;
 };
 
-#endif /* __ASM_AVR32_SIGCONTEXT_H */
+#endif /* _UAPI__ASM_AVR32_SIGCONTEXT_H */
diff --git a/arch/avr32/include/uapi/asm/siginfo.h b/arch/avr32/include/uapi/asm/siginfo.h
deleted file mode 100644 (file)
index 5ee93f4..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _AVR32_SIGINFO_H
-#define _AVR32_SIGINFO_H
-
-#include <asm-generic/siginfo.h>
-
-#endif
index 1b77a93..ffe8c77 100644 (file)
@@ -118,5 +118,4 @@ typedef struct sigaltstack {
        size_t ss_size;
 } stack_t;
 
-
 #endif /* _UAPI__ASM_AVR32_SIGNAL_H */
index 4399364..cbf902e 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef __ASM_AVR32_SOCKET_H
-#define __ASM_AVR32_SOCKET_H
+#ifndef _UAPI__ASM_AVR32_SOCKET_H
+#define _UAPI__ASM_AVR32_SOCKET_H
 
 #include <asm/sockios.h>
 
@@ -78,4 +78,4 @@
 
 #define SO_MAX_PACING_RATE     47
 
-#endif /* __ASM_AVR32_SOCKET_H */
+#endif /* _UAPI__ASM_AVR32_SOCKET_H */
index 0802d74..d047854 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef __ASM_AVR32_SOCKIOS_H
-#define __ASM_AVR32_SOCKIOS_H
+#ifndef _UAPI__ASM_AVR32_SOCKIOS_H
+#define _UAPI__ASM_AVR32_SOCKIOS_H
 
 /* Socket-level I/O control calls. */
 #define FIOSETOWN      0x8901
@@ -10,4 +10,4 @@
 #define SIOCGSTAMP     0x8906          /* Get stamp (timeval) */
 #define SIOCGSTAMPNS   0x8907          /* Get stamp (timespec) */
 
-#endif /* __ASM_AVR32_SOCKIOS_H */
+#endif /* _UAPI__ASM_AVR32_SOCKIOS_H */
index e72881e..c06acef 100644 (file)
@@ -5,8 +5,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#ifndef __ASM_AVR32_STAT_H
-#define __ASM_AVR32_STAT_H
+#ifndef _UAPI__ASM_AVR32_STAT_H
+#define _UAPI__ASM_AVR32_STAT_H
 
 struct __old_kernel_stat {
         unsigned short st_dev;
@@ -76,4 +76,4 @@ struct stat64 {
        unsigned long   __unused2;
 };
 
-#endif /* __ASM_AVR32_STAT_H */
+#endif /* _UAPI__ASM_AVR32_STAT_H */
diff --git a/arch/avr32/include/uapi/asm/statfs.h b/arch/avr32/include/uapi/asm/statfs.h
deleted file mode 100644 (file)
index 2961bd1..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_AVR32_STATFS_H
-#define __ASM_AVR32_STATFS_H
-
-#include <asm-generic/statfs.h>
-
-#endif /* __ASM_AVR32_STATFS_H */
index 14cc737..1a03549 100644 (file)
@@ -1,8 +1,8 @@
 /*
  * AVR32 byteswapping functions.
  */
-#ifndef __ASM_AVR32_SWAB_H
-#define __ASM_AVR32_SWAB_H
+#ifndef _UAPI__ASM_AVR32_SWAB_H
+#define _UAPI__ASM_AVR32_SWAB_H
 
 #include <linux/types.h>
 #include <linux/compiler.h>
@@ -32,4 +32,4 @@ static inline __attribute_const__ __u32 __arch_swab32(__u32 val)
 #define __arch_swab32 __arch_swab32
 #endif
 
-#endif /* __ASM_AVR32_SWAB_H */
+#endif /* _UAPI__ASM_AVR32_SWAB_H */
index 366adc5..32789cc 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef __ASM_AVR32_TERMBITS_H
-#define __ASM_AVR32_TERMBITS_H
+#ifndef _UAPI__ASM_AVR32_TERMBITS_H
+#define _UAPI__ASM_AVR32_TERMBITS_H
 
 #include <linux/posix_types.h>
 
@@ -193,4 +193,4 @@ struct ktermios {
 #define        TCSADRAIN       1
 #define        TCSAFLUSH       2
 
-#endif /* __ASM_AVR32_TERMBITS_H */
+#endif /* _UAPI__ASM_AVR32_TERMBITS_H */
index b8ef8ea..c8a0081 100644 (file)
@@ -46,5 +46,4 @@ struct termio {
 
 /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
 
-
 #endif /* _UAPI__ASM_AVR32_TERMIOS_H */
index bb34ad3..7c986c4 100644 (file)
@@ -5,4 +5,9 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+#ifndef _UAPI__ASM_AVR32_TYPES_H
+#define _UAPI__ASM_AVR32_TYPES_H
+
 #include <asm-generic/int-ll64.h>
+
+#endif /* _UAPI__ASM_AVR32_TYPES_H */
index 3eaa687..8822bf4 100644 (file)
 #define __NR_eventfd           281
 #define __NR_setns             283
 
-
 #endif /* _UAPI__ASM_AVR32_UNISTD_H */
index 9899d3c..7301f48 100644 (file)
@@ -401,9 +401,10 @@ handle_critical:
        /* We should never get here... */
 bad_return:
        sub     r12, pc, (. - 1f)
-       bral    panic
+       lddpc   pc, 2f
        .align  2
 1:     .asciz  "Return from critical exception!"
+2:     .long   panic
 
        .align  1
 do_bus_error_write:
index 6163bd0..59eae6d 100644 (file)
 #include <linux/linkage.h>
 
 #include <asm/page.h>
-#include <asm/thread_info.h>
-#include <asm/sysreg.h>
 
        .section .init.text,"ax"
        .global kernel_entry
 kernel_entry:
-       /* Initialize status register */
-       lddpc   r0, init_sr
-       mtsr    SYSREG_SR, r0
-
-       /* Set initial stack pointer */
-       lddpc   sp, stack_addr
-       sub     sp, -THREAD_SIZE
-
-#ifdef CONFIG_FRAME_POINTER
-       /* Mark last stack frame */
-       mov     lr, 0
-       mov     r7, 0
-#endif
-
        /* Start the show */
        lddpc   pc, kernel_start_addr
 
        .align  2
-init_sr:
-       .long   0x007f0000      /* Supervisor mode, everything masked */
-stack_addr:
-       .long   init_thread_union
 kernel_start_addr:
        .long   start_kernel
index d43daf1..4c530a8 100644 (file)
@@ -1992,7 +1992,7 @@ sba_connect_bus(struct pci_bus *bus)
        if (PCI_CONTROLLER(bus)->iommu)
                return;
 
-       handle = PCI_CONTROLLER(bus)->acpi_handle;
+       handle = acpi_device_handle(PCI_CONTROLLER(bus)->companion);
        if (!handle)
                return;
 
index 80775f5..71fbaaa 100644 (file)
@@ -95,7 +95,7 @@ struct iospace_resource {
 };
 
 struct pci_controller {
-       void *acpi_handle;
+       struct acpi_device *companion;
        void *iommu;
        int segment;
        int node;               /* nearest node with memory or -1 for global allocation */
index 5a9ff1c..cb59277 100644 (file)
@@ -2166,12 +2166,6 @@ static const struct file_operations pfm_file_ops = {
        .flush          = pfm_flush
 };
 
-static int
-pfmfs_delete_dentry(const struct dentry *dentry)
-{
-       return 1;
-}
-
 static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen)
 {
        return dynamic_dname(dentry, buffer, buflen, "pfm:[%lu]",
@@ -2179,7 +2173,7 @@ static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen)
 }
 
 static const struct dentry_operations pfmfs_dentry_operations = {
-       .d_delete = pfmfs_delete_dentry,
+       .d_delete = always_delete_dentry,
        .d_dname = pfmfs_dname,
 };
 
index 2326790..9e4938d 100644 (file)
@@ -436,9 +436,9 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
        if (!controller)
                return NULL;
 
-       controller->acpi_handle = device->handle;
+       controller->companion = device;
 
-       pxm = acpi_get_pxm(controller->acpi_handle);
+       pxm = acpi_get_pxm(device->handle);
 #ifdef CONFIG_NUMA
        if (pxm >= 0)
                controller->node = pxm_to_node(pxm);
@@ -489,7 +489,7 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
 {
        struct pci_controller *controller = bridge->bus->sysdata;
 
-       ACPI_HANDLE_SET(&bridge->dev, controller->acpi_handle);
+       ACPI_COMPANION_SET(&bridge->dev, controller->companion);
        return 0;
 }
 
index b172539..0640739 100644 (file)
@@ -132,7 +132,7 @@ sn_get_bussoft_ptr(struct pci_bus *bus)
        struct acpi_resource_vendor_typed *vendor;
 
 
-       handle = PCI_CONTROLLER(bus)->acpi_handle;
+       handle = acpi_device_handle(PCI_CONTROLLER(bus)->companion);
        status = acpi_get_vendor_resource(handle, METHOD_NAME__CRS,
                                          &sn_uuid, &buffer);
        if (ACPI_FAILURE(status)) {
@@ -360,7 +360,7 @@ sn_acpi_get_pcidev_info(struct pci_dev *dev, struct pcidev_info **pcidev_info,
        acpi_status status;
        struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 
-       rootbus_handle = PCI_CONTROLLER(dev)->acpi_handle;
+       rootbus_handle = acpi_device_handle(PCI_CONTROLLER(dev)->companion);
         status = acpi_evaluate_integer(rootbus_handle, METHOD_NAME__SEG, NULL,
                                        &segment);
         if (ACPI_SUCCESS(status)) {
diff --git a/arch/parisc/include/asm/socket.h b/arch/parisc/include/asm/socket.h
new file mode 100644 (file)
index 0000000..748016c
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef _ASM_SOCKET_H
+#define _ASM_SOCKET_H
+
+#include <uapi/asm/socket.h>
+
+/* O_NONBLOCK clashes with the bits used for socket types.  Therefore we
+ * have to define SOCK_NONBLOCK to a different value here.
+ */
+#define SOCK_NONBLOCK  0x40000000
+
+#endif /* _ASM_SOCKET_H */
index 63f4dd0..4006964 100644 (file)
@@ -4,14 +4,11 @@
 /*
  * User space memory access functions
  */
-#include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/cache.h>
 #include <asm/errno.h>
 #include <asm-generic/uaccess-unaligned.h>
 
-#include <linux/sched.h>
-
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1
 
@@ -36,43 +33,12 @@ extern int __get_user_bad(void);
 extern int __put_kernel_bad(void);
 extern int __put_user_bad(void);
 
-
-/*
- * Test whether a block of memory is a valid user space address.
- * Returns 0 if the range is valid, nonzero otherwise.
- */
-static inline int __range_not_ok(unsigned long addr, unsigned long size,
-                                unsigned long limit)
+static inline long access_ok(int type, const void __user * addr,
+               unsigned long size)
 {
-       unsigned long __newaddr = addr + size;
-       return (__newaddr < addr || __newaddr > limit || size > limit);
+       return 1;
 }
 
-/**
- * access_ok: - Checks if a user space pointer is valid
- * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE.  Note that
- *        %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
- *        to write to a block, it is always safe to read from it.
- * @addr: User space pointer to start of block to check
- * @size: Size of block to check
- *
- * Context: User context only.  This function may sleep.
- *
- * Checks if a pointer to a block of memory in user space is valid.
- *
- * Returns true (nonzero) if the memory block may be valid, false (zero)
- * if it is definitely invalid.
- *
- * Note that, depending on architecture, this function probably just
- * checks that the pointer is in the user space range - after calling
- * this function, memory access functions may still return -EFAULT.
- */
-#define access_ok(type, addr, size)                                    \
-(      __chk_user_ptr(addr),                                           \
-       !__range_not_ok((unsigned long) (__force void *) (addr),        \
-                       size, user_addr_max())                          \
-)
-
 #define put_user __put_user
 #define get_user __get_user
 
@@ -253,11 +219,7 @@ extern long lstrnlen_user(const char __user *,long);
 /*
  * Complex access routines -- macros
  */
-#ifdef CONFIG_COMPAT
-#define user_addr_max() (TASK_SIZE)
-#else
-#define user_addr_max() (DEFAULT_TASK_SIZE)
-#endif
+#define user_addr_max() (~0UL)
 
 #define strnlen_user lstrnlen_user
 #define strlen_user(str) lstrnlen_user(str, 0x7fffffffL)
index 7c614d0..f33113a 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _ASM_SOCKET_H
-#define _ASM_SOCKET_H
+#ifndef _UAPI_ASM_SOCKET_H
+#define _UAPI_ASM_SOCKET_H
 
 #include <asm/sockios.h>
 
@@ -77,9 +77,4 @@
 
 #define SO_MAX_PACING_RATE     0x4048
 
-/* O_NONBLOCK clashes with the bits used for socket types.  Therefore we
- * have to define SOCK_NONBLOCK to a different value here.
- */
-#define SOCK_NONBLOCK   0x40000000
-
-#endif /* _ASM_SOCKET_H */
+#endif /* _UAPI_ASM_SOCKET_H */
index b5507ec..413dc17 100644 (file)
@@ -161,7 +161,7 @@ static inline void prefetch_dst(const void *addr)
 /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
  * per loop.  This code is derived from glibc. 
  */
-static inline unsigned long copy_dstaligned(unsigned long dst,
+static noinline unsigned long copy_dstaligned(unsigned long dst,
                                        unsigned long src, unsigned long len)
 {
        /* gcc complains that a2 and a3 may be uninitialized, but actually
@@ -276,7 +276,7 @@ handle_store_error:
 /* Returns PA_MEMCPY_OK, PA_MEMCPY_LOAD_ERROR or PA_MEMCPY_STORE_ERROR.
  * In case of an access fault the faulty address can be read from the per_cpu
  * exception data struct. */
-static unsigned long pa_memcpy_internal(void *dstp, const void *srcp,
+static noinline unsigned long pa_memcpy_internal(void *dstp, const void *srcp,
                                        unsigned long len)
 {
        register unsigned long src, dst, t1, t2, t3;
@@ -529,7 +529,7 @@ long probe_kernel_read(void *dst, const void *src, size_t size)
 {
        unsigned long addr = (unsigned long)src;
 
-       if (size < 0 || addr < PAGE_SIZE)
+       if (addr < PAGE_SIZE)
                return -EFAULT;
 
        /* check for I/O space F_EXTEND(0xfff00000) access as well? */
index 7584a5d..9d08c71 100644 (file)
@@ -282,16 +282,34 @@ bad_area:
 #endif
                switch (code) {
                case 15:        /* Data TLB miss fault/Data page fault */
+                       /* send SIGSEGV when outside of vma */
+                       if (!vma ||
+                           address < vma->vm_start || address > vma->vm_end) {
+                               si.si_signo = SIGSEGV;
+                               si.si_code = SEGV_MAPERR;
+                               break;
+                       }
+
+                       /* send SIGSEGV for wrong permissions */
+                       if ((vma->vm_flags & acc_type) != acc_type) {
+                               si.si_signo = SIGSEGV;
+                               si.si_code = SEGV_ACCERR;
+                               break;
+                       }
+
+                       /* probably address is outside of mapped file */
+                       /* fall through */
                case 17:        /* NA data TLB miss / page fault */
                case 18:        /* Unaligned access - PCXS only */
                        si.si_signo = SIGBUS;
-                       si.si_code = BUS_ADRERR;
+                       si.si_code = (code == 18) ? BUS_ADRALN : BUS_ADRERR;
                        break;
                case 16:        /* Non-access instruction TLB miss fault */
                case 26:        /* PCXL: Data memory access rights trap */
                default:
                        si.si_signo = SIGSEGV;
-                       si.si_code = SEGV_MAPERR;
+                       si.si_code = (code == 26) ? SEGV_ACCERR : SEGV_MAPERR;
+                       break;
                }
                si.si_errno = 0;
                si.si_addr = (void __user *) address;
index 607acf5..8a24636 100644 (file)
@@ -111,6 +111,7 @@ endif
 endif
 
 CFLAGS-$(CONFIG_PPC64) := -mtraceback=no -mcall-aixdesc
+CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv1)
 CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mcmodel=medium,-mminimal-toc)
 CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mno-pointers-to-nested-functions)
 CFLAGS-$(CONFIG_PPC32) := -ffixed-r2 $(MULTIPLEWORD)
index 4c617bf..4f6e482 100644 (file)
                reg = <0xe2000 0x1000>;
        };
 
-/include/ "qoriq-dma-0.dtsi"
+/include/ "elo3-dma-0.dtsi"
        dma@100300 {
                fsl,iommu-parent = <&pamu0>;
                fsl,liodn-reg = <&guts 0x580>; /* DMA1LIODNR */
        };
 
-/include/ "qoriq-dma-1.dtsi"
+/include/ "elo3-dma-1.dtsi"
        dma@101300 {
                fsl,iommu-parent = <&pamu0>;
                fsl,liodn-reg = <&guts 0x584>; /* DMA2LIODNR */
diff --git a/arch/powerpc/boot/dts/fsl/elo3-dma-0.dtsi b/arch/powerpc/boot/dts/fsl/elo3-dma-0.dtsi
new file mode 100644 (file)
index 0000000..3c210e0
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ * QorIQ Elo3 DMA device tree stub [ controller @ offset 0x100000 ]
+ *
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+dma0: dma@100300 {
+       #address-cells = <1>;
+       #size-cells = <1>;
+       compatible = "fsl,elo3-dma";
+       reg = <0x100300 0x4>,
+             <0x100600 0x4>;
+       ranges = <0x0 0x100100 0x500>;
+       dma-channel@0 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x0 0x80>;
+               interrupts = <28 2 0 0>;
+       };
+       dma-channel@80 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x80 0x80>;
+               interrupts = <29 2 0 0>;
+       };
+       dma-channel@100 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x100 0x80>;
+               interrupts = <30 2 0 0>;
+       };
+       dma-channel@180 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x180 0x80>;
+               interrupts = <31 2 0 0>;
+       };
+       dma-channel@300 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x300 0x80>;
+               interrupts = <76 2 0 0>;
+       };
+       dma-channel@380 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x380 0x80>;
+               interrupts = <77 2 0 0>;
+       };
+       dma-channel@400 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x400 0x80>;
+               interrupts = <78 2 0 0>;
+       };
+       dma-channel@480 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x480 0x80>;
+               interrupts = <79 2 0 0>;
+       };
+};
diff --git a/arch/powerpc/boot/dts/fsl/elo3-dma-1.dtsi b/arch/powerpc/boot/dts/fsl/elo3-dma-1.dtsi
new file mode 100644 (file)
index 0000000..cccf3bb
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ * QorIQ Elo3 DMA device tree stub [ controller @ offset 0x101000 ]
+ *
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+dma1: dma@101300 {
+       #address-cells = <1>;
+       #size-cells = <1>;
+       compatible = "fsl,elo3-dma";
+       reg = <0x101300 0x4>,
+             <0x101600 0x4>;
+       ranges = <0x0 0x101100 0x500>;
+       dma-channel@0 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x0 0x80>;
+               interrupts = <32 2 0 0>;
+       };
+       dma-channel@80 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x80 0x80>;
+               interrupts = <33 2 0 0>;
+       };
+       dma-channel@100 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x100 0x80>;
+               interrupts = <34 2 0 0>;
+       };
+       dma-channel@180 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x180 0x80>;
+               interrupts = <35 2 0 0>;
+       };
+       dma-channel@300 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x300 0x80>;
+               interrupts = <80 2 0 0>;
+       };
+       dma-channel@380 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x380 0x80>;
+               interrupts = <81 2 0 0>;
+       };
+       dma-channel@400 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x400 0x80>;
+               interrupts = <82 2 0 0>;
+       };
+       dma-channel@480 {
+               compatible = "fsl,eloplus-dma-channel";
+               reg = <0x480 0x80>;
+               interrupts = <83 2 0 0>;
+       };
+};
index 510afa3..4143a97 100644 (file)
                reg        = <0xea000 0x4000>;
        };
 
-/include/ "qoriq-dma-0.dtsi"
-/include/ "qoriq-dma-1.dtsi"
+/include/ "elo3-dma-0.dtsi"
+/include/ "elo3-dma-1.dtsi"
 
 /include/ "qoriq-espi-0.dtsi"
        spi@110000 {
diff --git a/arch/powerpc/configs/pseries_le_defconfig b/arch/powerpc/configs/pseries_le_defconfig
new file mode 100644 (file)
index 0000000..62771e0
--- /dev/null
@@ -0,0 +1,352 @@
+CONFIG_PPC64=y
+CONFIG_ALTIVEC=y
+CONFIG_VSX=y
+CONFIG_SMP=y
+CONFIG_NR_CPUS=2048
+CONFIG_CPU_LITTLE_ENDIAN=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_AUDITSYSCALL=y
+CONFIG_IRQ_DOMAIN_DEBUG=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+CONFIG_MODULE_SRCVERSION_ALL=y
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_PPC_SPLPAR=y
+CONFIG_SCANLOG=m
+CONFIG_PPC_SMLPAR=y
+CONFIG_DTL=y
+# CONFIG_PPC_PMAC is not set
+CONFIG_RTAS_FLASH=m
+CONFIG_IBMEBUS=y
+CONFIG_HZ_100=y
+CONFIG_BINFMT_MISC=m
+CONFIG_PPC_TRANSACTIONAL_MEM=y
+CONFIG_KEXEC=y
+CONFIG_IRQ_ALL_CPUS=y
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_MEMORY_HOTREMOVE=y
+CONFIG_CMA=y
+CONFIG_PPC_64K_PAGES=y
+CONFIG_PPC_SUBPAGE_PROT=y
+CONFIG_SCHED_SMT=y
+CONFIG_HOTPLUG_PCI=y
+CONFIG_HOTPLUG_PCI_RPA=m
+CONFIG_HOTPLUG_PCI_RPA_DLPAR=m
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=m
+CONFIG_NET_KEY=m
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_NET_IPIP=y
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+# CONFIG_IPV6 is not set
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=m
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_UDPLITE=m
+CONFIG_NF_CONNTRACK_FTP=m
+CONFIG_NF_CONNTRACK_IRC=m
+CONFIG_NF_CONNTRACK_TFTP=m
+CONFIG_NF_CT_NETLINK=m
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=m
+CONFIG_NETFILTER_XT_TARGET_MARK=m
+CONFIG_NETFILTER_XT_TARGET_NFLOG=m
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
+CONFIG_NETFILTER_XT_MATCH_COMMENT=m
+CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+CONFIG_NETFILTER_XT_MATCH_DCCP=m
+CONFIG_NETFILTER_XT_MATCH_DSCP=m
+CONFIG_NETFILTER_XT_MATCH_ESP=m
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
+CONFIG_NETFILTER_XT_MATCH_HELPER=m
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
+CONFIG_NETFILTER_XT_MATCH_LENGTH=m
+CONFIG_NETFILTER_XT_MATCH_LIMIT=m
+CONFIG_NETFILTER_XT_MATCH_MAC=m
+CONFIG_NETFILTER_XT_MATCH_MARK=m
+CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
+CONFIG_NETFILTER_XT_MATCH_OWNER=m
+CONFIG_NETFILTER_XT_MATCH_POLICY=m
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
+CONFIG_NETFILTER_XT_MATCH_QUOTA=m
+CONFIG_NETFILTER_XT_MATCH_RATEEST=m
+CONFIG_NETFILTER_XT_MATCH_REALM=m
+CONFIG_NETFILTER_XT_MATCH_RECENT=m
+CONFIG_NETFILTER_XT_MATCH_SCTP=m
+CONFIG_NETFILTER_XT_MATCH_STATE=m
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
+CONFIG_NETFILTER_XT_MATCH_STRING=m
+CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
+CONFIG_NETFILTER_XT_MATCH_TIME=m
+CONFIG_NETFILTER_XT_MATCH_U32=m
+CONFIG_NF_CONNTRACK_IPV4=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_AH=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_PROC_DEVICETREE=y
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_BLK_DEV_FD=m
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=65536
+CONFIG_VIRTIO_BLK=m
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDECD=y
+CONFIG_BLK_DEV_GENERIC=y
+CONFIG_BLK_DEV_AMD74XX=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_CHR_DEV_ST=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_MULTI_LUN=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_FC_ATTRS=y
+CONFIG_SCSI_CXGB3_ISCSI=m
+CONFIG_SCSI_CXGB4_ISCSI=m
+CONFIG_SCSI_BNX2_ISCSI=m
+CONFIG_BE2ISCSI=m
+CONFIG_SCSI_MPT2SAS=m
+CONFIG_SCSI_IBMVSCSI=y
+CONFIG_SCSI_IBMVFC=m
+CONFIG_SCSI_SYM53C8XX_2=y
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
+CONFIG_SCSI_IPR=y
+CONFIG_SCSI_QLA_FC=m
+CONFIG_SCSI_QLA_ISCSI=m
+CONFIG_SCSI_LPFC=m
+CONFIG_SCSI_VIRTIO=m
+CONFIG_SCSI_DH=m
+CONFIG_SCSI_DH_RDAC=m
+CONFIG_SCSI_DH_ALUA=m
+CONFIG_ATA=y
+# CONFIG_ATA_SFF is not set
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=y
+CONFIG_MD_RAID0=y
+CONFIG_MD_RAID1=y
+CONFIG_MD_RAID10=m
+CONFIG_MD_RAID456=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_MD_FAULTY=m
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_QL=m
+CONFIG_DM_MULTIPATH_ST=m
+CONFIG_DM_UEVENT=y
+CONFIG_BONDING=m
+CONFIG_DUMMY=m
+CONFIG_NETCONSOLE=y
+CONFIG_NETPOLL_TRAP=y
+CONFIG_TUN=m
+CONFIG_VIRTIO_NET=m
+CONFIG_VORTEX=y
+CONFIG_ACENIC=m
+CONFIG_ACENIC_OMIT_TIGON_I=y
+CONFIG_PCNET32=y
+CONFIG_TIGON3=y
+CONFIG_CHELSIO_T1=m
+CONFIG_BE2NET=m
+CONFIG_S2IO=m
+CONFIG_IBMVETH=y
+CONFIG_EHEA=y
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_IXGB=m
+CONFIG_IXGBE=m
+CONFIG_MLX4_EN=m
+CONFIG_MYRI10GE=m
+CONFIG_QLGE=m
+CONFIG_NETXEN_NIC=m
+CONFIG_PPP=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPPOE=m
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_EVDEV=m
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_PCSPKR=m
+# CONFIG_SERIO_SERPORT is not set
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_ICOM=m
+CONFIG_SERIAL_JSM=m
+CONFIG_HVC_CONSOLE=y
+CONFIG_HVC_RTAS=y
+CONFIG_HVCS=m
+CONFIG_VIRTIO_CONSOLE=m
+CONFIG_IBM_BSR=m
+CONFIG_GEN_RTC=y
+CONFIG_RAW_DRIVER=y
+CONFIG_MAX_RAW_DEVS=1024
+CONFIG_FB=y
+CONFIG_FIRMWARE_EDID=y
+CONFIG_FB_OF=y
+CONFIG_FB_MATROX=y
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+CONFIG_FB_MATROX_G=y
+CONFIG_FB_RADEON=y
+CONFIG_FB_IBM_GXT4500=y
+CONFIG_LCD_PLATFORM=m
+# CONFIG_VGA_CONSOLE is not set
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_LOGO=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB=y
+CONFIG_USB_MON=m
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_HCD_PPC_OF is not set
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_STORAGE=m
+CONFIG_INFINIBAND=m
+CONFIG_INFINIBAND_USER_MAD=m
+CONFIG_INFINIBAND_USER_ACCESS=m
+CONFIG_INFINIBAND_MTHCA=m
+CONFIG_INFINIBAND_EHCA=m
+CONFIG_INFINIBAND_CXGB3=m
+CONFIG_INFINIBAND_CXGB4=m
+CONFIG_MLX4_INFINIBAND=m
+CONFIG_INFINIBAND_IPOIB=m
+CONFIG_INFINIBAND_IPOIB_CM=y
+CONFIG_INFINIBAND_SRP=m
+CONFIG_INFINIBAND_ISER=m
+CONFIG_VIRTIO_PCI=m
+CONFIG_VIRTIO_BALLOON=m
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+CONFIG_EXT2_FS_XIP=y
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_REISERFS_FS=y
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+CONFIG_REISERFS_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_SECURITY=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_POSIX_ACL=y
+CONFIG_BTRFS_FS=m
+CONFIG_BTRFS_FS_POSIX_ACL=y
+CONFIG_NILFS2_FS=m
+CONFIG_AUTOFS4_FS=m
+CONFIG_FUSE_FS=m
+CONFIG_ISO9660_FS=y
+CONFIG_UDF_FS=m
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_CRAMFS=m
+CONFIG_SQUASHFS=m
+CONFIG_SQUASHFS_XATTR=y
+CONFIG_SQUASHFS_LZO=y
+CONFIG_SQUASHFS_XZ=y
+CONFIG_PSTORE=y
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V3_ACL=y
+CONFIG_NFSD_V4=y
+CONFIG_CIFS=m
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_CRC_T10DIF=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_LOCKUP_DETECTOR=y
+CONFIG_LATENCYTOP=y
+CONFIG_SCHED_TRACER=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_CODE_PATCHING_SELFTEST=y
+CONFIG_FTR_FIXUP_SELFTEST=y
+CONFIG_MSI_BITMAP_SELFTEST=y
+CONFIG_XMON=y
+CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_TGR192=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_ANUBIS=m
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_KHAZAD=m
+CONFIG_CRYPTO_SALSA20=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_TEA=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_LZO=m
+# CONFIG_CRYPTO_ANSI_CPRNG is not set
+CONFIG_CRYPTO_DEV_NX=y
+CONFIG_CRYPTO_DEV_NX_ENCRYPT=m
index cc0655a..935b5e7 100644 (file)
@@ -31,6 +31,8 @@
 extern unsigned long randomize_et_dyn(unsigned long base);
 #define ELF_ET_DYN_BASE                (randomize_et_dyn(0x20000000))
 
+#define ELF_CORE_EFLAGS (is_elf2_task() ? 2 : 0)
+
 /*
  * Our registers are always unsigned longs, whether we're a 32 bit
  * process or 64 bit, on either a 64 bit or 32 bit kernel.
@@ -86,6 +88,8 @@ typedef elf_vrregset_t elf_fpxregset_t;
 #ifdef __powerpc64__
 # define SET_PERSONALITY(ex)                                   \
 do {                                                           \
+       if (((ex).e_flags & 0x3) == 2)                          \
+               set_thread_flag(TIF_ELF2ABI);                   \
        if ((ex).e_ident[EI_CLASS] == ELFCLASS32)               \
                set_thread_flag(TIF_32BIT);                     \
        else                                                    \
index 0c7f2bf..d8b600b 100644 (file)
@@ -403,6 +403,8 @@ static inline unsigned long cmo_get_page_size(void)
 extern long pSeries_enable_reloc_on_exc(void);
 extern long pSeries_disable_reloc_on_exc(void);
 
+extern long pseries_big_endian_exceptions(void);
+
 #else
 
 #define pSeries_enable_reloc_on_exc()  do {} while (0)
index a63b045..12c32c5 100644 (file)
@@ -287,6 +287,32 @@ static inline long disable_reloc_on_exceptions(void) {
        return plpar_set_mode(0, 3, 0, 0);
 }
 
+/*
+ * Take exceptions in big endian mode on this partition
+ *
+ * Note: this call has a partition wide scope and can take a while to complete.
+ * If it returns H_LONG_BUSY_* it should be retried periodically until it
+ * returns H_SUCCESS.
+ */
+static inline long enable_big_endian_exceptions(void)
+{
+       /* mflags = 0: big endian exceptions */
+       return plpar_set_mode(0, 4, 0, 0);
+}
+
+/*
+ * Take exceptions in little endian mode on this partition
+ *
+ * Note: this call has a partition wide scope and can take a while to complete.
+ * If it returns H_LONG_BUSY_* it should be retried periodically until it
+ * returns H_SUCCESS.
+ */
+static inline long enable_little_endian_exceptions(void)
+{
+       /* mflags = 1: little endian exceptions */
+       return plpar_set_mode(1, 4, 0, 0);
+}
+
 static inline long plapr_set_ciabr(unsigned long ciabr)
 {
        return plpar_set_mode(0, 1, ciabr, 0);
index 98da78e..084e080 100644 (file)
@@ -33,6 +33,7 @@ extern int boot_cpuid;
 extern int spinning_secondaries;
 
 extern void cpu_die(void);
+extern int cpu_to_chip_id(int cpu);
 
 #ifdef CONFIG_SMP
 
@@ -112,7 +113,6 @@ static inline struct cpumask *cpu_core_mask(int cpu)
 }
 
 extern int cpu_to_core_id(int cpu);
-extern int cpu_to_chip_id(int cpu);
 
 /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers.
  *
index 8fd6cf6..9854c56 100644 (file)
@@ -105,6 +105,9 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_EMULATE_STACK_STORE        16      /* Is an instruction emulation
                                                for stack store? */
 #define TIF_MEMDIE             17      /* is terminating due to OOM killer */
+#if defined(CONFIG_PPC64)
+#define TIF_ELF2ABI            18      /* function descriptors must die! */
+#endif
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
@@ -183,6 +186,12 @@ static inline bool test_thread_local_flags(unsigned int flags)
 #define is_32bit_task()        (1)
 #endif
 
+#if defined(CONFIG_PPC64)
+#define is_elf2_task() (test_thread_flag(TIF_ELF2ABI))
+#else
+#define is_elf2_task() (0)
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
index 6713020..4bd687d 100644 (file)
@@ -686,6 +686,15 @@ void eeh_save_bars(struct eeh_dev *edev)
 
        for (i = 0; i < 16; i++)
                eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]);
+
+       /*
+        * For PCI bridges including root port, we need enable bus
+        * master explicitly. Otherwise, it can't fetch IODA table
+        * entries correctly. So we cache the bit in advance so that
+        * we can restore it after reset, either PHB range or PE range.
+        */
+       if (edev->mode & EEH_DEV_BRIDGE)
+               edev->config_space[1] |= PCI_COMMAND_MASTER;
 }
 
 /**
index d27c5af..72d748b 100644 (file)
@@ -74,8 +74,13 @@ static int eeh_event_handler(void * dummy)
                pe = event->pe;
                if (pe) {
                        eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
-                       pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n",
-                                pe->phb->global_number, pe->addr);
+                       if (pe->type & EEH_PE_PHB)
+                               pr_info("EEH: Detected error on PHB#%d\n",
+                                        pe->phb->global_number);
+                       else
+                               pr_info("EEH: Detected PCI bus error on "
+                                       "PHB#%d-PE#%x\n",
+                                       pe->phb->global_number, pe->addr);
                        eeh_handle_event(pe);
                        eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
                } else {
index 75c2d10..3386d8a 100644 (file)
@@ -858,17 +858,21 @@ void show_regs(struct pt_regs * regs)
        printk("MSR: "REG" ", regs->msr);
        printbits(regs->msr, msr_bits);
        printk("  CR: %08lx  XER: %08lx\n", regs->ccr, regs->xer);
-#ifdef CONFIG_PPC64
-       printk("SOFTE: %ld\n", regs->softe);
-#endif
        trap = TRAP(regs);
        if ((regs->trap != 0xc00) && cpu_has_feature(CPU_FTR_CFAR))
-               printk("CFAR: "REG"\n", regs->orig_gpr3);
-       if (trap == 0x300 || trap == 0x600)
+               printk("CFAR: "REG" ", regs->orig_gpr3);
+       if (trap == 0x200 || trap == 0x300 || trap == 0x600)
 #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
-               printk("DEAR: "REG", ESR: "REG"\n", regs->dar, regs->dsisr);
+               printk("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr);
 #else
-               printk("DAR: "REG", DSISR: %08lx\n", regs->dar, regs->dsisr);
+               printk("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr);
+#endif
+#ifdef CONFIG_PPC64
+       printk("SOFTE: %ld ", regs->softe);
+#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+       if (MSR_TM_ACTIVE(regs->msr))
+               printk("\nPACATMSCRATCH: %016llx ", get_paca()->tm_scratch);
 #endif
 
        for (i = 0;  i < 32;  i++) {
@@ -887,9 +891,6 @@ void show_regs(struct pt_regs * regs)
        printk("NIP ["REG"] %pS\n", regs->nip, (void *)regs->nip);
        printk("LR ["REG"] %pS\n", regs->link, (void *)regs->link);
 #endif
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-       printk("PACATMSCRATCH [%llx]\n", get_paca()->tm_scratch);
-#endif
        show_stack(current, (unsigned long *) regs->gpr[1]);
        if (!user_mode(regs))
                show_instructions(regs);
@@ -1086,25 +1087,45 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp)
        regs->msr = MSR_USER;
 #else
        if (!is_32bit_task()) {
-               unsigned long entry, toc;
+               unsigned long entry;
 
-               /* start is a relocated pointer to the function descriptor for
-                * the elf _start routine.  The first entry in the function
-                * descriptor is the entry address of _start and the second
-                * entry is the TOC value we need to use.
-                */
-               __get_user(entry, (unsigned long __user *)start);
-               __get_user(toc, (unsigned long __user *)start+1);
+               if (is_elf2_task()) {
+                       /* Look ma, no function descriptors! */
+                       entry = start;
 
-               /* Check whether the e_entry function descriptor entries
-                * need to be relocated before we can use them.
-                */
-               if (load_addr != 0) {
-                       entry += load_addr;
-                       toc   += load_addr;
+                       /*
+                        * Ulrich says:
+                        *   The latest iteration of the ABI requires that when
+                        *   calling a function (at its global entry point),
+                        *   the caller must ensure r12 holds the entry point
+                        *   address (so that the function can quickly
+                        *   establish addressability).
+                        */
+                       regs->gpr[12] = start;
+                       /* Make sure that's restored on entry to userspace. */
+                       set_thread_flag(TIF_RESTOREALL);
+               } else {
+                       unsigned long toc;
+
+                       /* start is a relocated pointer to the function
+                        * descriptor for the elf _start routine.  The first
+                        * entry in the function descriptor is the entry
+                        * address of _start and the second entry is the TOC
+                        * value we need to use.
+                        */
+                       __get_user(entry, (unsigned long __user *)start);
+                       __get_user(toc, (unsigned long __user *)start+1);
+
+                       /* Check whether the e_entry function descriptor entries
+                        * need to be relocated before we can use them.
+                        */
+                       if (load_addr != 0) {
+                               entry += load_addr;
+                               toc   += load_addr;
+                       }
+                       regs->gpr[2] = toc;
                }
                regs->nip = entry;
-               regs->gpr[2] = toc;
                regs->msr = MSR_USER64;
        } else {
                regs->nip = start;
index f3a4709..fa0ad8a 100644 (file)
@@ -777,6 +777,26 @@ int of_get_ibm_chip_id(struct device_node *np)
        return -1;
 }
 
+/**
+ * cpu_to_chip_id - Return the cpus chip-id
+ * @cpu: The logical cpu number.
+ *
+ * Return the value of the ibm,chip-id property corresponding to the given
+ * logical cpu number. If the chip-id can not be found, returns -1.
+ */
+int cpu_to_chip_id(int cpu)
+{
+       struct device_node *np;
+
+       np = of_get_cpu_node(cpu, NULL);
+       if (!np)
+               return -1;
+
+       of_node_put(np);
+       return of_get_ibm_chip_id(np);
+}
+EXPORT_SYMBOL(cpu_to_chip_id);
+
 #ifdef CONFIG_PPC_PSERIES
 /*
  * Fix up the uninitialized fields in a new device node:
index 749778e..1844298 100644 (file)
@@ -457,7 +457,15 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
                if (copy_vsx_to_user(&frame->mc_vsregs, current))
                        return 1;
                msr |= MSR_VSX;
-       }
+       } else if (!ctx_has_vsx_region)
+               /*
+                * With a small context structure we can't hold the VSX
+                * registers, hence clear the MSR value to indicate the state
+                * was not saved.
+                */
+               msr &= ~MSR_VSX;
+
+
 #endif /* CONFIG_VSX */
 #ifdef CONFIG_SPE
        /* save spe registers */
index b3c6157..e66f67b 100644 (file)
@@ -701,12 +701,6 @@ badframe:
 int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info,
                sigset_t *set, struct pt_regs *regs)
 {
-       /* Handler is *really* a pointer to the function descriptor for
-        * the signal routine.  The first entry in the function
-        * descriptor is the entry address of signal and the second
-        * entry is the TOC value we need to use.
-        */
-       func_descr_t __user *funct_desc_ptr;
        struct rt_sigframe __user *frame;
        unsigned long newsp = 0;
        long err = 0;
@@ -766,19 +760,32 @@ int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info,
                        goto badframe;
                regs->link = (unsigned long) &frame->tramp[0];
        }
-       funct_desc_ptr = (func_descr_t __user *) ka->sa.sa_handler;
 
        /* Allocate a dummy caller frame for the signal handler. */
        newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
        err |= put_user(regs->gpr[1], (unsigned long __user *)newsp);
 
        /* Set up "regs" so we "return" to the signal handler. */
-       err |= get_user(regs->nip, &funct_desc_ptr->entry);
+       if (is_elf2_task()) {
+               regs->nip = (unsigned long) ka->sa.sa_handler;
+               regs->gpr[12] = regs->nip;
+       } else {
+               /* Handler is *really* a pointer to the function descriptor for
+                * the signal routine.  The first entry in the function
+                * descriptor is the entry address of signal and the second
+                * entry is the TOC value we need to use.
+                */
+               func_descr_t __user *funct_desc_ptr =
+                       (func_descr_t __user *) ka->sa.sa_handler;
+
+               err |= get_user(regs->nip, &funct_desc_ptr->entry);
+               err |= get_user(regs->gpr[2], &funct_desc_ptr->toc);
+       }
+
        /* enter the signal handler in native-endian mode */
        regs->msr &= ~MSR_LE;
        regs->msr |= (MSR_KERNEL & MSR_LE);
        regs->gpr[1] = newsp;
-       err |= get_user(regs->gpr[2], &funct_desc_ptr->toc);
        regs->gpr[3] = signr;
        regs->result = 0;
        if (ka->sa.sa_flags & SA_SIGINFO) {
index 930cd8a..a3b64f3 100644 (file)
@@ -597,22 +597,6 @@ out:
        return id;
 }
 
-/* Return the value of the chip-id property corresponding
- * to the given logical cpu.
- */
-int cpu_to_chip_id(int cpu)
-{
-       struct device_node *np;
-
-       np = of_get_cpu_node(cpu, NULL);
-       if (!np)
-               return -1;
-
-       of_node_put(np);
-       return of_get_ibm_chip_id(np);
-}
-EXPORT_SYMBOL(cpu_to_chip_id);
-
 /* Helper routines for cpu to core mapping */
 int cpu_core_index_of_thread(int cpu)
 {
index 192b051..b3b1441 100644 (file)
@@ -213,8 +213,6 @@ static u64 scan_dispatch_log(u64 stop_tb)
        if (i == be64_to_cpu(vpa->dtl_idx))
                return 0;
        while (i < be64_to_cpu(vpa->dtl_idx)) {
-               if (dtl_consumer)
-                       dtl_consumer(dtl, i);
                dtb = be64_to_cpu(dtl->timebase);
                tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) +
                        be32_to_cpu(dtl->ready_to_enqueue_time);
@@ -227,6 +225,8 @@ static u64 scan_dispatch_log(u64 stop_tb)
                }
                if (dtb > stop_tb)
                        break;
+               if (dtl_consumer)
+                       dtl_consumer(dtl, i);
                stolen += tb_delta;
                ++i;
                ++dtl;
index 45ea281..542c6f4 100644 (file)
@@ -142,6 +142,13 @@ V_FUNCTION_END(__kernel_sigtramp_rt64)
 /* Size of CR reg in DWARF unwind info. */
 #define CRSIZE 4
 
+/* Offset of CR reg within a full word. */
+#ifdef __LITTLE_ENDIAN__
+#define CROFF 0
+#else
+#define CROFF (RSIZE - CRSIZE)
+#endif
+
 /* This is the offset of the VMX reg pointer.  */
 #define VREGS  48*RSIZE+33*8
 
@@ -181,7 +188,14 @@ V_FUNCTION_END(__kernel_sigtramp_rt64)
   rsave (31, 31*RSIZE);                                                        \
   rsave (67, 32*RSIZE);                /* ap, used as temp for nip */          \
   rsave (65, 36*RSIZE);                /* lr */                                \
-  rsave (70, 38*RSIZE + (RSIZE - CRSIZE)) /* cr */
+  rsave (68, 38*RSIZE + CROFF);        /* cr fields */                         \
+  rsave (69, 38*RSIZE + CROFF);                                                \
+  rsave (70, 38*RSIZE + CROFF);                                                \
+  rsave (71, 38*RSIZE + CROFF);                                                \
+  rsave (72, 38*RSIZE + CROFF);                                                \
+  rsave (73, 38*RSIZE + CROFF);                                                \
+  rsave (74, 38*RSIZE + CROFF);                                                \
+  rsave (75, 38*RSIZE + CROFF)
 
 /* Describe where the FP regs are saved.  */
 #define EH_FRAME_FP \
index e7d0c88..76a6482 100644 (file)
@@ -1419,7 +1419,7 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node)
 
                /* needed to ensure proper operation of coherent allocations
                 * later, in case driver doesn't set it explicitly */
-               dma_set_mask_and_coherent(&viodev->dev, DMA_BIT_MASK(64));
+               dma_coerce_mask_and_coherent(&viodev->dev, DMA_BIT_MASK(64));
        }
 
        /* register with generic device framework */
index 6936547..c5f734e 100644 (file)
@@ -123,6 +123,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
        struct mm_struct *mm = current->mm;
        unsigned long addr, len, end;
        unsigned long next;
+       unsigned long flags;
        pgd_t *pgdp;
        int nr = 0;
 
@@ -156,7 +157,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
         * So long as we atomically load page table pointers versus teardown,
         * we can follow the address down to the the page and take a ref on it.
         */
-       local_irq_disable();
+       local_irq_save(flags);
 
        pgdp = pgd_offset(mm, addr);
        do {
@@ -179,7 +180,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
                        break;
        } while (pgdp++, addr = next, addr != end);
 
-       local_irq_enable();
+       local_irq_restore(flags);
 
        return nr;
 }
index 3e99c14..7ce9cf3 100644 (file)
@@ -258,7 +258,7 @@ static bool slice_scan_available(unsigned long addr,
                slice = GET_HIGH_SLICE_INDEX(addr);
                *boundary_addr = (slice + end) ?
                        ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
-               return !!(available.high_slices & (1u << slice));
+               return !!(available.high_slices & (1ul << slice));
        }
 }
 
index c2a566f..132f872 100644 (file)
@@ -403,3 +403,14 @@ config PPC_DOORBELL
        default n
 
 endmenu
+
+config CPU_LITTLE_ENDIAN
+       bool "Build little endian kernel"
+       default n
+       help
+         This option selects whether a big endian or little endian kernel will
+         be built.
+
+         Note that if cross compiling a little endian kernel,
+         CROSS_COMPILE must point to a toolchain capable of targeting
+         little endian powerpc.
index 8844628..1cb160d 100644 (file)
@@ -19,6 +19,7 @@
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
+#include <asm/smp.h>
 
 
 struct powernv_rng {
index 7fbc25b..ccb633e 100644 (file)
@@ -189,8 +189,9 @@ static void *pseries_eeh_of_probe(struct device_node *dn, void *flag)
        struct eeh_dev *edev;
        struct eeh_pe pe;
        struct pci_dn *pdn = PCI_DN(dn);
-       const u32 *class_code, *vendor_id, *device_id;
-       const u32 *regs;
+       const __be32 *classp, *vendorp, *devicep;
+       u32 class_code;
+       const __be32 *regs;
        u32 pcie_flags;
        int enable = 0;
        int ret;
@@ -201,22 +202,24 @@ static void *pseries_eeh_of_probe(struct device_node *dn, void *flag)
                return NULL;
 
        /* Retrieve class/vendor/device IDs */
-       class_code = of_get_property(dn, "class-code", NULL);
-       vendor_id  = of_get_property(dn, "vendor-id", NULL);
-       device_id  = of_get_property(dn, "device-id", NULL);
+       classp = of_get_property(dn, "class-code", NULL);
+       vendorp = of_get_property(dn, "vendor-id", NULL);
+       devicep = of_get_property(dn, "device-id", NULL);
 
        /* Skip for bad OF node or PCI-ISA bridge */
-       if (!class_code || !vendor_id || !device_id)
+       if (!classp || !vendorp || !devicep)
                return NULL;
        if (dn->type && !strcmp(dn->type, "isa"))
                return NULL;
 
+       class_code = of_read_number(classp, 1);
+
        /*
         * Update class code and mode of eeh device. We need
         * correctly reflects that current device is root port
         * or PCIe switch downstream port.
         */
-       edev->class_code = *class_code;
+       edev->class_code = class_code;
        edev->pcie_cap = pseries_eeh_find_cap(dn, PCI_CAP_ID_EXP);
        edev->mode &= 0xFFFFFF00;
        if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
@@ -243,12 +246,12 @@ static void *pseries_eeh_of_probe(struct device_node *dn, void *flag)
        /* Initialize the fake PE */
        memset(&pe, 0, sizeof(struct eeh_pe));
        pe.phb = edev->phb;
-       pe.config_addr = regs[0];
+       pe.config_addr = of_read_number(regs, 1);
 
        /* Enable EEH on the device */
        ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
        if (!ret) {
-               edev->config_addr = regs[0];
+               edev->config_addr = of_read_number(regs, 1);
                /* Retrieve PE address */
                edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
                pe.addr = edev->pe_config_addr;
index 356bc75..4fca3de 100644 (file)
@@ -245,6 +245,23 @@ static void pSeries_lpar_hptab_clear(void)
                                        &(ptes[j].pteh), &(ptes[j].ptel));
                }
        }
+
+#ifdef __LITTLE_ENDIAN__
+       /* Reset exceptions to big endian */
+       if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+               long rc;
+
+               rc = pseries_big_endian_exceptions();
+               /*
+                * At this point it is unlikely panic() will get anything
+                * out to the user, but at least this will stop us from
+                * continuing on further and creating an even more
+                * difficult to debug situation.
+                */
+               if (rc)
+                       panic("Could not enable big endian exceptions");
+       }
+#endif
 }
 
 /*
index a702f1c..72a1027 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/of.h>
 #include <asm/archrandom.h>
 #include <asm/machdep.h>
+#include <asm/plpar_wrappers.h>
 
 
 static int pseries_get_random_long(unsigned long *v)
index 1f97e2b..c1f1908 100644 (file)
@@ -442,6 +442,32 @@ static void pSeries_machine_kexec(struct kimage *image)
 }
 #endif
 
+#ifdef __LITTLE_ENDIAN__
+long pseries_big_endian_exceptions(void)
+{
+       long rc;
+
+       while (1) {
+               rc = enable_big_endian_exceptions();
+               if (!H_IS_LONG_BUSY(rc))
+                       return rc;
+               mdelay(get_longbusy_msecs(rc));
+       }
+}
+
+static long pseries_little_endian_exceptions(void)
+{
+       long rc;
+
+       while (1) {
+               rc = enable_little_endian_exceptions();
+               if (!H_IS_LONG_BUSY(rc))
+                       return rc;
+               mdelay(get_longbusy_msecs(rc));
+       }
+}
+#endif
+
 static void __init pSeries_setup_arch(void)
 {
        panic_timeout = 10;
@@ -698,6 +724,22 @@ static int __init pSeries_probe(void)
        /* Now try to figure out if we are running on LPAR */
        of_scan_flat_dt(pseries_probe_fw_features, NULL);
 
+#ifdef __LITTLE_ENDIAN__
+       if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+               long rc;
+               /*
+                * Tell the hypervisor that we want our exceptions to
+                * be taken in little endian mode. If this fails we don't
+                * want to use BUG() because it will trigger an exception.
+                */
+               rc = pseries_little_endian_exceptions();
+               if (rc) {
+                       ppc_md.progress("H_SET_MODE LE exception fail", 0);
+                       panic("Could not enable little endian exceptions");
+               }
+       }
+#endif
+
        if (firmware_has_feature(FW_FEATURE_LPAR))
                hpte_init_lpar();
        else
index 8ef53bc..aaa46b3 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/of.h>
 #include <linux/smp.h>
 #include <linux/time.h>
+#include <linux/of_fdt.h>
 
 #include <asm/machdep.h>
 #include <asm/udbg.h>
index d18e6cc..a3c87f3 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/of.h>
 #include <linux/io.h>
+#include <linux/of_address.h>
 
 #include "wsp.h"
 
index 2d3b1dd..9cd92e6 100644 (file)
@@ -18,6 +18,8 @@
 #include <linux/smp.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
index cb565bf..3f67298 100644 (file)
@@ -15,6 +15,8 @@
 #include <linux/of.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
 
 #include <asm/reg_a2.h>
 #include <asm/irq.h>
index 508ec82..a87b414 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/of.h>
 #include <linux/smp.h>
 #include <linux/time.h>
+#include <linux/of_fdt.h>
 
 #include <asm/machdep.h>
 #include <asm/udbg.h>
index 8928507..6538b4d 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/of.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/of_address.h>
 
 #include <asm/cputhreads.h>
 #include <asm/reg_a2.h>
index ddb6efe..58cd1f0 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/smp.h>
 #include <linux/delay.h>
 #include <linux/time.h>
+#include <linux/of_address.h>
 
 #include <asm/scom.h>
 
index 7d74432..947b5c4 100644 (file)
@@ -15,7 +15,7 @@ struct pci_sysdata {
        int             domain;         /* PCI domain */
        int             node;           /* NUMA node */
 #ifdef CONFIG_ACPI
-       void            *acpi;          /* ACPI-specific data */
+       struct acpi_device *companion;  /* ACPI companion device */
 #endif
 #ifdef CONFIG_X86_64
        void            *iommu;         /* IOMMU private data */
index b93e09a..37813b5 100644 (file)
 #define MSR_PP1_ENERGY_STATUS          0x00000641
 #define MSR_PP1_POLICY                 0x00000642
 
+#define MSR_CORE_C1_RES                        0x00000660
+
 #define MSR_AMD64_MC0_MASK             0xc0010044
 
 #define MSR_IA32_MCx_CTL(x)            (MSR_IA32_MC0_CTL + 4*(x))
index daff69e..1185fe7 100644 (file)
@@ -296,4 +296,4 @@ static struct kernel_param_ops audit_param_ops = {
        .get = param_get_bool,
 };
 
-module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
+arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
index a7cccb6..c96314a 100644 (file)
@@ -61,6 +61,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 #if PAGETABLE_LEVELS > 2
 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
+       struct page *page = virt_to_page(pmd);
        paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
        /*
         * NOTE! For PAE, any changes to the top page-directory-pointer-table
@@ -69,7 +70,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 #ifdef CONFIG_X86_PAE
        tlb->need_flush_all = 1;
 #endif
-       tlb_remove_page(tlb, virt_to_page(pmd));
+       pgtable_pmd_page_dtor(page);
+       tlb_remove_page(tlb, page);
 }
 
 #if PAGETABLE_LEVELS > 3
@@ -209,7 +211,7 @@ static int preallocate_pmds(pmd_t *pmds[])
                if (!pmd)
                        failed = true;
                if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
-                       free_page((unsigned long)pmds[i]);
+                       free_page((unsigned long)pmd);
                        pmd = NULL;
                        failed = true;
                }
index 7fb24e5..4f25ec0 100644 (file)
@@ -518,7 +518,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
        sd = &info->sd;
        sd->domain = domain;
        sd->node = node;
-       sd->acpi = device->handle;
+       sd->companion = device;
        /*
         * Maybe the desired pci bus has been already scanned. In such case
         * it is unnecessary to scan the pci bus with the given domain,busnum.
@@ -589,7 +589,7 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
 {
        struct pci_sysdata *sd = bridge->bus->sysdata;
 
-       ACPI_HANDLE_SET(&bridge->dev, sd->acpi);
+       ACPI_COMPANION_SET(&bridge->dev, sd->companion);
        return 0;
 }
 
index 862f458..cdc629c 100644 (file)
@@ -171,9 +171,12 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL(blk_mq_can_queue);
 
-static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq,
-                              unsigned int rw_flags)
+static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+                              struct request *rq, unsigned int rw_flags)
 {
+       if (blk_queue_io_stat(q))
+               rw_flags |= REQ_IO_STAT;
+
        rq->mq_ctx = ctx;
        rq->cmd_flags = rw_flags;
        ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
@@ -197,7 +200,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
 
                rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
                if (rq) {
-                       blk_mq_rq_ctx_init(ctx, rq, rw);
+                       blk_mq_rq_ctx_init(q, ctx, rq, rw);
                        break;
                } else if (!(gfp & __GFP_WAIT))
                        break;
@@ -718,6 +721,8 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
 
+       trace_block_rq_insert(hctx->queue, rq);
+
        list_add_tail(&rq->queuelist, &ctx->rq_list);
        blk_mq_hctx_mark_pending(hctx, ctx);
 
@@ -921,7 +926,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
        trace_block_getrq(q, bio, rw);
        rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
        if (likely(rq))
-               blk_mq_rq_ctx_init(ctx, rq, rw);
+               blk_mq_rq_ctx_init(q, ctx, rq, rw);
        else {
                blk_mq_put_ctx(ctx);
                trace_block_sleeprq(q, bio, rw);
@@ -1377,6 +1382,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
        q->queue_hw_ctx = hctxs;
 
        q->mq_ops = reg->ops;
+       q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
        blk_queue_make_request(q, blk_mq_make_request);
        blk_queue_rq_timed_out(q, reg->ops->timeout);
index a8287b4..dc51f46 100644 (file)
@@ -96,6 +96,7 @@
  * - Code works, detects all the partitions.
  *
  ************************************************************/
+#include <linux/kernel.h>
 #include <linux/crc32.h>
 #include <linux/ctype.h>
 #include <linux/math64.h>
@@ -715,8 +716,8 @@ int efi_partition(struct parsed_partitions *state)
                efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
 
                /* Naively convert UTF16-LE to 7 bits. */
-               label_max = min(sizeof(info->volname) - 1,
-                               sizeof(ptes[i].partition_name));
+               label_max = min(ARRAY_SIZE(info->volname) - 1,
+                               ARRAY_SIZE(ptes[i].partition_name));
                info->volname[label_max] = 0;
                while (label_count < label_max) {
                        u8 c = ptes[i].partition_name[label_count] & 0xff;
index 71f337a..4ae5734 100644 (file)
@@ -1402,6 +1402,9 @@ config CRYPTO_USER_API_SKCIPHER
          This option enables the user-spaces interface for symmetric
          key cipher algorithms.
 
+config CRYPTO_HASH_INFO
+       bool
+
 source "drivers/crypto/Kconfig"
 source crypto/asymmetric_keys/Kconfig
 
index 80019ba..b3a7e80 100644 (file)
@@ -104,3 +104,4 @@ obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
 obj-$(CONFIG_XOR_BLOCKS) += xor.o
 obj-$(CONFIG_ASYNC_CORE) += async_tx/
 obj-$(CONFIG_ASYMMETRIC_KEY_TYPE) += asymmetric_keys/
+obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o
index 6d2c2ea..03a6eb9 100644 (file)
@@ -12,6 +12,8 @@ if ASYMMETRIC_KEY_TYPE
 config ASYMMETRIC_PUBLIC_KEY_SUBTYPE
        tristate "Asymmetric public-key crypto algorithm subtype"
        select MPILIB
+       select PUBLIC_KEY_ALGO_RSA
+       select CRYPTO_HASH_INFO
        help
          This option provides support for asymmetric public key type handling.
          If signature generation and/or verification are to be used,
@@ -20,8 +22,8 @@ config ASYMMETRIC_PUBLIC_KEY_SUBTYPE
 
 config PUBLIC_KEY_ALGO_RSA
        tristate "RSA public-key algorithm"
-       depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE
        select MPILIB_EXTRA
+       select MPILIB
        help
          This option enables support for the RSA algorithm (PKCS#1, RFC3447).
 
index cf80765..b77eb53 100644 (file)
@@ -209,6 +209,7 @@ struct key_type key_type_asymmetric = {
        .match          = asymmetric_key_match,
        .destroy        = asymmetric_key_destroy,
        .describe       = asymmetric_key_describe,
+       .def_lookup_type = KEYRING_SEARCH_LOOKUP_ITERATE,
 };
 EXPORT_SYMBOL_GPL(key_type_asymmetric);
 
index cb2e291..97eb001 100644 (file)
 
 MODULE_LICENSE("GPL");
 
-const char *const pkey_algo[PKEY_ALGO__LAST] = {
+const char *const pkey_algo_name[PKEY_ALGO__LAST] = {
        [PKEY_ALGO_DSA]         = "DSA",
        [PKEY_ALGO_RSA]         = "RSA",
 };
-EXPORT_SYMBOL_GPL(pkey_algo);
+EXPORT_SYMBOL_GPL(pkey_algo_name);
 
-const char *const pkey_hash_algo[PKEY_HASH__LAST] = {
-       [PKEY_HASH_MD4]         = "md4",
-       [PKEY_HASH_MD5]         = "md5",
-       [PKEY_HASH_SHA1]        = "sha1",
-       [PKEY_HASH_RIPE_MD_160] = "rmd160",
-       [PKEY_HASH_SHA256]      = "sha256",
-       [PKEY_HASH_SHA384]      = "sha384",
-       [PKEY_HASH_SHA512]      = "sha512",
-       [PKEY_HASH_SHA224]      = "sha224",
+const struct public_key_algorithm *pkey_algo[PKEY_ALGO__LAST] = {
+#if defined(CONFIG_PUBLIC_KEY_ALGO_RSA) || \
+       defined(CONFIG_PUBLIC_KEY_ALGO_RSA_MODULE)
+       [PKEY_ALGO_RSA]         = &RSA_public_key_algorithm,
+#endif
 };
-EXPORT_SYMBOL_GPL(pkey_hash_algo);
+EXPORT_SYMBOL_GPL(pkey_algo);
 
-const char *const pkey_id_type[PKEY_ID_TYPE__LAST] = {
+const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST] = {
        [PKEY_ID_PGP]           = "PGP",
        [PKEY_ID_X509]          = "X509",
 };
-EXPORT_SYMBOL_GPL(pkey_id_type);
+EXPORT_SYMBOL_GPL(pkey_id_type_name);
 
 /*
  * Provide a part of a description of the key for /proc/keys.
@@ -56,7 +52,7 @@ static void public_key_describe(const struct key *asymmetric_key,
 
        if (key)
                seq_printf(m, "%s.%s",
-                          pkey_id_type[key->id_type], key->algo->name);
+                          pkey_id_type_name[key->id_type], key->algo->name);
 }
 
 /*
@@ -78,21 +74,45 @@ EXPORT_SYMBOL_GPL(public_key_destroy);
 /*
  * Verify a signature using a public key.
  */
-static int public_key_verify_signature(const struct key *key,
-                                      const struct public_key_signature *sig)
+int public_key_verify_signature(const struct public_key *pk,
+                               const struct public_key_signature *sig)
 {
-       const struct public_key *pk = key->payload.data;
+       const struct public_key_algorithm *algo;
+
+       BUG_ON(!pk);
+       BUG_ON(!pk->mpi[0]);
+       BUG_ON(!pk->mpi[1]);
+       BUG_ON(!sig);
+       BUG_ON(!sig->digest);
+       BUG_ON(!sig->mpi[0]);
+
+       algo = pk->algo;
+       if (!algo) {
+               if (pk->pkey_algo >= PKEY_ALGO__LAST)
+                       return -ENOPKG;
+               algo = pkey_algo[pk->pkey_algo];
+               if (!algo)
+                       return -ENOPKG;
+       }
 
-       if (!pk->algo->verify_signature)
+       if (!algo->verify_signature)
                return -ENOTSUPP;
 
-       if (sig->nr_mpi != pk->algo->n_sig_mpi) {
+       if (sig->nr_mpi != algo->n_sig_mpi) {
                pr_debug("Signature has %u MPI not %u\n",
-                        sig->nr_mpi, pk->algo->n_sig_mpi);
+                        sig->nr_mpi, algo->n_sig_mpi);
                return -EINVAL;
        }
 
-       return pk->algo->verify_signature(pk, sig);
+       return algo->verify_signature(pk, sig);
+}
+EXPORT_SYMBOL_GPL(public_key_verify_signature);
+
+static int public_key_verify_signature_2(const struct key *key,
+                                        const struct public_key_signature *sig)
+{
+       const struct public_key *pk = key->payload.data;
+       return public_key_verify_signature(pk, sig);
 }
 
 /*
@@ -103,6 +123,6 @@ struct asymmetric_key_subtype public_key_subtype = {
        .name                   = "public_key",
        .describe               = public_key_describe,
        .destroy                = public_key_destroy,
-       .verify_signature       = public_key_verify_signature,
+       .verify_signature       = public_key_verify_signature_2,
 };
 EXPORT_SYMBOL_GPL(public_key_subtype);
index 5e5e356..5c37a22 100644 (file)
@@ -28,3 +28,9 @@ struct public_key_algorithm {
 };
 
 extern const struct public_key_algorithm RSA_public_key_algorithm;
+
+/*
+ * public_key.c
+ */
+extern int public_key_verify_signature(const struct public_key *pk,
+                                      const struct public_key_signature *sig);
index 4a6a069..90a17f5 100644 (file)
@@ -73,13 +73,13 @@ static const struct {
        size_t size;
 } RSA_ASN1_templates[PKEY_HASH__LAST] = {
 #define _(X) { RSA_digest_info_##X, sizeof(RSA_digest_info_##X) }
-       [PKEY_HASH_MD5]         = _(MD5),
-       [PKEY_HASH_SHA1]        = _(SHA1),
-       [PKEY_HASH_RIPE_MD_160] = _(RIPE_MD_160),
-       [PKEY_HASH_SHA256]      = _(SHA256),
-       [PKEY_HASH_SHA384]      = _(SHA384),
-       [PKEY_HASH_SHA512]      = _(SHA512),
-       [PKEY_HASH_SHA224]      = _(SHA224),
+       [HASH_ALGO_MD5]         = _(MD5),
+       [HASH_ALGO_SHA1]        = _(SHA1),
+       [HASH_ALGO_RIPE_MD_160] = _(RIPE_MD_160),
+       [HASH_ALGO_SHA256]      = _(SHA256),
+       [HASH_ALGO_SHA384]      = _(SHA384),
+       [HASH_ALGO_SHA512]      = _(SHA512),
+       [HASH_ALGO_SHA224]      = _(SHA224),
 #undef _
 };
 
index facbf26..2989316 100644 (file)
@@ -47,6 +47,8 @@ void x509_free_certificate(struct x509_certificate *cert)
                kfree(cert->subject);
                kfree(cert->fingerprint);
                kfree(cert->authority);
+               kfree(cert->sig.digest);
+               mpi_free(cert->sig.rsa.s);
                kfree(cert);
        }
 }
@@ -152,33 +154,33 @@ int x509_note_pkey_algo(void *context, size_t hdrlen,
                return -ENOPKG; /* Unsupported combination */
 
        case OID_md4WithRSAEncryption:
-               ctx->cert->sig_hash_algo = PKEY_HASH_MD5;
-               ctx->cert->sig_pkey_algo = PKEY_ALGO_RSA;
+               ctx->cert->sig.pkey_hash_algo = HASH_ALGO_MD5;
+               ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA;
                break;
 
        case OID_sha1WithRSAEncryption:
-               ctx->cert->sig_hash_algo = PKEY_HASH_SHA1;
-               ctx->cert->sig_pkey_algo = PKEY_ALGO_RSA;
+               ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA1;
+               ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA;
                break;
 
        case OID_sha256WithRSAEncryption:
-               ctx->cert->sig_hash_algo = PKEY_HASH_SHA256;
-               ctx->cert->sig_pkey_algo = PKEY_ALGO_RSA;
+               ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA256;
+               ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA;
                break;
 
        case OID_sha384WithRSAEncryption:
-               ctx->cert->sig_hash_algo = PKEY_HASH_SHA384;
-               ctx->cert->sig_pkey_algo = PKEY_ALGO_RSA;
+               ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA384;
+               ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA;
                break;
 
        case OID_sha512WithRSAEncryption:
-               ctx->cert->sig_hash_algo = PKEY_HASH_SHA512;
-               ctx->cert->sig_pkey_algo = PKEY_ALGO_RSA;
+               ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA512;
+               ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA;
                break;
 
        case OID_sha224WithRSAEncryption:
-               ctx->cert->sig_hash_algo = PKEY_HASH_SHA224;
-               ctx->cert->sig_pkey_algo = PKEY_ALGO_RSA;
+               ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA224;
+               ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA;
                break;
        }
 
@@ -203,8 +205,8 @@ int x509_note_signature(void *context, size_t hdrlen,
                return -EINVAL;
        }
 
-       ctx->cert->sig = value;
-       ctx->cert->sig_size = vlen;
+       ctx->cert->raw_sig = value;
+       ctx->cert->raw_sig_size = vlen;
        return 0;
 }
 
@@ -343,8 +345,9 @@ int x509_extract_key_data(void *context, size_t hdrlen,
        if (ctx->last_oid != OID_rsaEncryption)
                return -ENOPKG;
 
-       /* There seems to be an extraneous 0 byte on the front of the data */
-       ctx->cert->pkey_algo = PKEY_ALGO_RSA;
+       ctx->cert->pub->pkey_algo = PKEY_ALGO_RSA;
+
+       /* Discard the BIT STRING metadata */
        ctx->key = value + 1;
        ctx->key_size = vlen - 1;
        return 0;
index f86dc5f..87d9cc2 100644 (file)
@@ -9,6 +9,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
+#include <linux/time.h>
 #include <crypto/public_key.h>
 
 struct x509_certificate {
@@ -20,13 +21,11 @@ struct x509_certificate {
        char            *authority;             /* Authority key fingerprint as hex */
        struct tm       valid_from;
        struct tm       valid_to;
-       enum pkey_algo  pkey_algo : 8;          /* Public key algorithm */
-       enum pkey_algo  sig_pkey_algo : 8;      /* Signature public key algorithm */
-       enum pkey_hash_algo sig_hash_algo : 8;  /* Signature hash algorithm */
        const void      *tbs;                   /* Signed data */
-       size_t          tbs_size;               /* Size of signed data */
-       const void      *sig;                   /* Signature data */
-       size_t          sig_size;               /* Size of sigature */
+       unsigned        tbs_size;               /* Size of signed data */
+       unsigned        raw_sig_size;           /* Size of sigature */
+       const void      *raw_sig;               /* Signature data */
+       struct public_key_signature sig;        /* Signature parameters */
 };
 
 /*
@@ -34,3 +33,10 @@ struct x509_certificate {
  */
 extern void x509_free_certificate(struct x509_certificate *cert);
 extern struct x509_certificate *x509_cert_parse(const void *data, size_t datalen);
+
+/*
+ * x509_public_key.c
+ */
+extern int x509_get_sig_params(struct x509_certificate *cert);
+extern int x509_check_signature(const struct public_key *pub,
+                               struct x509_certificate *cert);
index 06007f0..f83300b 100644 (file)
 #include <linux/asn1_decoder.h>
 #include <keys/asymmetric-subtype.h>
 #include <keys/asymmetric-parser.h>
+#include <keys/system_keyring.h>
 #include <crypto/hash.h>
 #include "asymmetric_keys.h"
 #include "public_key.h"
 #include "x509_parser.h"
 
-static const
-struct public_key_algorithm *x509_public_key_algorithms[PKEY_ALGO__LAST] = {
-       [PKEY_ALGO_DSA]         = NULL,
-#if defined(CONFIG_PUBLIC_KEY_ALGO_RSA) || \
-       defined(CONFIG_PUBLIC_KEY_ALGO_RSA_MODULE)
-       [PKEY_ALGO_RSA]         = &RSA_public_key_algorithm,
-#endif
-};
+/*
+ * Find a key in the given keyring by issuer and authority.
+ */
+static struct key *x509_request_asymmetric_key(
+       struct key *keyring,
+       const char *signer, size_t signer_len,
+       const char *authority, size_t auth_len)
+{
+       key_ref_t key;
+       char *id;
+
+       /* Construct an identifier. */
+       id = kmalloc(signer_len + 2 + auth_len + 1, GFP_KERNEL);
+       if (!id)
+               return ERR_PTR(-ENOMEM);
+
+       memcpy(id, signer, signer_len);
+       id[signer_len + 0] = ':';
+       id[signer_len + 1] = ' ';
+       memcpy(id + signer_len + 2, authority, auth_len);
+       id[signer_len + 2 + auth_len] = 0;
+
+       pr_debug("Look up: \"%s\"\n", id);
+
+       key = keyring_search(make_key_ref(keyring, 1),
+                            &key_type_asymmetric, id);
+       if (IS_ERR(key))
+               pr_debug("Request for module key '%s' err %ld\n",
+                        id, PTR_ERR(key));
+       kfree(id);
+
+       if (IS_ERR(key)) {
+               switch (PTR_ERR(key)) {
+                       /* Hide some search errors */
+               case -EACCES:
+               case -ENOTDIR:
+               case -EAGAIN:
+                       return ERR_PTR(-ENOKEY);
+               default:
+                       return ERR_CAST(key);
+               }
+       }
+
+       pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
+       return key_ref_to_ptr(key);
+}
 
 /*
- * Check the signature on a certificate using the provided public key
+ * Set up the signature parameters in an X.509 certificate.  This involves
+ * digesting the signed data and extracting the signature.
  */
-static int x509_check_signature(const struct public_key *pub,
-                               const struct x509_certificate *cert)
+int x509_get_sig_params(struct x509_certificate *cert)
 {
-       struct public_key_signature *sig;
        struct crypto_shash *tfm;
        struct shash_desc *desc;
        size_t digest_size, desc_size;
+       void *digest;
        int ret;
 
        pr_devel("==>%s()\n", __func__);
-       
+
+       if (cert->sig.rsa.s)
+               return 0;
+
+       cert->sig.rsa.s = mpi_read_raw_data(cert->raw_sig, cert->raw_sig_size);
+       if (!cert->sig.rsa.s)
+               return -ENOMEM;
+       cert->sig.nr_mpi = 1;
+
        /* Allocate the hashing algorithm we're going to need and find out how
         * big the hash operational data will be.
         */
-       tfm = crypto_alloc_shash(pkey_hash_algo[cert->sig_hash_algo], 0, 0);
+       tfm = crypto_alloc_shash(hash_algo_name[cert->sig.pkey_hash_algo], 0, 0);
        if (IS_ERR(tfm))
                return (PTR_ERR(tfm) == -ENOENT) ? -ENOPKG : PTR_ERR(tfm);
 
        desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
        digest_size = crypto_shash_digestsize(tfm);
 
-       /* We allocate the hash operational data storage on the end of our
-        * context data.
+       /* We allocate the hash operational data storage on the end of the
+        * digest storage space.
         */
        ret = -ENOMEM;
-       sig = kzalloc(sizeof(*sig) + desc_size + digest_size, GFP_KERNEL);
-       if (!sig)
-               goto error_no_sig;
+       digest = kzalloc(digest_size + desc_size, GFP_KERNEL);
+       if (!digest)
+               goto error;
 
-       sig->pkey_hash_algo     = cert->sig_hash_algo;
-       sig->digest             = (u8 *)sig + sizeof(*sig) + desc_size;
-       sig->digest_size        = digest_size;
+       cert->sig.digest = digest;
+       cert->sig.digest_size = digest_size;
 
-       desc = (void *)sig + sizeof(*sig);
-       desc->tfm       = tfm;
-       desc->flags     = CRYPTO_TFM_REQ_MAY_SLEEP;
+       desc = digest + digest_size;
+       desc->tfm = tfm;
+       desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
        ret = crypto_shash_init(desc);
        if (ret < 0)
                goto error;
+       might_sleep();
+       ret = crypto_shash_finup(desc, cert->tbs, cert->tbs_size, digest);
+error:
+       crypto_free_shash(tfm);
+       pr_devel("<==%s() = %d\n", __func__, ret);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(x509_get_sig_params);
 
-       ret = -ENOMEM;
-       sig->rsa.s = mpi_read_raw_data(cert->sig, cert->sig_size);
-       if (!sig->rsa.s)
-               goto error;
+/*
+ * Check the signature on a certificate using the provided public key
+ */
+int x509_check_signature(const struct public_key *pub,
+                        struct x509_certificate *cert)
+{
+       int ret;
 
-       ret = crypto_shash_finup(desc, cert->tbs, cert->tbs_size, sig->digest);
-       if (ret < 0)
-               goto error_mpi;
+       pr_devel("==>%s()\n", __func__);
 
-       ret = pub->algo->verify_signature(pub, sig);
+       ret = x509_get_sig_params(cert);
+       if (ret < 0)
+               return ret;
 
+       ret = public_key_verify_signature(pub, &cert->sig);
        pr_debug("Cert Verification: %d\n", ret);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(x509_check_signature);
 
-error_mpi:
-       mpi_free(sig->rsa.s);
-error:
-       kfree(sig);
-error_no_sig:
-       crypto_free_shash(tfm);
+/*
+ * Check the new certificate against the ones in the trust keyring.  If one of
+ * those is the signing key and validates the new certificate, then mark the
+ * new certificate as being trusted.
+ *
+ * Return 0 if the new certificate was successfully validated, 1 if we couldn't
+ * find a matching parent certificate in the trusted list and an error if there
+ * is a matching certificate but the signature check fails.
+ */
+static int x509_validate_trust(struct x509_certificate *cert,
+                              struct key *trust_keyring)
+{
+       const struct public_key *pk;
+       struct key *key;
+       int ret = 1;
 
-       pr_devel("<==%s() = %d\n", __func__, ret);
+       key = x509_request_asymmetric_key(trust_keyring,
+                                         cert->issuer, strlen(cert->issuer),
+                                         cert->authority,
+                                         strlen(cert->authority));
+       if (!IS_ERR(key))  {
+               pk = key->payload.data;
+               ret = x509_check_signature(pk, cert);
+       }
        return ret;
 }
 
@@ -106,7 +183,6 @@ error_no_sig:
 static int x509_key_preparse(struct key_preparsed_payload *prep)
 {
        struct x509_certificate *cert;
-       struct tm now;
        size_t srlen, sulen;
        char *desc = NULL;
        int ret;
@@ -117,7 +193,18 @@ static int x509_key_preparse(struct key_preparsed_payload *prep)
 
        pr_devel("Cert Issuer: %s\n", cert->issuer);
        pr_devel("Cert Subject: %s\n", cert->subject);
-       pr_devel("Cert Key Algo: %s\n", pkey_algo[cert->pkey_algo]);
+
+       if (cert->pub->pkey_algo >= PKEY_ALGO__LAST ||
+           cert->sig.pkey_algo >= PKEY_ALGO__LAST ||
+           cert->sig.pkey_hash_algo >= PKEY_HASH__LAST ||
+           !pkey_algo[cert->pub->pkey_algo] ||
+           !pkey_algo[cert->sig.pkey_algo] ||
+           !hash_algo_name[cert->sig.pkey_hash_algo]) {
+               ret = -ENOPKG;
+               goto error_free_cert;
+       }
+
+       pr_devel("Cert Key Algo: %s\n", pkey_algo_name[cert->pub->pkey_algo]);
        pr_devel("Cert Valid From: %04ld-%02d-%02d %02d:%02d:%02d\n",
                 cert->valid_from.tm_year + 1900, cert->valid_from.tm_mon + 1,
                 cert->valid_from.tm_mday, cert->valid_from.tm_hour,
@@ -127,61 +214,29 @@ static int x509_key_preparse(struct key_preparsed_payload *prep)
                 cert->valid_to.tm_mday, cert->valid_to.tm_hour,
                 cert->valid_to.tm_min,  cert->valid_to.tm_sec);
        pr_devel("Cert Signature: %s + %s\n",
-                pkey_algo[cert->sig_pkey_algo],
-                pkey_hash_algo[cert->sig_hash_algo]);
+                pkey_algo_name[cert->sig.pkey_algo],
+                hash_algo_name[cert->sig.pkey_hash_algo]);
 
-       if (!cert->fingerprint || !cert->authority) {
-               pr_warn("Cert for '%s' must have SubjKeyId and AuthKeyId extensions\n",
+       if (!cert->fingerprint) {
+               pr_warn("Cert for '%s' must have a SubjKeyId extension\n",
                        cert->subject);
                ret = -EKEYREJECTED;
                goto error_free_cert;
        }
 
-       time_to_tm(CURRENT_TIME.tv_sec, 0, &now);
-       pr_devel("Now: %04ld-%02d-%02d %02d:%02d:%02d\n",
-                now.tm_year + 1900, now.tm_mon + 1, now.tm_mday,
-                now.tm_hour, now.tm_min,  now.tm_sec);
-       if (now.tm_year < cert->valid_from.tm_year ||
-           (now.tm_year == cert->valid_from.tm_year &&
-            (now.tm_mon < cert->valid_from.tm_mon ||
-             (now.tm_mon == cert->valid_from.tm_mon &&
-              (now.tm_mday < cert->valid_from.tm_mday ||
-               (now.tm_mday == cert->valid_from.tm_mday &&
-                (now.tm_hour < cert->valid_from.tm_hour ||
-                 (now.tm_hour == cert->valid_from.tm_hour &&
-                  (now.tm_min < cert->valid_from.tm_min ||
-                   (now.tm_min == cert->valid_from.tm_min &&
-                    (now.tm_sec < cert->valid_from.tm_sec
-                     ))))))))))) {
-               pr_warn("Cert %s is not yet valid\n", cert->fingerprint);
-               ret = -EKEYREJECTED;
-               goto error_free_cert;
-       }
-       if (now.tm_year > cert->valid_to.tm_year ||
-           (now.tm_year == cert->valid_to.tm_year &&
-            (now.tm_mon > cert->valid_to.tm_mon ||
-             (now.tm_mon == cert->valid_to.tm_mon &&
-              (now.tm_mday > cert->valid_to.tm_mday ||
-               (now.tm_mday == cert->valid_to.tm_mday &&
-                (now.tm_hour > cert->valid_to.tm_hour ||
-                 (now.tm_hour == cert->valid_to.tm_hour &&
-                  (now.tm_min > cert->valid_to.tm_min ||
-                   (now.tm_min == cert->valid_to.tm_min &&
-                    (now.tm_sec > cert->valid_to.tm_sec
-                     ))))))))))) {
-               pr_warn("Cert %s has expired\n", cert->fingerprint);
-               ret = -EKEYEXPIRED;
-               goto error_free_cert;
-       }
-
-       cert->pub->algo = x509_public_key_algorithms[cert->pkey_algo];
+       cert->pub->algo = pkey_algo[cert->pub->pkey_algo];
        cert->pub->id_type = PKEY_ID_X509;
 
-       /* Check the signature on the key */
-       if (strcmp(cert->fingerprint, cert->authority) == 0) {
-               ret = x509_check_signature(cert->pub, cert);
+       /* Check the signature on the key if it appears to be self-signed */
+       if (!cert->authority ||
+           strcmp(cert->fingerprint, cert->authority) == 0) {
+               ret = x509_check_signature(cert->pub, cert); /* self-signed */
                if (ret < 0)
                        goto error_free_cert;
+       } else {
+               ret = x509_validate_trust(cert, system_trusted_keyring);
+               if (!ret)
+                       prep->trusted = 1;
        }
 
        /* Propose a description */
@@ -237,3 +292,6 @@ static void __exit x509_key_exit(void)
 
 module_init(x509_key_init);
 module_exit(x509_key_exit);
+
+MODULE_DESCRIPTION("X.509 certificate parser");
+MODULE_LICENSE("GPL");
index 9e62fef..f8c0b8d 100644 (file)
@@ -50,33 +50,36 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
                                                      &dest, 1, &src, 1, len);
        struct dma_device *device = chan ? chan->device : NULL;
        struct dma_async_tx_descriptor *tx = NULL;
+       struct dmaengine_unmap_data *unmap = NULL;
 
-       if (device && is_dma_copy_aligned(device, src_offset, dest_offset, len)) {
-               dma_addr_t dma_dest, dma_src;
+       if (device)
+               unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOIO);
+
+       if (unmap && is_dma_copy_aligned(device, src_offset, dest_offset, len)) {
                unsigned long dma_prep_flags = 0;
 
                if (submit->cb_fn)
                        dma_prep_flags |= DMA_PREP_INTERRUPT;
                if (submit->flags & ASYNC_TX_FENCE)
                        dma_prep_flags |= DMA_PREP_FENCE;
-               dma_dest = dma_map_page(device->dev, dest, dest_offset, len,
-                                       DMA_FROM_DEVICE);
-
-               dma_src = dma_map_page(device->dev, src, src_offset, len,
-                                      DMA_TO_DEVICE);
-
-               tx = device->device_prep_dma_memcpy(chan, dma_dest, dma_src,
-                                                   len, dma_prep_flags);
-               if (!tx) {
-                       dma_unmap_page(device->dev, dma_dest, len,
-                                      DMA_FROM_DEVICE);
-                       dma_unmap_page(device->dev, dma_src, len,
-                                      DMA_TO_DEVICE);
-               }
+
+               unmap->to_cnt = 1;
+               unmap->addr[0] = dma_map_page(device->dev, src, src_offset, len,
+                                             DMA_TO_DEVICE);
+               unmap->from_cnt = 1;
+               unmap->addr[1] = dma_map_page(device->dev, dest, dest_offset, len,
+                                             DMA_FROM_DEVICE);
+               unmap->len = len;
+
+               tx = device->device_prep_dma_memcpy(chan, unmap->addr[1],
+                                                   unmap->addr[0], len,
+                                                   dma_prep_flags);
        }
 
        if (tx) {
                pr_debug("%s: (async) len: %zu\n", __func__, len);
+
+               dma_set_unmap(tx, unmap);
                async_tx_submit(chan, tx, submit);
        } else {
                void *dest_buf, *src_buf;
@@ -96,6 +99,8 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
                async_tx_sync_epilog(submit);
        }
 
+       dmaengine_unmap_put(unmap);
+
        return tx;
 }
 EXPORT_SYMBOL_GPL(async_memcpy);
index 91d5d38..d05327c 100644 (file)
@@ -46,49 +46,24 @@ static struct page *pq_scribble_page;
  * do_async_gen_syndrome - asynchronously calculate P and/or Q
  */
 static __async_inline struct dma_async_tx_descriptor *
-do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks,
-                     const unsigned char *scfs, unsigned int offset, int disks,
-                     size_t len, dma_addr_t *dma_src,
+do_async_gen_syndrome(struct dma_chan *chan,
+                     const unsigned char *scfs, int disks,
+                     struct dmaengine_unmap_data *unmap,
+                     enum dma_ctrl_flags dma_flags,
                      struct async_submit_ctl *submit)
 {
        struct dma_async_tx_descriptor *tx = NULL;
        struct dma_device *dma = chan->device;
-       enum dma_ctrl_flags dma_flags = 0;
        enum async_tx_flags flags_orig = submit->flags;
        dma_async_tx_callback cb_fn_orig = submit->cb_fn;
        dma_async_tx_callback cb_param_orig = submit->cb_param;
        int src_cnt = disks - 2;
-       unsigned char coefs[src_cnt];
        unsigned short pq_src_cnt;
        dma_addr_t dma_dest[2];
        int src_off = 0;
-       int idx;
-       int i;
 
-       /* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */
-       if (P(blocks, disks))
-               dma_dest[0] = dma_map_page(dma->dev, P(blocks, disks), offset,
-                                          len, DMA_BIDIRECTIONAL);
-       else
-               dma_flags |= DMA_PREP_PQ_DISABLE_P;
-       if (Q(blocks, disks))
-               dma_dest[1] = dma_map_page(dma->dev, Q(blocks, disks), offset,
-                                          len, DMA_BIDIRECTIONAL);
-       else
-               dma_flags |= DMA_PREP_PQ_DISABLE_Q;
-
-       /* convert source addresses being careful to collapse 'empty'
-        * sources and update the coefficients accordingly
-        */
-       for (i = 0, idx = 0; i < src_cnt; i++) {
-               if (blocks[i] == NULL)
-                       continue;
-               dma_src[idx] = dma_map_page(dma->dev, blocks[i], offset, len,
-                                           DMA_TO_DEVICE);
-               coefs[idx] = scfs[i];
-               idx++;
-       }
-       src_cnt = idx;
+       if (submit->flags & ASYNC_TX_FENCE)
+               dma_flags |= DMA_PREP_FENCE;
 
        while (src_cnt > 0) {
                submit->flags = flags_orig;
@@ -100,28 +75,25 @@ do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks,
                if (src_cnt > pq_src_cnt) {
                        submit->flags &= ~ASYNC_TX_ACK;
                        submit->flags |= ASYNC_TX_FENCE;
-                       dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP;
                        submit->cb_fn = NULL;
                        submit->cb_param = NULL;
                } else {
-                       dma_flags &= ~DMA_COMPL_SKIP_DEST_UNMAP;
                        submit->cb_fn = cb_fn_orig;
                        submit->cb_param = cb_param_orig;
                        if (cb_fn_orig)
                                dma_flags |= DMA_PREP_INTERRUPT;
                }
-               if (submit->flags & ASYNC_TX_FENCE)
-                       dma_flags |= DMA_PREP_FENCE;
 
-               /* Since we have clobbered the src_list we are committed
-                * to doing this asynchronously.  Drivers force forward
-                * progress in case they can not provide a descriptor
+               /* Drivers force forward progress in case they can not provide
+                * a descriptor
                 */
                for (;;) {
+                       dma_dest[0] = unmap->addr[disks - 2];
+                       dma_dest[1] = unmap->addr[disks - 1];
                        tx = dma->device_prep_dma_pq(chan, dma_dest,
-                                                    &dma_src[src_off],
+                                                    &unmap->addr[src_off],
                                                     pq_src_cnt,
-                                                    &coefs[src_off], len,
+                                                    &scfs[src_off], unmap->len,
                                                     dma_flags);
                        if (likely(tx))
                                break;
@@ -129,6 +101,7 @@ do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks,
                        dma_async_issue_pending(chan);
                }
 
+               dma_set_unmap(tx, unmap);
                async_tx_submit(chan, tx, submit);
                submit->depend_tx = tx;
 
@@ -188,10 +161,6 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
  * set to NULL those buffers will be replaced with the raid6_zero_page
  * in the synchronous path and omitted in the hardware-asynchronous
  * path.
- *
- * 'blocks' note: if submit->scribble is NULL then the contents of
- * 'blocks' may be overwritten to perform address conversions
- * (dma_map_page() or page_address()).
  */
 struct dma_async_tx_descriptor *
 async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
@@ -202,26 +171,69 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
                                                      &P(blocks, disks), 2,
                                                      blocks, src_cnt, len);
        struct dma_device *device = chan ? chan->device : NULL;
-       dma_addr_t *dma_src = NULL;
+       struct dmaengine_unmap_data *unmap = NULL;
 
        BUG_ON(disks > 255 || !(P(blocks, disks) || Q(blocks, disks)));
 
-       if (submit->scribble)
-               dma_src = submit->scribble;
-       else if (sizeof(dma_addr_t) <= sizeof(struct page *))
-               dma_src = (dma_addr_t *) blocks;
+       if (device)
+               unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO);
 
-       if (dma_src && device &&
+       if (unmap &&
            (src_cnt <= dma_maxpq(device, 0) ||
             dma_maxpq(device, DMA_PREP_CONTINUE) > 0) &&
            is_dma_pq_aligned(device, offset, 0, len)) {
+               struct dma_async_tx_descriptor *tx;
+               enum dma_ctrl_flags dma_flags = 0;
+               unsigned char coefs[src_cnt];
+               int i, j;
+
                /* run the p+q asynchronously */
                pr_debug("%s: (async) disks: %d len: %zu\n",
                         __func__, disks, len);
-               return do_async_gen_syndrome(chan, blocks, raid6_gfexp, offset,
-                                            disks, len, dma_src, submit);
+
+               /* convert source addresses being careful to collapse 'empty'
+                * sources and update the coefficients accordingly
+                */
+               unmap->len = len;
+               for (i = 0, j = 0; i < src_cnt; i++) {
+                       if (blocks[i] == NULL)
+                               continue;
+                       unmap->addr[j] = dma_map_page(device->dev, blocks[i], offset,
+                                                     len, DMA_TO_DEVICE);
+                       coefs[j] = raid6_gfexp[i];
+                       unmap->to_cnt++;
+                       j++;
+               }
+
+               /*
+                * DMAs use destinations as sources,
+                * so use BIDIRECTIONAL mapping
+                */
+               unmap->bidi_cnt++;
+               if (P(blocks, disks))
+                       unmap->addr[j++] = dma_map_page(device->dev, P(blocks, disks),
+                                                       offset, len, DMA_BIDIRECTIONAL);
+               else {
+                       unmap->addr[j++] = 0;
+                       dma_flags |= DMA_PREP_PQ_DISABLE_P;
+               }
+
+               unmap->bidi_cnt++;
+               if (Q(blocks, disks))
+                       unmap->addr[j++] = dma_map_page(device->dev, Q(blocks, disks),
+                                                      offset, len, DMA_BIDIRECTIONAL);
+               else {
+                       unmap->addr[j++] = 0;
+                       dma_flags |= DMA_PREP_PQ_DISABLE_Q;
+               }
+
+               tx = do_async_gen_syndrome(chan, coefs, j, unmap, dma_flags, submit);
+               dmaengine_unmap_put(unmap);
+               return tx;
        }
 
+       dmaengine_unmap_put(unmap);
+
        /* run the pq synchronously */
        pr_debug("%s: (sync) disks: %d len: %zu\n", __func__, disks, len);
 
@@ -277,50 +289,60 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
        struct dma_async_tx_descriptor *tx;
        unsigned char coefs[disks-2];
        enum dma_ctrl_flags dma_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
-       dma_addr_t *dma_src = NULL;
-       int src_cnt = 0;
+       struct dmaengine_unmap_data *unmap = NULL;
 
        BUG_ON(disks < 4);
 
-       if (submit->scribble)
-               dma_src = submit->scribble;
-       else if (sizeof(dma_addr_t) <= sizeof(struct page *))
-               dma_src = (dma_addr_t *) blocks;
+       if (device)
+               unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO);
 
-       if (dma_src && device && disks <= dma_maxpq(device, 0) &&
+       if (unmap && disks <= dma_maxpq(device, 0) &&
            is_dma_pq_aligned(device, offset, 0, len)) {
                struct device *dev = device->dev;
-               dma_addr_t *pq = &dma_src[disks-2];
-               int i;
+               dma_addr_t pq[2];
+               int i, j = 0, src_cnt = 0;
 
                pr_debug("%s: (async) disks: %d len: %zu\n",
                         __func__, disks, len);
-               if (!P(blocks, disks))
+
+               unmap->len = len;
+               for (i = 0; i < disks-2; i++)
+                       if (likely(blocks[i])) {
+                               unmap->addr[j] = dma_map_page(dev, blocks[i],
+                                                             offset, len,
+                                                             DMA_TO_DEVICE);
+                               coefs[j] = raid6_gfexp[i];
+                               unmap->to_cnt++;
+                               src_cnt++;
+                               j++;
+                       }
+
+               if (!P(blocks, disks)) {
+                       pq[0] = 0;
                        dma_flags |= DMA_PREP_PQ_DISABLE_P;
-               else
+               } else {
                        pq[0] = dma_map_page(dev, P(blocks, disks),
                                             offset, len,
                                             DMA_TO_DEVICE);
-               if (!Q(blocks, disks))
+                       unmap->addr[j++] = pq[0];
+                       unmap->to_cnt++;
+               }
+               if (!Q(blocks, disks)) {
+                       pq[1] = 0;
                        dma_flags |= DMA_PREP_PQ_DISABLE_Q;
-               else
+               } else {
                        pq[1] = dma_map_page(dev, Q(blocks, disks),
                                             offset, len,
                                             DMA_TO_DEVICE);
+                       unmap->addr[j++] = pq[1];
+                       unmap->to_cnt++;
+               }
 
                if (submit->flags & ASYNC_TX_FENCE)
                        dma_flags |= DMA_PREP_FENCE;
-               for (i = 0; i < disks-2; i++)
-                       if (likely(blocks[i])) {
-                               dma_src[src_cnt] = dma_map_page(dev, blocks[i],
-                                                               offset, len,
-                                                               DMA_TO_DEVICE);
-                               coefs[src_cnt] = raid6_gfexp[i];
-                               src_cnt++;
-                       }
-
                for (;;) {
-                       tx = device->device_prep_dma_pq_val(chan, pq, dma_src,
+                       tx = device->device_prep_dma_pq_val(chan, pq,
+                                                           unmap->addr,
                                                            src_cnt,
                                                            coefs,
                                                            len, pqres,
@@ -330,6 +352,8 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks,
                        async_tx_quiesce(&submit->depend_tx);
                        dma_async_issue_pending(chan);
                }
+
+               dma_set_unmap(tx, unmap);
                async_tx_submit(chan, tx, submit);
 
                return tx;
index a9f08a6..934a849 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/raid/pq.h>
 #include <linux/async_tx.h>
+#include <linux/dmaengine.h>
 
 static struct dma_async_tx_descriptor *
 async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef,
@@ -34,35 +35,45 @@ async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef,
        struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
                                                      &dest, 1, srcs, 2, len);
        struct dma_device *dma = chan ? chan->device : NULL;
+       struct dmaengine_unmap_data *unmap = NULL;
        const u8 *amul, *bmul;
        u8 ax, bx;
        u8 *a, *b, *c;
 
-       if (dma) {
-               dma_addr_t dma_dest[2];
-               dma_addr_t dma_src[2];
+       if (dma)
+               unmap = dmaengine_get_unmap_data(dma->dev, 3, GFP_NOIO);
+
+       if (unmap) {
                struct device *dev = dma->dev;
+               dma_addr_t pq[2];
                struct dma_async_tx_descriptor *tx;
                enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
 
                if (submit->flags & ASYNC_TX_FENCE)
                        dma_flags |= DMA_PREP_FENCE;
-               dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
-               dma_src[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE);
-               dma_src[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE);
-               tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 2, coef,
+               unmap->addr[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE);
+               unmap->addr[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE);
+               unmap->to_cnt = 2;
+
+               unmap->addr[2] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
+               unmap->bidi_cnt = 1;
+               /* engine only looks at Q, but expects it to follow P */
+               pq[1] = unmap->addr[2];
+
+               unmap->len = len;
+               tx = dma->device_prep_dma_pq(chan, pq, unmap->addr, 2, coef,
                                             len, dma_flags);
                if (tx) {
+                       dma_set_unmap(tx, unmap);
                        async_tx_submit(chan, tx, submit);
+                       dmaengine_unmap_put(unmap);
                        return tx;
                }
 
                /* could not get a descriptor, unmap and fall through to
                 * the synchronous path
                 */
-               dma_unmap_page(dev, dma_dest[1], len, DMA_BIDIRECTIONAL);
-               dma_unmap_page(dev, dma_src[0], len, DMA_TO_DEVICE);
-               dma_unmap_page(dev, dma_src[1], len, DMA_TO_DEVICE);
+               dmaengine_unmap_put(unmap);
        }
 
        /* run the operation synchronously */
@@ -89,23 +100,38 @@ async_mult(struct page *dest, struct page *src, u8 coef, size_t len,
        struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ,
                                                      &dest, 1, &src, 1, len);
        struct dma_device *dma = chan ? chan->device : NULL;
+       struct dmaengine_unmap_data *unmap = NULL;
        const u8 *qmul; /* Q multiplier table */
        u8 *d, *s;
 
-       if (dma) {
+       if (dma)
+               unmap = dmaengine_get_unmap_data(dma->dev, 3, GFP_NOIO);
+
+       if (unmap) {
                dma_addr_t dma_dest[2];
-               dma_addr_t dma_src[1];
                struct device *dev = dma->dev;
                struct dma_async_tx_descriptor *tx;
                enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
 
                if (submit->flags & ASYNC_TX_FENCE)
                        dma_flags |= DMA_PREP_FENCE;
-               dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
-               dma_src[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE);
-               tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 1, &coef,
-                                            len, dma_flags);
+               unmap->addr[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE);
+               unmap->to_cnt++;
+               unmap->addr[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
+               dma_dest[1] = unmap->addr[1];
+               unmap->bidi_cnt++;
+               unmap->len = len;
+
+               /* this looks funny, but the engine looks for Q at
+                * dma_dest[1] and ignores dma_dest[0] as a dest
+                * due to DMA_PREP_PQ_DISABLE_P
+                */
+               tx = dma->device_prep_dma_pq(chan, dma_dest, unmap->addr,
+                                            1, &coef, len, dma_flags);
+
                if (tx) {
+                       dma_set_unmap(tx, unmap);
+                       dmaengine_unmap_put(unmap);
                        async_tx_submit(chan, tx, submit);
                        return tx;
                }
@@ -113,8 +139,7 @@ async_mult(struct page *dest, struct page *src, u8 coef, size_t len,
                /* could not get a descriptor, unmap and fall through to
                 * the synchronous path
                 */
-               dma_unmap_page(dev, dma_dest[1], len, DMA_BIDIRECTIONAL);
-               dma_unmap_page(dev, dma_src[0], len, DMA_TO_DEVICE);
+               dmaengine_unmap_put(unmap);
        }
 
        /* no channel available, or failed to allocate a descriptor, so
index 7be3424..39ea479 100644 (file)
@@ -128,7 +128,7 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
                }
                device->device_issue_pending(chan);
        } else {
-               if (dma_wait_for_async_tx(depend_tx) != DMA_SUCCESS)
+               if (dma_wait_for_async_tx(depend_tx) != DMA_COMPLETE)
                        panic("%s: DMA error waiting for depend_tx\n",
                              __func__);
                tx->tx_submit(tx);
@@ -280,7 +280,7 @@ void async_tx_quiesce(struct dma_async_tx_descriptor **tx)
                 * we are referring to the correct operation
                 */
                BUG_ON(async_tx_test_ack(*tx));
-               if (dma_wait_for_async_tx(*tx) != DMA_SUCCESS)
+               if (dma_wait_for_async_tx(*tx) != DMA_COMPLETE)
                        panic("%s: DMA error waiting for transaction\n",
                              __func__);
                async_tx_ack(*tx);
index 8ade0a0..3c562f5 100644 (file)
 
 /* do_async_xor - dma map the pages and perform the xor with an engine */
 static __async_inline struct dma_async_tx_descriptor *
-do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
-            unsigned int offset, int src_cnt, size_t len, dma_addr_t *dma_src,
+do_async_xor(struct dma_chan *chan, struct dmaengine_unmap_data *unmap,
             struct async_submit_ctl *submit)
 {
        struct dma_device *dma = chan->device;
        struct dma_async_tx_descriptor *tx = NULL;
-       int src_off = 0;
-       int i;
        dma_async_tx_callback cb_fn_orig = submit->cb_fn;
        void *cb_param_orig = submit->cb_param;
        enum async_tx_flags flags_orig = submit->flags;
-       enum dma_ctrl_flags dma_flags;
-       int xor_src_cnt = 0;
-       dma_addr_t dma_dest;
-
-       /* map the dest bidrectional in case it is re-used as a source */
-       dma_dest = dma_map_page(dma->dev, dest, offset, len, DMA_BIDIRECTIONAL);
-       for (i = 0; i < src_cnt; i++) {
-               /* only map the dest once */
-               if (!src_list[i])
-                       continue;
-               if (unlikely(src_list[i] == dest)) {
-                       dma_src[xor_src_cnt++] = dma_dest;
-                       continue;
-               }
-               dma_src[xor_src_cnt++] = dma_map_page(dma->dev, src_list[i], offset,
-                                                     len, DMA_TO_DEVICE);
-       }
-       src_cnt = xor_src_cnt;
+       enum dma_ctrl_flags dma_flags = 0;
+       int src_cnt = unmap->to_cnt;
+       int xor_src_cnt;
+       dma_addr_t dma_dest = unmap->addr[unmap->to_cnt];
+       dma_addr_t *src_list = unmap->addr;
 
        while (src_cnt) {
+               dma_addr_t tmp;
+
                submit->flags = flags_orig;
-               dma_flags = 0;
                xor_src_cnt = min(src_cnt, (int)dma->max_xor);
-               /* if we are submitting additional xors, leave the chain open,
-                * clear the callback parameters, and leave the destination
-                * buffer mapped
+               /* if we are submitting additional xors, leave the chain open
+                * and clear the callback parameters
                 */
                if (src_cnt > xor_src_cnt) {
                        submit->flags &= ~ASYNC_TX_ACK;
                        submit->flags |= ASYNC_TX_FENCE;
-                       dma_flags = DMA_COMPL_SKIP_DEST_UNMAP;
                        submit->cb_fn = NULL;
                        submit->cb_param = NULL;
                } else {
@@ -85,12 +68,18 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
                        dma_flags |= DMA_PREP_INTERRUPT;
                if (submit->flags & ASYNC_TX_FENCE)
                        dma_flags |= DMA_PREP_FENCE;
-               /* Since we have clobbered the src_list we are committed
-                * to doing this asynchronously.  Drivers force forward progress
-                * in case they can not provide a descriptor
+
+               /* Drivers force forward progress in case they can not provide a
+                * descriptor
                 */
-               tx = dma->device_prep_dma_xor(chan, dma_dest, &dma_src[src_off],
-                                             xor_src_cnt, len, dma_flags);
+               tmp = src_list[0];
+               if (src_list > unmap->addr)
+                       src_list[0] = dma_dest;
+               tx = dma->device_prep_dma_xor(chan, dma_dest, src_list,
+                                             xor_src_cnt, unmap->len,
+                                             dma_flags);
+               src_list[0] = tmp;
+
 
                if (unlikely(!tx))
                        async_tx_quiesce(&submit->depend_tx);
@@ -99,22 +88,21 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
                while (unlikely(!tx)) {
                        dma_async_issue_pending(chan);
                        tx = dma->device_prep_dma_xor(chan, dma_dest,
-                                                     &dma_src[src_off],
-                                                     xor_src_cnt, len,
+                                                     src_list,
+                                                     xor_src_cnt, unmap->len,
                                                      dma_flags);
                }
 
+               dma_set_unmap(tx, unmap);
                async_tx_submit(chan, tx, submit);
                submit->depend_tx = tx;
 
                if (src_cnt > xor_src_cnt) {
                        /* drop completed sources */
                        src_cnt -= xor_src_cnt;
-                       src_off += xor_src_cnt;
-
                        /* use the intermediate result a source */
-                       dma_src[--src_off] = dma_dest;
                        src_cnt++;
+                       src_list += xor_src_cnt - 1;
                } else
                        break;
        }
@@ -189,22 +177,40 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset,
        struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR,
                                                      &dest, 1, src_list,
                                                      src_cnt, len);
-       dma_addr_t *dma_src = NULL;
+       struct dma_device *device = chan ? chan->device : NULL;
+       struct dmaengine_unmap_data *unmap = NULL;
 
        BUG_ON(src_cnt <= 1);
 
-       if (submit->scribble)
-               dma_src = submit->scribble;
-       else if (sizeof(dma_addr_t) <= sizeof(struct page *))
-               dma_src = (dma_addr_t *) src_list;
+       if (device)
+               unmap = dmaengine_get_unmap_data(device->dev, src_cnt+1, GFP_NOIO);
+
+       if (unmap && is_dma_xor_aligned(device, offset, 0, len)) {
+               struct dma_async_tx_descriptor *tx;
+               int i, j;
 
-       if (dma_src && chan && is_dma_xor_aligned(chan->device, offset, 0, len)) {
                /* run the xor asynchronously */
                pr_debug("%s (async): len: %zu\n", __func__, len);
 
-               return do_async_xor(chan, dest, src_list, offset, src_cnt, len,
-                                   dma_src, submit);
+               unmap->len = len;
+               for (i = 0, j = 0; i < src_cnt; i++) {
+                       if (!src_list[i])
+                               continue;
+                       unmap->to_cnt++;
+                       unmap->addr[j++] = dma_map_page(device->dev, src_list[i],
+                                                       offset, len, DMA_TO_DEVICE);
+               }
+
+               /* map it bidirectional as it may be re-used as a source */
+               unmap->addr[j] = dma_map_page(device->dev, dest, offset, len,
+                                             DMA_BIDIRECTIONAL);
+               unmap->bidi_cnt = 1;
+
+               tx = do_async_xor(chan, unmap, submit);
+               dmaengine_unmap_put(unmap);
+               return tx;
        } else {
+               dmaengine_unmap_put(unmap);
                /* run the xor synchronously */
                pr_debug("%s (sync): len: %zu\n", __func__, len);
                WARN_ONCE(chan, "%s: no space for dma address conversion\n",
@@ -268,16 +274,14 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
        struct dma_chan *chan = xor_val_chan(submit, dest, src_list, src_cnt, len);
        struct dma_device *device = chan ? chan->device : NULL;
        struct dma_async_tx_descriptor *tx = NULL;
-       dma_addr_t *dma_src = NULL;
+       struct dmaengine_unmap_data *unmap = NULL;
 
        BUG_ON(src_cnt <= 1);
 
-       if (submit->scribble)
-               dma_src = submit->scribble;
-       else if (sizeof(dma_addr_t) <= sizeof(struct page *))
-               dma_src = (dma_addr_t *) src_list;
+       if (device)
+               unmap = dmaengine_get_unmap_data(device->dev, src_cnt, GFP_NOIO);
 
-       if (dma_src && device && src_cnt <= device->max_xor &&
+       if (unmap && src_cnt <= device->max_xor &&
            is_dma_xor_aligned(device, offset, 0, len)) {
                unsigned long dma_prep_flags = 0;
                int i;
@@ -288,11 +292,15 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
                        dma_prep_flags |= DMA_PREP_INTERRUPT;
                if (submit->flags & ASYNC_TX_FENCE)
                        dma_prep_flags |= DMA_PREP_FENCE;
-               for (i = 0; i < src_cnt; i++)
-                       dma_src[i] = dma_map_page(device->dev, src_list[i],
-                                                 offset, len, DMA_TO_DEVICE);
 
-               tx = device->device_prep_dma_xor_val(chan, dma_src, src_cnt,
+               for (i = 0; i < src_cnt; i++) {
+                       unmap->addr[i] = dma_map_page(device->dev, src_list[i],
+                                                     offset, len, DMA_TO_DEVICE);
+                       unmap->to_cnt++;
+               }
+               unmap->len = len;
+
+               tx = device->device_prep_dma_xor_val(chan, unmap->addr, src_cnt,
                                                     len, result,
                                                     dma_prep_flags);
                if (unlikely(!tx)) {
@@ -301,11 +309,11 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
                        while (!tx) {
                                dma_async_issue_pending(chan);
                                tx = device->device_prep_dma_xor_val(chan,
-                                       dma_src, src_cnt, len, result,
+                                       unmap->addr, src_cnt, len, result,
                                        dma_prep_flags);
                        }
                }
-
+               dma_set_unmap(tx, unmap);
                async_tx_submit(chan, tx, submit);
        } else {
                enum async_tx_flags flags_orig = submit->flags;
@@ -327,6 +335,7 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset,
                async_tx_sync_epilog(submit);
                submit->flags = flags_orig;
        }
+       dmaengine_unmap_put(unmap);
 
        return tx;
 }
index 4a92bac..dad95f4 100644 (file)
@@ -28,7 +28,7 @@
 #undef pr
 #define pr(fmt, args...) pr_info("raid6test: " fmt, ##args)
 
-#define NDISKS 16 /* Including P and Q */
+#define NDISKS 64 /* Including P and Q */
 
 static struct page *dataptrs[NDISKS];
 static addr_conv_t addr_conv[NDISKS];
@@ -219,6 +219,14 @@ static int raid6_test(void)
                err += test(11, &tests);
                err += test(12, &tests);
        }
+
+       /* the 24 disk case is special for ioatdma as it is the boudary point
+        * at which it needs to switch from 8-source ops to 16-source
+        * ops for continuation (assumes DMA_HAS_PQ_CONTINUE is not set)
+        */
+       if (NDISKS > 24)
+               err += test(24, &tests);
+
        err += test(NDISKS, &tests);
 
        pr("\n");
diff --git a/crypto/hash_info.c b/crypto/hash_info.c
new file mode 100644 (file)
index 0000000..3e7ff46
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Hash Info: Hash algorithms information
+ *
+ * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/export.h>
+#include <crypto/hash_info.h>
+
+const char *const hash_algo_name[HASH_ALGO__LAST] = {
+       [HASH_ALGO_MD4]         = "md4",
+       [HASH_ALGO_MD5]         = "md5",
+       [HASH_ALGO_SHA1]        = "sha1",
+       [HASH_ALGO_RIPE_MD_160] = "rmd160",
+       [HASH_ALGO_SHA256]      = "sha256",
+       [HASH_ALGO_SHA384]      = "sha384",
+       [HASH_ALGO_SHA512]      = "sha512",
+       [HASH_ALGO_SHA224]      = "sha224",
+       [HASH_ALGO_RIPE_MD_128] = "rmd128",
+       [HASH_ALGO_RIPE_MD_256] = "rmd256",
+       [HASH_ALGO_RIPE_MD_320] = "rmd320",
+       [HASH_ALGO_WP_256]      = "wp256",
+       [HASH_ALGO_WP_384]      = "wp384",
+       [HASH_ALGO_WP_512]      = "wp512",
+       [HASH_ALGO_TGR_128]     = "tgr128",
+       [HASH_ALGO_TGR_160]     = "tgr160",
+       [HASH_ALGO_TGR_192]     = "tgr192",
+};
+EXPORT_SYMBOL_GPL(hash_algo_name);
+
+const int hash_digest_size[HASH_ALGO__LAST] = {
+       [HASH_ALGO_MD4]         = MD5_DIGEST_SIZE,
+       [HASH_ALGO_MD5]         = MD5_DIGEST_SIZE,
+       [HASH_ALGO_SHA1]        = SHA1_DIGEST_SIZE,
+       [HASH_ALGO_RIPE_MD_160] = RMD160_DIGEST_SIZE,
+       [HASH_ALGO_SHA256]      = SHA256_DIGEST_SIZE,
+       [HASH_ALGO_SHA384]      = SHA384_DIGEST_SIZE,
+       [HASH_ALGO_SHA512]      = SHA512_DIGEST_SIZE,
+       [HASH_ALGO_SHA224]      = SHA224_DIGEST_SIZE,
+       [HASH_ALGO_RIPE_MD_128] = RMD128_DIGEST_SIZE,
+       [HASH_ALGO_RIPE_MD_256] = RMD256_DIGEST_SIZE,
+       [HASH_ALGO_RIPE_MD_320] = RMD320_DIGEST_SIZE,
+       [HASH_ALGO_WP_256]      = WP256_DIGEST_SIZE,
+       [HASH_ALGO_WP_384]      = WP384_DIGEST_SIZE,
+       [HASH_ALGO_WP_512]      = WP512_DIGEST_SIZE,
+       [HASH_ALGO_TGR_128]     = TGR128_DIGEST_SIZE,
+       [HASH_ALGO_TGR_160]     = TGR160_DIGEST_SIZE,
+       [HASH_ALGO_TGR_192]     = TGR192_DIGEST_SIZE,
+};
+EXPORT_SYMBOL_GPL(hash_digest_size);
index c95df0b..5d92485 100644 (file)
@@ -235,17 +235,6 @@ config ACPI_INITRD_TABLE_OVERRIDE
          initrd, therefore it's safe to say Y.
          See Documentation/acpi/initrd_table_override.txt for details
 
-config ACPI_BLACKLIST_YEAR
-       int "Disable ACPI for systems before Jan 1st this year" if X86_32
-       default 0
-       help
-         Enter a 4-digit year, e.g., 2001, to disable ACPI by default
-         on platforms with DMI BIOS date before January 1st that year.
-         "acpi=force" can be used to override this mechanism.
-
-         Enter 0 to disable this mechanism and allow ACPI to
-         run by default no matter what the year.  (default)
-
 config ACPI_DEBUG
        bool "Debug Statements"
        default n
index b9f0d5f..8711e37 100644 (file)
@@ -56,7 +56,6 @@ static int ac_sleep_before_get_state_ms;
 
 struct acpi_ac {
        struct power_supply charger;
-       struct acpi_device *adev;
        struct platform_device *pdev;
        unsigned long long state;
 };
@@ -70,8 +69,9 @@ struct acpi_ac {
 static int acpi_ac_get_state(struct acpi_ac *ac)
 {
        acpi_status status;
+       acpi_handle handle = ACPI_HANDLE(&ac->pdev->dev);
 
-       status = acpi_evaluate_integer(ac->adev->handle, "_PSR", NULL,
+       status = acpi_evaluate_integer(handle, "_PSR", NULL,
                                       &ac->state);
        if (ACPI_FAILURE(status)) {
                ACPI_EXCEPTION((AE_INFO, status,
@@ -119,6 +119,7 @@ static enum power_supply_property ac_props[] = {
 static void acpi_ac_notify_handler(acpi_handle handle, u32 event, void *data)
 {
        struct acpi_ac *ac = data;
+       struct acpi_device *adev;
 
        if (!ac)
                return;
@@ -141,10 +142,11 @@ static void acpi_ac_notify_handler(acpi_handle handle, u32 event, void *data)
                        msleep(ac_sleep_before_get_state_ms);
 
                acpi_ac_get_state(ac);
-               acpi_bus_generate_netlink_event(ac->adev->pnp.device_class,
+               adev = ACPI_COMPANION(&ac->pdev->dev);
+               acpi_bus_generate_netlink_event(adev->pnp.device_class,
                                                dev_name(&ac->pdev->dev),
                                                event, (u32) ac->state);
-               acpi_notifier_call_chain(ac->adev, event, (u32) ac->state);
+               acpi_notifier_call_chain(adev, event, (u32) ac->state);
                kobject_uevent(&ac->charger.dev->kobj, KOBJ_CHANGE);
        }
 
@@ -178,8 +180,8 @@ static int acpi_ac_probe(struct platform_device *pdev)
        if (!pdev)
                return -EINVAL;
 
-       result = acpi_bus_get_device(ACPI_HANDLE(&pdev->dev), &adev);
-       if (result)
+       adev = ACPI_COMPANION(&pdev->dev);
+       if (!adev)
                return -ENODEV;
 
        ac = kzalloc(sizeof(struct acpi_ac), GFP_KERNEL);
@@ -188,7 +190,6 @@ static int acpi_ac_probe(struct platform_device *pdev)
 
        strcpy(acpi_device_name(adev), ACPI_AC_DEVICE_NAME);
        strcpy(acpi_device_class(adev), ACPI_AC_CLASS);
-       ac->adev = adev;
        ac->pdev = pdev;
        platform_set_drvdata(pdev, ac);
 
index d396101..6745fe1 100644 (file)
@@ -163,6 +163,15 @@ static const struct acpi_device_id acpi_lpss_device_ids[] = {
        { "80860F41", (unsigned long)&byt_i2c_dev_desc },
        { "INT33B2", },
 
+       { "INT3430", (unsigned long)&lpt_dev_desc },
+       { "INT3431", (unsigned long)&lpt_dev_desc },
+       { "INT3432", (unsigned long)&lpt_dev_desc },
+       { "INT3433", (unsigned long)&lpt_dev_desc },
+       { "INT3434", (unsigned long)&lpt_uart_dev_desc },
+       { "INT3435", (unsigned long)&lpt_uart_dev_desc },
+       { "INT3436", (unsigned long)&lpt_sdio_dev_desc },
+       { "INT3437", },
+
        { }
 };
 
index 8a4cfc7..dbfe49e 100644 (file)
@@ -111,7 +111,7 @@ int acpi_create_platform_device(struct acpi_device *adev,
        pdevinfo.id = -1;
        pdevinfo.res = resources;
        pdevinfo.num_res = count;
-       pdevinfo.acpi_node.handle = adev->handle;
+       pdevinfo.acpi_node.companion = adev;
        pdev = platform_device_register_full(&pdevinfo);
        if (IS_ERR(pdev)) {
                dev_err(&adev->dev, "platform device creation failed: %ld\n",
index fb84837..078c4f7 100644 (file)
@@ -75,39 +75,6 @@ static struct acpi_blacklist_item acpi_blacklist[] __initdata = {
        {""}
 };
 
-#if    CONFIG_ACPI_BLACKLIST_YEAR
-
-static int __init blacklist_by_year(void)
-{
-       int year;
-
-       /* Doesn't exist? Likely an old system */
-       if (!dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL)) {
-               printk(KERN_ERR PREFIX "no DMI BIOS year, "
-                       "acpi=force is required to enable ACPI\n" );
-               return 1;
-       }
-       /* 0? Likely a buggy new BIOS */
-       if (year == 0) {
-               printk(KERN_ERR PREFIX "DMI BIOS year==0, "
-                       "assuming ACPI-capable machine\n" );
-               return 0;
-       }
-       if (year < CONFIG_ACPI_BLACKLIST_YEAR) {
-               printk(KERN_ERR PREFIX "BIOS age (%d) fails cutoff (%d), "
-                      "acpi=force is required to enable ACPI\n",
-                      year, CONFIG_ACPI_BLACKLIST_YEAR);
-               return 1;
-       }
-       return 0;
-}
-#else
-static inline int blacklist_by_year(void)
-{
-       return 0;
-}
-#endif
-
 int __init acpi_blacklisted(void)
 {
        int i = 0;
@@ -166,8 +133,6 @@ int __init acpi_blacklisted(void)
                }
        }
 
-       blacklisted += blacklist_by_year();
-
        dmi_check_system(acpi_osi_dmi_table);
 
        return blacklisted;
index d42b2fb..b3480cf 100644 (file)
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
-#include <linux/device.h>
+#include <linux/acpi.h>
 #include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/pm_qos.h>
 #include <linux/pm_runtime.h>
 
-#include <acpi/acpi.h>
-#include <acpi/acpi_bus.h>
-#include <acpi/acpi_drivers.h>
-
 #include "internal.h"
 
 #define _COMPONENT     ACPI_POWER_COMPONENT
@@ -548,7 +544,7 @@ static int acpi_dev_pm_get_state(struct device *dev, struct acpi_device *adev,
  */
 int acpi_pm_device_sleep_state(struct device *dev, int *d_min_p, int d_max_in)
 {
-       acpi_handle handle = DEVICE_ACPI_HANDLE(dev);
+       acpi_handle handle = ACPI_HANDLE(dev);
        struct acpi_device *adev;
        int ret, d_min, d_max;
 
@@ -656,7 +652,7 @@ int acpi_pm_device_run_wake(struct device *phys_dev, bool enable)
        if (!device_run_wake(phys_dev))
                return -EINVAL;
 
-       handle = DEVICE_ACPI_HANDLE(phys_dev);
+       handle = ACPI_HANDLE(phys_dev);
        if (!handle || acpi_bus_get_device(handle, &adev)) {
                dev_dbg(phys_dev, "ACPI handle without context in %s!\n",
                        __func__);
@@ -700,7 +696,7 @@ int acpi_pm_device_sleep_wake(struct device *dev, bool enable)
        if (!device_can_wakeup(dev))
                return -EINVAL;
 
-       handle = DEVICE_ACPI_HANDLE(dev);
+       handle = ACPI_HANDLE(dev);
        if (!handle || acpi_bus_get_device(handle, &adev)) {
                dev_dbg(dev, "ACPI handle without context in %s!\n", __func__);
                return -ENODEV;
@@ -722,7 +718,7 @@ int acpi_pm_device_sleep_wake(struct device *dev, bool enable)
  */
 struct acpi_device *acpi_dev_pm_get_node(struct device *dev)
 {
-       acpi_handle handle = DEVICE_ACPI_HANDLE(dev);
+       acpi_handle handle = ACPI_HANDLE(dev);
        struct acpi_device *adev;
 
        return handle && !acpi_bus_get_device(handle, &adev) ? adev : NULL;
index d5309fd..ba5b56d 100644 (file)
@@ -173,9 +173,10 @@ static void start_transaction(struct acpi_ec *ec)
 static void advance_transaction(struct acpi_ec *ec, u8 status)
 {
        unsigned long flags;
-       struct transaction *t = ec->curr;
+       struct transaction *t;
 
        spin_lock_irqsave(&ec->lock, flags);
+       t = ec->curr;
        if (!t)
                goto unlock;
        if (t->wlen > t->wi) {
index 10f0f40..a22a295 100644 (file)
@@ -197,30 +197,28 @@ static void acpi_physnode_link_name(char *buf, unsigned int node_id)
 
 int acpi_bind_one(struct device *dev, acpi_handle handle)
 {
-       struct acpi_device *acpi_dev;
-       acpi_status status;
+       struct acpi_device *acpi_dev = NULL;
        struct acpi_device_physical_node *physical_node, *pn;
        char physical_node_name[PHYSICAL_NODE_NAME_SIZE];
        struct list_head *physnode_list;
        unsigned int node_id;
        int retval = -EINVAL;
 
-       if (ACPI_HANDLE(dev)) {
+       if (ACPI_COMPANION(dev)) {
                if (handle) {
-                       dev_warn(dev, "ACPI handle is already set\n");
+                       dev_warn(dev, "ACPI companion already set\n");
                        return -EINVAL;
                } else {
-                       handle = ACPI_HANDLE(dev);
+                       acpi_dev = ACPI_COMPANION(dev);
                }
+       } else {
+               acpi_bus_get_device(handle, &acpi_dev);
        }
-       if (!handle)
+       if (!acpi_dev)
                return -EINVAL;
 
+       get_device(&acpi_dev->dev);
        get_device(dev);
-       status = acpi_bus_get_device(handle, &acpi_dev);
-       if (ACPI_FAILURE(status))
-               goto err;
-
        physical_node = kzalloc(sizeof(*physical_node), GFP_KERNEL);
        if (!physical_node) {
                retval = -ENOMEM;
@@ -242,10 +240,11 @@ int acpi_bind_one(struct device *dev, acpi_handle handle)
 
                        dev_warn(dev, "Already associated with ACPI node\n");
                        kfree(physical_node);
-                       if (ACPI_HANDLE(dev) != handle)
+                       if (ACPI_COMPANION(dev) != acpi_dev)
                                goto err;
 
                        put_device(dev);
+                       put_device(&acpi_dev->dev);
                        return 0;
                }
                if (pn->node_id == node_id) {
@@ -259,8 +258,8 @@ int acpi_bind_one(struct device *dev, acpi_handle handle)
        list_add(&physical_node->node, physnode_list);
        acpi_dev->physical_node_count++;
 
-       if (!ACPI_HANDLE(dev))
-               ACPI_HANDLE_SET(dev, acpi_dev->handle);
+       if (!ACPI_COMPANION(dev))
+               ACPI_COMPANION_SET(dev, acpi_dev);
 
        acpi_physnode_link_name(physical_node_name, node_id);
        retval = sysfs_create_link(&acpi_dev->dev.kobj, &dev->kobj,
@@ -283,27 +282,21 @@ int acpi_bind_one(struct device *dev, acpi_handle handle)
        return 0;
 
  err:
-       ACPI_HANDLE_SET(dev, NULL);
+       ACPI_COMPANION_SET(dev, NULL);
        put_device(dev);
+       put_device(&acpi_dev->dev);
        return retval;
 }
 EXPORT_SYMBOL_GPL(acpi_bind_one);
 
 int acpi_unbind_one(struct device *dev)
 {
+       struct acpi_device *acpi_dev = ACPI_COMPANION(dev);
        struct acpi_device_physical_node *entry;
-       struct acpi_device *acpi_dev;
-       acpi_status status;
 
-       if (!ACPI_HANDLE(dev))
+       if (!acpi_dev)
                return 0;
 
-       status = acpi_bus_get_device(ACPI_HANDLE(dev), &acpi_dev);
-       if (ACPI_FAILURE(status)) {
-               dev_err(dev, "Oops, ACPI handle corrupt in %s()\n", __func__);
-               return -EINVAL;
-       }
-
        mutex_lock(&acpi_dev->physical_node_lock);
 
        list_for_each_entry(entry, &acpi_dev->physical_node_list, node)
@@ -316,9 +309,10 @@ int acpi_unbind_one(struct device *dev)
                        acpi_physnode_link_name(physnode_name, entry->node_id);
                        sysfs_remove_link(&acpi_dev->dev.kobj, physnode_name);
                        sysfs_remove_link(&dev->kobj, "firmware_node");
-                       ACPI_HANDLE_SET(dev, NULL);
-                       /* acpi_bind_one() increase refcnt by one. */
+                       ACPI_COMPANION_SET(dev, NULL);
+                       /* Drop references taken by acpi_bind_one(). */
                        put_device(dev);
+                       put_device(&acpi_dev->dev);
                        kfree(entry);
                        break;
                }
@@ -328,6 +322,15 @@ int acpi_unbind_one(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(acpi_unbind_one);
 
+void acpi_preset_companion(struct device *dev, acpi_handle parent, u64 addr)
+{
+       struct acpi_device *adev;
+
+       if (!acpi_bus_get_device(acpi_get_child(parent, addr), &adev))
+               ACPI_COMPANION_SET(dev, adev);
+}
+EXPORT_SYMBOL_GPL(acpi_preset_companion);
+
 static int acpi_platform_notify(struct device *dev)
 {
        struct acpi_bus_type *type = acpi_get_bus_type(dev);
index 56f0586..0703bff 100644 (file)
@@ -575,6 +575,7 @@ static int acpi_pci_root_add(struct acpi_device *device,
                dev_err(&device->dev,
                        "Bus %04x:%02x not present in PCI namespace\n",
                        root->segment, (unsigned int)root->secondary.start);
+               device->driver_data = NULL;
                result = -ENODEV;
                goto end;
        }
index 55f9ded..15daa21 100644 (file)
@@ -289,24 +289,17 @@ void acpi_bus_device_eject(void *data, u32 ost_src)
 {
        struct acpi_device *device = data;
        acpi_handle handle = device->handle;
-       struct acpi_scan_handler *handler;
        u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE;
        int error;
 
        lock_device_hotplug();
        mutex_lock(&acpi_scan_lock);
 
-       handler = device->handler;
-       if (!handler || !handler->hotplug.enabled) {
-               put_device(&device->dev);
-               goto err_support;
-       }
-
        if (ost_src == ACPI_NOTIFY_EJECT_REQUEST)
                acpi_evaluate_hotplug_ost(handle, ACPI_NOTIFY_EJECT_REQUEST,
                                          ACPI_OST_SC_EJECT_IN_PROGRESS, NULL);
 
-       if (handler->hotplug.mode == AHM_CONTAINER)
+       if (device->handler && device->handler->hotplug.mode == AHM_CONTAINER)
                kobject_uevent(&device->dev.kobj, KOBJ_OFFLINE);
 
        error = acpi_scan_hot_remove(device);
@@ -411,8 +404,7 @@ static void acpi_hotplug_notify_cb(acpi_handle handle, u32 type, void *data)
                break;
        case ACPI_NOTIFY_EJECT_REQUEST:
                acpi_handle_debug(handle, "ACPI_NOTIFY_EJECT_REQUEST event\n");
-               status = acpi_bus_get_device(handle, &adev);
-               if (ACPI_FAILURE(status))
+               if (acpi_bus_get_device(handle, &adev))
                        goto err_out;
 
                get_device(&adev->dev);
@@ -1997,6 +1989,7 @@ static int acpi_bus_scan_fixed(void)
                if (result)
                        return result;
 
+               device->flags.match_driver = true;
                result = device_attach(&device->dev);
                if (result < 0)
                        return result;
@@ -2013,6 +2006,7 @@ static int acpi_bus_scan_fixed(void)
                if (result)
                        return result;
 
+               device->flags.match_driver = true;
                result = device_attach(&device->dev);
        }
 
index 18dbdff..995e91b 100644 (file)
@@ -82,13 +82,6 @@ static bool allow_duplicates;
 module_param(allow_duplicates, bool, 0644);
 
 /*
- * Some BIOSes claim they use minimum backlight at boot,
- * and this may bring dimming screen after boot
- */
-static bool use_bios_initial_backlight = 1;
-module_param(use_bios_initial_backlight, bool, 0644);
-
-/*
  * For Windows 8 systems: if set ture and the GPU driver has
  * registered a backlight interface, skip registering ACPI video's.
  */
@@ -406,12 +399,6 @@ static int __init video_set_bqc_offset(const struct dmi_system_id *d)
        return 0;
 }
 
-static int video_ignore_initial_backlight(const struct dmi_system_id *d)
-{
-       use_bios_initial_backlight = 0;
-       return 0;
-}
-
 static struct dmi_system_id video_dmi_table[] __initdata = {
        /*
         * Broken _BQC workaround http://bugzilla.kernel.org/show_bug.cgi?id=13121
@@ -456,54 +443,6 @@ static struct dmi_system_id video_dmi_table[] __initdata = {
                DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 7720"),
                },
        },
-       {
-        .callback = video_ignore_initial_backlight,
-        .ident = "HP Folio 13-2000",
-        .matches = {
-               DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
-               DMI_MATCH(DMI_PRODUCT_NAME, "HP Folio 13 - 2000 Notebook PC"),
-               },
-       },
-       {
-        .callback = video_ignore_initial_backlight,
-        .ident = "Fujitsu E753",
-        .matches = {
-               DMI_MATCH(DMI_BOARD_VENDOR, "FUJITSU"),
-               DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK E753"),
-               },
-       },
-       {
-        .callback = video_ignore_initial_backlight,
-        .ident = "HP Pavilion dm4",
-        .matches = {
-               DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
-               DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion dm4 Notebook PC"),
-               },
-       },
-       {
-        .callback = video_ignore_initial_backlight,
-        .ident = "HP Pavilion g6 Notebook PC",
-        .matches = {
-                DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
-                DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion g6 Notebook PC"),
-               },
-       },
-       {
-        .callback = video_ignore_initial_backlight,
-        .ident = "HP 1000 Notebook PC",
-        .matches = {
-               DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
-               DMI_MATCH(DMI_PRODUCT_NAME, "HP 1000 Notebook PC"),
-               },
-       },
-       {
-        .callback = video_ignore_initial_backlight,
-        .ident = "HP Pavilion m4",
-        .matches = {
-               DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
-               DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion m4 Notebook PC"),
-               },
-       },
        {}
 };
 
@@ -839,20 +778,18 @@ acpi_video_init_brightness(struct acpi_video_device *device)
        if (!device->cap._BQC)
                goto set_level;
 
-       if (use_bios_initial_backlight) {
-               level = acpi_video_bqc_value_to_level(device, level_old);
-               /*
-                * On some buggy laptops, _BQC returns an uninitialized
-                * value when invoked for the first time, i.e.
-                * level_old is invalid (no matter whether it's a level
-                * or an index). Set the backlight to max_level in this case.
-                */
-               for (i = 2; i < br->count; i++)
-                       if (level == br->levels[i])
-                               break;
-               if (i == br->count || !level)
-                       level = max_level;
-       }
+       level = acpi_video_bqc_value_to_level(device, level_old);
+       /*
+        * On some buggy laptops, _BQC returns an uninitialized
+        * value when invoked for the first time, i.e.
+        * level_old is invalid (no matter whether it's a level
+        * or an index). Set the backlight to max_level in this case.
+        */
+       for (i = 2; i < br->count; i++)
+               if (level == br->levels[i])
+                       break;
+       if (i == br->count || !level)
+               level = max_level;
 
 set_level:
        result = acpi_video_device_lcd_set_level(device, level);
index ab714d2..4372cfa 100644 (file)
@@ -185,7 +185,7 @@ void ata_acpi_bind_port(struct ata_port *ap)
        if (libata_noacpi || ap->flags & ATA_FLAG_ACPI_SATA || !host_handle)
                return;
 
-       ACPI_HANDLE_SET(&ap->tdev, acpi_get_child(host_handle, ap->port_no));
+       acpi_preset_companion(&ap->tdev, host_handle, ap->port_no);
 
        if (ata_acpi_gtm(ap, &ap->__acpi_init_gtm) == 0)
                ap->pflags |= ATA_PFLAG_INIT_GTM_VALID;
@@ -222,7 +222,7 @@ void ata_acpi_bind_dev(struct ata_device *dev)
                parent_handle = port_handle;
        }
 
-       ACPI_HANDLE_SET(&dev->tdev, acpi_get_child(parent_handle, adr));
+       acpi_preset_companion(&dev->tdev, parent_handle, adr);
 
        register_hotplug_dock_device(ata_dev_acpi_handle(dev),
                                     &ata_acpi_dev_dock_ops, dev, NULL, NULL);
index 853f610..e88690e 100644 (file)
@@ -396,8 +396,7 @@ dma_xfer(struct arasan_cf_dev *acdev, dma_addr_t src, dma_addr_t dest, u32 len)
        struct dma_async_tx_descriptor *tx;
        struct dma_chan *chan = acdev->dma_chan;
        dma_cookie_t cookie;
-       unsigned long flags = DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP |
-               DMA_COMPL_SKIP_DEST_UNMAP;
+       unsigned long flags = DMA_PREP_INTERRUPT;
        int ret = 0;
 
        tx = chan->device->device_prep_dma_memcpy(chan, dest, src, len, flags);
index 47051cd..3a94b79 100644 (file)
@@ -432,7 +432,7 @@ struct platform_device *platform_device_register_full(
                goto err_alloc;
 
        pdev->dev.parent = pdevinfo->parent;
-       ACPI_HANDLE_SET(&pdev->dev, pdevinfo->acpi_node.handle);
+       ACPI_COMPANION_SET(&pdev->dev, pdevinfo->acpi_node.companion);
 
        if (pdevinfo->dma_mask) {
                /*
@@ -463,7 +463,7 @@ struct platform_device *platform_device_register_full(
        ret = platform_device_add(pdev);
        if (ret) {
 err:
-               ACPI_HANDLE_SET(&pdev->dev, NULL);
+               ACPI_COMPANION_SET(&pdev->dev, NULL);
                kfree(pdev->dev.dma_mask);
 
 err_alloc:
index c12e9b9..1b41fca 100644 (file)
@@ -1350,6 +1350,9 @@ static int device_prepare(struct device *dev, pm_message_t state)
 
        device_unlock(dev);
 
+       if (error)
+               pm_runtime_put(dev);
+
        return error;
 }
 
index b5d8423..ea192ec 100644 (file)
@@ -223,7 +223,7 @@ static void null_softirq_done_fn(struct request *rq)
        blk_end_request_all(rq, 0);
 }
 
-#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#ifdef CONFIG_SMP
 
 static void null_ipi_cmd_end_io(void *data)
 {
@@ -260,7 +260,7 @@ static void null_cmd_end_ipi(struct nullb_cmd *cmd)
        put_cpu();
 }
 
-#endif /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
+#endif /* CONFIG_SMP */
 
 static inline void null_handle_cmd(struct nullb_cmd *cmd)
 {
@@ -270,7 +270,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd)
                end_cmd(cmd);
                break;
        case NULL_IRQ_SOFTIRQ:
-#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#ifdef CONFIG_SMP
                null_cmd_end_ipi(cmd);
 #else
                end_cmd(cmd);
@@ -571,7 +571,7 @@ static int __init null_init(void)
 {
        unsigned int i;
 
-#if !defined(CONFIG_SMP) || !defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+#if !defined(CONFIG_SMP)
        if (irqmode == NULL_IRQ_SOFTIRQ) {
                pr_warn("null_blk: softirq completions not available.\n");
                pr_warn("null_blk: using direct completions.\n");
index 588479d..6a680d4 100644 (file)
@@ -199,15 +199,16 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 
        spin_lock_irqsave(&vblk->vq_lock, flags);
        if (__virtblk_add_req(vblk->vq, vbr, vbr->sg, num) < 0) {
+               virtqueue_kick(vblk->vq);
                spin_unlock_irqrestore(&vblk->vq_lock, flags);
                blk_mq_stop_hw_queue(hctx);
-               virtqueue_kick(vblk->vq);
                return BLK_MQ_RQ_QUEUE_BUSY;
        }
-       spin_unlock_irqrestore(&vblk->vq_lock, flags);
 
        if (last)
                virtqueue_kick(vblk->vq);
+
+       spin_unlock_irqrestore(&vblk->vq_lock, flags);
        return BLK_MQ_RQ_QUEUE_OK;
 }
 
index 94c0c74..1a65838 100644 (file)
@@ -33,6 +33,15 @@ config TCG_TIS
          from within Linux.  To compile this driver as a module, choose
          M here; the module will be called tpm_tis.
 
+config TCG_TIS_I2C_ATMEL
+       tristate "TPM Interface Specification 1.2 Interface (I2C - Atmel)"
+       depends on I2C
+       ---help---
+         If you have an Atmel I2C TPM security chip say Yes and it will be
+         accessible from within Linux.
+         To compile this driver as a module, choose M here; the module will
+         be called tpm_tis_i2c_atmel.
+
 config TCG_TIS_I2C_INFINEON
        tristate "TPM Interface Specification 1.2 Interface (I2C - Infineon)"
        depends on I2C
@@ -42,7 +51,17 @@ config TCG_TIS_I2C_INFINEON
          Specification 0.20 say Yes and it will be accessible from within
          Linux.
          To compile this driver as a module, choose M here; the module
-         will be called tpm_tis_i2c_infineon.
+         will be called tpm_i2c_infineon.
+
+config TCG_TIS_I2C_NUVOTON
+       tristate "TPM Interface Specification 1.2 Interface (I2C - Nuvoton)"
+       depends on I2C
+       ---help---
+         If you have a TPM security chip with an I2C interface from
+         Nuvoton Technology Corp. say Yes and it will be accessible
+         from within Linux.
+         To compile this driver as a module, choose M here; the module
+         will be called tpm_i2c_nuvoton.
 
 config TCG_NSC
        tristate "National Semiconductor TPM Interface"
@@ -82,14 +101,14 @@ config TCG_IBMVTPM
          as a module, choose M here; the module will be called tpm_ibmvtpm.
 
 config TCG_ST33_I2C
-        tristate "STMicroelectronics ST33 I2C TPM"
-        depends on I2C
-        depends on GPIOLIB
-        ---help---
-        If you have a TPM security chip from STMicroelectronics working with
-        an I2C bus say Yes and it will be accessible from within Linux.
-        To compile this driver as a module, choose M here; the module will be
-        called tpm_stm_st33_i2c.
+       tristate "STMicroelectronics ST33 I2C TPM"
+       depends on I2C
+       depends on GPIOLIB
+       ---help---
+         If you have a TPM security chip from STMicroelectronics working with
+         an I2C bus say Yes and it will be accessible from within Linux.
+         To compile this driver as a module, choose M here; the module will be
+         called tpm_stm_st33_i2c.
 
 config TCG_XEN
        tristate "XEN TPM Interface"
index eb41ff9..b80a400 100644 (file)
@@ -2,17 +2,20 @@
 # Makefile for the kernel tpm device drivers.
 #
 obj-$(CONFIG_TCG_TPM) += tpm.o
+tpm-y := tpm-interface.o
+tpm-$(CONFIG_ACPI) += tpm_ppi.o
+
 ifdef CONFIG_ACPI
-       obj-$(CONFIG_TCG_TPM) += tpm_bios.o
-       tpm_bios-objs += tpm_eventlog.o tpm_acpi.o tpm_ppi.o
+       tpm-y += tpm_eventlog.o tpm_acpi.o
 else
 ifdef CONFIG_TCG_IBMVTPM
-       obj-$(CONFIG_TCG_TPM) += tpm_bios.o
-       tpm_bios-objs += tpm_eventlog.o tpm_of.o
+       tpm-y += tpm_eventlog.o tpm_of.o
 endif
 endif
 obj-$(CONFIG_TCG_TIS) += tpm_tis.o
+obj-$(CONFIG_TCG_TIS_I2C_ATMEL) += tpm_i2c_atmel.o
 obj-$(CONFIG_TCG_TIS_I2C_INFINEON) += tpm_i2c_infineon.o
+obj-$(CONFIG_TCG_TIS_I2C_NUVOTON) += tpm_i2c_nuvoton.o
 obj-$(CONFIG_TCG_NSC) += tpm_nsc.o
 obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o
 obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o
similarity index 93%
rename from drivers/char/tpm/tpm.c
rename to drivers/char/tpm/tpm-interface.c
index e3c974a..6ae41d3 100644 (file)
  * Maintained by: <tpmdd-devel@lists.sourceforge.net>
  *
  * Device driver for TCG/TCPA TPM (trusted platform module).
- * Specifications at www.trustedcomputinggroup.org      
+ * Specifications at www.trustedcomputinggroup.org
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation, version 2 of the
  * License.
- * 
+ *
  * Note, the TPM chip is not interrupt driven (only polling)
  * and can have very long timeouts (minutes!). Hence the unusual
  * calls to msleep.
@@ -371,13 +371,14 @@ static ssize_t tpm_transmit(struct tpm_chip *chip, const char *buf,
                return -ENODATA;
        if (count > bufsiz) {
                dev_err(chip->dev,
-                       "invalid count value %x %zx \n", count, bufsiz);
+                       "invalid count value %x %zx\n", count, bufsiz);
                return -E2BIG;
        }
 
        mutex_lock(&chip->tpm_mutex);
 
-       if ((rc = chip->vendor.send(chip, (u8 *) buf, count)) < 0) {
+       rc = chip->vendor.send(chip, (u8 *) buf, count);
+       if (rc < 0) {
                dev_err(chip->dev,
                        "tpm_transmit: tpm_send: error %zd\n", rc);
                goto out;
@@ -444,7 +445,7 @@ static ssize_t transmit_cmd(struct tpm_chip *chip, struct tpm_cmd_t *cmd,
 {
        int err;
 
-       len = tpm_transmit(chip,(u8 *) cmd, len);
+       len = tpm_transmit(chip, (u8 *) cmd, len);
        if (len <  0)
                return len;
        else if (len < TPM_HEADER_SIZE)
@@ -658,7 +659,7 @@ static int tpm_continue_selftest(struct tpm_chip *chip)
        return rc;
 }
 
-ssize_t tpm_show_enabled(struct device * dev, struct device_attribute * attr,
+ssize_t tpm_show_enabled(struct device *dev, struct device_attribute *attr,
                        char *buf)
 {
        cap_t cap;
@@ -674,7 +675,7 @@ ssize_t tpm_show_enabled(struct device * dev, struct device_attribute * attr,
 }
 EXPORT_SYMBOL_GPL(tpm_show_enabled);
 
-ssize_t tpm_show_active(struct device * dev, struct device_attribute * attr,
+ssize_t tpm_show_active(struct device *dev, struct device_attribute *attr,
                        char *buf)
 {
        cap_t cap;
@@ -690,7 +691,7 @@ ssize_t tpm_show_active(struct device * dev, struct device_attribute * attr,
 }
 EXPORT_SYMBOL_GPL(tpm_show_active);
 
-ssize_t tpm_show_owned(struct device * dev, struct device_attribute * attr,
+ssize_t tpm_show_owned(struct device *dev, struct device_attribute *attr,
                        char *buf)
 {
        cap_t cap;
@@ -706,8 +707,8 @@ ssize_t tpm_show_owned(struct device * dev, struct device_attribute * attr,
 }
 EXPORT_SYMBOL_GPL(tpm_show_owned);
 
-ssize_t tpm_show_temp_deactivated(struct device * dev,
-                               struct device_attribute * attr, char *buf)
+ssize_t tpm_show_temp_deactivated(struct device *dev,
+                               struct device_attribute *attr, char *buf)
 {
        cap_t cap;
        ssize_t rc;
@@ -769,10 +770,10 @@ static int __tpm_pcr_read(struct tpm_chip *chip, int pcr_idx, u8 *res_buf)
 
 /**
  * tpm_pcr_read - read a pcr value
- * @chip_num:  tpm idx # or ANY
+ * @chip_num:  tpm idx # or ANY
  * @pcr_idx:   pcr idx to retrieve
- * @res_buf:   TPM_PCR value
- *             size of res_buf is 20 bytes (or NULL if you don't care)
+ * @res_buf:   TPM_PCR value
+ *             size of res_buf is 20 bytes (or NULL if you don't care)
  *
  * The TPM driver should be built-in, but for whatever reason it
  * isn't, protect against the chip disappearing, by incrementing
@@ -794,9 +795,9 @@ EXPORT_SYMBOL_GPL(tpm_pcr_read);
 
 /**
  * tpm_pcr_extend - extend pcr value with hash
- * @chip_num:  tpm idx # or AN&
+ * @chip_num:  tpm idx # or AN&
  * @pcr_idx:   pcr idx to extend
- * @hash:      hash value used to extend pcr value
+ * @hash:      hash value used to extend pcr value
  *
  * The TPM driver should be built-in, but for whatever reason it
  * isn't, protect against the chip disappearing, by incrementing
@@ -847,8 +848,7 @@ int tpm_do_selftest(struct tpm_chip *chip)
        unsigned long duration;
        struct tpm_cmd_t cmd;
 
-       duration = tpm_calc_ordinal_duration(chip,
-                                            TPM_ORD_CONTINUE_SELFTEST);
+       duration = tpm_calc_ordinal_duration(chip, TPM_ORD_CONTINUE_SELFTEST);
 
        loops = jiffies_to_msecs(duration) / delay_msec;
 
@@ -965,12 +965,12 @@ ssize_t tpm_show_pubek(struct device *dev, struct device_attribute *attr,
        if (err)
                goto out;
 
-       /* 
+       /*
           ignore header 10 bytes
           algorithm 32 bits (1 == RSA )
           encscheme 16 bits
           sigscheme 16 bits
-          parameters (RSA 12->bytes: keybit, #primes, expbit)  
+          parameters (RSA 12->bytes: keybit, #primes, expbit)
           keylenbytes 32 bits
           256 byte modulus
           ignore checksum 20 bytes
@@ -1020,43 +1020,33 @@ ssize_t tpm_show_caps(struct device *dev, struct device_attribute *attr,
        str += sprintf(str, "Manufacturer: 0x%x\n",
                       be32_to_cpu(cap.manufacturer_id));
 
-       rc = tpm_getcap(dev, CAP_VERSION_1_1, &cap,
-                       "attempting to determine the 1.1 version");
-       if (rc)
-               return 0;
-       str += sprintf(str,
-                      "TCG version: %d.%d\nFirmware version: %d.%d\n",
-                      cap.tpm_version.Major, cap.tpm_version.Minor,
-                      cap.tpm_version.revMajor, cap.tpm_version.revMinor);
-       return str - buf;
-}
-EXPORT_SYMBOL_GPL(tpm_show_caps);
-
-ssize_t tpm_show_caps_1_2(struct device * dev,
-                         struct device_attribute * attr, char *buf)
-{
-       cap_t cap;
-       ssize_t rc;
-       char *str = buf;
-
-       rc = tpm_getcap(dev, TPM_CAP_PROP_MANUFACTURER, &cap,
-                       "attempting to determine the manufacturer");
-       if (rc)
-               return 0;
-       str += sprintf(str, "Manufacturer: 0x%x\n",
-                      be32_to_cpu(cap.manufacturer_id));
+       /* Try to get a TPM version 1.2 TPM_CAP_VERSION_INFO */
        rc = tpm_getcap(dev, CAP_VERSION_1_2, &cap,
                         "attempting to determine the 1.2 version");
-       if (rc)
-               return 0;
-       str += sprintf(str,
-                      "TCG version: %d.%d\nFirmware version: %d.%d\n",
-                      cap.tpm_version_1_2.Major, cap.tpm_version_1_2.Minor,
-                      cap.tpm_version_1_2.revMajor,
-                      cap.tpm_version_1_2.revMinor);
+       if (!rc) {
+               str += sprintf(str,
+                              "TCG version: %d.%d\nFirmware version: %d.%d\n",
+                              cap.tpm_version_1_2.Major,
+                              cap.tpm_version_1_2.Minor,
+                              cap.tpm_version_1_2.revMajor,
+                              cap.tpm_version_1_2.revMinor);
+       } else {
+               /* Otherwise just use TPM_STRUCT_VER */
+               rc = tpm_getcap(dev, CAP_VERSION_1_1, &cap,
+                               "attempting to determine the 1.1 version");
+               if (rc)
+                       return 0;
+               str += sprintf(str,
+                              "TCG version: %d.%d\nFirmware version: %d.%d\n",
+                              cap.tpm_version.Major,
+                              cap.tpm_version.Minor,
+                              cap.tpm_version.revMajor,
+                              cap.tpm_version.revMinor);
+       }
+
        return str - buf;
 }
-EXPORT_SYMBOL_GPL(tpm_show_caps_1_2);
+EXPORT_SYMBOL_GPL(tpm_show_caps);
 
 ssize_t tpm_show_durations(struct device *dev, struct device_attribute *attr,
                          char *buf)
@@ -1102,8 +1092,8 @@ ssize_t tpm_store_cancel(struct device *dev, struct device_attribute *attr,
 }
 EXPORT_SYMBOL_GPL(tpm_store_cancel);
 
-static bool wait_for_tpm_stat_cond(struct tpm_chip *chip, u8 mask, bool check_cancel,
-                                  bool *canceled)
+static bool wait_for_tpm_stat_cond(struct tpm_chip *chip, u8 mask,
+                                       bool check_cancel, bool *canceled)
 {
        u8 status = chip->vendor.status(chip);
 
@@ -1170,38 +1160,25 @@ EXPORT_SYMBOL_GPL(wait_for_tpm_stat);
  */
 int tpm_open(struct inode *inode, struct file *file)
 {
-       int minor = iminor(inode);
-       struct tpm_chip *chip = NULL, *pos;
-
-       rcu_read_lock();
-       list_for_each_entry_rcu(pos, &tpm_chip_list, list) {
-               if (pos->vendor.miscdev.minor == minor) {
-                       chip = pos;
-                       get_device(chip->dev);
-                       break;
-               }
-       }
-       rcu_read_unlock();
-
-       if (!chip)
-               return -ENODEV;
+       struct miscdevice *misc = file->private_data;
+       struct tpm_chip *chip = container_of(misc, struct tpm_chip,
+                                            vendor.miscdev);
 
        if (test_and_set_bit(0, &chip->is_open)) {
                dev_dbg(chip->dev, "Another process owns this TPM\n");
-               put_device(chip->dev);
                return -EBUSY;
        }
 
        chip->data_buffer = kzalloc(TPM_BUFSIZE, GFP_KERNEL);
        if (chip->data_buffer == NULL) {
                clear_bit(0, &chip->is_open);
-               put_device(chip->dev);
                return -ENOMEM;
        }
 
        atomic_set(&chip->data_pending, 0);
 
        file->private_data = chip;
+       get_device(chip->dev);
        return 0;
 }
 EXPORT_SYMBOL_GPL(tpm_open);
@@ -1463,7 +1440,6 @@ void tpm_dev_vendor_release(struct tpm_chip *chip)
                chip->vendor.release(chip->dev);
 
        clear_bit(chip->dev_num, dev_mask);
-       kfree(chip->vendor.miscdev.name);
 }
 EXPORT_SYMBOL_GPL(tpm_dev_vendor_release);
 
@@ -1487,7 +1463,7 @@ void tpm_dev_release(struct device *dev)
 EXPORT_SYMBOL_GPL(tpm_dev_release);
 
 /*
- * Called from tpm_<specific>.c probe function only for devices 
+ * Called from tpm_<specific>.c probe function only for devices
  * the driver has determined it should claim.  Prior to calling
  * this function the specific probe function has called pci_enable_device
  * upon errant exit from this function specific probe function should call
@@ -1496,17 +1472,13 @@ EXPORT_SYMBOL_GPL(tpm_dev_release);
 struct tpm_chip *tpm_register_hardware(struct device *dev,
                                        const struct tpm_vendor_specific *entry)
 {
-#define DEVNAME_SIZE 7
-
-       char *devname;
        struct tpm_chip *chip;
 
        /* Driver specific per-device data */
        chip = kzalloc(sizeof(*chip), GFP_KERNEL);
-       devname = kmalloc(DEVNAME_SIZE, GFP_KERNEL);
 
-       if (chip == NULL || devname == NULL)
-               goto out_free;
+       if (chip == NULL)
+               return NULL;
 
        mutex_init(&chip->buffer_mutex);
        mutex_init(&chip->tpm_mutex);
@@ -1531,8 +1503,9 @@ struct tpm_chip *tpm_register_hardware(struct device *dev,
 
        set_bit(chip->dev_num, dev_mask);
 
-       scnprintf(devname, DEVNAME_SIZE, "%s%d", "tpm", chip->dev_num);
-       chip->vendor.miscdev.name = devname;
+       scnprintf(chip->devname, sizeof(chip->devname), "%s%d", "tpm",
+                 chip->dev_num);
+       chip->vendor.miscdev.name = chip->devname;
 
        chip->vendor.miscdev.parent = dev;
        chip->dev = get_device(dev);
@@ -1558,7 +1531,7 @@ struct tpm_chip *tpm_register_hardware(struct device *dev,
                goto put_device;
        }
 
-       chip->bios_dir = tpm_bios_log_setup(devname);
+       chip->bios_dir = tpm_bios_log_setup(chip->devname);
 
        /* Make chip available */
        spin_lock(&driver_lock);
@@ -1571,7 +1544,6 @@ put_device:
        put_device(chip->dev);
 out_free:
        kfree(chip);
-       kfree(devname);
        return NULL;
 }
 EXPORT_SYMBOL_GPL(tpm_register_hardware);
index a7bfc17..f328478 100644 (file)
@@ -59,8 +59,6 @@ extern ssize_t tpm_show_pcrs(struct device *, struct device_attribute *attr,
                                char *);
 extern ssize_t tpm_show_caps(struct device *, struct device_attribute *attr,
                                char *);
-extern ssize_t tpm_show_caps_1_2(struct device *, struct device_attribute *attr,
-                               char *);
 extern ssize_t tpm_store_cancel(struct device *, struct device_attribute *attr,
                                const char *, size_t);
 extern ssize_t tpm_show_enabled(struct device *, struct device_attribute *attr,
@@ -122,6 +120,7 @@ struct tpm_chip {
        struct device *dev;     /* Device stuff */
 
        int dev_num;            /* /dev/tpm# */
+       char devname[7];
        unsigned long is_open;  /* only one allowed */
        int time_expired;
 
index 99d6820..c9a528d 100644 (file)
@@ -202,7 +202,7 @@ static int __init init_atmel(void)
 
        have_region =
            (atmel_request_region
-            (tpm_atmel.base, region_size, "tpm_atmel0") == NULL) ? 0 : 1;
+            (base, region_size, "tpm_atmel0") == NULL) ? 0 : 1;
 
        pdev = platform_device_register_simple("tpm_atmel", -1, NULL, 0);
        if (IS_ERR(pdev)) {
index 84ddc55..59f7cb2 100644 (file)
@@ -406,7 +406,6 @@ out_tpm:
 out:
        return NULL;
 }
-EXPORT_SYMBOL_GPL(tpm_bios_log_setup);
 
 void tpm_bios_log_teardown(struct dentry **lst)
 {
@@ -415,5 +414,3 @@ void tpm_bios_log_teardown(struct dentry **lst)
        for (i = 0; i < 3; i++)
                securityfs_remove(lst[i]);
 }
-EXPORT_SYMBOL_GPL(tpm_bios_log_teardown);
-MODULE_LICENSE("GPL");
diff --git a/drivers/char/tpm/tpm_i2c_atmel.c b/drivers/char/tpm/tpm_i2c_atmel.c
new file mode 100644 (file)
index 0000000..c3cd7fe
--- /dev/null
@@ -0,0 +1,284 @@
+/*
+ * ATMEL I2C TPM AT97SC3204T
+ *
+ * Copyright (C) 2012 V Lab Technologies
+ *  Teddy Reed <teddy@prosauce.org>
+ * Copyright (C) 2013, Obsidian Research Corp.
+ *  Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+ * Device driver for ATMEL I2C TPMs.
+ *
+ * Teddy Reed determined the basic I2C command flow, unlike other I2C TPM
+ * devices the raw TCG formatted TPM command data is written via I2C and then
+ * raw TCG formatted TPM command data is returned via I2C.
+ *
+ * TGC status/locality/etc functions seen in the LPC implementation do not
+ * seem to be present.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/>.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/i2c.h>
+#include "tpm.h"
+
+#define I2C_DRIVER_NAME "tpm_i2c_atmel"
+
+#define TPM_I2C_SHORT_TIMEOUT  750     /* ms */
+#define TPM_I2C_LONG_TIMEOUT   2000    /* 2 sec */
+
+#define ATMEL_STS_OK 1
+
+struct priv_data {
+       size_t len;
+       /* This is the amount we read on the first try. 25 was chosen to fit a
+        * fair number of read responses in the buffer so a 2nd retry can be
+        * avoided in small message cases. */
+       u8 buffer[sizeof(struct tpm_output_header) + 25];
+};
+
+static int i2c_atmel_send(struct tpm_chip *chip, u8 *buf, size_t len)
+{
+       struct priv_data *priv = chip->vendor.priv;
+       struct i2c_client *client = to_i2c_client(chip->dev);
+       s32 status;
+
+       priv->len = 0;
+
+       if (len <= 2)
+               return -EIO;
+
+       status = i2c_master_send(client, buf, len);
+
+       dev_dbg(chip->dev,
+               "%s(buf=%*ph len=%0zx) -> sts=%d\n", __func__,
+               (int)min_t(size_t, 64, len), buf, len, status);
+       return status;
+}
+
+static int i2c_atmel_recv(struct tpm_chip *chip, u8 *buf, size_t count)
+{
+       struct priv_data *priv = chip->vendor.priv;
+       struct i2c_client *client = to_i2c_client(chip->dev);
+       struct tpm_output_header *hdr =
+               (struct tpm_output_header *)priv->buffer;
+       u32 expected_len;
+       int rc;
+
+       if (priv->len == 0)
+               return -EIO;
+
+       /* Get the message size from the message header, if we didn't get the
+        * whole message in read_status then we need to re-read the
+        * message. */
+       expected_len = be32_to_cpu(hdr->length);
+       if (expected_len > count)
+               return -ENOMEM;
+
+       if (priv->len >= expected_len) {
+               dev_dbg(chip->dev,
+                       "%s early(buf=%*ph count=%0zx) -> ret=%d\n", __func__,
+                       (int)min_t(size_t, 64, expected_len), buf, count,
+                       expected_len);
+               memcpy(buf, priv->buffer, expected_len);
+               return expected_len;
+       }
+
+       rc = i2c_master_recv(client, buf, expected_len);
+       dev_dbg(chip->dev,
+               "%s reread(buf=%*ph count=%0zx) -> ret=%d\n", __func__,
+               (int)min_t(size_t, 64, expected_len), buf, count,
+               expected_len);
+       return rc;
+}
+
+static void i2c_atmel_cancel(struct tpm_chip *chip)
+{
+       dev_err(chip->dev, "TPM operation cancellation was requested, but is not supported");
+}
+
+static u8 i2c_atmel_read_status(struct tpm_chip *chip)
+{
+       struct priv_data *priv = chip->vendor.priv;
+       struct i2c_client *client = to_i2c_client(chip->dev);
+       int rc;
+
+       /* The TPM fails the I2C read until it is ready, so we do the entire
+        * transfer here and buffer it locally. This way the common code can
+        * properly handle the timeouts. */
+       priv->len = 0;
+       memset(priv->buffer, 0, sizeof(priv->buffer));
+
+
+       /* Once the TPM has completed the command the command remains readable
+        * until another command is issued. */
+       rc = i2c_master_recv(client, priv->buffer, sizeof(priv->buffer));
+       dev_dbg(chip->dev,
+               "%s: sts=%d", __func__, rc);
+       if (rc <= 0)
+               return 0;
+
+       priv->len = rc;
+
+       return ATMEL_STS_OK;
+}
+
+static const struct file_operations i2c_atmel_ops = {
+       .owner = THIS_MODULE,
+       .llseek = no_llseek,
+       .open = tpm_open,
+       .read = tpm_read,
+       .write = tpm_write,
+       .release = tpm_release,
+};
+
+static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
+static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
+static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
+static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
+static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
+static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, NULL);
+static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
+static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel);
+static DEVICE_ATTR(durations, S_IRUGO, tpm_show_durations, NULL);
+static DEVICE_ATTR(timeouts, S_IRUGO, tpm_show_timeouts, NULL);
+
+static struct attribute *i2c_atmel_attrs[] = {
+       &dev_attr_pubek.attr,
+       &dev_attr_pcrs.attr,
+       &dev_attr_enabled.attr,
+       &dev_attr_active.attr,
+       &dev_attr_owned.attr,
+       &dev_attr_temp_deactivated.attr,
+       &dev_attr_caps.attr,
+       &dev_attr_cancel.attr,
+       &dev_attr_durations.attr,
+       &dev_attr_timeouts.attr,
+       NULL,
+};
+
+static struct attribute_group i2c_atmel_attr_grp = {
+       .attrs = i2c_atmel_attrs
+};
+
+static bool i2c_atmel_req_canceled(struct tpm_chip *chip, u8 status)
+{
+       return 0;
+}
+
+static const struct tpm_vendor_specific i2c_atmel = {
+       .status = i2c_atmel_read_status,
+       .recv = i2c_atmel_recv,
+       .send = i2c_atmel_send,
+       .cancel = i2c_atmel_cancel,
+       .req_complete_mask = ATMEL_STS_OK,
+       .req_complete_val = ATMEL_STS_OK,
+       .req_canceled = i2c_atmel_req_canceled,
+       .attr_group = &i2c_atmel_attr_grp,
+       .miscdev.fops = &i2c_atmel_ops,
+};
+
+static int i2c_atmel_probe(struct i2c_client *client,
+                          const struct i2c_device_id *id)
+{
+       int rc;
+       struct tpm_chip *chip;
+       struct device *dev = &client->dev;
+
+       if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C))
+               return -ENODEV;
+
+       chip = tpm_register_hardware(dev, &i2c_atmel);
+       if (!chip) {
+               dev_err(dev, "%s() error in tpm_register_hardware\n", __func__);
+               return -ENODEV;
+       }
+
+       chip->vendor.priv = devm_kzalloc(dev, sizeof(struct priv_data),
+                                        GFP_KERNEL);
+
+       /* Default timeouts */
+       chip->vendor.timeout_a = msecs_to_jiffies(TPM_I2C_SHORT_TIMEOUT);
+       chip->vendor.timeout_b = msecs_to_jiffies(TPM_I2C_LONG_TIMEOUT);
+       chip->vendor.timeout_c = msecs_to_jiffies(TPM_I2C_SHORT_TIMEOUT);
+       chip->vendor.timeout_d = msecs_to_jiffies(TPM_I2C_SHORT_TIMEOUT);
+       chip->vendor.irq = 0;
+
+       /* There is no known way to probe for this device, and all version
+        * information seems to be read via TPM commands. Thus we rely on the
+        * TPM startup process in the common code to detect the device. */
+       if (tpm_get_timeouts(chip)) {
+               rc = -ENODEV;
+               goto out_err;
+       }
+
+       if (tpm_do_selftest(chip)) {
+               rc = -ENODEV;
+               goto out_err;
+       }
+
+       return 0;
+
+out_err:
+       tpm_dev_vendor_release(chip);
+       tpm_remove_hardware(chip->dev);
+       return rc;
+}
+
+static int i2c_atmel_remove(struct i2c_client *client)
+{
+       struct device *dev = &(client->dev);
+       struct tpm_chip *chip = dev_get_drvdata(dev);
+
+       if (chip)
+               tpm_dev_vendor_release(chip);
+       tpm_remove_hardware(dev);
+       kfree(chip);
+       return 0;
+}
+
+static const struct i2c_device_id i2c_atmel_id[] = {
+       {I2C_DRIVER_NAME, 0},
+       {}
+};
+MODULE_DEVICE_TABLE(i2c, i2c_atmel_id);
+
+#ifdef CONFIG_OF
+static const struct of_device_id i2c_atmel_of_match[] = {
+       {.compatible = "atmel,at97sc3204t"},
+       {},
+};
+MODULE_DEVICE_TABLE(of, i2c_atmel_of_match);
+#endif
+
+static SIMPLE_DEV_PM_OPS(i2c_atmel_pm_ops, tpm_pm_suspend, tpm_pm_resume);
+
+static struct i2c_driver i2c_atmel_driver = {
+       .id_table = i2c_atmel_id,
+       .probe = i2c_atmel_probe,
+       .remove = i2c_atmel_remove,
+       .driver = {
+               .name = I2C_DRIVER_NAME,
+               .owner = THIS_MODULE,
+               .pm = &i2c_atmel_pm_ops,
+               .of_match_table = of_match_ptr(i2c_atmel_of_match),
+       },
+};
+
+module_i2c_driver(i2c_atmel_driver);
+
+MODULE_AUTHOR("Jason Gunthorpe <jgunthorpe@obsidianresearch.com>");
+MODULE_DESCRIPTION("Atmel TPM I2C Driver");
+MODULE_LICENSE("GPL");
index b8735de..fefd2aa 100644 (file)
@@ -581,7 +581,7 @@ static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
 static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
 static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
 static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, NULL);
-static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps_1_2, NULL);
+static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
 static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel);
 static DEVICE_ATTR(durations, S_IRUGO, tpm_show_durations, NULL);
 static DEVICE_ATTR(timeouts, S_IRUGO, tpm_show_timeouts, NULL);
@@ -685,7 +685,6 @@ out_vendor:
        chip->dev->release = NULL;
        chip->release = NULL;
        tpm_dev.client = NULL;
-       dev_set_drvdata(chip->dev, chip);
 out_err:
        return rc;
 }
@@ -766,7 +765,6 @@ static int tpm_tis_i2c_remove(struct i2c_client *client)
        chip->dev->release = NULL;
        chip->release = NULL;
        tpm_dev.client = NULL;
-       dev_set_drvdata(chip->dev, chip);
 
        return 0;
 }
diff --git a/drivers/char/tpm/tpm_i2c_nuvoton.c b/drivers/char/tpm/tpm_i2c_nuvoton.c
new file mode 100644 (file)
index 0000000..6276fea
--- /dev/null
@@ -0,0 +1,710 @@
+/******************************************************************************
+ * Nuvoton TPM I2C Device Driver Interface for WPCT301/NPCT501,
+ * based on the TCG TPM Interface Spec version 1.2.
+ * Specifications at www.trustedcomputinggroup.org
+ *
+ * Copyright (C) 2011, Nuvoton Technology Corporation.
+ *  Dan Morav <dan.morav@nuvoton.com>
+ * Copyright (C) 2013, Obsidian Research Corp.
+ *  Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/>.
+ *
+ * Nuvoton contact information: APC.Support@nuvoton.com
+ *****************************************************************************/
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/wait.h>
+#include <linux/i2c.h>
+#include "tpm.h"
+
+/* I2C interface offsets */
+#define TPM_STS                0x00
+#define TPM_BURST_COUNT        0x01
+#define TPM_DATA_FIFO_W        0x20
+#define TPM_DATA_FIFO_R        0x40
+#define TPM_VID_DID_RID        0x60
+/* TPM command header size */
+#define TPM_HEADER_SIZE        10
+#define TPM_RETRY      5
+/*
+ * I2C bus device maximum buffer size w/o counting I2C address or command
+ * i.e. max size required for I2C write is 34 = addr, command, 32 bytes data
+ */
+#define TPM_I2C_MAX_BUF_SIZE           32
+#define TPM_I2C_RETRY_COUNT            32
+#define TPM_I2C_BUS_DELAY              1       /* msec */
+#define TPM_I2C_RETRY_DELAY_SHORT      2       /* msec */
+#define TPM_I2C_RETRY_DELAY_LONG       10      /* msec */
+
+#define I2C_DRIVER_NAME "tpm_i2c_nuvoton"
+
+struct priv_data {
+       unsigned int intrs;
+};
+
+static s32 i2c_nuvoton_read_buf(struct i2c_client *client, u8 offset, u8 size,
+                               u8 *data)
+{
+       s32 status;
+
+       status = i2c_smbus_read_i2c_block_data(client, offset, size, data);
+       dev_dbg(&client->dev,
+               "%s(offset=%u size=%u data=%*ph) -> sts=%d\n", __func__,
+               offset, size, (int)size, data, status);
+       return status;
+}
+
+static s32 i2c_nuvoton_write_buf(struct i2c_client *client, u8 offset, u8 size,
+                                u8 *data)
+{
+       s32 status;
+
+       status = i2c_smbus_write_i2c_block_data(client, offset, size, data);
+       dev_dbg(&client->dev,
+               "%s(offset=%u size=%u data=%*ph) -> sts=%d\n", __func__,
+               offset, size, (int)size, data, status);
+       return status;
+}
+
+#define TPM_STS_VALID          0x80
+#define TPM_STS_COMMAND_READY  0x40
+#define TPM_STS_GO             0x20
+#define TPM_STS_DATA_AVAIL     0x10
+#define TPM_STS_EXPECT         0x08
+#define TPM_STS_RESPONSE_RETRY 0x02
+#define TPM_STS_ERR_VAL        0x07    /* bit2...bit0 reads always 0 */
+
+#define TPM_I2C_SHORT_TIMEOUT  750     /* ms */
+#define TPM_I2C_LONG_TIMEOUT   2000    /* 2 sec */
+
+/* read TPM_STS register */
+static u8 i2c_nuvoton_read_status(struct tpm_chip *chip)
+{
+       struct i2c_client *client = to_i2c_client(chip->dev);
+       s32 status;
+       u8 data;
+
+       status = i2c_nuvoton_read_buf(client, TPM_STS, 1, &data);
+       if (status <= 0) {
+               dev_err(chip->dev, "%s() error return %d\n", __func__,
+                       status);
+               data = TPM_STS_ERR_VAL;
+       }
+
+       return data;
+}
+
+/* write byte to TPM_STS register */
+static s32 i2c_nuvoton_write_status(struct i2c_client *client, u8 data)
+{
+       s32 status;
+       int i;
+
+       /* this causes the current command to be aborted */
+       for (i = 0, status = -1; i < TPM_I2C_RETRY_COUNT && status < 0; i++) {
+               status = i2c_nuvoton_write_buf(client, TPM_STS, 1, &data);
+               msleep(TPM_I2C_BUS_DELAY);
+       }
+       return status;
+}
+
+/* write commandReady to TPM_STS register */
+static void i2c_nuvoton_ready(struct tpm_chip *chip)
+{
+       struct i2c_client *client = to_i2c_client(chip->dev);
+       s32 status;
+
+       /* this causes the current command to be aborted */
+       status = i2c_nuvoton_write_status(client, TPM_STS_COMMAND_READY);
+       if (status < 0)
+               dev_err(chip->dev,
+                       "%s() fail to write TPM_STS.commandReady\n", __func__);
+}
+
+/* read burstCount field from TPM_STS register
+ * return -1 on fail to read */
+static int i2c_nuvoton_get_burstcount(struct i2c_client *client,
+                                     struct tpm_chip *chip)
+{
+       unsigned long stop = jiffies + chip->vendor.timeout_d;
+       s32 status;
+       int burst_count = -1;
+       u8 data;
+
+       /* wait for burstcount to be non-zero */
+       do {
+               /* in I2C burstCount is 1 byte */
+               status = i2c_nuvoton_read_buf(client, TPM_BURST_COUNT, 1,
+                                             &data);
+               if (status > 0 && data > 0) {
+                       burst_count = min_t(u8, TPM_I2C_MAX_BUF_SIZE, data);
+                       break;
+               }
+               msleep(TPM_I2C_BUS_DELAY);
+       } while (time_before(jiffies, stop));
+
+       return burst_count;
+}
+
+/*
+ * WPCT301/NPCT501 SINT# supports only dataAvail
+ * any call to this function which is not waiting for dataAvail will
+ * set queue to NULL to avoid waiting for interrupt
+ */
+static bool i2c_nuvoton_check_status(struct tpm_chip *chip, u8 mask, u8 value)
+{
+       u8 status = i2c_nuvoton_read_status(chip);
+       return (status != TPM_STS_ERR_VAL) && ((status & mask) == value);
+}
+
+static int i2c_nuvoton_wait_for_stat(struct tpm_chip *chip, u8 mask, u8 value,
+                                    u32 timeout, wait_queue_head_t *queue)
+{
+       if (chip->vendor.irq && queue) {
+               s32 rc;
+               DEFINE_WAIT(wait);
+               struct priv_data *priv = chip->vendor.priv;
+               unsigned int cur_intrs = priv->intrs;
+
+               enable_irq(chip->vendor.irq);
+               rc = wait_event_interruptible_timeout(*queue,
+                                                     cur_intrs != priv->intrs,
+                                                     timeout);
+               if (rc > 0)
+                       return 0;
+               /* At this point we know that the SINT pin is asserted, so we
+                * do not need to do i2c_nuvoton_check_status */
+       } else {
+               unsigned long ten_msec, stop;
+               bool status_valid;
+
+               /* check current status */
+               status_valid = i2c_nuvoton_check_status(chip, mask, value);
+               if (status_valid)
+                       return 0;
+
+               /* use polling to wait for the event */
+               ten_msec = jiffies + msecs_to_jiffies(TPM_I2C_RETRY_DELAY_LONG);
+               stop = jiffies + timeout;
+               do {
+                       if (time_before(jiffies, ten_msec))
+                               msleep(TPM_I2C_RETRY_DELAY_SHORT);
+                       else
+                               msleep(TPM_I2C_RETRY_DELAY_LONG);
+                       status_valid = i2c_nuvoton_check_status(chip, mask,
+                                                               value);
+                       if (status_valid)
+                               return 0;
+               } while (time_before(jiffies, stop));
+       }
+       dev_err(chip->dev, "%s(%02x, %02x) -> timeout\n", __func__, mask,
+               value);
+       return -ETIMEDOUT;
+}
+
+/* wait for dataAvail field to be set in the TPM_STS register */
+static int i2c_nuvoton_wait_for_data_avail(struct tpm_chip *chip, u32 timeout,
+                                          wait_queue_head_t *queue)
+{
+       return i2c_nuvoton_wait_for_stat(chip,
+                                        TPM_STS_DATA_AVAIL | TPM_STS_VALID,
+                                        TPM_STS_DATA_AVAIL | TPM_STS_VALID,
+                                        timeout, queue);
+}
+
+/* Read @count bytes into @buf from TPM_RD_FIFO register */
+static int i2c_nuvoton_recv_data(struct i2c_client *client,
+                                struct tpm_chip *chip, u8 *buf, size_t count)
+{
+       s32 rc;
+       int burst_count, bytes2read, size = 0;
+
+       while (size < count &&
+              i2c_nuvoton_wait_for_data_avail(chip,
+                                              chip->vendor.timeout_c,
+                                              &chip->vendor.read_queue) == 0) {
+               burst_count = i2c_nuvoton_get_burstcount(client, chip);
+               if (burst_count < 0) {
+                       dev_err(chip->dev,
+                               "%s() fail to read burstCount=%d\n", __func__,
+                               burst_count);
+                       return -EIO;
+               }
+               bytes2read = min_t(size_t, burst_count, count - size);
+               rc = i2c_nuvoton_read_buf(client, TPM_DATA_FIFO_R,
+                                         bytes2read, &buf[size]);
+               if (rc < 0) {
+                       dev_err(chip->dev,
+                               "%s() fail on i2c_nuvoton_read_buf()=%d\n",
+                               __func__, rc);
+                       return -EIO;
+               }
+               dev_dbg(chip->dev, "%s(%d):", __func__, bytes2read);
+               size += bytes2read;
+       }
+
+       return size;
+}
+
+/* Read TPM command results */
+static int i2c_nuvoton_recv(struct tpm_chip *chip, u8 *buf, size_t count)
+{
+       struct device *dev = chip->dev;
+       struct i2c_client *client = to_i2c_client(dev);
+       s32 rc;
+       int expected, status, burst_count, retries, size = 0;
+
+       if (count < TPM_HEADER_SIZE) {
+               i2c_nuvoton_ready(chip);    /* return to idle */
+               dev_err(dev, "%s() count < header size\n", __func__);
+               return -EIO;
+       }
+       for (retries = 0; retries < TPM_RETRY; retries++) {
+               if (retries > 0) {
+                       /* if this is not the first trial, set responseRetry */
+                       i2c_nuvoton_write_status(client,
+                                                TPM_STS_RESPONSE_RETRY);
+               }
+               /*
+                * read first available (> 10 bytes), including:
+                * tag, paramsize, and result
+                */
+               status = i2c_nuvoton_wait_for_data_avail(
+                       chip, chip->vendor.timeout_c, &chip->vendor.read_queue);
+               if (status != 0) {
+                       dev_err(dev, "%s() timeout on dataAvail\n", __func__);
+                       size = -ETIMEDOUT;
+                       continue;
+               }
+               burst_count = i2c_nuvoton_get_burstcount(client, chip);
+               if (burst_count < 0) {
+                       dev_err(dev, "%s() fail to get burstCount\n", __func__);
+                       size = -EIO;
+                       continue;
+               }
+               size = i2c_nuvoton_recv_data(client, chip, buf,
+                                            burst_count);
+               if (size < TPM_HEADER_SIZE) {
+                       dev_err(dev, "%s() fail to read header\n", __func__);
+                       size = -EIO;
+                       continue;
+               }
+               /*
+                * convert number of expected bytes field from big endian 32 bit
+                * to machine native
+                */
+               expected = be32_to_cpu(*(__be32 *) (buf + 2));
+               if (expected > count) {
+                       dev_err(dev, "%s() expected > count\n", __func__);
+                       size = -EIO;
+                       continue;
+               }
+               rc = i2c_nuvoton_recv_data(client, chip, &buf[size],
+                                          expected - size);
+               size += rc;
+               if (rc < 0 || size < expected) {
+                       dev_err(dev, "%s() fail to read remainder of result\n",
+                               __func__);
+                       size = -EIO;
+                       continue;
+               }
+               if (i2c_nuvoton_wait_for_stat(
+                           chip, TPM_STS_VALID | TPM_STS_DATA_AVAIL,
+                           TPM_STS_VALID, chip->vendor.timeout_c,
+                           NULL)) {
+                       dev_err(dev, "%s() error left over data\n", __func__);
+                       size = -ETIMEDOUT;
+                       continue;
+               }
+               break;
+       }
+       i2c_nuvoton_ready(chip);
+       dev_dbg(chip->dev, "%s() -> %d\n", __func__, size);
+       return size;
+}
+
+/*
+ * Send TPM command.
+ *
+ * If interrupts are used (signaled by an irq set in the vendor structure)
+ * tpm.c can skip polling for the data to be available as the interrupt is
+ * waited for here
+ */
+static int i2c_nuvoton_send(struct tpm_chip *chip, u8 *buf, size_t len)
+{
+       struct device *dev = chip->dev;
+       struct i2c_client *client = to_i2c_client(dev);
+       u32 ordinal;
+       size_t count = 0;
+       int burst_count, bytes2write, retries, rc = -EIO;
+
+       for (retries = 0; retries < TPM_RETRY; retries++) {
+               i2c_nuvoton_ready(chip);
+               if (i2c_nuvoton_wait_for_stat(chip, TPM_STS_COMMAND_READY,
+                                             TPM_STS_COMMAND_READY,
+                                             chip->vendor.timeout_b, NULL)) {
+                       dev_err(dev, "%s() timeout on commandReady\n",
+                               __func__);
+                       rc = -EIO;
+                       continue;
+               }
+               rc = 0;
+               while (count < len - 1) {
+                       burst_count = i2c_nuvoton_get_burstcount(client,
+                                                                chip);
+                       if (burst_count < 0) {
+                               dev_err(dev, "%s() fail get burstCount\n",
+                                       __func__);
+                               rc = -EIO;
+                               break;
+                       }
+                       bytes2write = min_t(size_t, burst_count,
+                                           len - 1 - count);
+                       rc = i2c_nuvoton_write_buf(client, TPM_DATA_FIFO_W,
+                                                  bytes2write, &buf[count]);
+                       if (rc < 0) {
+                               dev_err(dev, "%s() fail i2cWriteBuf\n",
+                                       __func__);
+                               break;
+                       }
+                       dev_dbg(dev, "%s(%d):", __func__, bytes2write);
+                       count += bytes2write;
+                       rc = i2c_nuvoton_wait_for_stat(chip,
+                                                      TPM_STS_VALID |
+                                                      TPM_STS_EXPECT,
+                                                      TPM_STS_VALID |
+                                                      TPM_STS_EXPECT,
+                                                      chip->vendor.timeout_c,
+                                                      NULL);
+                       if (rc < 0) {
+                               dev_err(dev, "%s() timeout on Expect\n",
+                                       __func__);
+                               rc = -ETIMEDOUT;
+                               break;
+                       }
+               }
+               if (rc < 0)
+                       continue;
+
+               /* write last byte */
+               rc = i2c_nuvoton_write_buf(client, TPM_DATA_FIFO_W, 1,
+                                          &buf[count]);
+               if (rc < 0) {
+                       dev_err(dev, "%s() fail to write last byte\n",
+                               __func__);
+                       rc = -EIO;
+                       continue;
+               }
+               dev_dbg(dev, "%s(last): %02x", __func__, buf[count]);
+               rc = i2c_nuvoton_wait_for_stat(chip,
+                                              TPM_STS_VALID | TPM_STS_EXPECT,
+                                              TPM_STS_VALID,
+                                              chip->vendor.timeout_c, NULL);
+               if (rc) {
+                       dev_err(dev, "%s() timeout on Expect to clear\n",
+                               __func__);
+                       rc = -ETIMEDOUT;
+                       continue;
+               }
+               break;
+       }
+       if (rc < 0) {
+               /* retries == TPM_RETRY */
+               i2c_nuvoton_ready(chip);
+               return rc;
+       }
+       /* execute the TPM command */
+       rc = i2c_nuvoton_write_status(client, TPM_STS_GO);
+       if (rc < 0) {
+               dev_err(dev, "%s() fail to write Go\n", __func__);
+               i2c_nuvoton_ready(chip);
+               return rc;
+       }
+       ordinal = be32_to_cpu(*((__be32 *) (buf + 6)));
+       rc = i2c_nuvoton_wait_for_data_avail(chip,
+                                            tpm_calc_ordinal_duration(chip,
+                                                                      ordinal),
+                                            &chip->vendor.read_queue);
+       if (rc) {
+               dev_err(dev, "%s() timeout command duration\n", __func__);
+               i2c_nuvoton_ready(chip);
+               return rc;
+       }
+
+       dev_dbg(dev, "%s() -> %zd\n", __func__, len);
+       return len;
+}
+
+static bool i2c_nuvoton_req_canceled(struct tpm_chip *chip, u8 status)
+{
+       return (status == TPM_STS_COMMAND_READY);
+}
+
+static const struct file_operations i2c_nuvoton_ops = {
+       .owner = THIS_MODULE,
+       .llseek = no_llseek,
+       .open = tpm_open,
+       .read = tpm_read,
+       .write = tpm_write,
+       .release = tpm_release,
+};
+
+static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
+static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
+static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
+static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
+static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
+static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, NULL);
+static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
+static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel);
+static DEVICE_ATTR(durations, S_IRUGO, tpm_show_durations, NULL);
+static DEVICE_ATTR(timeouts, S_IRUGO, tpm_show_timeouts, NULL);
+
+static struct attribute *i2c_nuvoton_attrs[] = {
+       &dev_attr_pubek.attr,
+       &dev_attr_pcrs.attr,
+       &dev_attr_enabled.attr,
+       &dev_attr_active.attr,
+       &dev_attr_owned.attr,
+       &dev_attr_temp_deactivated.attr,
+       &dev_attr_caps.attr,
+       &dev_attr_cancel.attr,
+       &dev_attr_durations.attr,
+       &dev_attr_timeouts.attr,
+       NULL,
+};
+
+static struct attribute_group i2c_nuvoton_attr_grp = {
+       .attrs = i2c_nuvoton_attrs
+};
+
+static const struct tpm_vendor_specific tpm_i2c = {
+       .status = i2c_nuvoton_read_status,
+       .recv = i2c_nuvoton_recv,
+       .send = i2c_nuvoton_send,
+       .cancel = i2c_nuvoton_ready,
+       .req_complete_mask = TPM_STS_DATA_AVAIL | TPM_STS_VALID,
+       .req_complete_val = TPM_STS_DATA_AVAIL | TPM_STS_VALID,
+       .req_canceled = i2c_nuvoton_req_canceled,
+       .attr_group = &i2c_nuvoton_attr_grp,
+       .miscdev.fops = &i2c_nuvoton_ops,
+};
+
+/* The only purpose for the handler is to signal to any waiting threads that
+ * the interrupt is currently being asserted. The driver does not do any
+ * processing triggered by interrupts, and the chip provides no way to mask at
+ * the source (plus that would be slow over I2C). Run the IRQ as a one-shot,
+ * this means it cannot be shared. */
+static irqreturn_t i2c_nuvoton_int_handler(int dummy, void *dev_id)
+{
+       struct tpm_chip *chip = dev_id;
+       struct priv_data *priv = chip->vendor.priv;
+
+       priv->intrs++;
+       wake_up(&chip->vendor.read_queue);
+       disable_irq_nosync(chip->vendor.irq);
+       return IRQ_HANDLED;
+}
+
+static int get_vid(struct i2c_client *client, u32 *res)
+{
+       static const u8 vid_did_rid_value[] = { 0x50, 0x10, 0xfe };
+       u32 temp;
+       s32 rc;
+
+       if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE_DATA))
+               return -ENODEV;
+       rc = i2c_nuvoton_read_buf(client, TPM_VID_DID_RID, 4, (u8 *)&temp);
+       if (rc < 0)
+               return rc;
+
+       /* check WPCT301 values - ignore RID */
+       if (memcmp(&temp, vid_did_rid_value, sizeof(vid_did_rid_value))) {
+               /*
+                * f/w rev 2.81 has an issue where the VID_DID_RID is not
+                * reporting the right value. so give it another chance at
+                * offset 0x20 (FIFO_W).
+                */
+               rc = i2c_nuvoton_read_buf(client, TPM_DATA_FIFO_W, 4,
+                                         (u8 *) (&temp));
+               if (rc < 0)
+                       return rc;
+
+               /* check WPCT301 values - ignore RID */
+               if (memcmp(&temp, vid_did_rid_value,
+                          sizeof(vid_did_rid_value)))
+                       return -ENODEV;
+       }
+
+       *res = temp;
+       return 0;
+}
+
+static int i2c_nuvoton_probe(struct i2c_client *client,
+                            const struct i2c_device_id *id)
+{
+       int rc;
+       struct tpm_chip *chip;
+       struct device *dev = &client->dev;
+       u32 vid = 0;
+
+       rc = get_vid(client, &vid);
+       if (rc)
+               return rc;
+
+       dev_info(dev, "VID: %04X DID: %02X RID: %02X\n", (u16) vid,
+                (u8) (vid >> 16), (u8) (vid >> 24));
+
+       chip = tpm_register_hardware(dev, &tpm_i2c);
+       if (!chip) {
+               dev_err(dev, "%s() error in tpm_register_hardware\n", __func__);
+               return -ENODEV;
+       }
+
+       chip->vendor.priv = devm_kzalloc(dev, sizeof(struct priv_data),
+                                        GFP_KERNEL);
+       init_waitqueue_head(&chip->vendor.read_queue);
+       init_waitqueue_head(&chip->vendor.int_queue);
+
+       /* Default timeouts */
+       chip->vendor.timeout_a = msecs_to_jiffies(TPM_I2C_SHORT_TIMEOUT);
+       chip->vendor.timeout_b = msecs_to_jiffies(TPM_I2C_LONG_TIMEOUT);
+       chip->vendor.timeout_c = msecs_to_jiffies(TPM_I2C_SHORT_TIMEOUT);
+       chip->vendor.timeout_d = msecs_to_jiffies(TPM_I2C_SHORT_TIMEOUT);
+
+       /*
+        * I2C intfcaps (interrupt capabilitieis) in the chip are hard coded to:
+        *   TPM_INTF_INT_LEVEL_LOW | TPM_INTF_DATA_AVAIL_INT
+        * The IRQ should be set in the i2c_board_info (which is done
+        * automatically in of_i2c_register_devices, for device tree users */
+       chip->vendor.irq = client->irq;
+
+       if (chip->vendor.irq) {
+               dev_dbg(dev, "%s() chip-vendor.irq\n", __func__);
+               rc = devm_request_irq(dev, chip->vendor.irq,
+                                     i2c_nuvoton_int_handler,
+                                     IRQF_TRIGGER_LOW,
+                                     chip->vendor.miscdev.name,
+                                     chip);
+               if (rc) {
+                       dev_err(dev, "%s() Unable to request irq: %d for use\n",
+                               __func__, chip->vendor.irq);
+                       chip->vendor.irq = 0;
+               } else {
+                       /* Clear any pending interrupt */
+                       i2c_nuvoton_ready(chip);
+                       /* - wait for TPM_STS==0xA0 (stsValid, commandReady) */
+                       rc = i2c_nuvoton_wait_for_stat(chip,
+                                                      TPM_STS_COMMAND_READY,
+                                                      TPM_STS_COMMAND_READY,
+                                                      chip->vendor.timeout_b,
+                                                      NULL);
+                       if (rc == 0) {
+                               /*
+                                * TIS is in ready state
+                                * write dummy byte to enter reception state
+                                * TPM_DATA_FIFO_W <- rc (0)
+                                */
+                               rc = i2c_nuvoton_write_buf(client,
+                                                          TPM_DATA_FIFO_W,
+                                                          1, (u8 *) (&rc));
+                               if (rc < 0)
+                                       goto out_err;
+                               /* TPM_STS <- 0x40 (commandReady) */
+                               i2c_nuvoton_ready(chip);
+                       } else {
+                               /*
+                                * timeout_b reached - command was
+                                * aborted. TIS should now be in idle state -
+                                * only TPM_STS_VALID should be set
+                                */
+                               if (i2c_nuvoton_read_status(chip) !=
+                                   TPM_STS_VALID) {
+                                       rc = -EIO;
+                                       goto out_err;
+                               }
+                       }
+               }
+       }
+
+       if (tpm_get_timeouts(chip)) {
+               rc = -ENODEV;
+               goto out_err;
+       }
+
+       if (tpm_do_selftest(chip)) {
+               rc = -ENODEV;
+               goto out_err;
+       }
+
+       return 0;
+
+out_err:
+       tpm_dev_vendor_release(chip);
+       tpm_remove_hardware(chip->dev);
+       return rc;
+}
+
+static int i2c_nuvoton_remove(struct i2c_client *client)
+{
+       struct device *dev = &(client->dev);
+       struct tpm_chip *chip = dev_get_drvdata(dev);
+
+       if (chip)
+               tpm_dev_vendor_release(chip);
+       tpm_remove_hardware(dev);
+       kfree(chip);
+       return 0;
+}
+
+
+static const struct i2c_device_id i2c_nuvoton_id[] = {
+       {I2C_DRIVER_NAME, 0},
+       {}
+};
+MODULE_DEVICE_TABLE(i2c, i2c_nuvoton_id);
+
+#ifdef CONFIG_OF
+static const struct of_device_id i2c_nuvoton_of_match[] = {
+       {.compatible = "nuvoton,npct501"},
+       {.compatible = "winbond,wpct301"},
+       {},
+};
+MODULE_DEVICE_TABLE(of, i2c_nuvoton_of_match);
+#endif
+
+static SIMPLE_DEV_PM_OPS(i2c_nuvoton_pm_ops, tpm_pm_suspend, tpm_pm_resume);
+
+static struct i2c_driver i2c_nuvoton_driver = {
+       .id_table = i2c_nuvoton_id,
+       .probe = i2c_nuvoton_probe,
+       .remove = i2c_nuvoton_remove,
+       .driver = {
+               .name = I2C_DRIVER_NAME,
+               .owner = THIS_MODULE,
+               .pm = &i2c_nuvoton_pm_ops,
+               .of_match_table = of_match_ptr(i2c_nuvoton_of_match),
+       },
+};
+
+module_i2c_driver(i2c_nuvoton_driver);
+
+MODULE_AUTHOR("Dan Morav (dan.morav@nuvoton.com)");
+MODULE_DESCRIPTION("Nuvoton TPM I2C Driver");
+MODULE_LICENSE("GPL");
index 5bb8e2d..a0d6ceb 100644 (file)
@@ -584,7 +584,7 @@ static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
 static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
 static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
 static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, NULL);
-static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps_1_2, NULL);
+static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
 static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel);
 
 static struct attribute *stm_tpm_attrs[] = {
@@ -746,8 +746,6 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 
        tpm_get_timeouts(chip);
 
-       i2c_set_clientdata(client, chip);
-
        dev_info(chip->dev, "TPM I2C Initialized\n");
        return 0;
 _irq_set:
@@ -807,24 +805,18 @@ static int tpm_st33_i2c_remove(struct i2c_client *client)
 #ifdef CONFIG_PM_SLEEP
 /*
  * tpm_st33_i2c_pm_suspend suspend the TPM device
- * Added: Work around when suspend and no tpm application is running, suspend
- * may fail because chip->data_buffer is not set (only set in tpm_open in Linux
- * TPM core)
  * @param: client, the i2c_client drescription (TPM I2C description).
  * @param: mesg, the power management message.
  * @return: 0 in case of success.
  */
 static int tpm_st33_i2c_pm_suspend(struct device *dev)
 {
-       struct tpm_chip *chip = dev_get_drvdata(dev);
        struct st33zp24_platform_data *pin_infos = dev->platform_data;
        int ret = 0;
 
        if (power_mgt) {
                gpio_set_value(pin_infos->io_lpcpd, 0);
        } else {
-               if (chip->data_buffer == NULL)
-                       chip->data_buffer = pin_infos->tpm_i2c_buffer[0];
                ret = tpm_pm_suspend(dev);
        }
        return ret;
@@ -849,8 +841,6 @@ static int tpm_st33_i2c_pm_resume(struct device *dev)
                                          TPM_STS_VALID) == TPM_STS_VALID,
                                          chip->vendor.timeout_b);
        } else {
-               if (chip->data_buffer == NULL)
-                       chip->data_buffer = pin_infos->tpm_i2c_buffer[0];
                ret = tpm_pm_resume(dev);
                if (!ret)
                        tpm_do_selftest(chip);
index 56b07c3..2783a42 100644 (file)
@@ -98,7 +98,7 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
 
        if (count < len) {
                dev_err(ibmvtpm->dev,
-                       "Invalid size in recv: count=%ld, crq_size=%d\n",
+                       "Invalid size in recv: count=%zd, crq_size=%d\n",
                        count, len);
                return -EIO;
        }
@@ -136,7 +136,7 @@ static int tpm_ibmvtpm_send(struct tpm_chip *chip, u8 *buf, size_t count)
 
        if (count > ibmvtpm->rtce_size) {
                dev_err(ibmvtpm->dev,
-                       "Invalid size in send: count=%ld, rtce_size=%d\n",
+                       "Invalid size in send: count=%zd, rtce_size=%d\n",
                        count, ibmvtpm->rtce_size);
                return -EIO;
        }
@@ -419,7 +419,7 @@ static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
 static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
 static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated,
                   NULL);
-static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps_1_2, NULL);
+static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
 static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel);
 static DEVICE_ATTR(durations, S_IRUGO, tpm_show_durations, NULL);
 static DEVICE_ATTR(timeouts, S_IRUGO, tpm_show_timeouts, NULL);
index 2168d15..8e562dc 100644 (file)
@@ -452,12 +452,8 @@ int tpm_add_ppi(struct kobject *parent)
 {
        return sysfs_create_group(parent, &ppi_attr_grp);
 }
-EXPORT_SYMBOL_GPL(tpm_add_ppi);
 
 void tpm_remove_ppi(struct kobject *parent)
 {
        sysfs_remove_group(parent, &ppi_attr_grp);
 }
-EXPORT_SYMBOL_GPL(tpm_remove_ppi);
-
-MODULE_LICENSE("GPL");
index 5796d01..1b74459 100644 (file)
@@ -448,7 +448,7 @@ static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
 static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
 static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated,
                   NULL);
-static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps_1_2, NULL);
+static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
 static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel);
 static DEVICE_ATTR(durations, S_IRUGO, tpm_show_durations, NULL);
 static DEVICE_ATTR(timeouts, S_IRUGO, tpm_show_timeouts, NULL);
index 94c280d..c8ff4df 100644 (file)
@@ -351,8 +351,6 @@ static int tpmfront_probe(struct xenbus_device *dev,
 
        tpm_get_timeouts(priv->chip);
 
-       dev_set_drvdata(&dev->dev, priv->chip);
-
        return rv;
 }
 
index 218460f..25a70d0 100644 (file)
@@ -68,6 +68,9 @@ static void cs_check_cpu(int cpu, unsigned int load)
 
                dbs_info->requested_freq += get_freq_target(cs_tuners, policy);
 
+               if (dbs_info->requested_freq > policy->max)
+                       dbs_info->requested_freq = policy->max;
+
                __cpufreq_driver_target(policy, dbs_info->requested_freq,
                        CPUFREQ_RELATION_H);
                return;
index 0806c31..e6be635 100644 (file)
@@ -328,10 +328,6 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
                                             dbs_data->cdata->gov_dbs_timer);
                }
 
-               /*
-                * conservative does not implement micro like ondemand
-                * governor, thus we are bound to jiffes/HZ
-                */
                if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
                        cs_dbs_info->down_skip = 0;
                        cs_dbs_info->enable = 1;
index be6d143..a0acd0b 100644 (file)
@@ -53,6 +53,7 @@ static unsigned int omap_getspeed(unsigned int cpu)
 
 static int omap_target(struct cpufreq_policy *policy, unsigned int index)
 {
+       int r, ret;
        struct dev_pm_opp *opp;
        unsigned long freq, volt = 0, volt_old = 0, tol = 0;
        unsigned int old_freq, new_freq;
index dd2874e..446687c 100644 (file)
@@ -89,14 +89,15 @@ config AT_HDMAC
          Support the Atmel AHB DMA controller.
 
 config FSL_DMA
-       tristate "Freescale Elo and Elo Plus DMA support"
+       tristate "Freescale Elo series DMA support"
        depends on FSL_SOC
        select DMA_ENGINE
        select ASYNC_TX_ENABLE_CHANNEL_SWITCH
        ---help---
-         Enable support for the Freescale Elo and Elo Plus DMA controllers.
-         The Elo is the DMA controller on some 82xx and 83xx parts, and the
-         Elo Plus is the DMA controller on 85xx and 86xx parts.
+         Enable support for the Freescale Elo series DMA controllers.
+         The Elo is the DMA controller on some mpc82xx and mpc83xx parts, the
+         EloPlus is on mpc85xx and mpc86xx and Pxxx parts, and the Elo3 is on
+         some Txxx and Bxxx parts.
 
 config MPC512X_DMA
        tristate "Freescale MPC512x built-in DMA engine support"
index e51a983..16a2aa2 100644 (file)
@@ -1164,42 +1164,12 @@ static void pl08x_free_txd(struct pl08x_driver_data *pl08x,
        kfree(txd);
 }
 
-static void pl08x_unmap_buffers(struct pl08x_txd *txd)
-{
-       struct device *dev = txd->vd.tx.chan->device->dev;
-       struct pl08x_sg *dsg;
-
-       if (!(txd->vd.tx.flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-               if (txd->vd.tx.flags & DMA_COMPL_SRC_UNMAP_SINGLE)
-                       list_for_each_entry(dsg, &txd->dsg_list, node)
-                               dma_unmap_single(dev, dsg->src_addr, dsg->len,
-                                               DMA_TO_DEVICE);
-               else {
-                       list_for_each_entry(dsg, &txd->dsg_list, node)
-                               dma_unmap_page(dev, dsg->src_addr, dsg->len,
-                                               DMA_TO_DEVICE);
-               }
-       }
-       if (!(txd->vd.tx.flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-               if (txd->vd.tx.flags & DMA_COMPL_DEST_UNMAP_SINGLE)
-                       list_for_each_entry(dsg, &txd->dsg_list, node)
-                               dma_unmap_single(dev, dsg->dst_addr, dsg->len,
-                                               DMA_FROM_DEVICE);
-               else
-                       list_for_each_entry(dsg, &txd->dsg_list, node)
-                               dma_unmap_page(dev, dsg->dst_addr, dsg->len,
-                                               DMA_FROM_DEVICE);
-       }
-}
-
 static void pl08x_desc_free(struct virt_dma_desc *vd)
 {
        struct pl08x_txd *txd = to_pl08x_txd(&vd->tx);
        struct pl08x_dma_chan *plchan = to_pl08x_chan(vd->tx.chan);
 
-       if (!plchan->slave)
-               pl08x_unmap_buffers(txd);
-
+       dma_descriptor_unmap(txd);
        if (!txd->done)
                pl08x_release_mux(plchan);
 
@@ -1252,7 +1222,7 @@ static enum dma_status pl08x_dma_tx_status(struct dma_chan *chan,
        size_t bytes = 0;
 
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret == DMA_SUCCESS)
+       if (ret == DMA_COMPLETE)
                return ret;
 
        /*
@@ -1267,7 +1237,7 @@ static enum dma_status pl08x_dma_tx_status(struct dma_chan *chan,
 
        spin_lock_irqsave(&plchan->vc.lock, flags);
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret != DMA_SUCCESS) {
+       if (ret != DMA_COMPLETE) {
                vd = vchan_find_desc(&plchan->vc, cookie);
                if (vd) {
                        /* On the issued list, so hasn't been processed yet */
@@ -2138,8 +2108,7 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id)
        writel(0x000000FF, pl08x->base + PL080_ERR_CLEAR);
        writel(0x000000FF, pl08x->base + PL080_TC_CLEAR);
 
-       ret = request_irq(adev->irq[0], pl08x_irq, IRQF_DISABLED,
-                         DRIVER_NAME, pl08x);
+       ret = request_irq(adev->irq[0], pl08x_irq, 0, DRIVER_NAME, pl08x);
        if (ret) {
                dev_err(&adev->dev, "%s failed to request interrupt %d\n",
                        __func__, adev->irq[0]);
index c787f38..e2c04dc 100644 (file)
@@ -344,31 +344,7 @@ atc_chain_complete(struct at_dma_chan *atchan, struct at_desc *desc)
        /* move myself to free_list */
        list_move(&desc->desc_node, &atchan->free_list);
 
-       /* unmap dma addresses (not on slave channels) */
-       if (!atchan->chan_common.private) {
-               struct device *parent = chan2parent(&atchan->chan_common);
-               if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-                       if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
-                               dma_unmap_single(parent,
-                                               desc->lli.daddr,
-                                               desc->len, DMA_FROM_DEVICE);
-                       else
-                               dma_unmap_page(parent,
-                                               desc->lli.daddr,
-                                               desc->len, DMA_FROM_DEVICE);
-               }
-               if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-                       if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
-                               dma_unmap_single(parent,
-                                               desc->lli.saddr,
-                                               desc->len, DMA_TO_DEVICE);
-                       else
-                               dma_unmap_page(parent,
-                                               desc->lli.saddr,
-                                               desc->len, DMA_TO_DEVICE);
-               }
-       }
-
+       dma_descriptor_unmap(txd);
        /* for cyclic transfers,
         * no need to replay callback function while stopping */
        if (!atc_chan_is_cyclic(atchan)) {
@@ -1102,7 +1078,7 @@ atc_tx_status(struct dma_chan *chan,
        int bytes = 0;
 
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret == DMA_SUCCESS)
+       if (ret == DMA_COMPLETE)
                return ret;
        /*
         * There's no point calculating the residue if there's
index 31011d2..3c6716e 100644 (file)
@@ -2369,7 +2369,7 @@ coh901318_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
        enum dma_status ret;
 
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret == DMA_SUCCESS)
+       if (ret == DMA_COMPLETE)
                return ret;
 
        dma_set_residue(txstate, coh901318_get_bytes_left(chan));
@@ -2694,7 +2694,7 @@ static int __init coh901318_probe(struct platform_device *pdev)
        if (irq < 0)
                return irq;
 
-       err = devm_request_irq(&pdev->dev, irq, dma_irq_handler, IRQF_DISABLED,
+       err = devm_request_irq(&pdev->dev, irq, dma_irq_handler, 0,
                               "coh901318", base);
        if (err)
                return err;
index 7c82b92..c29dacf 100644 (file)
@@ -141,6 +141,9 @@ struct cppi41_dd {
        const struct chan_queues *queues_rx;
        const struct chan_queues *queues_tx;
        struct chan_queues td_queue;
+
+       /* context for suspend/resume */
+       unsigned int dma_tdfdq;
 };
 
 #define FIST_COMPLETION_QUEUE  93
@@ -263,6 +266,15 @@ static u32 pd_trans_len(u32 val)
        return val & ((1 << (DESC_LENGTH_BITS_NUM + 1)) - 1);
 }
 
+static u32 cppi41_pop_desc(struct cppi41_dd *cdd, unsigned queue_num)
+{
+       u32 desc;
+
+       desc = cppi_readl(cdd->qmgr_mem + QMGR_QUEUE_D(queue_num));
+       desc &= ~0x1f;
+       return desc;
+}
+
 static irqreturn_t cppi41_irq(int irq, void *data)
 {
        struct cppi41_dd *cdd = data;
@@ -300,8 +312,7 @@ static irqreturn_t cppi41_irq(int irq, void *data)
                        q_num = __fls(val);
                        val &= ~(1 << q_num);
                        q_num += 32 * i;
-                       desc = cppi_readl(cdd->qmgr_mem + QMGR_QUEUE_D(q_num));
-                       desc &= ~0x1f;
+                       desc = cppi41_pop_desc(cdd, q_num);
                        c = desc_to_chan(cdd, desc);
                        if (WARN_ON(!c)) {
                                pr_err("%s() q %d desc %08x\n", __func__,
@@ -353,7 +364,7 @@ static enum dma_status cppi41_dma_tx_status(struct dma_chan *chan,
 
        /* lock */
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (txstate && ret == DMA_SUCCESS)
+       if (txstate && ret == DMA_COMPLETE)
                txstate->residue = c->residue;
        /* unlock */
 
@@ -517,15 +528,6 @@ static void cppi41_compute_td_desc(struct cppi41_desc *d)
        d->pd0 = DESC_TYPE_TEARD << DESC_TYPE;
 }
 
-static u32 cppi41_pop_desc(struct cppi41_dd *cdd, unsigned queue_num)
-{
-       u32 desc;
-
-       desc = cppi_readl(cdd->qmgr_mem + QMGR_QUEUE_D(queue_num));
-       desc &= ~0x1f;
-       return desc;
-}
-
 static int cppi41_tear_down_chan(struct cppi41_channel *c)
 {
        struct cppi41_dd *cdd = c->cdd;
@@ -561,36 +563,26 @@ static int cppi41_tear_down_chan(struct cppi41_channel *c)
                c->td_retry = 100;
        }
 
-       if (!c->td_seen) {
-               unsigned td_comp_queue;
+       if (!c->td_seen || !c->td_desc_seen) {
 
-               if (c->is_tx)
-                       td_comp_queue =  cdd->td_queue.complete;
-               else
-                       td_comp_queue =  c->q_comp_num;
+               desc_phys = cppi41_pop_desc(cdd, cdd->td_queue.complete);
+               if (!desc_phys)
+                       desc_phys = cppi41_pop_desc(cdd, c->q_comp_num);
 
-               desc_phys = cppi41_pop_desc(cdd, td_comp_queue);
-               if (desc_phys) {
-                       __iormb();
+               if (desc_phys == c->desc_phys) {
+                       c->td_desc_seen = 1;
+
+               } else if (desc_phys == td_desc_phys) {
+                       u32 pd0;
 
-                       if (desc_phys == td_desc_phys) {
-                               u32 pd0;
-                               pd0 = td->pd0;
-                               WARN_ON((pd0 >> DESC_TYPE) != DESC_TYPE_TEARD);
-                               WARN_ON(!c->is_tx && !(pd0 & TD_DESC_IS_RX));
-                               WARN_ON((pd0 & 0x1f) != c->port_num);
-                       } else {
-                               WARN_ON_ONCE(1);
-                       }
-                       c->td_seen = 1;
-               }
-       }
-       if (!c->td_desc_seen) {
-               desc_phys = cppi41_pop_desc(cdd, c->q_comp_num);
-               if (desc_phys) {
                        __iormb();
-                       WARN_ON(c->desc_phys != desc_phys);
-                       c->td_desc_seen = 1;
+                       pd0 = td->pd0;
+                       WARN_ON((pd0 >> DESC_TYPE) != DESC_TYPE_TEARD);
+                       WARN_ON(!c->is_tx && !(pd0 & TD_DESC_IS_RX));
+                       WARN_ON((pd0 & 0x1f) != c->port_num);
+                       c->td_seen = 1;
+               } else if (desc_phys) {
+                       WARN_ON_ONCE(1);
                }
        }
        c->td_retry--;
@@ -609,7 +601,7 @@ static int cppi41_tear_down_chan(struct cppi41_channel *c)
 
        WARN_ON(!c->td_retry);
        if (!c->td_desc_seen) {
-               desc_phys = cppi_readl(cdd->qmgr_mem + QMGR_QUEUE_D(c->q_num));
+               desc_phys = cppi41_pop_desc(cdd, c->q_num);
                WARN_ON(!desc_phys);
        }
 
@@ -674,14 +666,14 @@ static void cleanup_chans(struct cppi41_dd *cdd)
        }
 }
 
-static int cppi41_add_chans(struct platform_device *pdev, struct cppi41_dd *cdd)
+static int cppi41_add_chans(struct device *dev, struct cppi41_dd *cdd)
 {
        struct cppi41_channel *cchan;
        int i;
        int ret;
        u32 n_chans;
 
-       ret = of_property_read_u32(pdev->dev.of_node, "#dma-channels",
+       ret = of_property_read_u32(dev->of_node, "#dma-channels",
                        &n_chans);
        if (ret)
                return ret;
@@ -719,7 +711,7 @@ err:
        return -ENOMEM;
 }
 
-static void purge_descs(struct platform_device *pdev, struct cppi41_dd *cdd)
+static void purge_descs(struct device *dev, struct cppi41_dd *cdd)
 {
        unsigned int mem_decs;
        int i;
@@ -731,7 +723,7 @@ static void purge_descs(struct platform_device *pdev, struct cppi41_dd *cdd)
                cppi_writel(0, cdd->qmgr_mem + QMGR_MEMBASE(i));
                cppi_writel(0, cdd->qmgr_mem + QMGR_MEMCTRL(i));
 
-               dma_free_coherent(&pdev->dev, mem_decs, cdd->cd,
+               dma_free_coherent(dev, mem_decs, cdd->cd,
                                cdd->descs_phys);
        }
 }
@@ -741,19 +733,19 @@ static void disable_sched(struct cppi41_dd *cdd)
        cppi_writel(0, cdd->sched_mem + DMA_SCHED_CTRL);
 }
 
-static void deinit_cpii41(struct platform_device *pdev, struct cppi41_dd *cdd)
+static void deinit_cppi41(struct device *dev, struct cppi41_dd *cdd)
 {
        disable_sched(cdd);
 
-       purge_descs(pdev, cdd);
+       purge_descs(dev, cdd);
 
        cppi_writel(0, cdd->qmgr_mem + QMGR_LRAM0_BASE);
        cppi_writel(0, cdd->qmgr_mem + QMGR_LRAM0_BASE);
-       dma_free_coherent(&pdev->dev, QMGR_SCRATCH_SIZE, cdd->qmgr_scratch,
+       dma_free_coherent(dev, QMGR_SCRATCH_SIZE, cdd->qmgr_scratch,
                        cdd->scratch_phys);
 }
 
-static int init_descs(struct platform_device *pdev, struct cppi41_dd *cdd)
+static int init_descs(struct device *dev, struct cppi41_dd *cdd)
 {
        unsigned int desc_size;
        unsigned int mem_decs;
@@ -777,7 +769,7 @@ static int init_descs(struct platform_device *pdev, struct cppi41_dd *cdd)
                reg |= ilog2(ALLOC_DECS_NUM) - 5;
 
                BUILD_BUG_ON(DESCS_AREAS != 1);
-               cdd->cd = dma_alloc_coherent(&pdev->dev, mem_decs,
+               cdd->cd = dma_alloc_coherent(dev, mem_decs,
                                &cdd->descs_phys, GFP_KERNEL);
                if (!cdd->cd)
                        return -ENOMEM;
@@ -813,12 +805,12 @@ static void init_sched(struct cppi41_dd *cdd)
        cppi_writel(reg, cdd->sched_mem + DMA_SCHED_CTRL);
 }
 
-static int init_cppi41(struct platform_device *pdev, struct cppi41_dd *cdd)
+static int init_cppi41(struct device *dev, struct cppi41_dd *cdd)
 {
        int ret;
 
        BUILD_BUG_ON(QMGR_SCRATCH_SIZE > ((1 << 14) - 1));
-       cdd->qmgr_scratch = dma_alloc_coherent(&pdev->dev, QMGR_SCRATCH_SIZE,
+       cdd->qmgr_scratch = dma_alloc_coherent(dev, QMGR_SCRATCH_SIZE,
                        &cdd->scratch_phys, GFP_KERNEL);
        if (!cdd->qmgr_scratch)
                return -ENOMEM;
@@ -827,7 +819,7 @@ static int init_cppi41(struct platform_device *pdev, struct cppi41_dd *cdd)
        cppi_writel(QMGR_SCRATCH_SIZE, cdd->qmgr_mem + QMGR_LRAM_SIZE);
        cppi_writel(0, cdd->qmgr_mem + QMGR_LRAM1_BASE);
 
-       ret = init_descs(pdev, cdd);
+       ret = init_descs(dev, cdd);
        if (ret)
                goto err_td;
 
@@ -835,7 +827,7 @@ static int init_cppi41(struct platform_device *pdev, struct cppi41_dd *cdd)
        init_sched(cdd);
        return 0;
 err_td:
-       deinit_cpii41(pdev, cdd);
+       deinit_cppi41(dev, cdd);
        return ret;
 }
 
@@ -914,11 +906,11 @@ static const struct of_device_id cppi41_dma_ids[] = {
 };
 MODULE_DEVICE_TABLE(of, cppi41_dma_ids);
 
-static const struct cppi_glue_infos *get_glue_info(struct platform_device *pdev)
+static const struct cppi_glue_infos *get_glue_info(struct device *dev)
 {
        const struct of_device_id *of_id;
 
-       of_id = of_match_node(cppi41_dma_ids, pdev->dev.of_node);
+       of_id = of_match_node(cppi41_dma_ids, dev->of_node);
        if (!of_id)
                return NULL;
        return of_id->data;
@@ -927,11 +919,12 @@ static const struct cppi_glue_infos *get_glue_info(struct platform_device *pdev)
 static int cppi41_dma_probe(struct platform_device *pdev)
 {
        struct cppi41_dd *cdd;
+       struct device *dev = &pdev->dev;
        const struct cppi_glue_infos *glue_info;
        int irq;
        int ret;
 
-       glue_info = get_glue_info(pdev);
+       glue_info = get_glue_info(dev);
        if (!glue_info)
                return -EINVAL;
 
@@ -946,14 +939,14 @@ static int cppi41_dma_probe(struct platform_device *pdev)
        cdd->ddev.device_issue_pending = cppi41_dma_issue_pending;
        cdd->ddev.device_prep_slave_sg = cppi41_dma_prep_slave_sg;
        cdd->ddev.device_control = cppi41_dma_control;
-       cdd->ddev.dev = &pdev->dev;
+       cdd->ddev.dev = dev;
        INIT_LIST_HEAD(&cdd->ddev.channels);
        cpp41_dma_info.dma_cap = cdd->ddev.cap_mask;
 
-       cdd->usbss_mem = of_iomap(pdev->dev.of_node, 0);
-       cdd->ctrl_mem = of_iomap(pdev->dev.of_node, 1);
-       cdd->sched_mem = of_iomap(pdev->dev.of_node, 2);
-       cdd->qmgr_mem = of_iomap(pdev->dev.of_node, 3);
+       cdd->usbss_mem = of_iomap(dev->of_node, 0);
+       cdd->ctrl_mem = of_iomap(dev->of_node, 1);
+       cdd->sched_mem = of_iomap(dev->of_node, 2);
+       cdd->qmgr_mem = of_iomap(dev->of_node, 3);
 
        if (!cdd->usbss_mem || !cdd->ctrl_mem || !cdd->sched_mem ||
                        !cdd->qmgr_mem) {
@@ -961,31 +954,31 @@ static int cppi41_dma_probe(struct platform_device *pdev)
                goto err_remap;
        }
 
-       pm_runtime_enable(&pdev->dev);
-       ret = pm_runtime_get_sync(&pdev->dev);
-       if (ret)
+       pm_runtime_enable(dev);
+       ret = pm_runtime_get_sync(dev);
+       if (ret < 0)
                goto err_get_sync;
 
        cdd->queues_rx = glue_info->queues_rx;
        cdd->queues_tx = glue_info->queues_tx;
        cdd->td_queue = glue_info->td_queue;
 
-       ret = init_cppi41(pdev, cdd);
+       ret = init_cppi41(dev, cdd);
        if (ret)
                goto err_init_cppi;
 
-       ret = cppi41_add_chans(pdev, cdd);
+       ret = cppi41_add_chans(dev, cdd);
        if (ret)
                goto err_chans;
 
-       irq = irq_of_parse_and_map(pdev->dev.of_node, 0);
+       irq = irq_of_parse_and_map(dev->of_node, 0);
        if (!irq)
                goto err_irq;
 
        cppi_writel(USBSS_IRQ_PD_COMP, cdd->usbss_mem + USBSS_IRQ_ENABLER);
 
        ret = request_irq(irq, glue_info->isr, IRQF_SHARED,
-                       dev_name(&pdev->dev), cdd);
+                       dev_name(dev), cdd);
        if (ret)
                goto err_irq;
        cdd->irq = irq;
@@ -994,7 +987,7 @@ static int cppi41_dma_probe(struct platform_device *pdev)
        if (ret)
                goto err_dma_reg;
 
-       ret = of_dma_controller_register(pdev->dev.of_node,
+       ret = of_dma_controller_register(dev->of_node,
                        cppi41_dma_xlate, &cpp41_dma_info);
        if (ret)
                goto err_of;
@@ -1009,11 +1002,11 @@ err_irq:
        cppi_writel(0, cdd->usbss_mem + USBSS_IRQ_CLEARR);
        cleanup_chans(cdd);
 err_chans:
-       deinit_cpii41(pdev, cdd);
+       deinit_cppi41(dev, cdd);
 err_init_cppi:
-       pm_runtime_put(&pdev->dev);
+       pm_runtime_put(dev);
 err_get_sync:
-       pm_runtime_disable(&pdev->dev);
+       pm_runtime_disable(dev);
        iounmap(cdd->usbss_mem);
        iounmap(cdd->ctrl_mem);
        iounmap(cdd->sched_mem);
@@ -1033,7 +1026,7 @@ static int cppi41_dma_remove(struct platform_device *pdev)
        cppi_writel(0, cdd->usbss_mem + USBSS_IRQ_CLEARR);
        free_irq(cdd->irq, cdd);
        cleanup_chans(cdd);
-       deinit_cpii41(pdev, cdd);
+       deinit_cppi41(&pdev->dev, cdd);
        iounmap(cdd->usbss_mem);
        iounmap(cdd->ctrl_mem);
        iounmap(cdd->sched_mem);
@@ -1044,12 +1037,53 @@ static int cppi41_dma_remove(struct platform_device *pdev)
        return 0;
 }
 
+#ifdef CONFIG_PM_SLEEP
+static int cppi41_suspend(struct device *dev)
+{
+       struct cppi41_dd *cdd = dev_get_drvdata(dev);
+
+       cdd->dma_tdfdq = cppi_readl(cdd->ctrl_mem + DMA_TDFDQ);
+       cppi_writel(0, cdd->usbss_mem + USBSS_IRQ_CLEARR);
+       disable_sched(cdd);
+
+       return 0;
+}
+
+static int cppi41_resume(struct device *dev)
+{
+       struct cppi41_dd *cdd = dev_get_drvdata(dev);
+       struct cppi41_channel *c;
+       int i;
+
+       for (i = 0; i < DESCS_AREAS; i++)
+               cppi_writel(cdd->descs_phys, cdd->qmgr_mem + QMGR_MEMBASE(i));
+
+       list_for_each_entry(c, &cdd->ddev.channels, chan.device_node)
+               if (!c->is_tx)
+                       cppi_writel(c->q_num, c->gcr_reg + RXHPCRA0);
+
+       init_sched(cdd);
+
+       cppi_writel(cdd->dma_tdfdq, cdd->ctrl_mem + DMA_TDFDQ);
+       cppi_writel(cdd->scratch_phys, cdd->qmgr_mem + QMGR_LRAM0_BASE);
+       cppi_writel(QMGR_SCRATCH_SIZE, cdd->qmgr_mem + QMGR_LRAM_SIZE);
+       cppi_writel(0, cdd->qmgr_mem + QMGR_LRAM1_BASE);
+
+       cppi_writel(USBSS_IRQ_PD_COMP, cdd->usbss_mem + USBSS_IRQ_ENABLER);
+
+       return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(cppi41_pm_ops, cppi41_suspend, cppi41_resume);
+
 static struct platform_driver cpp41_dma_driver = {
        .probe  = cppi41_dma_probe,
        .remove = cppi41_dma_remove,
        .driver = {
                .name = "cppi41-dma-engine",
                .owner = THIS_MODULE,
+               .pm = &cppi41_pm_ops,
                .of_match_table = of_match_ptr(cppi41_dma_ids),
        },
 };
index b0c0c82..94c380f 100644 (file)
@@ -491,7 +491,7 @@ static enum dma_status jz4740_dma_tx_status(struct dma_chan *c,
        unsigned long flags;
 
        status = dma_cookie_status(c, cookie, state);
-       if (status == DMA_SUCCESS || !state)
+       if (status == DMA_COMPLETE || !state)
                return status;
 
        spin_lock_irqsave(&chan->vchan.lock, flags);
index 9162ac8..ea806bd 100644 (file)
@@ -65,6 +65,7 @@
 #include <linux/acpi.h>
 #include <linux/acpi_dma.h>
 #include <linux/of_dma.h>
+#include <linux/mempool.h>
 
 static DEFINE_MUTEX(dma_list_mutex);
 static DEFINE_IDR(dma_idr);
@@ -901,98 +902,132 @@ void dma_async_device_unregister(struct dma_device *device)
 }
 EXPORT_SYMBOL(dma_async_device_unregister);
 
-/**
- * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
- * @chan: DMA channel to offload copy to
- * @dest: destination address (virtual)
- * @src: source address (virtual)
- * @len: length
- *
- * Both @dest and @src must be mappable to a bus address according to the
- * DMA mapping API rules for streaming mappings.
- * Both @dest and @src must stay memory resident (kernel memory or locked
- * user space pages).
- */
-dma_cookie_t
-dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest,
-                       void *src, size_t len)
-{
-       struct dma_device *dev = chan->device;
-       struct dma_async_tx_descriptor *tx;
-       dma_addr_t dma_dest, dma_src;
-       dma_cookie_t cookie;
-       unsigned long flags;
+struct dmaengine_unmap_pool {
+       struct kmem_cache *cache;
+       const char *name;
+       mempool_t *pool;
+       size_t size;
+};
 
-       dma_src = dma_map_single(dev->dev, src, len, DMA_TO_DEVICE);
-       dma_dest = dma_map_single(dev->dev, dest, len, DMA_FROM_DEVICE);
-       flags = DMA_CTRL_ACK |
-               DMA_COMPL_SRC_UNMAP_SINGLE |
-               DMA_COMPL_DEST_UNMAP_SINGLE;
-       tx = dev->device_prep_dma_memcpy(chan, dma_dest, dma_src, len, flags);
+#define __UNMAP_POOL(x) { .size = x, .name = "dmaengine-unmap-" __stringify(x) }
+static struct dmaengine_unmap_pool unmap_pool[] = {
+       __UNMAP_POOL(2),
+       #if IS_ENABLED(CONFIG_ASYNC_TX_DMA)
+       __UNMAP_POOL(16),
+       __UNMAP_POOL(128),
+       __UNMAP_POOL(256),
+       #endif
+};
 
-       if (!tx) {
-               dma_unmap_single(dev->dev, dma_src, len, DMA_TO_DEVICE);
-               dma_unmap_single(dev->dev, dma_dest, len, DMA_FROM_DEVICE);
-               return -ENOMEM;
+static struct dmaengine_unmap_pool *__get_unmap_pool(int nr)
+{
+       int order = get_count_order(nr);
+
+       switch (order) {
+       case 0 ... 1:
+               return &unmap_pool[0];
+       case 2 ... 4:
+               return &unmap_pool[1];
+       case 5 ... 7:
+               return &unmap_pool[2];
+       case 8:
+               return &unmap_pool[3];
+       default:
+               BUG();
+               return NULL;
        }
+}
 
-       tx->callback = NULL;
-       cookie = tx->tx_submit(tx);
+static void dmaengine_unmap(struct kref *kref)
+{
+       struct dmaengine_unmap_data *unmap = container_of(kref, typeof(*unmap), kref);
+       struct device *dev = unmap->dev;
+       int cnt, i;
+
+       cnt = unmap->to_cnt;
+       for (i = 0; i < cnt; i++)
+               dma_unmap_page(dev, unmap->addr[i], unmap->len,
+                              DMA_TO_DEVICE);
+       cnt += unmap->from_cnt;
+       for (; i < cnt; i++)
+               dma_unmap_page(dev, unmap->addr[i], unmap->len,
+                              DMA_FROM_DEVICE);
+       cnt += unmap->bidi_cnt;
+       for (; i < cnt; i++) {
+               if (unmap->addr[i] == 0)
+                       continue;
+               dma_unmap_page(dev, unmap->addr[i], unmap->len,
+                              DMA_BIDIRECTIONAL);
+       }
+       mempool_free(unmap, __get_unmap_pool(cnt)->pool);
+}
 
-       preempt_disable();
-       __this_cpu_add(chan->local->bytes_transferred, len);
-       __this_cpu_inc(chan->local->memcpy_count);
-       preempt_enable();
+void dmaengine_unmap_put(struct dmaengine_unmap_data *unmap)
+{
+       if (unmap)
+               kref_put(&unmap->kref, dmaengine_unmap);
+}
+EXPORT_SYMBOL_GPL(dmaengine_unmap_put);
 
-       return cookie;
+static void dmaengine_destroy_unmap_pool(void)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(unmap_pool); i++) {
+               struct dmaengine_unmap_pool *p = &unmap_pool[i];
+
+               if (p->pool)
+                       mempool_destroy(p->pool);
+               p->pool = NULL;
+               if (p->cache)
+                       kmem_cache_destroy(p->cache);
+               p->cache = NULL;
+       }
 }
-EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
 
-/**
- * dma_async_memcpy_buf_to_pg - offloaded copy from address to page
- * @chan: DMA channel to offload copy to
- * @page: destination page
- * @offset: offset in page to copy to
- * @kdata: source address (virtual)
- * @len: length
- *
- * Both @page/@offset and @kdata must be mappable to a bus address according
- * to the DMA mapping API rules for streaming mappings.
- * Both @page/@offset and @kdata must stay memory resident (kernel memory or
- * locked user space pages)
- */
-dma_cookie_t
-dma_async_memcpy_buf_to_pg(struct dma_chan *chan, struct page *page,
-                       unsigned int offset, void *kdata, size_t len)
+static int __init dmaengine_init_unmap_pool(void)
 {
-       struct dma_device *dev = chan->device;
-       struct dma_async_tx_descriptor *tx;
-       dma_addr_t dma_dest, dma_src;
-       dma_cookie_t cookie;
-       unsigned long flags;
+       int i;
 
-       dma_src = dma_map_single(dev->dev, kdata, len, DMA_TO_DEVICE);
-       dma_dest = dma_map_page(dev->dev, page, offset, len, DMA_FROM_DEVICE);
-       flags = DMA_CTRL_ACK | DMA_COMPL_SRC_UNMAP_SINGLE;
-       tx = dev->device_prep_dma_memcpy(chan, dma_dest, dma_src, len, flags);
+       for (i = 0; i < ARRAY_SIZE(unmap_pool); i++) {
+               struct dmaengine_unmap_pool *p = &unmap_pool[i];
+               size_t size;
 
-       if (!tx) {
-               dma_unmap_single(dev->dev, dma_src, len, DMA_TO_DEVICE);
-               dma_unmap_page(dev->dev, dma_dest, len, DMA_FROM_DEVICE);
-               return -ENOMEM;
+               size = sizeof(struct dmaengine_unmap_data) +
+                      sizeof(dma_addr_t) * p->size;
+
+               p->cache = kmem_cache_create(p->name, size, 0,
+                                            SLAB_HWCACHE_ALIGN, NULL);
+               if (!p->cache)
+                       break;
+               p->pool = mempool_create_slab_pool(1, p->cache);
+               if (!p->pool)
+                       break;
        }
 
-       tx->callback = NULL;
-       cookie = tx->tx_submit(tx);
+       if (i == ARRAY_SIZE(unmap_pool))
+               return 0;
 
-       preempt_disable();
-       __this_cpu_add(chan->local->bytes_transferred, len);
-       __this_cpu_inc(chan->local->memcpy_count);
-       preempt_enable();
+       dmaengine_destroy_unmap_pool();
+       return -ENOMEM;
+}
 
-       return cookie;
+struct dmaengine_unmap_data *
+dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags)
+{
+       struct dmaengine_unmap_data *unmap;
+
+       unmap = mempool_alloc(__get_unmap_pool(nr)->pool, flags);
+       if (!unmap)
+               return NULL;
+
+       memset(unmap, 0, sizeof(*unmap));
+       kref_init(&unmap->kref);
+       unmap->dev = dev;
+
+       return unmap;
 }
-EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
+EXPORT_SYMBOL(dmaengine_get_unmap_data);
 
 /**
  * dma_async_memcpy_pg_to_pg - offloaded copy from page to page
@@ -1015,24 +1050,33 @@ dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg,
 {
        struct dma_device *dev = chan->device;
        struct dma_async_tx_descriptor *tx;
-       dma_addr_t dma_dest, dma_src;
+       struct dmaengine_unmap_data *unmap;
        dma_cookie_t cookie;
        unsigned long flags;
 
-       dma_src = dma_map_page(dev->dev, src_pg, src_off, len, DMA_TO_DEVICE);
-       dma_dest = dma_map_page(dev->dev, dest_pg, dest_off, len,
-                               DMA_FROM_DEVICE);
+       unmap = dmaengine_get_unmap_data(dev->dev, 2, GFP_NOIO);
+       if (!unmap)
+               return -ENOMEM;
+
+       unmap->to_cnt = 1;
+       unmap->from_cnt = 1;
+       unmap->addr[0] = dma_map_page(dev->dev, src_pg, src_off, len,
+                                     DMA_TO_DEVICE);
+       unmap->addr[1] = dma_map_page(dev->dev, dest_pg, dest_off, len,
+                                     DMA_FROM_DEVICE);
+       unmap->len = len;
        flags = DMA_CTRL_ACK;
-       tx = dev->device_prep_dma_memcpy(chan, dma_dest, dma_src, len, flags);
+       tx = dev->device_prep_dma_memcpy(chan, unmap->addr[1], unmap->addr[0],
+                                        len, flags);
 
        if (!tx) {
-               dma_unmap_page(dev->dev, dma_src, len, DMA_TO_DEVICE);
-               dma_unmap_page(dev->dev, dma_dest, len, DMA_FROM_DEVICE);
+               dmaengine_unmap_put(unmap);
                return -ENOMEM;
        }
 
-       tx->callback = NULL;
+       dma_set_unmap(tx, unmap);
        cookie = tx->tx_submit(tx);
+       dmaengine_unmap_put(unmap);
 
        preempt_disable();
        __this_cpu_add(chan->local->bytes_transferred, len);
@@ -1043,6 +1087,52 @@ dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg,
 }
 EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
 
+/**
+ * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses
+ * @chan: DMA channel to offload copy to
+ * @dest: destination address (virtual)
+ * @src: source address (virtual)
+ * @len: length
+ *
+ * Both @dest and @src must be mappable to a bus address according to the
+ * DMA mapping API rules for streaming mappings.
+ * Both @dest and @src must stay memory resident (kernel memory or locked
+ * user space pages).
+ */
+dma_cookie_t
+dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest,
+                           void *src, size_t len)
+{
+       return dma_async_memcpy_pg_to_pg(chan, virt_to_page(dest),
+                                        (unsigned long) dest & ~PAGE_MASK,
+                                        virt_to_page(src),
+                                        (unsigned long) src & ~PAGE_MASK, len);
+}
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
+
+/**
+ * dma_async_memcpy_buf_to_pg - offloaded copy from address to page
+ * @chan: DMA channel to offload copy to
+ * @page: destination page
+ * @offset: offset in page to copy to
+ * @kdata: source address (virtual)
+ * @len: length
+ *
+ * Both @page/@offset and @kdata must be mappable to a bus address according
+ * to the DMA mapping API rules for streaming mappings.
+ * Both @page/@offset and @kdata must stay memory resident (kernel memory or
+ * locked user space pages)
+ */
+dma_cookie_t
+dma_async_memcpy_buf_to_pg(struct dma_chan *chan, struct page *page,
+                          unsigned int offset, void *kdata, size_t len)
+{
+       return dma_async_memcpy_pg_to_pg(chan, page, offset,
+                                        virt_to_page(kdata),
+                                        (unsigned long) kdata & ~PAGE_MASK, len);
+}
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
+
 void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
        struct dma_chan *chan)
 {
@@ -1062,7 +1152,7 @@ dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
        unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000);
 
        if (!tx)
-               return DMA_SUCCESS;
+               return DMA_COMPLETE;
 
        while (tx->cookie == -EBUSY) {
                if (time_after_eq(jiffies, dma_sync_wait_timeout)) {
@@ -1116,6 +1206,10 @@ EXPORT_SYMBOL_GPL(dma_run_dependencies);
 
 static int __init dma_bus_init(void)
 {
+       int err = dmaengine_init_unmap_pool();
+
+       if (err)
+               return err;
        return class_register(&dma_devclass);
 }
 arch_initcall(dma_bus_init);
index 92f796c..20f9a3a 100644 (file)
@@ -8,6 +8,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
-#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
-#include <linux/seq_file.h>
 
 static unsigned int test_buf_size = 16384;
 module_param(test_buf_size, uint, S_IRUGO | S_IWUSR);
@@ -68,92 +66,13 @@ module_param(timeout, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(timeout, "Transfer Timeout in msec (default: 3000), "
                 "Pass -1 for infinite timeout");
 
-/* Maximum amount of mismatched bytes in buffer to print */
-#define MAX_ERROR_COUNT                32
-
-/*
- * Initialization patterns. All bytes in the source buffer has bit 7
- * set, all bytes in the destination buffer has bit 7 cleared.
- *
- * Bit 6 is set for all bytes which are to be copied by the DMA
- * engine. Bit 5 is set for all bytes which are to be overwritten by
- * the DMA engine.
- *
- * The remaining bits are the inverse of a counter which increments by
- * one for each byte address.
- */
-#define PATTERN_SRC            0x80
-#define PATTERN_DST            0x00
-#define PATTERN_COPY           0x40
-#define PATTERN_OVERWRITE      0x20
-#define PATTERN_COUNT_MASK     0x1f
-
-enum dmatest_error_type {
-       DMATEST_ET_OK,
-       DMATEST_ET_MAP_SRC,
-       DMATEST_ET_MAP_DST,
-       DMATEST_ET_PREP,
-       DMATEST_ET_SUBMIT,
-       DMATEST_ET_TIMEOUT,
-       DMATEST_ET_DMA_ERROR,
-       DMATEST_ET_DMA_IN_PROGRESS,
-       DMATEST_ET_VERIFY,
-       DMATEST_ET_VERIFY_BUF,
-};
-
-struct dmatest_verify_buffer {
-       unsigned int    index;
-       u8              expected;
-       u8              actual;
-};
-
-struct dmatest_verify_result {
-       unsigned int                    error_count;
-       struct dmatest_verify_buffer    data[MAX_ERROR_COUNT];
-       u8                              pattern;
-       bool                            is_srcbuf;
-};
-
-struct dmatest_thread_result {
-       struct list_head        node;
-       unsigned int            n;
-       unsigned int            src_off;
-       unsigned int            dst_off;
-       unsigned int            len;
-       enum dmatest_error_type type;
-       union {
-               unsigned long                   data;
-               dma_cookie_t                    cookie;
-               enum dma_status                 status;
-               int                             error;
-               struct dmatest_verify_result    *vr;
-       };
-};
-
-struct dmatest_result {
-       struct list_head        node;
-       char                    *name;
-       struct list_head        results;
-};
-
-struct dmatest_info;
-
-struct dmatest_thread {
-       struct list_head        node;
-       struct dmatest_info     *info;
-       struct task_struct      *task;
-       struct dma_chan         *chan;
-       u8                      **srcs;
-       u8                      **dsts;
-       enum dma_transaction_type type;
-       bool                    done;
-};
+static bool noverify;
+module_param(noverify, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(noverify, "Disable random data setup and verification");
 
-struct dmatest_chan {
-       struct list_head        node;
-       struct dma_chan         *chan;
-       struct list_head        threads;
-};
+static bool verbose;
+module_param(verbose, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(verbose, "Enable \"success\" result messages (default: off)");
 
 /**
  * struct dmatest_params - test parameters.
@@ -177,6 +96,7 @@ struct dmatest_params {
        unsigned int    xor_sources;
        unsigned int    pq_sources;
        int             timeout;
+       bool            noverify;
 };
 
 /**
@@ -184,7 +104,7 @@ struct dmatest_params {
  * @params:            test parameters
  * @lock:              access protection to the fields of this structure
  */
-struct dmatest_info {
+static struct dmatest_info {
        /* Test parameters */
        struct dmatest_params   params;
 
@@ -192,16 +112,95 @@ struct dmatest_info {
        struct list_head        channels;
        unsigned int            nr_channels;
        struct mutex            lock;
+       bool                    did_init;
+} test_info = {
+       .channels = LIST_HEAD_INIT(test_info.channels),
+       .lock = __MUTEX_INITIALIZER(test_info.lock),
+};
+
+static int dmatest_run_set(const char *val, const struct kernel_param *kp);
+static int dmatest_run_get(char *val, const struct kernel_param *kp);
+static struct kernel_param_ops run_ops = {
+       .set = dmatest_run_set,
+       .get = dmatest_run_get,
+};
+static bool dmatest_run;
+module_param_cb(run, &run_ops, &dmatest_run, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(run, "Run the test (default: false)");
+
+/* Maximum amount of mismatched bytes in buffer to print */
+#define MAX_ERROR_COUNT                32
+
+/*
+ * Initialization patterns. All bytes in the source buffer has bit 7
+ * set, all bytes in the destination buffer has bit 7 cleared.
+ *
+ * Bit 6 is set for all bytes which are to be copied by the DMA
+ * engine. Bit 5 is set for all bytes which are to be overwritten by
+ * the DMA engine.
+ *
+ * The remaining bits are the inverse of a counter which increments by
+ * one for each byte address.
+ */
+#define PATTERN_SRC            0x80
+#define PATTERN_DST            0x00
+#define PATTERN_COPY           0x40
+#define PATTERN_OVERWRITE      0x20
+#define PATTERN_COUNT_MASK     0x1f
 
-       /* debugfs related stuff */
-       struct dentry           *root;
+struct dmatest_thread {
+       struct list_head        node;
+       struct dmatest_info     *info;
+       struct task_struct      *task;
+       struct dma_chan         *chan;
+       u8                      **srcs;
+       u8                      **dsts;
+       enum dma_transaction_type type;
+       bool                    done;
+};
 
-       /* Test results */
-       struct list_head        results;
-       struct mutex            results_lock;
+struct dmatest_chan {
+       struct list_head        node;
+       struct dma_chan         *chan;
+       struct list_head        threads;
 };
 
-static struct dmatest_info test_info;
+static DECLARE_WAIT_QUEUE_HEAD(thread_wait);
+static bool wait;
+
+static bool is_threaded_test_run(struct dmatest_info *info)
+{
+       struct dmatest_chan *dtc;
+
+       list_for_each_entry(dtc, &info->channels, node) {
+               struct dmatest_thread *thread;
+
+               list_for_each_entry(thread, &dtc->threads, node) {
+                       if (!thread->done)
+                               return true;
+               }
+       }
+
+       return false;
+}
+
+static int dmatest_wait_get(char *val, const struct kernel_param *kp)
+{
+       struct dmatest_info *info = &test_info;
+       struct dmatest_params *params = &info->params;
+
+       if (params->iterations)
+               wait_event(thread_wait, !is_threaded_test_run(info));
+       wait = true;
+       return param_get_bool(val, kp);
+}
+
+static struct kernel_param_ops wait_ops = {
+       .get = dmatest_wait_get,
+       .set = param_set_bool,
+};
+module_param_cb(wait, &wait_ops, &wait, S_IRUGO);
+MODULE_PARM_DESC(wait, "Wait for tests to complete (default: false)");
 
 static bool dmatest_match_channel(struct dmatest_params *params,
                struct dma_chan *chan)
@@ -223,7 +222,7 @@ static unsigned long dmatest_random(void)
 {
        unsigned long buf;
 
-       get_random_bytes(&buf, sizeof(buf));
+       prandom_bytes(&buf, sizeof(buf));
        return buf;
 }
 
@@ -262,9 +261,31 @@ static void dmatest_init_dsts(u8 **bufs, unsigned int start, unsigned int len,
        }
 }
 
-static unsigned int dmatest_verify(struct dmatest_verify_result *vr, u8 **bufs,
-               unsigned int start, unsigned int end, unsigned int counter,
-               u8 pattern, bool is_srcbuf)
+static void dmatest_mismatch(u8 actual, u8 pattern, unsigned int index,
+               unsigned int counter, bool is_srcbuf)
+{
+       u8              diff = actual ^ pattern;
+       u8              expected = pattern | (~counter & PATTERN_COUNT_MASK);
+       const char      *thread_name = current->comm;
+
+       if (is_srcbuf)
+               pr_warn("%s: srcbuf[0x%x] overwritten! Expected %02x, got %02x\n",
+                       thread_name, index, expected, actual);
+       else if ((pattern & PATTERN_COPY)
+                       && (diff & (PATTERN_COPY | PATTERN_OVERWRITE)))
+               pr_warn("%s: dstbuf[0x%x] not copied! Expected %02x, got %02x\n",
+                       thread_name, index, expected, actual);
+       else if (diff & PATTERN_SRC)
+               pr_warn("%s: dstbuf[0x%x] was copied! Expected %02x, got %02x\n",
+                       thread_name, index, expected, actual);
+       else
+               pr_warn("%s: dstbuf[0x%x] mismatch! Expected %02x, got %02x\n",
+                       thread_name, index, expected, actual);
+}
+
+static unsigned int dmatest_verify(u8 **bufs, unsigned int start,
+               unsigned int end, unsigned int counter, u8 pattern,
+               bool is_srcbuf)
 {
        unsigned int i;
        unsigned int error_count = 0;
@@ -272,7 +293,6 @@ static unsigned int dmatest_verify(struct dmatest_verify_result *vr, u8 **bufs,
        u8 expected;
        u8 *buf;
        unsigned int counter_orig = counter;
-       struct dmatest_verify_buffer *vb;
 
        for (; (buf = *bufs); bufs++) {
                counter = counter_orig;
@@ -280,12 +300,9 @@ static unsigned int dmatest_verify(struct dmatest_verify_result *vr, u8 **bufs,
                        actual = buf[i];
                        expected = pattern | (~counter & PATTERN_COUNT_MASK);
                        if (actual != expected) {
-                               if (error_count < MAX_ERROR_COUNT && vr) {
-                                       vb = &vr->data[error_count];
-                                       vb->index = i;
-                                       vb->expected = expected;
-                                       vb->actual = actual;
-                               }
+                               if (error_count < MAX_ERROR_COUNT)
+                                       dmatest_mismatch(actual, pattern, i,
+                                                        counter, is_srcbuf);
                                error_count++;
                        }
                        counter++;
@@ -293,7 +310,7 @@ static unsigned int dmatest_verify(struct dmatest_verify_result *vr, u8 **bufs,
        }
 
        if (error_count > MAX_ERROR_COUNT)
-               pr_warning("%s: %u errors suppressed\n",
+               pr_warn("%s: %u errors suppressed\n",
                        current->comm, error_count - MAX_ERROR_COUNT);
 
        return error_count;
@@ -313,20 +330,6 @@ static void dmatest_callback(void *arg)
        wake_up_all(done->wait);
 }
 
-static inline void unmap_src(struct device *dev, dma_addr_t *addr, size_t len,
-                            unsigned int count)
-{
-       while (count--)
-               dma_unmap_single(dev, addr[count], len, DMA_TO_DEVICE);
-}
-
-static inline void unmap_dst(struct device *dev, dma_addr_t *addr, size_t len,
-                            unsigned int count)
-{
-       while (count--)
-               dma_unmap_single(dev, addr[count], len, DMA_BIDIRECTIONAL);
-}
-
 static unsigned int min_odd(unsigned int x, unsigned int y)
 {
        unsigned int val = min(x, y);
@@ -334,172 +337,49 @@ static unsigned int min_odd(unsigned int x, unsigned int y)
        return val % 2 ? val : val - 1;
 }
 
-static char *verify_result_get_one(struct dmatest_verify_result *vr,
-               unsigned int i)
+static void result(const char *err, unsigned int n, unsigned int src_off,
+                  unsigned int dst_off, unsigned int len, unsigned long data)
 {
-       struct dmatest_verify_buffer *vb = &vr->data[i];
-       u8 diff = vb->actual ^ vr->pattern;
-       static char buf[512];
-       char *msg;
-
-       if (vr->is_srcbuf)
-               msg = "srcbuf overwritten!";
-       else if ((vr->pattern & PATTERN_COPY)
-                       && (diff & (PATTERN_COPY | PATTERN_OVERWRITE)))
-               msg = "dstbuf not copied!";
-       else if (diff & PATTERN_SRC)
-               msg = "dstbuf was copied!";
-       else
-               msg = "dstbuf mismatch!";
-
-       snprintf(buf, sizeof(buf) - 1, "%s [0x%x] Expected %02x, got %02x", msg,
-                vb->index, vb->expected, vb->actual);
-
-       return buf;
+       pr_info("%s: result #%u: '%s' with src_off=0x%x dst_off=0x%x len=0x%x (%lu)",
+               current->comm, n, err, src_off, dst_off, len, data);
 }
 
-static char *thread_result_get(const char *name,
-               struct dmatest_thread_result *tr)
+static void dbg_result(const char *err, unsigned int n, unsigned int src_off,
+                      unsigned int dst_off, unsigned int len,
+                      unsigned long data)
 {
-       static const char * const messages[] = {
-               [DMATEST_ET_OK]                 = "No errors",
-               [DMATEST_ET_MAP_SRC]            = "src mapping error",
-               [DMATEST_ET_MAP_DST]            = "dst mapping error",
-               [DMATEST_ET_PREP]               = "prep error",
-               [DMATEST_ET_SUBMIT]             = "submit error",
-               [DMATEST_ET_TIMEOUT]            = "test timed out",
-               [DMATEST_ET_DMA_ERROR]          =
-                       "got completion callback (DMA_ERROR)",
-               [DMATEST_ET_DMA_IN_PROGRESS]    =
-                       "got completion callback (DMA_IN_PROGRESS)",
-               [DMATEST_ET_VERIFY]             = "errors",
-               [DMATEST_ET_VERIFY_BUF]         = "verify errors",
-       };
-       static char buf[512];
-
-       snprintf(buf, sizeof(buf) - 1,
-                "%s: #%u: %s with src_off=0x%x ""dst_off=0x%x len=0x%x (%lu)",
-                name, tr->n, messages[tr->type], tr->src_off, tr->dst_off,
-                tr->len, tr->data);
-
-       return buf;
+       pr_debug("%s: result #%u: '%s' with src_off=0x%x dst_off=0x%x len=0x%x (%lu)",
+                  current->comm, n, err, src_off, dst_off, len, data);
 }
 
-static int thread_result_add(struct dmatest_info *info,
-               struct dmatest_result *r, enum dmatest_error_type type,
-               unsigned int n, unsigned int src_off, unsigned int dst_off,
-               unsigned int len, unsigned long data)
-{
-       struct dmatest_thread_result *tr;
-
-       tr = kzalloc(sizeof(*tr), GFP_KERNEL);
-       if (!tr)
-               return -ENOMEM;
-
-       tr->type = type;
-       tr->n = n;
-       tr->src_off = src_off;
-       tr->dst_off = dst_off;
-       tr->len = len;
-       tr->data = data;
+#define verbose_result(err, n, src_off, dst_off, len, data) ({ \
+       if (verbose) \
+               result(err, n, src_off, dst_off, len, data); \
+       else \
+               dbg_result(err, n, src_off, dst_off, len, data); \
+})
 
-       mutex_lock(&info->results_lock);
-       list_add_tail(&tr->node, &r->results);
-       mutex_unlock(&info->results_lock);
-
-       if (tr->type == DMATEST_ET_OK)
-               pr_debug("%s\n", thread_result_get(r->name, tr));
-       else
-               pr_warn("%s\n", thread_result_get(r->name, tr));
-
-       return 0;
-}
-
-static unsigned int verify_result_add(struct dmatest_info *info,
-               struct dmatest_result *r, unsigned int n,
-               unsigned int src_off, unsigned int dst_off, unsigned int len,
-               u8 **bufs, int whence, unsigned int counter, u8 pattern,
-               bool is_srcbuf)
+static unsigned long long dmatest_persec(s64 runtime, unsigned int val)
 {
-       struct dmatest_verify_result *vr;
-       unsigned int error_count;
-       unsigned int buf_off = is_srcbuf ? src_off : dst_off;
-       unsigned int start, end;
-
-       if (whence < 0) {
-               start = 0;
-               end = buf_off;
-       } else if (whence > 0) {
-               start = buf_off + len;
-               end = info->params.buf_size;
-       } else {
-               start = buf_off;
-               end = buf_off + len;
-       }
+       unsigned long long per_sec = 1000000;
 
-       vr = kmalloc(sizeof(*vr), GFP_KERNEL);
-       if (!vr) {
-               pr_warn("dmatest: No memory to store verify result\n");
-               return dmatest_verify(NULL, bufs, start, end, counter, pattern,
-                                     is_srcbuf);
-       }
-
-       vr->pattern = pattern;
-       vr->is_srcbuf = is_srcbuf;
-
-       error_count = dmatest_verify(vr, bufs, start, end, counter, pattern,
-                                    is_srcbuf);
-       if (error_count) {
-               vr->error_count = error_count;
-               thread_result_add(info, r, DMATEST_ET_VERIFY_BUF, n, src_off,
-                                 dst_off, len, (unsigned long)vr);
-               return error_count;
-       }
-
-       kfree(vr);
-       return 0;
-}
-
-static void result_free(struct dmatest_info *info, const char *name)
-{
-       struct dmatest_result *r, *_r;
-
-       mutex_lock(&info->results_lock);
-       list_for_each_entry_safe(r, _r, &info->results, node) {
-               struct dmatest_thread_result *tr, *_tr;
-
-               if (name && strcmp(r->name, name))
-                       continue;
-
-               list_for_each_entry_safe(tr, _tr, &r->results, node) {
-                       if (tr->type == DMATEST_ET_VERIFY_BUF)
-                               kfree(tr->vr);
-                       list_del(&tr->node);
-                       kfree(tr);
-               }
+       if (runtime <= 0)
+               return 0;
 
-               kfree(r->name);
-               list_del(&r->node);
-               kfree(r);
+       /* drop precision until runtime is 32-bits */
+       while (runtime > UINT_MAX) {
+               runtime >>= 1;
+               per_sec <<= 1;
        }
 
-       mutex_unlock(&info->results_lock);
+       per_sec *= val;
+       do_div(per_sec, runtime);
+       return per_sec;
 }
 
-static struct dmatest_result *result_init(struct dmatest_info *info,
-               const char *name)
+static unsigned long long dmatest_KBs(s64 runtime, unsigned long long len)
 {
-       struct dmatest_result *r;
-
-       r = kzalloc(sizeof(*r), GFP_KERNEL);
-       if (r) {
-               r->name = kstrdup(name, GFP_KERNEL);
-               INIT_LIST_HEAD(&r->results);
-               mutex_lock(&info->results_lock);
-               list_add_tail(&r->node, &info->results);
-               mutex_unlock(&info->results_lock);
-       }
-       return r;
+       return dmatest_persec(runtime, len >> 10);
 }
 
 /*
@@ -525,7 +405,6 @@ static int dmatest_func(void *data)
        struct dmatest_params   *params;
        struct dma_chan         *chan;
        struct dma_device       *dev;
-       const char              *thread_name;
        unsigned int            src_off, dst_off, len;
        unsigned int            error_count;
        unsigned int            failed_tests = 0;
@@ -538,9 +417,10 @@ static int dmatest_func(void *data)
        int                     src_cnt;
        int                     dst_cnt;
        int                     i;
-       struct dmatest_result   *result;
+       ktime_t                 ktime;
+       s64                     runtime = 0;
+       unsigned long long      total_len = 0;
 
-       thread_name = current->comm;
        set_freezable();
 
        ret = -ENOMEM;
@@ -570,10 +450,6 @@ static int dmatest_func(void *data)
        } else
                goto err_thread_type;
 
-       result = result_init(info, thread_name);
-       if (!result)
-               goto err_srcs;
-
        thread->srcs = kcalloc(src_cnt+1, sizeof(u8 *), GFP_KERNEL);
        if (!thread->srcs)
                goto err_srcs;
@@ -597,17 +473,17 @@ static int dmatest_func(void *data)
        set_user_nice(current, 10);
 
        /*
-        * src buffers are freed by the DMAEngine code with dma_unmap_single()
-        * dst buffers are freed by ourselves below
+        * src and dst buffers are freed by ourselves below
         */
-       flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT
-             | DMA_COMPL_SKIP_DEST_UNMAP | DMA_COMPL_SRC_UNMAP_SINGLE;
+       flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT;
 
+       ktime = ktime_get();
        while (!kthread_should_stop()
               && !(params->iterations && total_tests >= params->iterations)) {
                struct dma_async_tx_descriptor *tx = NULL;
-               dma_addr_t dma_srcs[src_cnt];
-               dma_addr_t dma_dsts[dst_cnt];
+               struct dmaengine_unmap_data *um;
+               dma_addr_t srcs[src_cnt];
+               dma_addr_t *dsts;
                u8 align = 0;
 
                total_tests++;
@@ -626,81 +502,103 @@ static int dmatest_func(void *data)
                        break;
                }
 
-               len = dmatest_random() % params->buf_size + 1;
+               if (params->noverify) {
+                       len = params->buf_size;
+                       src_off = 0;
+                       dst_off = 0;
+               } else {
+                       len = dmatest_random() % params->buf_size + 1;
+                       len = (len >> align) << align;
+                       if (!len)
+                               len = 1 << align;
+                       src_off = dmatest_random() % (params->buf_size - len + 1);
+                       dst_off = dmatest_random() % (params->buf_size - len + 1);
+
+                       src_off = (src_off >> align) << align;
+                       dst_off = (dst_off >> align) << align;
+
+                       dmatest_init_srcs(thread->srcs, src_off, len,
+                                         params->buf_size);
+                       dmatest_init_dsts(thread->dsts, dst_off, len,
+                                         params->buf_size);
+               }
+
                len = (len >> align) << align;
                if (!len)
                        len = 1 << align;
-               src_off = dmatest_random() % (params->buf_size - len + 1);
-               dst_off = dmatest_random() % (params->buf_size - len + 1);
+               total_len += len;
 
-               src_off = (src_off >> align) << align;
-               dst_off = (dst_off >> align) << align;
-
-               dmatest_init_srcs(thread->srcs, src_off, len, params->buf_size);
-               dmatest_init_dsts(thread->dsts, dst_off, len, params->buf_size);
+               um = dmaengine_get_unmap_data(dev->dev, src_cnt+dst_cnt,
+                                             GFP_KERNEL);
+               if (!um) {
+                       failed_tests++;
+                       result("unmap data NULL", total_tests,
+                              src_off, dst_off, len, ret);
+                       continue;
+               }
 
+               um->len = params->buf_size;
                for (i = 0; i < src_cnt; i++) {
-                       u8 *buf = thread->srcs[i] + src_off;
-
-                       dma_srcs[i] = dma_map_single(dev->dev, buf, len,
-                                                    DMA_TO_DEVICE);
-                       ret = dma_mapping_error(dev->dev, dma_srcs[i]);
+                       unsigned long buf = (unsigned long) thread->srcs[i];
+                       struct page *pg = virt_to_page(buf);
+                       unsigned pg_off = buf & ~PAGE_MASK;
+
+                       um->addr[i] = dma_map_page(dev->dev, pg, pg_off,
+                                                  um->len, DMA_TO_DEVICE);
+                       srcs[i] = um->addr[i] + src_off;
+                       ret = dma_mapping_error(dev->dev, um->addr[i]);
                        if (ret) {
-                               unmap_src(dev->dev, dma_srcs, len, i);
-                               thread_result_add(info, result,
-                                                 DMATEST_ET_MAP_SRC,
-                                                 total_tests, src_off, dst_off,
-                                                 len, ret);
+                               dmaengine_unmap_put(um);
+                               result("src mapping error", total_tests,
+                                      src_off, dst_off, len, ret);
                                failed_tests++;
                                continue;
                        }
+                       um->to_cnt++;
                }
                /* map with DMA_BIDIRECTIONAL to force writeback/invalidate */
+               dsts = &um->addr[src_cnt];
                for (i = 0; i < dst_cnt; i++) {
-                       dma_dsts[i] = dma_map_single(dev->dev, thread->dsts[i],
-                                                    params->buf_size,
-                                                    DMA_BIDIRECTIONAL);
-                       ret = dma_mapping_error(dev->dev, dma_dsts[i]);
+                       unsigned long buf = (unsigned long) thread->dsts[i];
+                       struct page *pg = virt_to_page(buf);
+                       unsigned pg_off = buf & ~PAGE_MASK;
+
+                       dsts[i] = dma_map_page(dev->dev, pg, pg_off, um->len,
+                                              DMA_BIDIRECTIONAL);
+                       ret = dma_mapping_error(dev->dev, dsts[i]);
                        if (ret) {
-                               unmap_src(dev->dev, dma_srcs, len, src_cnt);
-                               unmap_dst(dev->dev, dma_dsts, params->buf_size,
-                                         i);
-                               thread_result_add(info, result,
-                                                 DMATEST_ET_MAP_DST,
-                                                 total_tests, src_off, dst_off,
-                                                 len, ret);
+                               dmaengine_unmap_put(um);
+                               result("dst mapping error", total_tests,
+                                      src_off, dst_off, len, ret);
                                failed_tests++;
                                continue;
                        }
+                       um->bidi_cnt++;
                }
 
                if (thread->type == DMA_MEMCPY)
                        tx = dev->device_prep_dma_memcpy(chan,
-                                                        dma_dsts[0] + dst_off,
-                                                        dma_srcs[0], len,
-                                                        flags);
+                                                        dsts[0] + dst_off,
+                                                        srcs[0], len, flags);
                else if (thread->type == DMA_XOR)
                        tx = dev->device_prep_dma_xor(chan,
-                                                     dma_dsts[0] + dst_off,
-                                                     dma_srcs, src_cnt,
+                                                     dsts[0] + dst_off,
+                                                     srcs, src_cnt,
                                                      len, flags);
                else if (thread->type == DMA_PQ) {
                        dma_addr_t dma_pq[dst_cnt];
 
                        for (i = 0; i < dst_cnt; i++)
-                               dma_pq[i] = dma_dsts[i] + dst_off;
-                       tx = dev->device_prep_dma_pq(chan, dma_pq, dma_srcs,
+                               dma_pq[i] = dsts[i] + dst_off;
+                       tx = dev->device_prep_dma_pq(chan, dma_pq, srcs,
                                                     src_cnt, pq_coefs,
                                                     len, flags);
                }
 
                if (!tx) {
-                       unmap_src(dev->dev, dma_srcs, len, src_cnt);
-                       unmap_dst(dev->dev, dma_dsts, params->buf_size,
-                                 dst_cnt);
-                       thread_result_add(info, result, DMATEST_ET_PREP,
-                                         total_tests, src_off, dst_off,
-                                         len, 0);
+                       dmaengine_unmap_put(um);
+                       result("prep error", total_tests, src_off,
+                              dst_off, len, ret);
                        msleep(100);
                        failed_tests++;
                        continue;
@@ -712,9 +610,9 @@ static int dmatest_func(void *data)
                cookie = tx->tx_submit(tx);
 
                if (dma_submit_error(cookie)) {
-                       thread_result_add(info, result, DMATEST_ET_SUBMIT,
-                                         total_tests, src_off, dst_off,
-                                         len, cookie);
+                       dmaengine_unmap_put(um);
+                       result("submit error", total_tests, src_off,
+                              dst_off, len, ret);
                        msleep(100);
                        failed_tests++;
                        continue;
@@ -735,59 +633,59 @@ static int dmatest_func(void *data)
                         * free it this time?" dancing.  For now, just
                         * leave it dangling.
                         */
-                       thread_result_add(info, result, DMATEST_ET_TIMEOUT,
-                                         total_tests, src_off, dst_off,
-                                         len, 0);
+                       dmaengine_unmap_put(um);
+                       result("test timed out", total_tests, src_off, dst_off,
+                              len, 0);
                        failed_tests++;
                        continue;
-               } else if (status != DMA_SUCCESS) {
-                       enum dmatest_error_type type = (status == DMA_ERROR) ?
-                               DMATEST_ET_DMA_ERROR : DMATEST_ET_DMA_IN_PROGRESS;
-                       thread_result_add(info, result, type,
-                                         total_tests, src_off, dst_off,
-                                         len, status);
+               } else if (status != DMA_COMPLETE) {
+                       dmaengine_unmap_put(um);
+                       result(status == DMA_ERROR ?
+                              "completion error status" :
+                              "completion busy status", total_tests, src_off,
+                              dst_off, len, ret);
                        failed_tests++;
                        continue;
                }
 
-               /* Unmap by myself (see DMA_COMPL_SKIP_DEST_UNMAP above) */
-               unmap_dst(dev->dev, dma_dsts, params->buf_size, dst_cnt);
+               dmaengine_unmap_put(um);
 
-               error_count = 0;
+               if (params->noverify) {
+                       verbose_result("test passed", total_tests, src_off,
+                                      dst_off, len, 0);
+                       continue;
+               }
 
-               pr_debug("%s: verifying source buffer...\n", thread_name);
-               error_count += verify_result_add(info, result, total_tests,
-                               src_off, dst_off, len, thread->srcs, -1,
+               pr_debug("%s: verifying source buffer...\n", current->comm);
+               error_count = dmatest_verify(thread->srcs, 0, src_off,
                                0, PATTERN_SRC, true);
-               error_count += verify_result_add(info, result, total_tests,
-                               src_off, dst_off, len, thread->srcs, 0,
-                               src_off, PATTERN_SRC | PATTERN_COPY, true);
-               error_count += verify_result_add(info, result, total_tests,
-                               src_off, dst_off, len, thread->srcs, 1,
-                               src_off + len, PATTERN_SRC, true);
-
-               pr_debug("%s: verifying dest buffer...\n", thread_name);
-               error_count += verify_result_add(info, result, total_tests,
-                               src_off, dst_off, len, thread->dsts, -1,
+               error_count += dmatest_verify(thread->srcs, src_off,
+                               src_off + len, src_off,
+                               PATTERN_SRC | PATTERN_COPY, true);
+               error_count += dmatest_verify(thread->srcs, src_off + len,
+                               params->buf_size, src_off + len,
+                               PATTERN_SRC, true);
+
+               pr_debug("%s: verifying dest buffer...\n", current->comm);
+               error_count += dmatest_verify(thread->dsts, 0, dst_off,
                                0, PATTERN_DST, false);
-               error_count += verify_result_add(info, result, total_tests,
-                               src_off, dst_off, len, thread->dsts, 0,
-                               src_off, PATTERN_SRC | PATTERN_COPY, false);
-               error_count += verify_result_add(info, result, total_tests,
-                               src_off, dst_off, len, thread->dsts, 1,
-                               dst_off + len, PATTERN_DST, false);
+               error_count += dmatest_verify(thread->dsts, dst_off,
+                               dst_off + len, src_off,
+                               PATTERN_SRC | PATTERN_COPY, false);
+               error_count += dmatest_verify(thread->dsts, dst_off + len,
+                               params->buf_size, dst_off + len,
+                               PATTERN_DST, false);
 
                if (error_count) {
-                       thread_result_add(info, result, DMATEST_ET_VERIFY,
-                                         total_tests, src_off, dst_off,
-                                         len, error_count);
+                       result("data error", total_tests, src_off, dst_off,
+                              len, error_count);
                        failed_tests++;
                } else {
-                       thread_result_add(info, result, DMATEST_ET_OK,
-                                         total_tests, src_off, dst_off,
-                                         len, 0);
+                       verbose_result("test passed", total_tests, src_off,
+                                      dst_off, len, 0);
                }
        }
+       runtime = ktime_us_delta(ktime_get(), ktime);
 
        ret = 0;
        for (i = 0; thread->dsts[i]; i++)
@@ -802,20 +700,17 @@ err_srcbuf:
 err_srcs:
        kfree(pq_coefs);
 err_thread_type:
-       pr_notice("%s: terminating after %u tests, %u failures (status %d)\n",
-                       thread_name, total_tests, failed_tests, ret);
+       pr_info("%s: summary %u tests, %u failures %llu iops %llu KB/s (%d)\n",
+               current->comm, total_tests, failed_tests,
+               dmatest_persec(runtime, total_tests),
+               dmatest_KBs(runtime, total_len), ret);
 
        /* terminate all transfers on specified channels */
        if (ret)
                dmaengine_terminate_all(chan);
 
        thread->done = true;
-
-       if (params->iterations > 0)
-               while (!kthread_should_stop()) {
-                       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wait_dmatest_exit);
-                       interruptible_sleep_on(&wait_dmatest_exit);
-               }
+       wake_up(&thread_wait);
 
        return ret;
 }
@@ -828,9 +723,10 @@ static void dmatest_cleanup_channel(struct dmatest_chan *dtc)
 
        list_for_each_entry_safe(thread, _thread, &dtc->threads, node) {
                ret = kthread_stop(thread->task);
-               pr_debug("dmatest: thread %s exited with status %d\n",
-                               thread->task->comm, ret);
+               pr_debug("thread %s exited with status %d\n",
+                        thread->task->comm, ret);
                list_del(&thread->node);
+               put_task_struct(thread->task);
                kfree(thread);
        }
 
@@ -861,27 +757,27 @@ static int dmatest_add_threads(struct dmatest_info *info,
        for (i = 0; i < params->threads_per_chan; i++) {
                thread = kzalloc(sizeof(struct dmatest_thread), GFP_KERNEL);
                if (!thread) {
-                       pr_warning("dmatest: No memory for %s-%s%u\n",
-                                  dma_chan_name(chan), op, i);
-
+                       pr_warn("No memory for %s-%s%u\n",
+                               dma_chan_name(chan), op, i);
                        break;
                }
                thread->info = info;
                thread->chan = dtc->chan;
                thread->type = type;
                smp_wmb();
-               thread->task = kthread_run(dmatest_func, thread, "%s-%s%u",
+               thread->task = kthread_create(dmatest_func, thread, "%s-%s%u",
                                dma_chan_name(chan), op, i);
                if (IS_ERR(thread->task)) {
-                       pr_warning("dmatest: Failed to run thread %s-%s%u\n",
-                                       dma_chan_name(chan), op, i);
+                       pr_warn("Failed to create thread %s-%s%u\n",
+                               dma_chan_name(chan), op, i);
                        kfree(thread);
                        break;
                }
 
                /* srcbuf and dstbuf are allocated by the thread itself */
-
+               get_task_struct(thread->task);
                list_add_tail(&thread->node, &dtc->threads);
+               wake_up_process(thread->task);
        }
 
        return i;
@@ -897,7 +793,7 @@ static int dmatest_add_channel(struct dmatest_info *info,
 
        dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL);
        if (!dtc) {
-               pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan));
+               pr_warn("No memory for %s\n", dma_chan_name(chan));
                return -ENOMEM;
        }
 
@@ -917,7 +813,7 @@ static int dmatest_add_channel(struct dmatest_info *info,
                thread_count += cnt > 0 ? cnt : 0;
        }
 
-       pr_info("dmatest: Started %u threads using %s\n",
+       pr_info("Started %u threads using %s\n",
                thread_count, dma_chan_name(chan));
 
        list_add_tail(&dtc->node, &info->channels);
@@ -937,20 +833,20 @@ static bool filter(struct dma_chan *chan, void *param)
                return true;
 }
 
-static int __run_threaded_test(struct dmatest_info *info)
+static void request_channels(struct dmatest_info *info,
+                            enum dma_transaction_type type)
 {
        dma_cap_mask_t mask;
-       struct dma_chan *chan;
-       struct dmatest_params *params = &info->params;
-       int err = 0;
 
        dma_cap_zero(mask);
-       dma_cap_set(DMA_MEMCPY, mask);
+       dma_cap_set(type, mask);
        for (;;) {
+               struct dmatest_params *params = &info->params;
+               struct dma_chan *chan;
+
                chan = dma_request_channel(mask, filter, params);
                if (chan) {
-                       err = dmatest_add_channel(info, chan);
-                       if (err) {
+                       if (dmatest_add_channel(info, chan)) {
                                dma_release_channel(chan);
                                break; /* add_channel failed, punt */
                        }
@@ -960,22 +856,30 @@ static int __run_threaded_test(struct dmatest_info *info)
                    info->nr_channels >= params->max_channels)
                        break; /* we have all we need */
        }
-       return err;
 }
 
-#ifndef MODULE
-static int run_threaded_test(struct dmatest_info *info)
+static void run_threaded_test(struct dmatest_info *info)
 {
-       int ret;
+       struct dmatest_params *params = &info->params;
 
-       mutex_lock(&info->lock);
-       ret = __run_threaded_test(info);
-       mutex_unlock(&info->lock);
-       return ret;
+       /* Copy test parameters */
+       params->buf_size = test_buf_size;
+       strlcpy(params->channel, strim(test_channel), sizeof(params->channel));
+       strlcpy(params->device, strim(test_device), sizeof(params->device));
+       params->threads_per_chan = threads_per_chan;
+       params->max_channels = max_channels;
+       params->iterations = iterations;
+       params->xor_sources = xor_sources;
+       params->pq_sources = pq_sources;
+       params->timeout = timeout;
+       params->noverify = noverify;
+
+       request_channels(info, DMA_MEMCPY);
+       request_channels(info, DMA_XOR);
+       request_channels(info, DMA_PQ);
 }
-#endif
 
-static void __stop_threaded_test(struct dmatest_info *info)
+static void stop_threaded_test(struct dmatest_info *info)
 {
        struct dmatest_chan *dtc, *_dtc;
        struct dma_chan *chan;
@@ -984,203 +888,86 @@ static void __stop_threaded_test(struct dmatest_info *info)
                list_del(&dtc->node);
                chan = dtc->chan;
                dmatest_cleanup_channel(dtc);
-               pr_debug("dmatest: dropped channel %s\n", dma_chan_name(chan));
+               pr_debug("dropped channel %s\n", dma_chan_name(chan));
                dma_release_channel(chan);
        }
 
        info->nr_channels = 0;
 }
 
-static void stop_threaded_test(struct dmatest_info *info)
+static void restart_threaded_test(struct dmatest_info *info, bool run)
 {
-       mutex_lock(&info->lock);
-       __stop_threaded_test(info);
-       mutex_unlock(&info->lock);
-}
-
-static int __restart_threaded_test(struct dmatest_info *info, bool run)
-{
-       struct dmatest_params *params = &info->params;
+       /* we might be called early to set run=, defer running until all
+        * parameters have been evaluated
+        */
+       if (!info->did_init)
+               return;
 
        /* Stop any running test first */
-       __stop_threaded_test(info);
-
-       if (run == false)
-               return 0;
-
-       /* Clear results from previous run */
-       result_free(info, NULL);
-
-       /* Copy test parameters */
-       params->buf_size = test_buf_size;
-       strlcpy(params->channel, strim(test_channel), sizeof(params->channel));
-       strlcpy(params->device, strim(test_device), sizeof(params->device));
-       params->threads_per_chan = threads_per_chan;
-       params->max_channels = max_channels;
-       params->iterations = iterations;
-       params->xor_sources = xor_sources;
-       params->pq_sources = pq_sources;
-       params->timeout = timeout;
+       stop_threaded_test(info);
 
        /* Run test with new parameters */
-       return __run_threaded_test(info);
-}
-
-static bool __is_threaded_test_run(struct dmatest_info *info)
-{
-       struct dmatest_chan *dtc;
-
-       list_for_each_entry(dtc, &info->channels, node) {
-               struct dmatest_thread *thread;
-
-               list_for_each_entry(thread, &dtc->threads, node) {
-                       if (!thread->done)
-                               return true;
-               }
-       }
-
-       return false;
+       run_threaded_test(info);
 }
 
-static ssize_t dtf_read_run(struct file *file, char __user *user_buf,
-               size_t count, loff_t *ppos)
+static int dmatest_run_get(char *val, const struct kernel_param *kp)
 {
-       struct dmatest_info *info = file->private_data;
-       char buf[3];
+       struct dmatest_info *info = &test_info;
 
        mutex_lock(&info->lock);
-
-       if (__is_threaded_test_run(info)) {
-               buf[0] = 'Y';
+       if (is_threaded_test_run(info)) {
+               dmatest_run = true;
        } else {
-               __stop_threaded_test(info);
-               buf[0] = 'N';
+               stop_threaded_test(info);
+               dmatest_run = false;
        }
-
        mutex_unlock(&info->lock);
-       buf[1] = '\n';
-       buf[2] = 0x00;
-       return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
-}
-
-static ssize_t dtf_write_run(struct file *file, const char __user *user_buf,
-               size_t count, loff_t *ppos)
-{
-       struct dmatest_info *info = file->private_data;
-       char buf[16];
-       bool bv;
-       int ret = 0;
 
-       if (copy_from_user(buf, user_buf, min(count, (sizeof(buf) - 1))))
-               return -EFAULT;
-
-       if (strtobool(buf, &bv) == 0) {
-               mutex_lock(&info->lock);
-
-               if (__is_threaded_test_run(info))
-                       ret = -EBUSY;
-               else
-                       ret = __restart_threaded_test(info, bv);
-
-               mutex_unlock(&info->lock);
-       }
-
-       return ret ? ret : count;
+       return param_get_bool(val, kp);
 }
 
-static const struct file_operations dtf_run_fops = {
-       .read   = dtf_read_run,
-       .write  = dtf_write_run,
-       .open   = simple_open,
-       .llseek = default_llseek,
-};
-
-static int dtf_results_show(struct seq_file *sf, void *data)
+static int dmatest_run_set(const char *val, const struct kernel_param *kp)
 {
-       struct dmatest_info *info = sf->private;
-       struct dmatest_result *result;
-       struct dmatest_thread_result *tr;
-       unsigned int i;
+       struct dmatest_info *info = &test_info;
+       int ret;
 
-       mutex_lock(&info->results_lock);
-       list_for_each_entry(result, &info->results, node) {
-               list_for_each_entry(tr, &result->results, node) {
-                       seq_printf(sf, "%s\n",
-                               thread_result_get(result->name, tr));
-                       if (tr->type == DMATEST_ET_VERIFY_BUF) {
-                               for (i = 0; i < tr->vr->error_count; i++) {
-                                       seq_printf(sf, "\t%s\n",
-                                               verify_result_get_one(tr->vr, i));
-                               }
-                       }
-               }
+       mutex_lock(&info->lock);
+       ret = param_set_bool(val, kp);
+       if (ret) {
+               mutex_unlock(&info->lock);
+               return ret;
        }
 
-       mutex_unlock(&info->results_lock);
-       return 0;
-}
-
-static int dtf_results_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, dtf_results_show, inode->i_private);
-}
-
-static const struct file_operations dtf_results_fops = {
-       .open           = dtf_results_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static int dmatest_register_dbgfs(struct dmatest_info *info)
-{
-       struct dentry *d;
-
-       d = debugfs_create_dir("dmatest", NULL);
-       if (IS_ERR(d))
-               return PTR_ERR(d);
-       if (!d)
-               goto err_root;
+       if (is_threaded_test_run(info))
+               ret = -EBUSY;
+       else if (dmatest_run)
+               restart_threaded_test(info, dmatest_run);
 
-       info->root = d;
-
-       /* Run or stop threaded test */
-       debugfs_create_file("run", S_IWUSR | S_IRUGO, info->root, info,
-                           &dtf_run_fops);
-
-       /* Results of test in progress */
-       debugfs_create_file("results", S_IRUGO, info->root, info,
-                           &dtf_results_fops);
-
-       return 0;
+       mutex_unlock(&info->lock);
 
-err_root:
-       pr_err("dmatest: Failed to initialize debugfs\n");
-       return -ENOMEM;
+       return ret;
 }
 
 static int __init dmatest_init(void)
 {
        struct dmatest_info *info = &test_info;
-       int ret;
-
-       memset(info, 0, sizeof(*info));
+       struct dmatest_params *params = &info->params;
 
-       mutex_init(&info->lock);
-       INIT_LIST_HEAD(&info->channels);
+       if (dmatest_run) {
+               mutex_lock(&info->lock);
+               run_threaded_test(info);
+               mutex_unlock(&info->lock);
+       }
 
-       mutex_init(&info->results_lock);
-       INIT_LIST_HEAD(&info->results);
+       if (params->iterations && wait)
+               wait_event(thread_wait, !is_threaded_test_run(info));
 
-       ret = dmatest_register_dbgfs(info);
-       if (ret)
-               return ret;
+       /* module parameters are stable, inittime tests are started,
+        * let userspace take over 'run' control
+        */
+       info->did_init = true;
 
-#ifdef MODULE
        return 0;
-#else
-       return run_threaded_test(info);
-#endif
 }
 /* when compiled-in wait for drivers to load first */
 late_initcall(dmatest_init);
@@ -1189,9 +976,9 @@ static void __exit dmatest_exit(void)
 {
        struct dmatest_info *info = &test_info;
 
-       debugfs_remove_recursive(info->root);
+       mutex_lock(&info->lock);
        stop_threaded_test(info);
-       result_free(info, NULL);
+       mutex_unlock(&info->lock);
 }
 module_exit(dmatest_exit);
 
index 89eb89f..7516be4 100644 (file)
@@ -85,10 +85,6 @@ static struct device *chan2dev(struct dma_chan *chan)
 {
        return &chan->dev->device;
 }
-static struct device *chan2parent(struct dma_chan *chan)
-{
-       return chan->dev->device.parent;
-}
 
 static struct dw_desc *dwc_first_active(struct dw_dma_chan *dwc)
 {
@@ -311,26 +307,7 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc,
        list_splice_init(&desc->tx_list, &dwc->free_list);
        list_move(&desc->desc_node, &dwc->free_list);
 
-       if (!is_slave_direction(dwc->direction)) {
-               struct device *parent = chan2parent(&dwc->chan);
-               if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-                       if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
-                               dma_unmap_single(parent, desc->lli.dar,
-                                       desc->total_len, DMA_FROM_DEVICE);
-                       else
-                               dma_unmap_page(parent, desc->lli.dar,
-                                       desc->total_len, DMA_FROM_DEVICE);
-               }
-               if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-                       if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
-                               dma_unmap_single(parent, desc->lli.sar,
-                                       desc->total_len, DMA_TO_DEVICE);
-                       else
-                               dma_unmap_page(parent, desc->lli.sar,
-                                       desc->total_len, DMA_TO_DEVICE);
-               }
-       }
-
+       dma_descriptor_unmap(txd);
        spin_unlock_irqrestore(&dwc->lock, flags);
 
        if (callback)
@@ -1098,13 +1075,13 @@ dwc_tx_status(struct dma_chan *chan,
        enum dma_status         ret;
 
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret == DMA_SUCCESS)
+       if (ret == DMA_COMPLETE)
                return ret;
 
        dwc_scan_descriptors(to_dw_dma(chan->device), dwc);
 
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret != DMA_SUCCESS)
+       if (ret != DMA_COMPLETE)
                dma_set_residue(txstate, dwc_get_residue(dwc));
 
        if (dwc->paused && ret == DMA_IN_PROGRESS)
index bef8a36..2539ea0 100644 (file)
 #define EDMA_CHANS     64
 #endif /* CONFIG_ARCH_DAVINCI_DA8XX */
 
-/* Max of 16 segments per channel to conserve PaRAM slots */
-#define MAX_NR_SG              16
+/*
+ * Max of 20 segments per channel to conserve PaRAM slots
+ * Also note that MAX_NR_SG should be atleast the no.of periods
+ * that are required for ASoC, otherwise DMA prep calls will
+ * fail. Today davinci-pcm is the only user of this driver and
+ * requires atleast 17 slots, so we setup the default to 20.
+ */
+#define MAX_NR_SG              20
 #define EDMA_MAX_SLOTS         MAX_NR_SG
 #define EDMA_DESCRIPTORS       16
 
 struct edma_desc {
        struct virt_dma_desc            vdesc;
        struct list_head                node;
+       int                             cyclic;
        int                             absync;
        int                             pset_nr;
        int                             processed;
@@ -167,8 +174,13 @@ static void edma_execute(struct edma_chan *echan)
         * then setup a link to the dummy slot, this results in all future
         * events being absorbed and that's OK because we're done
         */
-       if (edesc->processed == edesc->pset_nr)
-               edma_link(echan->slot[nslots-1], echan->ecc->dummy_slot);
+       if (edesc->processed == edesc->pset_nr) {
+               if (edesc->cyclic)
+                       edma_link(echan->slot[nslots-1], echan->slot[1]);
+               else
+                       edma_link(echan->slot[nslots-1],
+                                 echan->ecc->dummy_slot);
+       }
 
        edma_resume(echan->ch_num);
 
@@ -250,6 +262,117 @@ static int edma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
        return ret;
 }
 
+/*
+ * A PaRAM set configuration abstraction used by other modes
+ * @chan: Channel who's PaRAM set we're configuring
+ * @pset: PaRAM set to initialize and setup.
+ * @src_addr: Source address of the DMA
+ * @dst_addr: Destination address of the DMA
+ * @burst: In units of dev_width, how much to send
+ * @dev_width: How much is the dev_width
+ * @dma_length: Total length of the DMA transfer
+ * @direction: Direction of the transfer
+ */
+static int edma_config_pset(struct dma_chan *chan, struct edmacc_param *pset,
+       dma_addr_t src_addr, dma_addr_t dst_addr, u32 burst,
+       enum dma_slave_buswidth dev_width, unsigned int dma_length,
+       enum dma_transfer_direction direction)
+{
+       struct edma_chan *echan = to_edma_chan(chan);
+       struct device *dev = chan->device->dev;
+       int acnt, bcnt, ccnt, cidx;
+       int src_bidx, dst_bidx, src_cidx, dst_cidx;
+       int absync;
+
+       acnt = dev_width;
+       /*
+        * If the maxburst is equal to the fifo width, use
+        * A-synced transfers. This allows for large contiguous
+        * buffer transfers using only one PaRAM set.
+        */
+       if (burst == 1) {
+               /*
+                * For the A-sync case, bcnt and ccnt are the remainder
+                * and quotient respectively of the division of:
+                * (dma_length / acnt) by (SZ_64K -1). This is so
+                * that in case bcnt over flows, we have ccnt to use.
+                * Note: In A-sync tranfer only, bcntrld is used, but it
+                * only applies for sg_dma_len(sg) >= SZ_64K.
+                * In this case, the best way adopted is- bccnt for the
+                * first frame will be the remainder below. Then for
+                * every successive frame, bcnt will be SZ_64K-1. This
+                * is assured as bcntrld = 0xffff in end of function.
+                */
+               absync = false;
+               ccnt = dma_length / acnt / (SZ_64K - 1);
+               bcnt = dma_length / acnt - ccnt * (SZ_64K - 1);
+               /*
+                * If bcnt is non-zero, we have a remainder and hence an
+                * extra frame to transfer, so increment ccnt.
+                */
+               if (bcnt)
+                       ccnt++;
+               else
+                       bcnt = SZ_64K - 1;
+               cidx = acnt;
+       } else {
+               /*
+                * If maxburst is greater than the fifo address_width,
+                * use AB-synced transfers where A count is the fifo
+                * address_width and B count is the maxburst. In this
+                * case, we are limited to transfers of C count frames
+                * of (address_width * maxburst) where C count is limited
+                * to SZ_64K-1. This places an upper bound on the length
+                * of an SG segment that can be handled.
+                */
+               absync = true;
+               bcnt = burst;
+               ccnt = dma_length / (acnt * bcnt);
+               if (ccnt > (SZ_64K - 1)) {
+                       dev_err(dev, "Exceeded max SG segment size\n");
+                       return -EINVAL;
+               }
+               cidx = acnt * bcnt;
+       }
+
+       if (direction == DMA_MEM_TO_DEV) {
+               src_bidx = acnt;
+               src_cidx = cidx;
+               dst_bidx = 0;
+               dst_cidx = 0;
+       } else if (direction == DMA_DEV_TO_MEM)  {
+               src_bidx = 0;
+               src_cidx = 0;
+               dst_bidx = acnt;
+               dst_cidx = cidx;
+       } else {
+               dev_err(dev, "%s: direction not implemented yet\n", __func__);
+               return -EINVAL;
+       }
+
+       pset->opt = EDMA_TCC(EDMA_CHAN_SLOT(echan->ch_num));
+       /* Configure A or AB synchronized transfers */
+       if (absync)
+               pset->opt |= SYNCDIM;
+
+       pset->src = src_addr;
+       pset->dst = dst_addr;
+
+       pset->src_dst_bidx = (dst_bidx << 16) | src_bidx;
+       pset->src_dst_cidx = (dst_cidx << 16) | src_cidx;
+
+       pset->a_b_cnt = bcnt << 16 | acnt;
+       pset->ccnt = ccnt;
+       /*
+        * Only time when (bcntrld) auto reload is required is for
+        * A-sync case, and in this case, a requirement of reload value
+        * of SZ_64K-1 only is assured. 'link' is initially set to NULL
+        * and then later will be populated by edma_execute.
+        */
+       pset->link_bcntrld = 0xffffffff;
+       return absync;
+}
+
 static struct dma_async_tx_descriptor *edma_prep_slave_sg(
        struct dma_chan *chan, struct scatterlist *sgl,
        unsigned int sg_len, enum dma_transfer_direction direction,
@@ -258,23 +381,21 @@ static struct dma_async_tx_descriptor *edma_prep_slave_sg(
        struct edma_chan *echan = to_edma_chan(chan);
        struct device *dev = chan->device->dev;
        struct edma_desc *edesc;
-       dma_addr_t dev_addr;
+       dma_addr_t src_addr = 0, dst_addr = 0;
        enum dma_slave_buswidth dev_width;
        u32 burst;
        struct scatterlist *sg;
-       int acnt, bcnt, ccnt, src, dst, cidx;
-       int src_bidx, dst_bidx, src_cidx, dst_cidx;
-       int i, nslots;
+       int i, nslots, ret;
 
        if (unlikely(!echan || !sgl || !sg_len))
                return NULL;
 
        if (direction == DMA_DEV_TO_MEM) {
-               dev_addr = echan->cfg.src_addr;
+               src_addr = echan->cfg.src_addr;
                dev_width = echan->cfg.src_addr_width;
                burst = echan->cfg.src_maxburst;
        } else if (direction == DMA_MEM_TO_DEV) {
-               dev_addr = echan->cfg.dst_addr;
+               dst_addr = echan->cfg.dst_addr;
                dev_width = echan->cfg.dst_addr_width;
                burst = echan->cfg.dst_maxburst;
        } else {
@@ -307,7 +428,6 @@ static struct dma_async_tx_descriptor *edma_prep_slave_sg(
                        if (echan->slot[i] < 0) {
                                kfree(edesc);
                                dev_err(dev, "Failed to allocate slot\n");
-                               kfree(edesc);
                                return NULL;
                        }
                }
@@ -315,64 +435,21 @@ static struct dma_async_tx_descriptor *edma_prep_slave_sg(
 
        /* Configure PaRAM sets for each SG */
        for_each_sg(sgl, sg, sg_len, i) {
-
-               acnt = dev_width;
-
-               /*
-                * If the maxburst is equal to the fifo width, use
-                * A-synced transfers. This allows for large contiguous
-                * buffer transfers using only one PaRAM set.
-                */
-               if (burst == 1) {
-                       edesc->absync = false;
-                       ccnt = sg_dma_len(sg) / acnt / (SZ_64K - 1);
-                       bcnt = sg_dma_len(sg) / acnt - ccnt * (SZ_64K - 1);
-                       if (bcnt)
-                               ccnt++;
-                       else
-                               bcnt = SZ_64K - 1;
-                       cidx = acnt;
-               /*
-                * If maxburst is greater than the fifo address_width,
-                * use AB-synced transfers where A count is the fifo
-                * address_width and B count is the maxburst. In this
-                * case, we are limited to transfers of C count frames
-                * of (address_width * maxburst) where C count is limited
-                * to SZ_64K-1. This places an upper bound on the length
-                * of an SG segment that can be handled.
-                */
-               } else {
-                       edesc->absync = true;
-                       bcnt = burst;
-                       ccnt = sg_dma_len(sg) / (acnt * bcnt);
-                       if (ccnt > (SZ_64K - 1)) {
-                               dev_err(dev, "Exceeded max SG segment size\n");
-                               kfree(edesc);
-                               return NULL;
-                       }
-                       cidx = acnt * bcnt;
+               /* Get address for each SG */
+               if (direction == DMA_DEV_TO_MEM)
+                       dst_addr = sg_dma_address(sg);
+               else
+                       src_addr = sg_dma_address(sg);
+
+               ret = edma_config_pset(chan, &edesc->pset[i], src_addr,
+                                      dst_addr, burst, dev_width,
+                                      sg_dma_len(sg), direction);
+               if (ret < 0) {
+                       kfree(edesc);
+                       return NULL;
                }
 
-               if (direction == DMA_MEM_TO_DEV) {
-                       src = sg_dma_address(sg);
-                       dst = dev_addr;
-                       src_bidx = acnt;
-                       src_cidx = cidx;
-                       dst_bidx = 0;
-                       dst_cidx = 0;
-               } else {
-                       src = dev_addr;
-                       dst = sg_dma_address(sg);
-                       src_bidx = 0;
-                       src_cidx = 0;
-                       dst_bidx = acnt;
-                       dst_cidx = cidx;
-               }
-
-               edesc->pset[i].opt = EDMA_TCC(EDMA_CHAN_SLOT(echan->ch_num));
-               /* Configure A or AB synchronized transfers */
-               if (edesc->absync)
-                       edesc->pset[i].opt |= SYNCDIM;
+               edesc->absync = ret;
 
                /* If this is the last in a current SG set of transactions,
                   enable interrupts so that next set is processed */
@@ -382,17 +459,138 @@ static struct dma_async_tx_descriptor *edma_prep_slave_sg(
                /* If this is the last set, enable completion interrupt flag */
                if (i == sg_len - 1)
                        edesc->pset[i].opt |= TCINTEN;
+       }
 
-               edesc->pset[i].src = src;
-               edesc->pset[i].dst = dst;
+       return vchan_tx_prep(&echan->vchan, &edesc->vdesc, tx_flags);
+}
 
-               edesc->pset[i].src_dst_bidx = (dst_bidx << 16) | src_bidx;
-               edesc->pset[i].src_dst_cidx = (dst_cidx << 16) | src_cidx;
+static struct dma_async_tx_descriptor *edma_prep_dma_cyclic(
+       struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
+       size_t period_len, enum dma_transfer_direction direction,
+       unsigned long tx_flags, void *context)
+{
+       struct edma_chan *echan = to_edma_chan(chan);
+       struct device *dev = chan->device->dev;
+       struct edma_desc *edesc;
+       dma_addr_t src_addr, dst_addr;
+       enum dma_slave_buswidth dev_width;
+       u32 burst;
+       int i, ret, nslots;
+
+       if (unlikely(!echan || !buf_len || !period_len))
+               return NULL;
+
+       if (direction == DMA_DEV_TO_MEM) {
+               src_addr = echan->cfg.src_addr;
+               dst_addr = buf_addr;
+               dev_width = echan->cfg.src_addr_width;
+               burst = echan->cfg.src_maxburst;
+       } else if (direction == DMA_MEM_TO_DEV) {
+               src_addr = buf_addr;
+               dst_addr = echan->cfg.dst_addr;
+               dev_width = echan->cfg.dst_addr_width;
+               burst = echan->cfg.dst_maxburst;
+       } else {
+               dev_err(dev, "%s: bad direction?\n", __func__);
+               return NULL;
+       }
+
+       if (dev_width == DMA_SLAVE_BUSWIDTH_UNDEFINED) {
+               dev_err(dev, "Undefined slave buswidth\n");
+               return NULL;
+       }
+
+       if (unlikely(buf_len % period_len)) {
+               dev_err(dev, "Period should be multiple of Buffer length\n");
+               return NULL;
+       }
+
+       nslots = (buf_len / period_len) + 1;
+
+       /*
+        * Cyclic DMA users such as audio cannot tolerate delays introduced
+        * by cases where the number of periods is more than the maximum
+        * number of SGs the EDMA driver can handle at a time. For DMA types
+        * such as Slave SGs, such delays are tolerable and synchronized,
+        * but the synchronization is difficult to achieve with Cyclic and
+        * cannot be guaranteed, so we error out early.
+        */
+       if (nslots > MAX_NR_SG)
+               return NULL;
+
+       edesc = kzalloc(sizeof(*edesc) + nslots *
+               sizeof(edesc->pset[0]), GFP_ATOMIC);
+       if (!edesc) {
+               dev_dbg(dev, "Failed to allocate a descriptor\n");
+               return NULL;
+       }
+
+       edesc->cyclic = 1;
+       edesc->pset_nr = nslots;
+
+       dev_dbg(dev, "%s: nslots=%d\n", __func__, nslots);
+       dev_dbg(dev, "%s: period_len=%d\n", __func__, period_len);
+       dev_dbg(dev, "%s: buf_len=%d\n", __func__, buf_len);
+
+       for (i = 0; i < nslots; i++) {
+               /* Allocate a PaRAM slot, if needed */
+               if (echan->slot[i] < 0) {
+                       echan->slot[i] =
+                               edma_alloc_slot(EDMA_CTLR(echan->ch_num),
+                                               EDMA_SLOT_ANY);
+                       if (echan->slot[i] < 0) {
+                               dev_err(dev, "Failed to allocate slot\n");
+                               return NULL;
+                       }
+               }
+
+               if (i == nslots - 1) {
+                       memcpy(&edesc->pset[i], &edesc->pset[0],
+                              sizeof(edesc->pset[0]));
+                       break;
+               }
+
+               ret = edma_config_pset(chan, &edesc->pset[i], src_addr,
+                                      dst_addr, burst, dev_width, period_len,
+                                      direction);
+               if (ret < 0)
+                       return NULL;
 
-               edesc->pset[i].a_b_cnt = bcnt << 16 | acnt;
-               edesc->pset[i].ccnt = ccnt;
-               edesc->pset[i].link_bcntrld = 0xffffffff;
+               if (direction == DMA_DEV_TO_MEM)
+                       dst_addr += period_len;
+               else
+                       src_addr += period_len;
 
+               dev_dbg(dev, "%s: Configure period %d of buf:\n", __func__, i);
+               dev_dbg(dev,
+                       "\n pset[%d]:\n"
+                       "  chnum\t%d\n"
+                       "  slot\t%d\n"
+                       "  opt\t%08x\n"
+                       "  src\t%08x\n"
+                       "  dst\t%08x\n"
+                       "  abcnt\t%08x\n"
+                       "  ccnt\t%08x\n"
+                       "  bidx\t%08x\n"
+                       "  cidx\t%08x\n"
+                       "  lkrld\t%08x\n",
+                       i, echan->ch_num, echan->slot[i],
+                       edesc->pset[i].opt,
+                       edesc->pset[i].src,
+                       edesc->pset[i].dst,
+                       edesc->pset[i].a_b_cnt,
+                       edesc->pset[i].ccnt,
+                       edesc->pset[i].src_dst_bidx,
+                       edesc->pset[i].src_dst_cidx,
+                       edesc->pset[i].link_bcntrld);
+
+               edesc->absync = ret;
+
+               /*
+                * Enable interrupts for every period because callback
+                * has to be called for every period.
+                */
+               edesc->pset[i].opt |= TCINTEN;
        }
 
        return vchan_tx_prep(&echan->vchan, &edesc->vdesc, tx_flags);
@@ -406,30 +604,34 @@ static void edma_callback(unsigned ch_num, u16 ch_status, void *data)
        unsigned long flags;
        struct edmacc_param p;
 
-       /* Pause the channel */
-       edma_pause(echan->ch_num);
+       edesc = echan->edesc;
+
+       /* Pause the channel for non-cyclic */
+       if (!edesc || (edesc && !edesc->cyclic))
+               edma_pause(echan->ch_num);
 
        switch (ch_status) {
-       case DMA_COMPLETE:
+       case EDMA_DMA_COMPLETE:
                spin_lock_irqsave(&echan->vchan.lock, flags);
 
-               edesc = echan->edesc;
                if (edesc) {
-                       if (edesc->processed == edesc->pset_nr) {
+                       if (edesc->cyclic) {
+                               vchan_cyclic_callback(&edesc->vdesc);
+                       } else if (edesc->processed == edesc->pset_nr) {
                                dev_dbg(dev, "Transfer complete, stopping channel %d\n", ch_num);
                                edma_stop(echan->ch_num);
                                vchan_cookie_complete(&edesc->vdesc);
+                               edma_execute(echan);
                        } else {
                                dev_dbg(dev, "Intermediate transfer complete on channel %d\n", ch_num);
+                               edma_execute(echan);
                        }
-
-                       edma_execute(echan);
                }
 
                spin_unlock_irqrestore(&echan->vchan.lock, flags);
 
                break;
-       case DMA_CC_ERROR:
+       case EDMA_DMA_CC_ERROR:
                spin_lock_irqsave(&echan->vchan.lock, flags);
 
                edma_read_slot(EDMA_CHAN_SLOT(echan->slot[0]), &p);
@@ -579,7 +781,7 @@ static enum dma_status edma_tx_status(struct dma_chan *chan,
        unsigned long flags;
 
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret == DMA_SUCCESS || !txstate)
+       if (ret == DMA_COMPLETE || !txstate)
                return ret;
 
        spin_lock_irqsave(&echan->vchan.lock, flags);
@@ -619,6 +821,7 @@ static void edma_dma_init(struct edma_cc *ecc, struct dma_device *dma,
                          struct device *dev)
 {
        dma->device_prep_slave_sg = edma_prep_slave_sg;
+       dma->device_prep_dma_cyclic = edma_prep_dma_cyclic;
        dma->device_alloc_chan_resources = edma_alloc_chan_resources;
        dma->device_free_chan_resources = edma_free_chan_resources;
        dma->device_issue_pending = edma_issue_pending;
index 591cd8c..cb4bf68 100644 (file)
@@ -733,28 +733,6 @@ static void ep93xx_dma_advance_work(struct ep93xx_dma_chan *edmac)
        spin_unlock_irqrestore(&edmac->lock, flags);
 }
 
-static void ep93xx_dma_unmap_buffers(struct ep93xx_dma_desc *desc)
-{
-       struct device *dev = desc->txd.chan->device->dev;
-
-       if (!(desc->txd.flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-               if (desc->txd.flags & DMA_COMPL_SRC_UNMAP_SINGLE)
-                       dma_unmap_single(dev, desc->src_addr, desc->size,
-                                        DMA_TO_DEVICE);
-               else
-                       dma_unmap_page(dev, desc->src_addr, desc->size,
-                                      DMA_TO_DEVICE);
-       }
-       if (!(desc->txd.flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-               if (desc->txd.flags & DMA_COMPL_DEST_UNMAP_SINGLE)
-                       dma_unmap_single(dev, desc->dst_addr, desc->size,
-                                        DMA_FROM_DEVICE);
-               else
-                       dma_unmap_page(dev, desc->dst_addr, desc->size,
-                                      DMA_FROM_DEVICE);
-       }
-}
-
 static void ep93xx_dma_tasklet(unsigned long data)
 {
        struct ep93xx_dma_chan *edmac = (struct ep93xx_dma_chan *)data;
@@ -787,13 +765,7 @@ static void ep93xx_dma_tasklet(unsigned long data)
 
        /* Now we can release all the chained descriptors */
        list_for_each_entry_safe(desc, d, &list, node) {
-               /*
-                * For the memcpy channels the API requires us to unmap the
-                * buffers unless requested otherwise.
-                */
-               if (!edmac->chan.private)
-                       ep93xx_dma_unmap_buffers(desc);
-
+               dma_descriptor_unmap(&desc->txd);
                ep93xx_dma_desc_put(edmac, desc);
        }
 
index 61517dd..7086a16 100644 (file)
@@ -870,22 +870,7 @@ static void fsldma_cleanup_descriptor(struct fsldma_chan *chan,
        /* Run any dependencies */
        dma_run_dependencies(txd);
 
-       /* Unmap the dst buffer, if requested */
-       if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-               if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
-                       dma_unmap_single(dev, dst, len, DMA_FROM_DEVICE);
-               else
-                       dma_unmap_page(dev, dst, len, DMA_FROM_DEVICE);
-       }
-
-       /* Unmap the src buffer, if requested */
-       if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-               if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
-                       dma_unmap_single(dev, src, len, DMA_TO_DEVICE);
-               else
-                       dma_unmap_page(dev, src, len, DMA_TO_DEVICE);
-       }
-
+       dma_descriptor_unmap(txd);
 #ifdef FSL_DMA_LD_DEBUG
        chan_dbg(chan, "LD %p free\n", desc);
 #endif
@@ -1255,7 +1240,9 @@ static int fsl_dma_chan_probe(struct fsldma_device *fdev,
        WARN_ON(fdev->feature != chan->feature);
 
        chan->dev = fdev->dev;
-       chan->id = ((res.start - 0x100) & 0xfff) >> 7;
+       chan->id = (res.start & 0xfff) < 0x300 ?
+                  ((res.start - 0x100) & 0xfff) >> 7 :
+                  ((res.start - 0x200) & 0xfff) >> 7;
        if (chan->id >= FSL_DMA_MAX_CHANS_PER_DEVICE) {
                dev_err(fdev->dev, "too many channels for device\n");
                err = -EINVAL;
@@ -1428,6 +1415,7 @@ static int fsldma_of_remove(struct platform_device *op)
 }
 
 static const struct of_device_id fsldma_of_ids[] = {
+       { .compatible = "fsl,elo3-dma", },
        { .compatible = "fsl,eloplus-dma", },
        { .compatible = "fsl,elo-dma", },
        {}
@@ -1449,7 +1437,7 @@ static struct platform_driver fsldma_of_driver = {
 
 static __init int fsldma_init(void)
 {
-       pr_info("Freescale Elo / Elo Plus DMA driver\n");
+       pr_info("Freescale Elo series DMA driver\n");
        return platform_driver_register(&fsldma_of_driver);
 }
 
@@ -1461,5 +1449,5 @@ static void __exit fsldma_exit(void)
 subsys_initcall(fsldma_init);
 module_exit(fsldma_exit);
 
-MODULE_DESCRIPTION("Freescale Elo / Elo Plus DMA driver");
+MODULE_DESCRIPTION("Freescale Elo series DMA driver");
 MODULE_LICENSE("GPL");
index f5c3879..1ffc244 100644 (file)
@@ -112,7 +112,7 @@ struct fsldma_chan_regs {
 };
 
 struct fsldma_chan;
-#define FSL_DMA_MAX_CHANS_PER_DEVICE 4
+#define FSL_DMA_MAX_CHANS_PER_DEVICE 8
 
 struct fsldma_device {
        void __iomem *regs;     /* DGSR register base */
index 55852c0..6f9ac20 100644 (file)
@@ -572,9 +572,11 @@ static int imxdma_xfer_desc(struct imxdma_desc *d)
 
                imx_dmav1_writel(imxdma, d->len, DMA_CNTR(imxdmac->channel));
 
-               dev_dbg(imxdma->dev, "%s channel: %d dest=0x%08x src=0x%08x "
-                       "dma_length=%d\n", __func__, imxdmac->channel,
-                       d->dest, d->src, d->len);
+               dev_dbg(imxdma->dev,
+                       "%s channel: %d dest=0x%08llx src=0x%08llx dma_length=%zu\n",
+                       __func__, imxdmac->channel,
+                       (unsigned long long)d->dest,
+                       (unsigned long long)d->src, d->len);
 
                break;
        /* Cyclic transfer is the same as slave_sg with special sg configuration. */
@@ -586,20 +588,22 @@ static int imxdma_xfer_desc(struct imxdma_desc *d)
                        imx_dmav1_writel(imxdma, imxdmac->ccr_from_device,
                                         DMA_CCR(imxdmac->channel));
 
-                       dev_dbg(imxdma->dev, "%s channel: %d sg=%p sgcount=%d "
-                               "total length=%d dev_addr=0x%08x (dev2mem)\n",
-                               __func__, imxdmac->channel, d->sg, d->sgcount,
-                               d->len, imxdmac->per_address);
+                       dev_dbg(imxdma->dev,
+                               "%s channel: %d sg=%p sgcount=%d total length=%zu dev_addr=0x%08llx (dev2mem)\n",
+                               __func__, imxdmac->channel,
+                               d->sg, d->sgcount, d->len,
+                               (unsigned long long)imxdmac->per_address);
                } else if (d->direction == DMA_MEM_TO_DEV) {
                        imx_dmav1_writel(imxdma, imxdmac->per_address,
                                         DMA_DAR(imxdmac->channel));
                        imx_dmav1_writel(imxdma, imxdmac->ccr_to_device,
                                         DMA_CCR(imxdmac->channel));
 
-                       dev_dbg(imxdma->dev, "%s channel: %d sg=%p sgcount=%d "
-                               "total length=%d dev_addr=0x%08x (mem2dev)\n",
-                               __func__, imxdmac->channel, d->sg, d->sgcount,
-                               d->len, imxdmac->per_address);
+                       dev_dbg(imxdma->dev,
+                               "%s channel: %d sg=%p sgcount=%d total length=%zu dev_addr=0x%08llx (mem2dev)\n",
+                               __func__, imxdmac->channel,
+                               d->sg, d->sgcount, d->len,
+                               (unsigned long long)imxdmac->per_address);
                } else {
                        dev_err(imxdma->dev, "%s channel: %d bad dma mode\n",
                                __func__, imxdmac->channel);
@@ -771,7 +775,7 @@ static int imxdma_alloc_chan_resources(struct dma_chan *chan)
                desc->desc.tx_submit = imxdma_tx_submit;
                /* txd.flags will be overwritten in prep funcs */
                desc->desc.flags = DMA_CTRL_ACK;
-               desc->status = DMA_SUCCESS;
+               desc->status = DMA_COMPLETE;
 
                list_add_tail(&desc->node, &imxdmac->ld_free);
                imxdmac->descs_allocated++;
@@ -870,7 +874,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_cyclic(
        int i;
        unsigned int periods = buf_len / period_len;
 
-       dev_dbg(imxdma->dev, "%s channel: %d buf_len=%d period_len=%d\n",
+       dev_dbg(imxdma->dev, "%s channel: %d buf_len=%zu period_len=%zu\n",
                        __func__, imxdmac->channel, buf_len, period_len);
 
        if (list_empty(&imxdmac->ld_free) ||
@@ -926,8 +930,9 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_memcpy(
        struct imxdma_engine *imxdma = imxdmac->imxdma;
        struct imxdma_desc *desc;
 
-       dev_dbg(imxdma->dev, "%s channel: %d src=0x%x dst=0x%x len=%d\n",
-                       __func__, imxdmac->channel, src, dest, len);
+       dev_dbg(imxdma->dev, "%s channel: %d src=0x%llx dst=0x%llx len=%zu\n",
+               __func__, imxdmac->channel, (unsigned long long)src,
+               (unsigned long long)dest, len);
 
        if (list_empty(&imxdmac->ld_free) ||
            imxdma_chan_is_doing_cyclic(imxdmac))
@@ -956,9 +961,10 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_interleaved(
        struct imxdma_engine *imxdma = imxdmac->imxdma;
        struct imxdma_desc *desc;
 
-       dev_dbg(imxdma->dev, "%s channel: %d src_start=0x%x dst_start=0x%x\n"
-               "   src_sgl=%s dst_sgl=%s numf=%d frame_size=%d\n", __func__,
-               imxdmac->channel, xt->src_start, xt->dst_start,
+       dev_dbg(imxdma->dev, "%s channel: %d src_start=0x%llx dst_start=0x%llx\n"
+               "   src_sgl=%s dst_sgl=%s numf=%zu frame_size=%zu\n", __func__,
+               imxdmac->channel, (unsigned long long)xt->src_start,
+               (unsigned long long) xt->dst_start,
                xt->src_sgl ? "true" : "false", xt->dst_sgl ? "true" : "false",
                xt->numf, xt->frame_size);
 
index c1fd504..c75679d 100644 (file)
@@ -638,7 +638,7 @@ static void mxc_sdma_handle_channel_normal(struct sdma_channel *sdmac)
        if (error)
                sdmac->status = DMA_ERROR;
        else
-               sdmac->status = DMA_SUCCESS;
+               sdmac->status = DMA_COMPLETE;
 
        dma_cookie_complete(&sdmac->desc);
        if (sdmac->desc.callback)
@@ -1089,8 +1089,8 @@ static struct dma_async_tx_descriptor *sdma_prep_slave_sg(
                        param &= ~BD_CONT;
                }
 
-               dev_dbg(sdma->dev, "entry %d: count: %d dma: 0x%08x %s%s\n",
-                               i, count, sg->dma_address,
+               dev_dbg(sdma->dev, "entry %d: count: %d dma: %#llx %s%s\n",
+                               i, count, (u64)sg->dma_address,
                                param & BD_WRAP ? "wrap" : "",
                                param & BD_INTR ? " intr" : "");
 
@@ -1163,8 +1163,8 @@ static struct dma_async_tx_descriptor *sdma_prep_dma_cyclic(
                if (i + 1 == num_periods)
                        param |= BD_WRAP;
 
-               dev_dbg(sdma->dev, "entry %d: count: %d dma: 0x%08x %s%s\n",
-                               i, period_len, dma_addr,
+               dev_dbg(sdma->dev, "entry %d: count: %d dma: %#llx %s%s\n",
+                               i, period_len, (u64)dma_addr,
                                param & BD_WRAP ? "wrap" : "",
                                param & BD_INTR ? " intr" : "");
 
index a975ebe..1aab813 100644 (file)
@@ -309,7 +309,7 @@ static void midc_descriptor_complete(struct intel_mid_dma_chan *midc,
                callback_txd(param_txd);
        }
        if (midc->raw_tfr) {
-               desc->status = DMA_SUCCESS;
+               desc->status = DMA_COMPLETE;
                if (desc->lli != NULL) {
                        pci_pool_free(desc->lli_pool, desc->lli,
                                                desc->lli_phys);
@@ -481,7 +481,7 @@ static enum dma_status intel_mid_dma_tx_status(struct dma_chan *chan,
        enum dma_status ret;
 
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret != DMA_SUCCESS) {
+       if (ret != DMA_COMPLETE) {
                spin_lock_bh(&midc->lock);
                midc_scan_descriptors(to_middma_device(chan->device), midc);
                spin_unlock_bh(&midc->lock);
index 5ff6fc1..1a49c77 100644 (file)
@@ -531,21 +531,6 @@ static void ioat1_cleanup_event(unsigned long data)
        writew(IOAT_CHANCTRL_RUN, ioat->base.reg_base + IOAT_CHANCTRL_OFFSET);
 }
 
-void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags,
-                   size_t len, struct ioat_dma_descriptor *hw)
-{
-       struct pci_dev *pdev = chan->device->pdev;
-       size_t offset = len - hw->size;
-
-       if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP))
-               ioat_unmap(pdev, hw->dst_addr - offset, len,
-                          PCI_DMA_FROMDEVICE, flags, 1);
-
-       if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP))
-               ioat_unmap(pdev, hw->src_addr - offset, len,
-                          PCI_DMA_TODEVICE, flags, 0);
-}
-
 dma_addr_t ioat_get_current_completion(struct ioat_chan_common *chan)
 {
        dma_addr_t phys_complete;
@@ -602,7 +587,7 @@ static void __cleanup(struct ioat_dma_chan *ioat, dma_addr_t phys_complete)
                dump_desc_dbg(ioat, desc);
                if (tx->cookie) {
                        dma_cookie_complete(tx);
-                       ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw);
+                       dma_descriptor_unmap(tx);
                        ioat->active -= desc->hw->tx_cnt;
                        if (tx->callback) {
                                tx->callback(tx->callback_param);
@@ -733,7 +718,7 @@ ioat_dma_tx_status(struct dma_chan *c, dma_cookie_t cookie,
        enum dma_status ret;
 
        ret = dma_cookie_status(c, cookie, txstate);
-       if (ret == DMA_SUCCESS)
+       if (ret == DMA_COMPLETE)
                return ret;
 
        device->cleanup_fn((unsigned long) c);
@@ -833,8 +818,7 @@ int ioat_dma_self_test(struct ioatdma_device *device)
 
        dma_src = dma_map_single(dev, src, IOAT_TEST_SIZE, DMA_TO_DEVICE);
        dma_dest = dma_map_single(dev, dest, IOAT_TEST_SIZE, DMA_FROM_DEVICE);
-       flags = DMA_COMPL_SKIP_SRC_UNMAP | DMA_COMPL_SKIP_DEST_UNMAP |
-               DMA_PREP_INTERRUPT;
+       flags = DMA_PREP_INTERRUPT;
        tx = device->common.device_prep_dma_memcpy(dma_chan, dma_dest, dma_src,
                                                   IOAT_TEST_SIZE, flags);
        if (!tx) {
@@ -859,7 +843,7 @@ int ioat_dma_self_test(struct ioatdma_device *device)
 
        if (tmo == 0 ||
            dma->device_tx_status(dma_chan, cookie, NULL)
-                                       != DMA_SUCCESS) {
+                                       != DMA_COMPLETE) {
                dev_err(dev, "Self-test copy timed out, disabling\n");
                err = -ENODEV;
                goto unmap_dma;
@@ -885,8 +869,7 @@ static char ioat_interrupt_style[32] = "msix";
 module_param_string(ioat_interrupt_style, ioat_interrupt_style,
                    sizeof(ioat_interrupt_style), 0644);
 MODULE_PARM_DESC(ioat_interrupt_style,
-                "set ioat interrupt style: msix (default), "
-                "msix-single-vector, msi, intx)");
+                "set ioat interrupt style: msix (default), msi, intx");
 
 /**
  * ioat_dma_setup_interrupts - setup interrupt handler
@@ -904,8 +887,6 @@ int ioat_dma_setup_interrupts(struct ioatdma_device *device)
 
        if (!strcmp(ioat_interrupt_style, "msix"))
                goto msix;
-       if (!strcmp(ioat_interrupt_style, "msix-single-vector"))
-               goto msix_single_vector;
        if (!strcmp(ioat_interrupt_style, "msi"))
                goto msi;
        if (!strcmp(ioat_interrupt_style, "intx"))
@@ -920,10 +901,8 @@ msix:
                device->msix_entries[i].entry = i;
 
        err = pci_enable_msix(pdev, device->msix_entries, msixcnt);
-       if (err < 0)
+       if (err)
                goto msi;
-       if (err > 0)
-               goto msix_single_vector;
 
        for (i = 0; i < msixcnt; i++) {
                msix = &device->msix_entries[i];
@@ -937,29 +916,13 @@ msix:
                                chan = ioat_chan_by_index(device, j);
                                devm_free_irq(dev, msix->vector, chan);
                        }
-                       goto msix_single_vector;
+                       goto msi;
                }
        }
        intrctrl |= IOAT_INTRCTRL_MSIX_VECTOR_CONTROL;
        device->irq_mode = IOAT_MSIX;
        goto done;
 
-msix_single_vector:
-       msix = &device->msix_entries[0];
-       msix->entry = 0;
-       err = pci_enable_msix(pdev, device->msix_entries, 1);
-       if (err)
-               goto msi;
-
-       err = devm_request_irq(dev, msix->vector, ioat_dma_do_interrupt, 0,
-                              "ioat-msix", device);
-       if (err) {
-               pci_disable_msix(pdev);
-               goto msi;
-       }
-       device->irq_mode = IOAT_MSIX_SINGLE;
-       goto done;
-
 msi:
        err = pci_enable_msi(pdev);
        if (err)
@@ -971,7 +934,7 @@ msi:
                pci_disable_msi(pdev);
                goto intx;
        }
-       device->irq_mode = IOAT_MSIX;
+       device->irq_mode = IOAT_MSI;
        goto done;
 
 intx:
index 54fb7b9..11fb877 100644 (file)
@@ -52,7 +52,6 @@
 enum ioat_irq_mode {
        IOAT_NOIRQ = 0,
        IOAT_MSIX,
-       IOAT_MSIX_SINGLE,
        IOAT_MSI,
        IOAT_INTX
 };
@@ -83,7 +82,6 @@ struct ioatdma_device {
        struct pci_pool *completion_pool;
 #define MAX_SED_POOLS  5
        struct dma_pool *sed_hw_pool[MAX_SED_POOLS];
-       struct kmem_cache *sed_pool;
        struct dma_device common;
        u8 version;
        struct msix_entry msix_entries[4];
@@ -342,16 +340,6 @@ static inline bool is_ioat_bug(unsigned long err)
        return !!err;
 }
 
-static inline void ioat_unmap(struct pci_dev *pdev, dma_addr_t addr, size_t len,
-                             int direction, enum dma_ctrl_flags flags, bool dst)
-{
-       if ((dst && (flags & DMA_COMPL_DEST_UNMAP_SINGLE)) ||
-           (!dst && (flags & DMA_COMPL_SRC_UNMAP_SINGLE)))
-               pci_unmap_single(pdev, addr, len, direction);
-       else
-               pci_unmap_page(pdev, addr, len, direction);
-}
-
 int ioat_probe(struct ioatdma_device *device);
 int ioat_register(struct ioatdma_device *device);
 int ioat1_dma_probe(struct ioatdma_device *dev, int dca);
@@ -363,8 +351,6 @@ void ioat_init_channel(struct ioatdma_device *device,
                       struct ioat_chan_common *chan, int idx);
 enum dma_status ioat_dma_tx_status(struct dma_chan *c, dma_cookie_t cookie,
                                   struct dma_tx_state *txstate);
-void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags,
-                   size_t len, struct ioat_dma_descriptor *hw);
 bool ioat_cleanup_preamble(struct ioat_chan_common *chan,
                           dma_addr_t *phys_complete);
 void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type);
index b925e1b..5d3affe 100644 (file)
@@ -148,7 +148,7 @@ static void __cleanup(struct ioat2_dma_chan *ioat, dma_addr_t phys_complete)
                tx = &desc->txd;
                dump_desc_dbg(ioat, desc);
                if (tx->cookie) {
-                       ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw);
+                       dma_descriptor_unmap(tx);
                        dma_cookie_complete(tx);
                        if (tx->callback) {
                                tx->callback(tx->callback_param);
index 212d584..4702927 100644 (file)
@@ -157,7 +157,6 @@ static inline void ioat2_set_chainaddr(struct ioat2_dma_chan *ioat, u64 addr)
 
 int ioat2_dma_probe(struct ioatdma_device *dev, int dca);
 int ioat3_dma_probe(struct ioatdma_device *dev, int dca);
-void ioat3_dma_remove(struct ioatdma_device *dev);
 struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase);
 struct dca_provider *ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase);
 int ioat2_check_space_lock(struct ioat2_dma_chan *ioat, int num_descs);
index d8ececa..820817e 100644 (file)
@@ -67,6 +67,8 @@
 #include "dma.h"
 #include "dma_v2.h"
 
+extern struct kmem_cache *ioat3_sed_cache;
+
 /* ioat hardware assumes at least two sources for raid operations */
 #define src_cnt_to_sw(x) ((x) + 2)
 #define src_cnt_to_hw(x) ((x) - 2)
@@ -87,22 +89,8 @@ static const u8 pq_idx_to_field[] = { 1, 4, 5, 0, 1, 2, 4, 5 };
 static const u8 pq16_idx_to_field[] = { 1, 4, 1, 2, 3, 4, 5, 6, 7,
                                        0, 1, 2, 3, 4, 5, 6 };
 
-/*
- * technically sources 1 and 2 do not require SED, but the op will have
- * at least 9 descriptors so that's irrelevant.
- */
-static const u8 pq16_idx_to_sed[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     1, 1, 1, 1, 1, 1, 1 };
-
 static void ioat3_eh(struct ioat2_dma_chan *ioat);
 
-static dma_addr_t xor_get_src(struct ioat_raw_descriptor *descs[2], int idx)
-{
-       struct ioat_raw_descriptor *raw = descs[xor_idx_to_desc >> idx & 1];
-
-       return raw->field[xor_idx_to_field[idx]];
-}
-
 static void xor_set_src(struct ioat_raw_descriptor *descs[2],
                        dma_addr_t addr, u32 offset, int idx)
 {
@@ -135,12 +123,6 @@ static void pq_set_src(struct ioat_raw_descriptor *descs[2],
        pq->coef[idx] = coef;
 }
 
-static int sed_get_pq16_pool_idx(int src_cnt)
-{
-
-       return pq16_idx_to_sed[src_cnt];
-}
-
 static bool is_jf_ioat(struct pci_dev *pdev)
 {
        switch (pdev->device) {
@@ -272,7 +254,7 @@ ioat3_alloc_sed(struct ioatdma_device *device, unsigned int hw_pool)
        struct ioat_sed_ent *sed;
        gfp_t flags = __GFP_ZERO | GFP_ATOMIC;
 
-       sed = kmem_cache_alloc(device->sed_pool, flags);
+       sed = kmem_cache_alloc(ioat3_sed_cache, flags);
        if (!sed)
                return NULL;
 
@@ -280,7 +262,7 @@ ioat3_alloc_sed(struct ioatdma_device *device, unsigned int hw_pool)
        sed->hw = dma_pool_alloc(device->sed_hw_pool[hw_pool],
                                 flags, &sed->dma);
        if (!sed->hw) {
-               kmem_cache_free(device->sed_pool, sed);
+               kmem_cache_free(ioat3_sed_cache, sed);
                return NULL;
        }
 
@@ -293,165 +275,7 @@ static void ioat3_free_sed(struct ioatdma_device *device, struct ioat_sed_ent *s
                return;
 
        dma_pool_free(device->sed_hw_pool[sed->hw_pool], sed->hw, sed->dma);
-       kmem_cache_free(device->sed_pool, sed);
-}
-
-static void ioat3_dma_unmap(struct ioat2_dma_chan *ioat,
-                           struct ioat_ring_ent *desc, int idx)
-{
-       struct ioat_chan_common *chan = &ioat->base;
-       struct pci_dev *pdev = chan->device->pdev;
-       size_t len = desc->len;
-       size_t offset = len - desc->hw->size;
-       struct dma_async_tx_descriptor *tx = &desc->txd;
-       enum dma_ctrl_flags flags = tx->flags;
-
-       switch (desc->hw->ctl_f.op) {
-       case IOAT_OP_COPY:
-               if (!desc->hw->ctl_f.null) /* skip 'interrupt' ops */
-                       ioat_dma_unmap(chan, flags, len, desc->hw);
-               break;
-       case IOAT_OP_XOR_VAL:
-       case IOAT_OP_XOR: {
-               struct ioat_xor_descriptor *xor = desc->xor;
-               struct ioat_ring_ent *ext;
-               struct ioat_xor_ext_descriptor *xor_ex = NULL;
-               int src_cnt = src_cnt_to_sw(xor->ctl_f.src_cnt);
-               struct ioat_raw_descriptor *descs[2];
-               int i;
-
-               if (src_cnt > 5) {
-                       ext = ioat2_get_ring_ent(ioat, idx + 1);
-                       xor_ex = ext->xor_ex;
-               }
-
-               if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-                       descs[0] = (struct ioat_raw_descriptor *) xor;
-                       descs[1] = (struct ioat_raw_descriptor *) xor_ex;
-                       for (i = 0; i < src_cnt; i++) {
-                               dma_addr_t src = xor_get_src(descs, i);
-
-                               ioat_unmap(pdev, src - offset, len,
-                                          PCI_DMA_TODEVICE, flags, 0);
-                       }
-
-                       /* dest is a source in xor validate operations */
-                       if (xor->ctl_f.op == IOAT_OP_XOR_VAL) {
-                               ioat_unmap(pdev, xor->dst_addr - offset, len,
-                                          PCI_DMA_TODEVICE, flags, 1);
-                               break;
-                       }
-               }
-
-               if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP))
-                       ioat_unmap(pdev, xor->dst_addr - offset, len,
-                                  PCI_DMA_FROMDEVICE, flags, 1);
-               break;
-       }
-       case IOAT_OP_PQ_VAL:
-       case IOAT_OP_PQ: {
-               struct ioat_pq_descriptor *pq = desc->pq;
-               struct ioat_ring_ent *ext;
-               struct ioat_pq_ext_descriptor *pq_ex = NULL;
-               int src_cnt = src_cnt_to_sw(pq->ctl_f.src_cnt);
-               struct ioat_raw_descriptor *descs[2];
-               int i;
-
-               if (src_cnt > 3) {
-                       ext = ioat2_get_ring_ent(ioat, idx + 1);
-                       pq_ex = ext->pq_ex;
-               }
-
-               /* in the 'continue' case don't unmap the dests as sources */
-               if (dmaf_p_disabled_continue(flags))
-                       src_cnt--;
-               else if (dmaf_continue(flags))
-                       src_cnt -= 3;
-
-               if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-                       descs[0] = (struct ioat_raw_descriptor *) pq;
-                       descs[1] = (struct ioat_raw_descriptor *) pq_ex;
-                       for (i = 0; i < src_cnt; i++) {
-                               dma_addr_t src = pq_get_src(descs, i);
-
-                               ioat_unmap(pdev, src - offset, len,
-                                          PCI_DMA_TODEVICE, flags, 0);
-                       }
-
-                       /* the dests are sources in pq validate operations */
-                       if (pq->ctl_f.op == IOAT_OP_XOR_VAL) {
-                               if (!(flags & DMA_PREP_PQ_DISABLE_P))
-                                       ioat_unmap(pdev, pq->p_addr - offset,
-                                                  len, PCI_DMA_TODEVICE, flags, 0);
-                               if (!(flags & DMA_PREP_PQ_DISABLE_Q))
-                                       ioat_unmap(pdev, pq->q_addr - offset,
-                                                  len, PCI_DMA_TODEVICE, flags, 0);
-                               break;
-                       }
-               }
-
-               if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-                       if (!(flags & DMA_PREP_PQ_DISABLE_P))
-                               ioat_unmap(pdev, pq->p_addr - offset, len,
-                                          PCI_DMA_BIDIRECTIONAL, flags, 1);
-                       if (!(flags & DMA_PREP_PQ_DISABLE_Q))
-                               ioat_unmap(pdev, pq->q_addr - offset, len,
-                                          PCI_DMA_BIDIRECTIONAL, flags, 1);
-               }
-               break;
-       }
-       case IOAT_OP_PQ_16S:
-       case IOAT_OP_PQ_VAL_16S: {
-               struct ioat_pq_descriptor *pq = desc->pq;
-               int src_cnt = src16_cnt_to_sw(pq->ctl_f.src_cnt);
-               struct ioat_raw_descriptor *descs[4];
-               int i;
-
-               /* in the 'continue' case don't unmap the dests as sources */
-               if (dmaf_p_disabled_continue(flags))
-                       src_cnt--;
-               else if (dmaf_continue(flags))
-                       src_cnt -= 3;
-
-               if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-                       descs[0] = (struct ioat_raw_descriptor *)pq;
-                       descs[1] = (struct ioat_raw_descriptor *)(desc->sed->hw);
-                       descs[2] = (struct ioat_raw_descriptor *)(&desc->sed->hw->b[0]);
-                       for (i = 0; i < src_cnt; i++) {
-                               dma_addr_t src = pq16_get_src(descs, i);
-
-                               ioat_unmap(pdev, src - offset, len,
-                                          PCI_DMA_TODEVICE, flags, 0);
-                       }
-
-                       /* the dests are sources in pq validate operations */
-                       if (pq->ctl_f.op == IOAT_OP_XOR_VAL) {
-                               if (!(flags & DMA_PREP_PQ_DISABLE_P))
-                                       ioat_unmap(pdev, pq->p_addr - offset,
-                                                  len, PCI_DMA_TODEVICE,
-                                                  flags, 0);
-                               if (!(flags & DMA_PREP_PQ_DISABLE_Q))
-                                       ioat_unmap(pdev, pq->q_addr - offset,
-                                                  len, PCI_DMA_TODEVICE,
-                                                  flags, 0);
-                               break;
-                       }
-               }
-
-               if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-                       if (!(flags & DMA_PREP_PQ_DISABLE_P))
-                               ioat_unmap(pdev, pq->p_addr - offset, len,
-                                          PCI_DMA_BIDIRECTIONAL, flags, 1);
-                       if (!(flags & DMA_PREP_PQ_DISABLE_Q))
-                               ioat_unmap(pdev, pq->q_addr - offset, len,
-                                          PCI_DMA_BIDIRECTIONAL, flags, 1);
-               }
-               break;
-       }
-       default:
-               dev_err(&pdev->dev, "%s: unknown op type: %#x\n",
-                       __func__, desc->hw->ctl_f.op);
-       }
+       kmem_cache_free(ioat3_sed_cache, sed);
 }
 
 static bool desc_has_ext(struct ioat_ring_ent *desc)
@@ -577,7 +401,7 @@ static void __cleanup(struct ioat2_dma_chan *ioat, dma_addr_t phys_complete)
                tx = &desc->txd;
                if (tx->cookie) {
                        dma_cookie_complete(tx);
-                       ioat3_dma_unmap(ioat, desc, idx + i);
+                       dma_descriptor_unmap(tx);
                        if (tx->callback) {
                                tx->callback(tx->callback_param);
                                tx->callback = NULL;
@@ -807,7 +631,7 @@ ioat3_tx_status(struct dma_chan *c, dma_cookie_t cookie,
        enum dma_status ret;
 
        ret = dma_cookie_status(c, cookie, txstate);
-       if (ret == DMA_SUCCESS)
+       if (ret == DMA_COMPLETE)
                return ret;
 
        ioat3_cleanup(ioat);
@@ -1129,9 +953,6 @@ __ioat3_prep_pq16_lock(struct dma_chan *c, enum sum_check_flags *result,
        u8 op;
        int i, s, idx, num_descs;
 
-       /* this function only handles src_cnt 9 - 16 */
-       BUG_ON(src_cnt < 9);
-
        /* this function is only called with 9-16 sources */
        op = result ? IOAT_OP_PQ_VAL_16S : IOAT_OP_PQ_16S;
 
@@ -1159,8 +980,7 @@ __ioat3_prep_pq16_lock(struct dma_chan *c, enum sum_check_flags *result,
 
                descs[0] = (struct ioat_raw_descriptor *) pq;
 
-               desc->sed = ioat3_alloc_sed(device,
-                                           sed_get_pq16_pool_idx(src_cnt));
+               desc->sed = ioat3_alloc_sed(device, (src_cnt-2) >> 3);
                if (!desc->sed) {
                        dev_err(to_dev(chan),
                                "%s: no free sed entries\n", __func__);
@@ -1218,13 +1038,21 @@ __ioat3_prep_pq16_lock(struct dma_chan *c, enum sum_check_flags *result,
        return &desc->txd;
 }
 
+static int src_cnt_flags(unsigned int src_cnt, unsigned long flags)
+{
+       if (dmaf_p_disabled_continue(flags))
+               return src_cnt + 1;
+       else if (dmaf_continue(flags))
+               return src_cnt + 3;
+       else
+               return src_cnt;
+}
+
 static struct dma_async_tx_descriptor *
 ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
              unsigned int src_cnt, const unsigned char *scf, size_t len,
              unsigned long flags)
 {
-       struct dma_device *dma = chan->device;
-
        /* specify valid address for disabled result */
        if (flags & DMA_PREP_PQ_DISABLE_P)
                dst[0] = dst[1];
@@ -1244,7 +1072,7 @@ ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
                single_source_coef[0] = scf[0];
                single_source_coef[1] = 0;
 
-               return (src_cnt > 8) && (dma->max_pq > 8) ?
+               return src_cnt_flags(src_cnt, flags) > 8 ?
                        __ioat3_prep_pq16_lock(chan, NULL, dst, single_source,
                                               2, single_source_coef, len,
                                               flags) :
@@ -1252,7 +1080,7 @@ ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
                                             single_source_coef, len, flags);
 
        } else {
-               return (src_cnt > 8) && (dma->max_pq > 8) ?
+               return src_cnt_flags(src_cnt, flags) > 8 ?
                        __ioat3_prep_pq16_lock(chan, NULL, dst, src, src_cnt,
                                               scf, len, flags) :
                        __ioat3_prep_pq_lock(chan, NULL, dst, src, src_cnt,
@@ -1265,8 +1093,6 @@ ioat3_prep_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
                  unsigned int src_cnt, const unsigned char *scf, size_t len,
                  enum sum_check_flags *pqres, unsigned long flags)
 {
-       struct dma_device *dma = chan->device;
-
        /* specify valid address for disabled result */
        if (flags & DMA_PREP_PQ_DISABLE_P)
                pq[0] = pq[1];
@@ -1278,7 +1104,7 @@ ioat3_prep_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
         */
        *pqres = 0;
 
-       return (src_cnt > 8) && (dma->max_pq > 8) ?
+       return src_cnt_flags(src_cnt, flags) > 8 ?
                __ioat3_prep_pq16_lock(chan, pqres, pq, src, src_cnt, scf, len,
                                       flags) :
                __ioat3_prep_pq_lock(chan, pqres, pq, src, src_cnt, scf, len,
@@ -1289,7 +1115,6 @@ static struct dma_async_tx_descriptor *
 ioat3_prep_pqxor(struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
                 unsigned int src_cnt, size_t len, unsigned long flags)
 {
-       struct dma_device *dma = chan->device;
        unsigned char scf[src_cnt];
        dma_addr_t pq[2];
 
@@ -1298,7 +1123,7 @@ ioat3_prep_pqxor(struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src,
        flags |= DMA_PREP_PQ_DISABLE_Q;
        pq[1] = dst; /* specify valid address for disabled result */
 
-       return (src_cnt > 8) && (dma->max_pq > 8) ?
+       return src_cnt_flags(src_cnt, flags) > 8 ?
                __ioat3_prep_pq16_lock(chan, NULL, pq, src, src_cnt, scf, len,
                                       flags) :
                __ioat3_prep_pq_lock(chan, NULL, pq, src, src_cnt, scf, len,
@@ -1310,7 +1135,6 @@ ioat3_prep_pqxor_val(struct dma_chan *chan, dma_addr_t *src,
                     unsigned int src_cnt, size_t len,
                     enum sum_check_flags *result, unsigned long flags)
 {
-       struct dma_device *dma = chan->device;
        unsigned char scf[src_cnt];
        dma_addr_t pq[2];
 
@@ -1324,8 +1148,7 @@ ioat3_prep_pqxor_val(struct dma_chan *chan, dma_addr_t *src,
        flags |= DMA_PREP_PQ_DISABLE_Q;
        pq[1] = pq[0]; /* specify valid address for disabled result */
 
-
-       return (src_cnt > 8) && (dma->max_pq > 8) ?
+       return src_cnt_flags(src_cnt, flags) > 8 ?
                __ioat3_prep_pq16_lock(chan, result, pq, &src[1], src_cnt - 1,
                                       scf, len, flags) :
                __ioat3_prep_pq_lock(chan, result, pq, &src[1], src_cnt - 1,
@@ -1444,9 +1267,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device)
                                           DMA_TO_DEVICE);
        tx = dma->device_prep_dma_xor(dma_chan, dest_dma, dma_srcs,
                                      IOAT_NUM_SRC_TEST, PAGE_SIZE,
-                                     DMA_PREP_INTERRUPT |
-                                     DMA_COMPL_SKIP_SRC_UNMAP |
-                                     DMA_COMPL_SKIP_DEST_UNMAP);
+                                     DMA_PREP_INTERRUPT);
 
        if (!tx) {
                dev_err(dev, "Self-test xor prep failed\n");
@@ -1468,7 +1289,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device)
 
        tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
 
-       if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_SUCCESS) {
+       if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_COMPLETE) {
                dev_err(dev, "Self-test xor timed out\n");
                err = -ENODEV;
                goto dma_unmap;
@@ -1507,9 +1328,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device)
                                           DMA_TO_DEVICE);
        tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs,
                                          IOAT_NUM_SRC_TEST + 1, PAGE_SIZE,
-                                         &xor_val_result, DMA_PREP_INTERRUPT |
-                                         DMA_COMPL_SKIP_SRC_UNMAP |
-                                         DMA_COMPL_SKIP_DEST_UNMAP);
+                                         &xor_val_result, DMA_PREP_INTERRUPT);
        if (!tx) {
                dev_err(dev, "Self-test zero prep failed\n");
                err = -ENODEV;
@@ -1530,7 +1349,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device)
 
        tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
 
-       if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_SUCCESS) {
+       if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_COMPLETE) {
                dev_err(dev, "Self-test validate timed out\n");
                err = -ENODEV;
                goto dma_unmap;
@@ -1545,6 +1364,8 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device)
                goto free_resources;
        }
 
+       memset(page_address(dest), 0, PAGE_SIZE);
+
        /* test for non-zero parity sum */
        op = IOAT_OP_XOR_VAL;
 
@@ -1554,9 +1375,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device)
                                           DMA_TO_DEVICE);
        tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs,
                                          IOAT_NUM_SRC_TEST + 1, PAGE_SIZE,
-                                         &xor_val_result, DMA_PREP_INTERRUPT |
-                                         DMA_COMPL_SKIP_SRC_UNMAP |
-                                         DMA_COMPL_SKIP_DEST_UNMAP);
+                                         &xor_val_result, DMA_PREP_INTERRUPT);
        if (!tx) {
                dev_err(dev, "Self-test 2nd zero prep failed\n");
                err = -ENODEV;
@@ -1577,7 +1396,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device)
 
        tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
 
-       if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_SUCCESS) {
+       if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_COMPLETE) {
                dev_err(dev, "Self-test 2nd validate timed out\n");
                err = -ENODEV;
                goto dma_unmap;
@@ -1630,52 +1449,36 @@ static int ioat3_dma_self_test(struct ioatdma_device *device)
 
 static int ioat3_irq_reinit(struct ioatdma_device *device)
 {
-       int msixcnt = device->common.chancnt;
        struct pci_dev *pdev = device->pdev;
-       int i;
-       struct msix_entry *msix;
-       struct ioat_chan_common *chan;
-       int err = 0;
+       int irq = pdev->irq, i;
+
+       if (!is_bwd_ioat(pdev))
+               return 0;
 
        switch (device->irq_mode) {
        case IOAT_MSIX:
+               for (i = 0; i < device->common.chancnt; i++) {
+                       struct msix_entry *msix = &device->msix_entries[i];
+                       struct ioat_chan_common *chan;
 
-               for (i = 0; i < msixcnt; i++) {
-                       msix = &device->msix_entries[i];
                        chan = ioat_chan_by_index(device, i);
                        devm_free_irq(&pdev->dev, msix->vector, chan);
                }
 
                pci_disable_msix(pdev);
                break;
-
-       case IOAT_MSIX_SINGLE:
-               msix = &device->msix_entries[0];
-               chan = ioat_chan_by_index(device, 0);
-               devm_free_irq(&pdev->dev, msix->vector, chan);
-               pci_disable_msix(pdev);
-               break;
-
        case IOAT_MSI:
-               chan = ioat_chan_by_index(device, 0);
-               devm_free_irq(&pdev->dev, pdev->irq, chan);
                pci_disable_msi(pdev);
-               break;
-
+               /* fall through */
        case IOAT_INTX:
-               chan = ioat_chan_by_index(device, 0);
-               devm_free_irq(&pdev->dev, pdev->irq, chan);
+               devm_free_irq(&pdev->dev, irq, device);
                break;
-
        default:
                return 0;
        }
-
        device->irq_mode = IOAT_NOIRQ;
 
-       err = ioat_dma_setup_interrupts(device);
-
-       return err;
+       return ioat_dma_setup_interrupts(device);
 }
 
 static int ioat3_reset_hw(struct ioat_chan_common *chan)
@@ -1718,14 +1521,12 @@ static int ioat3_reset_hw(struct ioat_chan_common *chan)
        }
 
        err = ioat2_reset_sync(chan, msecs_to_jiffies(200));
-       if (err) {
-               dev_err(&pdev->dev, "Failed to reset!\n");
-               return err;
-       }
-
-       if (device->irq_mode != IOAT_NOIRQ && is_bwd_ioat(pdev))
+       if (!err)
                err = ioat3_irq_reinit(device);
 
+       if (err)
+               dev_err(&pdev->dev, "Failed to reset: %d\n", err);
+
        return err;
 }
 
@@ -1835,21 +1636,15 @@ int ioat3_dma_probe(struct ioatdma_device *device, int dca)
                char pool_name[14];
                int i;
 
-               /* allocate sw descriptor pool for SED */
-               device->sed_pool = kmem_cache_create("ioat_sed",
-                               sizeof(struct ioat_sed_ent), 0, 0, NULL);
-               if (!device->sed_pool)
-                       return -ENOMEM;
-
                for (i = 0; i < MAX_SED_POOLS; i++) {
                        snprintf(pool_name, 14, "ioat_hw%d_sed", i);
 
                        /* allocate SED DMA pool */
-                       device->sed_hw_pool[i] = dma_pool_create(pool_name,
+                       device->sed_hw_pool[i] = dmam_pool_create(pool_name,
                                        &pdev->dev,
                                        SED_SIZE * (i + 1), 64, 0);
                        if (!device->sed_hw_pool[i])
-                               goto sed_pool_cleanup;
+                               return -ENOMEM;
 
                }
        }
@@ -1875,28 +1670,4 @@ int ioat3_dma_probe(struct ioatdma_device *device, int dca)
                device->dca = ioat3_dca_init(pdev, device->reg_base);
 
        return 0;
-
-sed_pool_cleanup:
-       if (device->sed_pool) {
-               int i;
-               kmem_cache_destroy(device->sed_pool);
-
-               for (i = 0; i < MAX_SED_POOLS; i++)
-                       if (device->sed_hw_pool[i])
-                               dma_pool_destroy(device->sed_hw_pool[i]);
-       }
-
-       return -ENOMEM;
-}
-
-void ioat3_dma_remove(struct ioatdma_device *device)
-{
-       if (device->sed_pool) {
-               int i;
-               kmem_cache_destroy(device->sed_pool);
-
-               for (i = 0; i < MAX_SED_POOLS; i++)
-                       if (device->sed_hw_pool[i])
-                               dma_pool_destroy(device->sed_hw_pool[i]);
-       }
 }
index 2c8d560..1d051cd 100644 (file)
@@ -123,6 +123,7 @@ module_param(ioat_dca_enabled, int, 0644);
 MODULE_PARM_DESC(ioat_dca_enabled, "control support of dca service (default: 1)");
 
 struct kmem_cache *ioat2_cache;
+struct kmem_cache *ioat3_sed_cache;
 
 #define DRV_NAME "ioatdma"
 
@@ -207,9 +208,6 @@ static void ioat_remove(struct pci_dev *pdev)
        if (!device)
                return;
 
-       if (device->version >= IOAT_VER_3_0)
-               ioat3_dma_remove(device);
-
        dev_err(&pdev->dev, "Removing dma and dca services\n");
        if (device->dca) {
                unregister_dca_provider(device->dca, &pdev->dev);
@@ -221,7 +219,7 @@ static void ioat_remove(struct pci_dev *pdev)
 
 static int __init ioat_init_module(void)
 {
-       int err;
+       int err = -ENOMEM;
 
        pr_info("%s: Intel(R) QuickData Technology Driver %s\n",
                DRV_NAME, IOAT_DMA_VERSION);
@@ -231,9 +229,21 @@ static int __init ioat_init_module(void)
        if (!ioat2_cache)
                return -ENOMEM;
 
+       ioat3_sed_cache = KMEM_CACHE(ioat_sed_ent, 0);
+       if (!ioat3_sed_cache)
+               goto err_ioat2_cache;
+
        err = pci_register_driver(&ioat_pci_driver);
        if (err)
-               kmem_cache_destroy(ioat2_cache);
+               goto err_ioat3_cache;
+
+       return 0;
+
+ err_ioat3_cache:
+       kmem_cache_destroy(ioat3_sed_cache);
+
+ err_ioat2_cache:
+       kmem_cache_destroy(ioat2_cache);
 
        return err;
 }
index dd8b44a..c56137b 100644 (file)
@@ -61,80 +61,6 @@ static void iop_adma_free_slots(struct iop_adma_desc_slot *slot)
        }
 }
 
-static void
-iop_desc_unmap(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc)
-{
-       struct dma_async_tx_descriptor *tx = &desc->async_tx;
-       struct iop_adma_desc_slot *unmap = desc->group_head;
-       struct device *dev = &iop_chan->device->pdev->dev;
-       u32 len = unmap->unmap_len;
-       enum dma_ctrl_flags flags = tx->flags;
-       u32 src_cnt;
-       dma_addr_t addr;
-       dma_addr_t dest;
-
-       src_cnt = unmap->unmap_src_cnt;
-       dest = iop_desc_get_dest_addr(unmap, iop_chan);
-       if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-               enum dma_data_direction dir;
-
-               if (src_cnt > 1) /* is xor? */
-                       dir = DMA_BIDIRECTIONAL;
-               else
-                       dir = DMA_FROM_DEVICE;
-
-               dma_unmap_page(dev, dest, len, dir);
-       }
-
-       if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-               while (src_cnt--) {
-                       addr = iop_desc_get_src_addr(unmap, iop_chan, src_cnt);
-                       if (addr == dest)
-                               continue;
-                       dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
-               }
-       }
-       desc->group_head = NULL;
-}
-
-static void
-iop_desc_unmap_pq(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc)
-{
-       struct dma_async_tx_descriptor *tx = &desc->async_tx;
-       struct iop_adma_desc_slot *unmap = desc->group_head;
-       struct device *dev = &iop_chan->device->pdev->dev;
-       u32 len = unmap->unmap_len;
-       enum dma_ctrl_flags flags = tx->flags;
-       u32 src_cnt = unmap->unmap_src_cnt;
-       dma_addr_t pdest = iop_desc_get_dest_addr(unmap, iop_chan);
-       dma_addr_t qdest = iop_desc_get_qdest_addr(unmap, iop_chan);
-       int i;
-
-       if (tx->flags & DMA_PREP_CONTINUE)
-               src_cnt -= 3;
-
-       if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP) && !desc->pq_check_result) {
-               dma_unmap_page(dev, pdest, len, DMA_BIDIRECTIONAL);
-               dma_unmap_page(dev, qdest, len, DMA_BIDIRECTIONAL);
-       }
-
-       if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-               dma_addr_t addr;
-
-               for (i = 0; i < src_cnt; i++) {
-                       addr = iop_desc_get_src_addr(unmap, iop_chan, i);
-                       dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
-               }
-               if (desc->pq_check_result) {
-                       dma_unmap_page(dev, pdest, len, DMA_TO_DEVICE);
-                       dma_unmap_page(dev, qdest, len, DMA_TO_DEVICE);
-               }
-       }
-
-       desc->group_head = NULL;
-}
-
-
 static dma_cookie_t
 iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc,
        struct iop_adma_chan *iop_chan, dma_cookie_t cookie)
@@ -152,15 +78,9 @@ iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc,
                if (tx->callback)
                        tx->callback(tx->callback_param);
 
-               /* unmap dma addresses
-                * (unmap_single vs unmap_page?)
-                */
-               if (desc->group_head && desc->unmap_len) {
-                       if (iop_desc_is_pq(desc))
-                               iop_desc_unmap_pq(iop_chan, desc);
-                       else
-                               iop_desc_unmap(iop_chan, desc);
-               }
+               dma_descriptor_unmap(tx);
+               if (desc->group_head)
+                       desc->group_head = NULL;
        }
 
        /* run dependent operations */
@@ -591,7 +511,6 @@ iop_adma_prep_dma_interrupt(struct dma_chan *chan, unsigned long flags)
        if (sw_desc) {
                grp_start = sw_desc->group_head;
                iop_desc_init_interrupt(grp_start, iop_chan);
-               grp_start->unmap_len = 0;
                sw_desc->async_tx.flags = flags;
        }
        spin_unlock_bh(&iop_chan->lock);
@@ -623,8 +542,6 @@ iop_adma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dma_dest,
                iop_desc_set_byte_count(grp_start, iop_chan, len);
                iop_desc_set_dest_addr(grp_start, iop_chan, dma_dest);
                iop_desc_set_memcpy_src_addr(grp_start, dma_src);
-               sw_desc->unmap_src_cnt = 1;
-               sw_desc->unmap_len = len;
                sw_desc->async_tx.flags = flags;
        }
        spin_unlock_bh(&iop_chan->lock);
@@ -657,8 +574,6 @@ iop_adma_prep_dma_xor(struct dma_chan *chan, dma_addr_t dma_dest,
                iop_desc_init_xor(grp_start, src_cnt, flags);
                iop_desc_set_byte_count(grp_start, iop_chan, len);
                iop_desc_set_dest_addr(grp_start, iop_chan, dma_dest);
-               sw_desc->unmap_src_cnt = src_cnt;
-               sw_desc->unmap_len = len;
                sw_desc->async_tx.flags = flags;
                while (src_cnt--)
                        iop_desc_set_xor_src_addr(grp_start, src_cnt,
@@ -694,8 +609,6 @@ iop_adma_prep_dma_xor_val(struct dma_chan *chan, dma_addr_t *dma_src,
                grp_start->xor_check_result = result;
                pr_debug("\t%s: grp_start->xor_check_result: %p\n",
                        __func__, grp_start->xor_check_result);
-               sw_desc->unmap_src_cnt = src_cnt;
-               sw_desc->unmap_len = len;
                sw_desc->async_tx.flags = flags;
                while (src_cnt--)
                        iop_desc_set_zero_sum_src_addr(grp_start, src_cnt,
@@ -748,8 +661,6 @@ iop_adma_prep_dma_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src,
                        dst[0] = dst[1] & 0x7;
 
                iop_desc_set_pq_addr(g, dst);
-               sw_desc->unmap_src_cnt = src_cnt;
-               sw_desc->unmap_len = len;
                sw_desc->async_tx.flags = flags;
                for (i = 0; i < src_cnt; i++)
                        iop_desc_set_pq_src_addr(g, i, src[i], scf[i]);
@@ -804,8 +715,6 @@ iop_adma_prep_dma_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
                g->pq_check_result = pqres;
                pr_debug("\t%s: g->pq_check_result: %p\n",
                        __func__, g->pq_check_result);
-               sw_desc->unmap_src_cnt = src_cnt+2;
-               sw_desc->unmap_len = len;
                sw_desc->async_tx.flags = flags;
                while (src_cnt--)
                        iop_desc_set_pq_zero_sum_src_addr(g, src_cnt,
@@ -864,7 +773,7 @@ static enum dma_status iop_adma_status(struct dma_chan *chan,
        int ret;
 
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret == DMA_SUCCESS)
+       if (ret == DMA_COMPLETE)
                return ret;
 
        iop_adma_slot_cleanup(iop_chan);
@@ -983,7 +892,7 @@ static int iop_adma_memcpy_self_test(struct iop_adma_device *device)
        msleep(1);
 
        if (iop_adma_status(dma_chan, cookie, NULL) !=
-                       DMA_SUCCESS) {
+                       DMA_COMPLETE) {
                dev_err(dma_chan->device->dev,
                        "Self-test copy timed out, disabling\n");
                err = -ENODEV;
@@ -1083,7 +992,7 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device)
        msleep(8);
 
        if (iop_adma_status(dma_chan, cookie, NULL) !=
-               DMA_SUCCESS) {
+               DMA_COMPLETE) {
                dev_err(dma_chan->device->dev,
                        "Self-test xor timed out, disabling\n");
                err = -ENODEV;
@@ -1129,7 +1038,7 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device)
        iop_adma_issue_pending(dma_chan);
        msleep(8);
 
-       if (iop_adma_status(dma_chan, cookie, NULL) != DMA_SUCCESS) {
+       if (iop_adma_status(dma_chan, cookie, NULL) != DMA_COMPLETE) {
                dev_err(dma_chan->device->dev,
                        "Self-test zero sum timed out, disabling\n");
                err = -ENODEV;
@@ -1158,7 +1067,7 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device)
        iop_adma_issue_pending(dma_chan);
        msleep(8);
 
-       if (iop_adma_status(dma_chan, cookie, NULL) != DMA_SUCCESS) {
+       if (iop_adma_status(dma_chan, cookie, NULL) != DMA_COMPLETE) {
                dev_err(dma_chan->device->dev,
                        "Self-test non-zero sum timed out, disabling\n");
                err = -ENODEV;
@@ -1254,7 +1163,7 @@ iop_adma_pq_zero_sum_self_test(struct iop_adma_device *device)
        msleep(8);
 
        if (iop_adma_status(dma_chan, cookie, NULL) !=
-               DMA_SUCCESS) {
+               DMA_COMPLETE) {
                dev_err(dev, "Self-test pq timed out, disabling\n");
                err = -ENODEV;
                goto free_resources;
@@ -1291,7 +1200,7 @@ iop_adma_pq_zero_sum_self_test(struct iop_adma_device *device)
        msleep(8);
 
        if (iop_adma_status(dma_chan, cookie, NULL) !=
-               DMA_SUCCESS) {
+               DMA_COMPLETE) {
                dev_err(dev, "Self-test pq-zero-sum timed out, disabling\n");
                err = -ENODEV;
                goto free_resources;
@@ -1323,7 +1232,7 @@ iop_adma_pq_zero_sum_self_test(struct iop_adma_device *device)
        msleep(8);
 
        if (iop_adma_status(dma_chan, cookie, NULL) !=
-               DMA_SUCCESS) {
+               DMA_COMPLETE) {
                dev_err(dev, "Self-test !pq-zero-sum timed out, disabling\n");
                err = -ENODEV;
                goto free_resources;
index cb9c0bc..128ca14 100644 (file)
@@ -1232,8 +1232,10 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
        desc = list_entry(ichan->queue.next, struct idmac_tx_desc, list);
        descnew = desc;
 
-       dev_dbg(dev, "IDMAC irq %d, dma 0x%08x, next dma 0x%08x, current %d, curbuf 0x%08x\n",
-               irq, sg_dma_address(*sg), sgnext ? sg_dma_address(sgnext) : 0, ichan->active_buffer, curbuf);
+       dev_dbg(dev, "IDMAC irq %d, dma %#llx, next dma %#llx, current %d, curbuf %#x\n",
+               irq, (u64)sg_dma_address(*sg),
+               sgnext ? (u64)sg_dma_address(sgnext) : 0,
+               ichan->active_buffer, curbuf);
 
        /* Find the descriptor of sgnext */
        sgnew = idmac_sg_next(ichan, &descnew, *sg);
index a2c330f..e260754 100644 (file)
@@ -344,7 +344,7 @@ static enum dma_status k3_dma_tx_status(struct dma_chan *chan,
        size_t bytes = 0;
 
        ret = dma_cookie_status(&c->vc.chan, cookie, state);
-       if (ret == DMA_SUCCESS)
+       if (ret == DMA_COMPLETE)
                return ret;
 
        spin_lock_irqsave(&c->vc.lock, flags);
@@ -693,7 +693,7 @@ static int k3_dma_probe(struct platform_device *op)
 
        irq = platform_get_irq(op, 0);
        ret = devm_request_irq(&op->dev, irq,
-                       k3_dma_int_handler, IRQF_DISABLED, DRIVER_NAME, d);
+                       k3_dma_int_handler, 0, DRIVER_NAME, d);
        if (ret)
                return ret;
 
index ff8d782..dcb1e05 100644 (file)
@@ -798,8 +798,7 @@ static void dma_do_tasklet(unsigned long data)
                 * move the descriptors to a temporary list so we can drop
                 * the lock during the entire cleanup operation
                 */
-               list_del(&desc->node);
-               list_add(&desc->node, &chain_cleanup);
+               list_move(&desc->node, &chain_cleanup);
 
                /*
                 * Look for the first list entry which has the ENDIRQEN flag
@@ -863,7 +862,7 @@ static int mmp_pdma_chan_init(struct mmp_pdma_device *pdev,
 
        if (irq) {
                ret = devm_request_irq(pdev->dev, irq,
-                       mmp_pdma_chan_handler, IRQF_DISABLED, "pdma", phy);
+                       mmp_pdma_chan_handler, 0, "pdma", phy);
                if (ret) {
                        dev_err(pdev->dev, "channel request irq fail!\n");
                        return ret;
@@ -970,7 +969,7 @@ static int mmp_pdma_probe(struct platform_device *op)
                /* all chan share one irq, demux inside */
                irq = platform_get_irq(op, 0);
                ret = devm_request_irq(pdev->dev, irq,
-                       mmp_pdma_int_handler, IRQF_DISABLED, "pdma", pdev);
+                       mmp_pdma_int_handler, 0, "pdma", pdev);
                if (ret)
                        return ret;
        }
index d3b6358..3ddacc1 100644 (file)
 #define TDCR_BURSTSZ_16B       (0x3 << 6)
 #define TDCR_BURSTSZ_32B       (0x6 << 6)
 #define TDCR_BURSTSZ_64B       (0x7 << 6)
+#define TDCR_BURSTSZ_SQU_1B            (0x5 << 6)
+#define TDCR_BURSTSZ_SQU_2B            (0x6 << 6)
+#define TDCR_BURSTSZ_SQU_4B            (0x0 << 6)
+#define TDCR_BURSTSZ_SQU_8B            (0x1 << 6)
+#define TDCR_BURSTSZ_SQU_16B   (0x3 << 6)
 #define TDCR_BURSTSZ_SQU_32B   (0x7 << 6)
 #define TDCR_BURSTSZ_128B      (0x5 << 6)
 #define TDCR_DSTDIR_MSK                (0x3 << 4)      /* Dst Direction */
@@ -158,7 +163,7 @@ static void mmp_tdma_disable_chan(struct mmp_tdma_chan *tdmac)
        /* disable irq */
        writel(0, tdmac->reg_base + TDIMR);
 
-       tdmac->status = DMA_SUCCESS;
+       tdmac->status = DMA_COMPLETE;
 }
 
 static void mmp_tdma_resume_chan(struct mmp_tdma_chan *tdmac)
@@ -228,8 +233,31 @@ static int mmp_tdma_config_chan(struct mmp_tdma_chan *tdmac)
                        return -EINVAL;
                }
        } else if (tdmac->type == PXA910_SQU) {
-               tdcr |= TDCR_BURSTSZ_SQU_32B;
                tdcr |= TDCR_SSPMOD;
+
+               switch (tdmac->burst_sz) {
+               case 1:
+                       tdcr |= TDCR_BURSTSZ_SQU_1B;
+                       break;
+               case 2:
+                       tdcr |= TDCR_BURSTSZ_SQU_2B;
+                       break;
+               case 4:
+                       tdcr |= TDCR_BURSTSZ_SQU_4B;
+                       break;
+               case 8:
+                       tdcr |= TDCR_BURSTSZ_SQU_8B;
+                       break;
+               case 16:
+                       tdcr |= TDCR_BURSTSZ_SQU_16B;
+                       break;
+               case 32:
+                       tdcr |= TDCR_BURSTSZ_SQU_32B;
+                       break;
+               default:
+                       dev_err(tdmac->dev, "mmp_tdma: unknown burst size.\n");
+                       return -EINVAL;
+               }
        }
 
        writel(tdcr, tdmac->reg_base + TDCR);
@@ -324,7 +352,7 @@ static int mmp_tdma_alloc_chan_resources(struct dma_chan *chan)
 
        if (tdmac->irq) {
                ret = devm_request_irq(tdmac->dev, tdmac->irq,
-                       mmp_tdma_chan_handler, IRQF_DISABLED, "tdma", tdmac);
+                       mmp_tdma_chan_handler, 0, "tdma", tdmac);
                if (ret)
                        return ret;
        }
@@ -365,7 +393,7 @@ static struct dma_async_tx_descriptor *mmp_tdma_prep_dma_cyclic(
        int num_periods = buf_len / period_len;
        int i = 0, buf = 0;
 
-       if (tdmac->status != DMA_SUCCESS)
+       if (tdmac->status != DMA_COMPLETE)
                return NULL;
 
        if (period_len > TDMA_MAX_XFER_BYTES) {
@@ -499,7 +527,7 @@ static int mmp_tdma_chan_init(struct mmp_tdma_device *tdev,
        tdmac->idx         = idx;
        tdmac->type        = type;
        tdmac->reg_base    = (unsigned long)tdev->base + idx * 4;
-       tdmac->status = DMA_SUCCESS;
+       tdmac->status = DMA_COMPLETE;
        tdev->tdmac[tdmac->idx] = tdmac;
        tasklet_init(&tdmac->tasklet, dma_do_tasklet, (unsigned long)tdmac);
 
@@ -554,7 +582,7 @@ static int mmp_tdma_probe(struct platform_device *pdev)
        if (irq_num != chan_num) {
                irq = platform_get_irq(pdev, 0);
                ret = devm_request_irq(&pdev->dev, irq,
-                       mmp_tdma_int_handler, IRQF_DISABLED, "tdma", tdev);
+                       mmp_tdma_int_handler, 0, "tdma", tdev);
                if (ret)
                        return ret;
        }
index 536dcb8..7807f0e 100644 (file)
@@ -60,14 +60,6 @@ static u32 mv_desc_get_dest_addr(struct mv_xor_desc_slot *desc)
        return hw_desc->phy_dest_addr;
 }
 
-static u32 mv_desc_get_src_addr(struct mv_xor_desc_slot *desc,
-                               int src_idx)
-{
-       struct mv_xor_desc *hw_desc = desc->hw_desc;
-       return hw_desc->phy_src_addr[mv_phy_src_idx(src_idx)];
-}
-
-
 static void mv_desc_set_byte_count(struct mv_xor_desc_slot *desc,
                                   u32 byte_count)
 {
@@ -278,42 +270,9 @@ mv_xor_run_tx_complete_actions(struct mv_xor_desc_slot *desc,
                        desc->async_tx.callback(
                                desc->async_tx.callback_param);
 
-               /* unmap dma addresses
-                * (unmap_single vs unmap_page?)
-                */
-               if (desc->group_head && desc->unmap_len) {
-                       struct mv_xor_desc_slot *unmap = desc->group_head;
-                       struct device *dev = mv_chan_to_devp(mv_chan);
-                       u32 len = unmap->unmap_len;
-                       enum dma_ctrl_flags flags = desc->async_tx.flags;
-                       u32 src_cnt;
-                       dma_addr_t addr;
-                       dma_addr_t dest;
-
-                       src_cnt = unmap->unmap_src_cnt;
-                       dest = mv_desc_get_dest_addr(unmap);
-                       if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-                               enum dma_data_direction dir;
-
-                               if (src_cnt > 1) /* is xor ? */
-                                       dir = DMA_BIDIRECTIONAL;
-                               else
-                                       dir = DMA_FROM_DEVICE;
-                               dma_unmap_page(dev, dest, len, dir);
-                       }
-
-                       if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-                               while (src_cnt--) {
-                                       addr = mv_desc_get_src_addr(unmap,
-                                                                   src_cnt);
-                                       if (addr == dest)
-                                               continue;
-                                       dma_unmap_page(dev, addr, len,
-                                                      DMA_TO_DEVICE);
-                               }
-                       }
+               dma_descriptor_unmap(&desc->async_tx);
+               if (desc->group_head)
                        desc->group_head = NULL;
-               }
        }
 
        /* run dependent operations */
@@ -749,7 +708,7 @@ static enum dma_status mv_xor_status(struct dma_chan *chan,
        enum dma_status ret;
 
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret == DMA_SUCCESS) {
+       if (ret == DMA_COMPLETE) {
                mv_xor_clean_completed_slots(mv_chan);
                return ret;
        }
@@ -874,7 +833,7 @@ static int mv_xor_memcpy_self_test(struct mv_xor_chan *mv_chan)
        msleep(1);
 
        if (mv_xor_status(dma_chan, cookie, NULL) !=
-           DMA_SUCCESS) {
+           DMA_COMPLETE) {
                dev_err(dma_chan->device->dev,
                        "Self-test copy timed out, disabling\n");
                err = -ENODEV;
@@ -968,7 +927,7 @@ mv_xor_xor_self_test(struct mv_xor_chan *mv_chan)
        msleep(8);
 
        if (mv_xor_status(dma_chan, cookie, NULL) !=
-           DMA_SUCCESS) {
+           DMA_COMPLETE) {
                dev_err(dma_chan->device->dev,
                        "Self-test xor timed out, disabling\n");
                err = -ENODEV;
@@ -1076,10 +1035,7 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
        }
 
        mv_chan->mmr_base = xordev->xor_base;
-       if (!mv_chan->mmr_base) {
-               ret = -ENOMEM;
-               goto err_free_dma;
-       }
+       mv_chan->mmr_high_base = xordev->xor_high_base;
        tasklet_init(&mv_chan->irq_tasklet, mv_xor_tasklet, (unsigned long)
                     mv_chan);
 
@@ -1138,7 +1094,7 @@ static void
 mv_xor_conf_mbus_windows(struct mv_xor_device *xordev,
                         const struct mbus_dram_target_info *dram)
 {
-       void __iomem *base = xordev->xor_base;
+       void __iomem *base = xordev->xor_high_base;
        u32 win_enable = 0;
        int i;
 
index 06b067f..d074922 100644 (file)
 #define XOR_OPERATION_MODE_MEMCPY      2
 #define XOR_DESCRIPTOR_SWAP            BIT(14)
 
-#define XOR_CURR_DESC(chan)    (chan->mmr_base + 0x210 + (chan->idx * 4))
-#define XOR_NEXT_DESC(chan)    (chan->mmr_base + 0x200 + (chan->idx * 4))
-#define XOR_BYTE_COUNT(chan)   (chan->mmr_base + 0x220 + (chan->idx * 4))
-#define XOR_DEST_POINTER(chan) (chan->mmr_base + 0x2B0 + (chan->idx * 4))
-#define XOR_BLOCK_SIZE(chan)   (chan->mmr_base + 0x2C0 + (chan->idx * 4))
-#define XOR_INIT_VALUE_LOW(chan)       (chan->mmr_base + 0x2E0)
-#define XOR_INIT_VALUE_HIGH(chan)      (chan->mmr_base + 0x2E4)
+#define XOR_CURR_DESC(chan)    (chan->mmr_high_base + 0x10 + (chan->idx * 4))
+#define XOR_NEXT_DESC(chan)    (chan->mmr_high_base + 0x00 + (chan->idx * 4))
+#define XOR_BYTE_COUNT(chan)   (chan->mmr_high_base + 0x20 + (chan->idx * 4))
+#define XOR_DEST_POINTER(chan) (chan->mmr_high_base + 0xB0 + (chan->idx * 4))
+#define XOR_BLOCK_SIZE(chan)   (chan->mmr_high_base + 0xC0 + (chan->idx * 4))
+#define XOR_INIT_VALUE_LOW(chan)       (chan->mmr_high_base + 0xE0)
+#define XOR_INIT_VALUE_HIGH(chan)      (chan->mmr_high_base + 0xE4)
 
 #define XOR_CONFIG(chan)       (chan->mmr_base + 0x10 + (chan->idx * 4))
 #define XOR_ACTIVATION(chan)   (chan->mmr_base + 0x20 + (chan->idx * 4))
 #define XOR_ERROR_ADDR(chan)   (chan->mmr_base + 0x60)
 #define XOR_INTR_MASK_VALUE    0x3F5
 
-#define WINDOW_BASE(w)         (0x250 + ((w) << 2))
-#define WINDOW_SIZE(w)         (0x270 + ((w) << 2))
-#define WINDOW_REMAP_HIGH(w)   (0x290 + ((w) << 2))
-#define WINDOW_BAR_ENABLE(chan)        (0x240 + ((chan) << 2))
-#define WINDOW_OVERRIDE_CTRL(chan)     (0x2A0 + ((chan) << 2))
+#define WINDOW_BASE(w)         (0x50 + ((w) << 2))
+#define WINDOW_SIZE(w)         (0x70 + ((w) << 2))
+#define WINDOW_REMAP_HIGH(w)   (0x90 + ((w) << 2))
+#define WINDOW_BAR_ENABLE(chan)        (0x40 + ((chan) << 2))
+#define WINDOW_OVERRIDE_CTRL(chan)     (0xA0 + ((chan) << 2))
 
 struct mv_xor_device {
        void __iomem         *xor_base;
@@ -82,6 +82,7 @@ struct mv_xor_chan {
        int                     pending;
        spinlock_t              lock; /* protects the descriptor slot pool */
        void __iomem            *mmr_base;
+       void __iomem            *mmr_high_base;
        unsigned int            idx;
        int                     irq;
        enum dma_transaction_type       current_type;
index ccd13df..ead4913 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_dma.h>
+#include <linux/list.h>
 
 #include <asm/irq.h>
 
@@ -57,6 +58,9 @@
        (((dma_is_apbh(d) && apbh_is_old(d)) ? 0x050 : 0x110) + (n) * 0x70)
 #define HW_APBHX_CHn_SEMA(d, n) \
        (((dma_is_apbh(d) && apbh_is_old(d)) ? 0x080 : 0x140) + (n) * 0x70)
+#define HW_APBHX_CHn_BAR(d, n) \
+       (((dma_is_apbh(d) && apbh_is_old(d)) ? 0x070 : 0x130) + (n) * 0x70)
+#define HW_APBX_CHn_DEBUG1(d, n) (0x150 + (n) * 0x70)
 
 /*
  * ccw bits definitions
@@ -115,7 +119,9 @@ struct mxs_dma_chan {
        int                             desc_count;
        enum dma_status                 status;
        unsigned int                    flags;
+       bool                            reset;
 #define MXS_DMA_SG_LOOP                        (1 << 0)
+#define MXS_DMA_USE_SEMAPHORE          (1 << 1)
 };
 
 #define MXS_DMA_CHANNELS               16
@@ -201,12 +207,47 @@ static void mxs_dma_reset_chan(struct mxs_dma_chan *mxs_chan)
        struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma;
        int chan_id = mxs_chan->chan.chan_id;
 
-       if (dma_is_apbh(mxs_dma) && apbh_is_old(mxs_dma))
+       /*
+        * mxs dma channel resets can cause a channel stall. To recover from a
+        * channel stall, we have to reset the whole DMA engine. To avoid this,
+        * we use cyclic DMA with semaphores, that are enhanced in
+        * mxs_dma_int_handler. To reset the channel, we can simply stop writing
+        * into the semaphore counter.
+        */
+       if (mxs_chan->flags & MXS_DMA_USE_SEMAPHORE &&
+                       mxs_chan->flags & MXS_DMA_SG_LOOP) {
+               mxs_chan->reset = true;
+       } else if (dma_is_apbh(mxs_dma) && apbh_is_old(mxs_dma)) {
                writel(1 << (chan_id + BP_APBH_CTRL0_RESET_CHANNEL),
                        mxs_dma->base + HW_APBHX_CTRL0 + STMP_OFFSET_REG_SET);
-       else
+       } else {
+               unsigned long elapsed = 0;
+               const unsigned long max_wait = 50000; /* 50ms */
+               void __iomem *reg_dbg1 = mxs_dma->base +
+                               HW_APBX_CHn_DEBUG1(mxs_dma, chan_id);
+
+               /*
+                * On i.MX28 APBX, the DMA channel can stop working if we reset
+                * the channel while it is in READ_FLUSH (0x08) state.
+                * We wait here until we leave the state. Then we trigger the
+                * reset. Waiting a maximum of 50ms, the kernel shouldn't crash
+                * because of this.
+                */
+               while ((readl(reg_dbg1) & 0xf) == 0x8 && elapsed < max_wait) {
+                       udelay(100);
+                       elapsed += 100;
+               }
+
+               if (elapsed >= max_wait)
+                       dev_err(&mxs_chan->mxs_dma->pdev->dev,
+                                       "Failed waiting for the DMA channel %d to leave state READ_FLUSH, trying to reset channel in READ_FLUSH state now\n",
+                                       chan_id);
+
                writel(1 << (chan_id + BP_APBHX_CHANNEL_CTRL_RESET_CHANNEL),
                        mxs_dma->base + HW_APBHX_CHANNEL_CTRL + STMP_OFFSET_REG_SET);
+       }
+
+       mxs_chan->status = DMA_COMPLETE;
 }
 
 static void mxs_dma_enable_chan(struct mxs_dma_chan *mxs_chan)
@@ -219,12 +260,21 @@ static void mxs_dma_enable_chan(struct mxs_dma_chan *mxs_chan)
                mxs_dma->base + HW_APBHX_CHn_NXTCMDAR(mxs_dma, chan_id));
 
        /* write 1 to SEMA to kick off the channel */
-       writel(1, mxs_dma->base + HW_APBHX_CHn_SEMA(mxs_dma, chan_id));
+       if (mxs_chan->flags & MXS_DMA_USE_SEMAPHORE &&
+                       mxs_chan->flags & MXS_DMA_SG_LOOP) {
+               /* A cyclic DMA consists of at least 2 segments, so initialize
+                * the semaphore with 2 so we have enough time to add 1 to the
+                * semaphore if we need to */
+               writel(2, mxs_dma->base + HW_APBHX_CHn_SEMA(mxs_dma, chan_id));
+       } else {
+               writel(1, mxs_dma->base + HW_APBHX_CHn_SEMA(mxs_dma, chan_id));
+       }
+       mxs_chan->reset = false;
 }
 
 static void mxs_dma_disable_chan(struct mxs_dma_chan *mxs_chan)
 {
-       mxs_chan->status = DMA_SUCCESS;
+       mxs_chan->status = DMA_COMPLETE;
 }
 
 static void mxs_dma_pause_chan(struct mxs_dma_chan *mxs_chan)
@@ -272,58 +322,88 @@ static void mxs_dma_tasklet(unsigned long data)
                mxs_chan->desc.callback(mxs_chan->desc.callback_param);
 }
 
+static int mxs_dma_irq_to_chan(struct mxs_dma_engine *mxs_dma, int irq)
+{
+       int i;
+
+       for (i = 0; i != mxs_dma->nr_channels; ++i)
+               if (mxs_dma->mxs_chans[i].chan_irq == irq)
+                       return i;
+
+       return -EINVAL;
+}
+
 static irqreturn_t mxs_dma_int_handler(int irq, void *dev_id)
 {
        struct mxs_dma_engine *mxs_dma = dev_id;
-       u32 stat1, stat2;
+       struct mxs_dma_chan *mxs_chan;
+       u32 completed;
+       u32 err;
+       int chan = mxs_dma_irq_to_chan(mxs_dma, irq);
+
+       if (chan < 0)
+               return IRQ_NONE;
 
        /* completion status */
-       stat1 = readl(mxs_dma->base + HW_APBHX_CTRL1);
-       stat1 &= MXS_DMA_CHANNELS_MASK;
-       writel(stat1, mxs_dma->base + HW_APBHX_CTRL1 + STMP_OFFSET_REG_CLR);
+       completed = readl(mxs_dma->base + HW_APBHX_CTRL1);
+       completed = (completed >> chan) & 0x1;
+
+       /* Clear interrupt */
+       writel((1 << chan),
+                       mxs_dma->base + HW_APBHX_CTRL1 + STMP_OFFSET_REG_CLR);
 
        /* error status */
-       stat2 = readl(mxs_dma->base + HW_APBHX_CTRL2);
-       writel(stat2, mxs_dma->base + HW_APBHX_CTRL2 + STMP_OFFSET_REG_CLR);
+       err = readl(mxs_dma->base + HW_APBHX_CTRL2);
+       err &= (1 << (MXS_DMA_CHANNELS + chan)) | (1 << chan);
+
+       /*
+        * error status bit is in the upper 16 bits, error irq bit in the lower
+        * 16 bits. We transform it into a simpler error code:
+        * err: 0x00 = no error, 0x01 = TERMINATION, 0x02 = BUS_ERROR
+        */
+       err = (err >> (MXS_DMA_CHANNELS + chan)) + (err >> chan);
+
+       /* Clear error irq */
+       writel((1 << chan),
+                       mxs_dma->base + HW_APBHX_CTRL2 + STMP_OFFSET_REG_CLR);
 
        /*
         * When both completion and error of termination bits set at the
         * same time, we do not take it as an error.  IOW, it only becomes
-        * an error we need to handle here in case of either it's (1) a bus
-        * error or (2) a termination error with no completion.
+        * an error we need to handle here in case of either it's a bus
+        * error or a termination error with no completion. 0x01 is termination
+        * error, so we can subtract err & completed to get the real error case.
         */
-       stat2 = ((stat2 >> MXS_DMA_CHANNELS) & stat2) | /* (1) */
-               (~(stat2 >> MXS_DMA_CHANNELS) & stat2 & ~stat1); /* (2) */
-
-       /* combine error and completion status for checking */
-       stat1 = (stat2 << MXS_DMA_CHANNELS) | stat1;
-       while (stat1) {
-               int channel = fls(stat1) - 1;
-               struct mxs_dma_chan *mxs_chan =
-                       &mxs_dma->mxs_chans[channel % MXS_DMA_CHANNELS];
-
-               if (channel >= MXS_DMA_CHANNELS) {
-                       dev_dbg(mxs_dma->dma_device.dev,
-                               "%s: error in channel %d\n", __func__,
-                               channel - MXS_DMA_CHANNELS);
-                       mxs_chan->status = DMA_ERROR;
-                       mxs_dma_reset_chan(mxs_chan);
-               } else {
-                       if (mxs_chan->flags & MXS_DMA_SG_LOOP)
-                               mxs_chan->status = DMA_IN_PROGRESS;
-                       else
-                               mxs_chan->status = DMA_SUCCESS;
-               }
+       err -= err & completed;
 
-               stat1 &= ~(1 << channel);
+       mxs_chan = &mxs_dma->mxs_chans[chan];
 
-               if (mxs_chan->status == DMA_SUCCESS)
-                       dma_cookie_complete(&mxs_chan->desc);
+       if (err) {
+               dev_dbg(mxs_dma->dma_device.dev,
+                       "%s: error in channel %d\n", __func__,
+                       chan);
+               mxs_chan->status = DMA_ERROR;
+               mxs_dma_reset_chan(mxs_chan);
+       } else if (mxs_chan->status != DMA_COMPLETE) {
+               if (mxs_chan->flags & MXS_DMA_SG_LOOP) {
+                       mxs_chan->status = DMA_IN_PROGRESS;
+                       if (mxs_chan->flags & MXS_DMA_USE_SEMAPHORE)
+                               writel(1, mxs_dma->base +
+                                       HW_APBHX_CHn_SEMA(mxs_dma, chan));
+               } else {
+                       mxs_chan->status = DMA_COMPLETE;
+               }
+       }
 
-               /* schedule tasklet on this channel */
-               tasklet_schedule(&mxs_chan->tasklet);
+       if (mxs_chan->status == DMA_COMPLETE) {
+               if (mxs_chan->reset)
+                       return IRQ_HANDLED;
+               dma_cookie_complete(&mxs_chan->desc);
        }
 
+       /* schedule tasklet on this channel */
+       tasklet_schedule(&mxs_chan->tasklet);
+
        return IRQ_HANDLED;
 }
 
@@ -523,6 +603,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic(
 
        mxs_chan->status = DMA_IN_PROGRESS;
        mxs_chan->flags |= MXS_DMA_SG_LOOP;
+       mxs_chan->flags |= MXS_DMA_USE_SEMAPHORE;
 
        if (num_periods > NUM_CCW) {
                dev_err(mxs_dma->dma_device.dev,
@@ -554,6 +635,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic(
                ccw->bits |= CCW_IRQ;
                ccw->bits |= CCW_HALT_ON_TERM;
                ccw->bits |= CCW_TERM_FLUSH;
+               ccw->bits |= CCW_DEC_SEM;
                ccw->bits |= BF_CCW(direction == DMA_DEV_TO_MEM ?
                                MXS_DMA_CMD_WRITE : MXS_DMA_CMD_READ, COMMAND);
 
@@ -599,8 +681,24 @@ static enum dma_status mxs_dma_tx_status(struct dma_chan *chan,
                        dma_cookie_t cookie, struct dma_tx_state *txstate)
 {
        struct mxs_dma_chan *mxs_chan = to_mxs_dma_chan(chan);
+       struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma;
+       u32 residue = 0;
+
+       if (mxs_chan->status == DMA_IN_PROGRESS &&
+                       mxs_chan->flags & MXS_DMA_SG_LOOP) {
+               struct mxs_dma_ccw *last_ccw;
+               u32 bar;
+
+               last_ccw = &mxs_chan->ccw[mxs_chan->desc_count - 1];
+               residue = last_ccw->xfer_bytes + last_ccw->bufaddr;
+
+               bar = readl(mxs_dma->base +
+                               HW_APBHX_CHn_BAR(mxs_dma, chan->chan_id));
+               residue -= bar;
+       }
 
-       dma_set_tx_state(txstate, chan->completed_cookie, chan->cookie, 0);
+       dma_set_tx_state(txstate, chan->completed_cookie, chan->cookie,
+                       residue);
 
        return mxs_chan->status;
 }
index ec3fc4f..2f66cf4 100644 (file)
@@ -248,7 +248,7 @@ static enum dma_status omap_dma_tx_status(struct dma_chan *chan,
        unsigned long flags;
 
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret == DMA_SUCCESS || !txstate)
+       if (ret == DMA_COMPLETE || !txstate)
                return ret;
 
        spin_lock_irqsave(&c->vc.lock, flags);
index df8b10f..cdf0483 100644 (file)
@@ -2268,6 +2268,8 @@ static void pl330_tasklet(unsigned long data)
                        list_move_tail(&desc->node, &pch->dmac->desc_pool);
                }
 
+               dma_descriptor_unmap(&desc->txd);
+
                if (callback) {
                        spin_unlock_irqrestore(&pch->lock, flags);
                        callback(callback_param);
@@ -2314,7 +2316,7 @@ bool pl330_filter(struct dma_chan *chan, void *param)
                return false;
 
        peri_id = chan->private;
-       return *peri_id == (unsigned)param;
+       return *peri_id == (unsigned long)param;
 }
 EXPORT_SYMBOL(pl330_filter);
 
@@ -2926,16 +2928,23 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 
        amba_set_drvdata(adev, pdmac);
 
-       irq = adev->irq[0];
-       ret = request_irq(irq, pl330_irq_handler, 0,
-                       dev_name(&adev->dev), pi);
-       if (ret)
-               return ret;
+       for (i = 0; i < AMBA_NR_IRQS; i++) {
+               irq = adev->irq[i];
+               if (irq) {
+                       ret = devm_request_irq(&adev->dev, irq,
+                                              pl330_irq_handler, 0,
+                                              dev_name(&adev->dev), pi);
+                       if (ret)
+                               return ret;
+               } else {
+                       break;
+               }
+       }
 
        pi->pcfg.periph_id = adev->periphid;
        ret = pl330_add(pi);
        if (ret)
-               goto probe_err1;
+               return ret;
 
        INIT_LIST_HEAD(&pdmac->desc_pool);
        spin_lock_init(&pdmac->pool_lock);
@@ -3033,8 +3042,6 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 
        return 0;
 probe_err3:
-       amba_set_drvdata(adev, NULL);
-
        /* Idle the DMAC */
        list_for_each_entry_safe(pch, _p, &pdmac->ddma.channels,
                        chan.device_node) {
@@ -3048,8 +3055,6 @@ probe_err3:
        }
 probe_err2:
        pl330_del(pi);
-probe_err1:
-       free_irq(irq, pi);
 
        return ret;
 }
@@ -3059,7 +3064,6 @@ static int pl330_remove(struct amba_device *adev)
        struct dma_pl330_dmac *pdmac = amba_get_drvdata(adev);
        struct dma_pl330_chan *pch, *_p;
        struct pl330_info *pi;
-       int irq;
 
        if (!pdmac)
                return 0;
@@ -3068,7 +3072,6 @@ static int pl330_remove(struct amba_device *adev)
                of_dma_controller_free(adev->dev.of_node);
 
        dma_async_device_unregister(&pdmac->ddma);
-       amba_set_drvdata(adev, NULL);
 
        /* Idle the DMAC */
        list_for_each_entry_safe(pch, _p, &pdmac->ddma.channels,
@@ -3086,9 +3089,6 @@ static int pl330_remove(struct amba_device *adev)
 
        pl330_del(pi);
 
-       irq = adev->irq[0];
-       free_irq(irq, pi);
-
        return 0;
 }
 
index e24b5ef..8da48c6 100644 (file)
@@ -804,218 +804,6 @@ static void ppc440spe_desc_set_link(struct ppc440spe_adma_chan *chan,
 }
 
 /**
- * ppc440spe_desc_get_src_addr - extract the source address from the descriptor
- */
-static u32 ppc440spe_desc_get_src_addr(struct ppc440spe_adma_desc_slot *desc,
-                               struct ppc440spe_adma_chan *chan, int src_idx)
-{
-       struct dma_cdb *dma_hw_desc;
-       struct xor_cb *xor_hw_desc;
-
-       switch (chan->device->id) {
-       case PPC440SPE_DMA0_ID:
-       case PPC440SPE_DMA1_ID:
-               dma_hw_desc = desc->hw_desc;
-               /* May have 0, 1, 2, or 3 sources */
-               switch (dma_hw_desc->opc) {
-               case DMA_CDB_OPC_NO_OP:
-               case DMA_CDB_OPC_DFILL128:
-                       return 0;
-               case DMA_CDB_OPC_DCHECK128:
-                       if (unlikely(src_idx)) {
-                               printk(KERN_ERR "%s: try to get %d source for"
-                                   " DCHECK128\n", __func__, src_idx);
-                               BUG();
-                       }
-                       return le32_to_cpu(dma_hw_desc->sg1l);
-               case DMA_CDB_OPC_MULTICAST:
-               case DMA_CDB_OPC_MV_SG1_SG2:
-                       if (unlikely(src_idx > 2)) {
-                               printk(KERN_ERR "%s: try to get %d source from"
-                                   " DMA descr\n", __func__, src_idx);
-                               BUG();
-                       }
-                       if (src_idx) {
-                               if (le32_to_cpu(dma_hw_desc->sg1u) &
-                                   DMA_CUED_XOR_WIN_MSK) {
-                                       u8 region;
-
-                                       if (src_idx == 1)
-                                               return le32_to_cpu(
-                                                   dma_hw_desc->sg1l) +
-                                                       desc->unmap_len;
-
-                                       region = (le32_to_cpu(
-                                           dma_hw_desc->sg1u)) >>
-                                               DMA_CUED_REGION_OFF;
-
-                                       region &= DMA_CUED_REGION_MSK;
-                                       switch (region) {
-                                       case DMA_RXOR123:
-                                               return le32_to_cpu(
-                                                   dma_hw_desc->sg1l) +
-                                                       (desc->unmap_len << 1);
-                                       case DMA_RXOR124:
-                                               return le32_to_cpu(
-                                                   dma_hw_desc->sg1l) +
-                                                       (desc->unmap_len * 3);
-                                       case DMA_RXOR125:
-                                               return le32_to_cpu(
-                                                   dma_hw_desc->sg1l) +
-                                                       (desc->unmap_len << 2);
-                                       default:
-                                               printk(KERN_ERR
-                                                   "%s: try to"
-                                                   " get src3 for region %02x"
-                                                   "PPC440SPE_DESC_RXOR12?\n",
-                                                   __func__, region);
-                                               BUG();
-                                       }
-                               } else {
-                                       printk(KERN_ERR
-                                               "%s: try to get %d"
-                                               " source for non-cued descr\n",
-                                               __func__, src_idx);
-                                       BUG();
-                               }
-                       }
-                       return le32_to_cpu(dma_hw_desc->sg1l);
-               default:
-                       printk(KERN_ERR "%s: unknown OPC 0x%02x\n",
-                               __func__, dma_hw_desc->opc);
-                       BUG();
-               }
-               return le32_to_cpu(dma_hw_desc->sg1l);
-       case PPC440SPE_XOR_ID:
-               /* May have up to 16 sources */
-               xor_hw_desc = desc->hw_desc;
-               return xor_hw_desc->ops[src_idx].l;
-       }
-       return 0;
-}
-
-/**
- * ppc440spe_desc_get_dest_addr - extract the destination address from the
- * descriptor
- */
-static u32 ppc440spe_desc_get_dest_addr(struct ppc440spe_adma_desc_slot *desc,
-                               struct ppc440spe_adma_chan *chan, int idx)
-{
-       struct dma_cdb *dma_hw_desc;
-       struct xor_cb *xor_hw_desc;
-
-       switch (chan->device->id) {
-       case PPC440SPE_DMA0_ID:
-       case PPC440SPE_DMA1_ID:
-               dma_hw_desc = desc->hw_desc;
-
-               if (likely(!idx))
-                       return le32_to_cpu(dma_hw_desc->sg2l);
-               return le32_to_cpu(dma_hw_desc->sg3l);
-       case PPC440SPE_XOR_ID:
-               xor_hw_desc = desc->hw_desc;
-               return xor_hw_desc->cbtal;
-       }
-       return 0;
-}
-
-/**
- * ppc440spe_desc_get_src_num - extract the number of source addresses from
- * the descriptor
- */
-static u32 ppc440spe_desc_get_src_num(struct ppc440spe_adma_desc_slot *desc,
-                               struct ppc440spe_adma_chan *chan)
-{
-       struct dma_cdb *dma_hw_desc;
-       struct xor_cb *xor_hw_desc;
-
-       switch (chan->device->id) {
-       case PPC440SPE_DMA0_ID:
-       case PPC440SPE_DMA1_ID:
-               dma_hw_desc = desc->hw_desc;
-
-               switch (dma_hw_desc->opc) {
-               case DMA_CDB_OPC_NO_OP:
-               case DMA_CDB_OPC_DFILL128:
-                       return 0;
-               case DMA_CDB_OPC_DCHECK128:
-                       return 1;
-               case DMA_CDB_OPC_MV_SG1_SG2:
-               case DMA_CDB_OPC_MULTICAST:
-                       /*
-                        * Only for RXOR operations we have more than
-                        * one source
-                        */
-                       if (le32_to_cpu(dma_hw_desc->sg1u) &
-                           DMA_CUED_XOR_WIN_MSK) {
-                               /* RXOR op, there are 2 or 3 sources */
-                               if (((le32_to_cpu(dma_hw_desc->sg1u) >>
-                                   DMA_CUED_REGION_OFF) &
-                                     DMA_CUED_REGION_MSK) == DMA_RXOR12) {
-                                       /* RXOR 1-2 */
-                                       return 2;
-                               } else {
-                                       /* RXOR 1-2-3/1-2-4/1-2-5 */
-                                       return 3;
-                               }
-                       }
-                       return 1;
-               default:
-                       printk(KERN_ERR "%s: unknown OPC 0x%02x\n",
-                               __func__, dma_hw_desc->opc);
-                       BUG();
-               }
-       case PPC440SPE_XOR_ID:
-               /* up to 16 sources */
-               xor_hw_desc = desc->hw_desc;
-               return xor_hw_desc->cbc & XOR_CDCR_OAC_MSK;
-       default:
-               BUG();
-       }
-       return 0;
-}
-
-/**
- * ppc440spe_desc_get_dst_num - get the number of destination addresses in
- * this descriptor
- */
-static u32 ppc440spe_desc_get_dst_num(struct ppc440spe_adma_desc_slot *desc,
-                               struct ppc440spe_adma_chan *chan)
-{
-       struct dma_cdb *dma_hw_desc;
-
-       switch (chan->device->id) {
-       case PPC440SPE_DMA0_ID:
-       case PPC440SPE_DMA1_ID:
-               /* May be 1 or 2 destinations */
-               dma_hw_desc = desc->hw_desc;
-               switch (dma_hw_desc->opc) {
-               case DMA_CDB_OPC_NO_OP:
-               case DMA_CDB_OPC_DCHECK128:
-                       return 0;
-               case DMA_CDB_OPC_MV_SG1_SG2:
-               case DMA_CDB_OPC_DFILL128:
-                       return 1;
-               case DMA_CDB_OPC_MULTICAST:
-                       if (desc->dst_cnt == 2)
-                               return 2;
-                       else
-                               return 1;
-               default:
-                       printk(KERN_ERR "%s: unknown OPC 0x%02x\n",
-                               __func__, dma_hw_desc->opc);
-                       BUG();
-               }
-       case PPC440SPE_XOR_ID:
-               /* Always only 1 destination */
-               return 1;
-       default:
-               BUG();
-       }
-       return 0;
-}
-
-/**
  * ppc440spe_desc_get_link - get the address of the descriptor that
  * follows this one
  */
@@ -1707,43 +1495,6 @@ static void ppc440spe_adma_free_slots(struct ppc440spe_adma_desc_slot *slot,
        }
 }
 
-static void ppc440spe_adma_unmap(struct ppc440spe_adma_chan *chan,
-                                struct ppc440spe_adma_desc_slot *desc)
-{
-       u32 src_cnt, dst_cnt;
-       dma_addr_t addr;
-
-       /*
-        * get the number of sources & destination
-        * included in this descriptor and unmap
-        * them all
-        */
-       src_cnt = ppc440spe_desc_get_src_num(desc, chan);
-       dst_cnt = ppc440spe_desc_get_dst_num(desc, chan);
-
-       /* unmap destinations */
-       if (!(desc->async_tx.flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-               while (dst_cnt--) {
-                       addr = ppc440spe_desc_get_dest_addr(
-                               desc, chan, dst_cnt);
-                       dma_unmap_page(chan->device->dev,
-                                       addr, desc->unmap_len,
-                                       DMA_FROM_DEVICE);
-               }
-       }
-
-       /* unmap sources */
-       if (!(desc->async_tx.flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-               while (src_cnt--) {
-                       addr = ppc440spe_desc_get_src_addr(
-                               desc, chan, src_cnt);
-                       dma_unmap_page(chan->device->dev,
-                                       addr, desc->unmap_len,
-                                       DMA_TO_DEVICE);
-               }
-       }
-}
-
 /**
  * ppc440spe_adma_run_tx_complete_actions - call functions to be called
  * upon completion
@@ -1767,26 +1518,7 @@ static dma_cookie_t ppc440spe_adma_run_tx_complete_actions(
                        desc->async_tx.callback(
                                desc->async_tx.callback_param);
 
-               /* unmap dma addresses
-                * (unmap_single vs unmap_page?)
-                *
-                * actually, ppc's dma_unmap_page() functions are empty, so
-                * the following code is just for the sake of completeness
-                */
-               if (chan && chan->needs_unmap && desc->group_head &&
-                    desc->unmap_len) {
-                       struct ppc440spe_adma_desc_slot *unmap =
-                                                       desc->group_head;
-                       /* assume 1 slot per op always */
-                       u32 slot_count = unmap->slot_cnt;
-
-                       /* Run through the group list and unmap addresses */
-                       for (i = 0; i < slot_count; i++) {
-                               BUG_ON(!unmap);
-                               ppc440spe_adma_unmap(chan, unmap);
-                               unmap = unmap->hw_next;
-                       }
-               }
+               dma_descriptor_unmap(&desc->async_tx);
        }
 
        /* run dependent operations */
@@ -3893,7 +3625,7 @@ static enum dma_status ppc440spe_adma_tx_status(struct dma_chan *chan,
 
        ppc440spe_chan = to_ppc440spe_adma_chan(chan);
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret == DMA_SUCCESS)
+       if (ret == DMA_COMPLETE)
                return ret;
 
        ppc440spe_adma_slot_cleanup(ppc440spe_chan);
index 461a91a..ab26d46 100644 (file)
@@ -436,7 +436,7 @@ static enum dma_status sa11x0_dma_tx_status(struct dma_chan *chan,
        enum dma_status ret;
 
        ret = dma_cookie_status(&c->vc.chan, cookie, state);
-       if (ret == DMA_SUCCESS)
+       if (ret == DMA_COMPLETE)
                return ret;
 
        if (!state)
index d94ab59..2e7b394 100644 (file)
@@ -724,7 +724,7 @@ static enum dma_status shdma_tx_status(struct dma_chan *chan,
         * If we don't find cookie on the queue, it has been aborted and we have
         * to report error
         */
-       if (status != DMA_SUCCESS) {
+       if (status != DMA_COMPLETE) {
                struct shdma_desc *sdesc;
                status = DMA_ERROR;
                list_for_each_entry(sdesc, &schan->ld_queue, node)
index 1069e88..0d765c0 100644 (file)
@@ -685,7 +685,7 @@ MODULE_DEVICE_TABLE(of, sh_dmae_of_match);
 static int sh_dmae_probe(struct platform_device *pdev)
 {
        const struct sh_dmae_pdata *pdata;
-       unsigned long irqflags = IRQF_DISABLED,
+       unsigned long irqflags = 0,
                chan_flag[SH_DMAE_MAX_CHANNELS] = {};
        int errirq, chan_irq[SH_DMAE_MAX_CHANNELS];
        int err, i, irq_cnt = 0, irqres = 0, irq_cap = 0;
@@ -838,7 +838,7 @@ static int sh_dmae_probe(struct platform_device *pdev)
                                    IORESOURCE_IRQ_SHAREABLE)
                                        chan_flag[irq_cnt] = IRQF_SHARED;
                                else
-                                       chan_flag[irq_cnt] = IRQF_DISABLED;
+                                       chan_flag[irq_cnt] = 0;
                                dev_dbg(&pdev->dev,
                                        "Found IRQ %d for channel %d\n",
                                        i, irq_cnt);
index 82d2b97..b8c031b 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/platform_device.h>
 #include <linux/clk.h>
 #include <linux/delay.h>
+#include <linux/log2.h>
 #include <linux/pm.h>
 #include <linux/pm_runtime.h>
 #include <linux/err.h>
@@ -2626,7 +2627,7 @@ static enum dma_status d40_tx_status(struct dma_chan *chan,
        }
 
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret != DMA_SUCCESS)
+       if (ret != DMA_COMPLETE)
                dma_set_residue(txstate, stedma40_residue(chan));
 
        if (d40_is_paused(d40c))
@@ -2796,8 +2797,8 @@ static int d40_set_runtime_config(struct dma_chan *chan,
            src_addr_width >  DMA_SLAVE_BUSWIDTH_8_BYTES   ||
            dst_addr_width <= DMA_SLAVE_BUSWIDTH_UNDEFINED ||
            dst_addr_width >  DMA_SLAVE_BUSWIDTH_8_BYTES   ||
-           ((src_addr_width > 1) && (src_addr_width & 1)) ||
-           ((dst_addr_width > 1) && (dst_addr_width & 1)))
+           !is_power_of_2(src_addr_width) ||
+           !is_power_of_2(dst_addr_width))
                return -EINVAL;
 
        cfg->src_info.data_width = src_addr_width;
index 5d4986e..73654e3 100644 (file)
@@ -570,7 +570,7 @@ static void handle_once_dma_done(struct tegra_dma_channel *tdc,
 
        list_del(&sgreq->node);
        if (sgreq->last_sg) {
-               dma_desc->dma_status = DMA_SUCCESS;
+               dma_desc->dma_status = DMA_COMPLETE;
                dma_cookie_complete(&dma_desc->txd);
                if (!dma_desc->cb_count)
                        list_add_tail(&dma_desc->cb_node, &tdc->cb_desc);
@@ -768,7 +768,7 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc,
        unsigned int residual;
 
        ret = dma_cookie_status(dc, cookie, txstate);
-       if (ret == DMA_SUCCESS)
+       if (ret == DMA_COMPLETE)
                return ret;
 
        spin_lock_irqsave(&tdc->lock, flags);
@@ -1018,7 +1018,7 @@ static struct dma_async_tx_descriptor *tegra_dma_prep_slave_sg(
        return &dma_desc->txd;
 }
 
-struct dma_async_tx_descriptor *tegra_dma_prep_dma_cyclic(
+static struct dma_async_tx_descriptor *tegra_dma_prep_dma_cyclic(
        struct dma_chan *dc, dma_addr_t buf_addr, size_t buf_len,
        size_t period_len, enum dma_transfer_direction direction,
        unsigned long flags, void *context)
index 28af214..4506a7b 100644 (file)
@@ -154,38 +154,6 @@ static bool __td_dma_done_ack(struct timb_dma_chan *td_chan)
        return done;
 }
 
-static void __td_unmap_desc(struct timb_dma_chan *td_chan, const u8 *dma_desc,
-       bool single)
-{
-       dma_addr_t addr;
-       int len;
-
-       addr = (dma_desc[7] << 24) | (dma_desc[6] << 16) | (dma_desc[5] << 8) |
-               dma_desc[4];
-
-       len = (dma_desc[3] << 8) | dma_desc[2];
-
-       if (single)
-               dma_unmap_single(chan2dev(&td_chan->chan), addr, len,
-                       DMA_TO_DEVICE);
-       else
-               dma_unmap_page(chan2dev(&td_chan->chan), addr, len,
-                       DMA_TO_DEVICE);
-}
-
-static void __td_unmap_descs(struct timb_dma_desc *td_desc, bool single)
-{
-       struct timb_dma_chan *td_chan = container_of(td_desc->txd.chan,
-               struct timb_dma_chan, chan);
-       u8 *descs;
-
-       for (descs = td_desc->desc_list; ; descs += TIMB_DMA_DESC_SIZE) {
-               __td_unmap_desc(td_chan, descs, single);
-               if (descs[0] & 0x02)
-                       break;
-       }
-}
-
 static int td_fill_desc(struct timb_dma_chan *td_chan, u8 *dma_desc,
        struct scatterlist *sg, bool last)
 {
@@ -293,10 +261,7 @@ static void __td_finish(struct timb_dma_chan *td_chan)
 
        list_move(&td_desc->desc_node, &td_chan->free_list);
 
-       if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP))
-               __td_unmap_descs(td_desc,
-                       txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE);
-
+       dma_descriptor_unmap(txd);
        /*
         * The API requires that no submissions are done from a
         * callback, so we don't need to drop the lock here
index 71e8e77..bae6c29 100644 (file)
@@ -419,30 +419,7 @@ txx9dmac_descriptor_complete(struct txx9dmac_chan *dc,
        list_splice_init(&desc->tx_list, &dc->free_list);
        list_move(&desc->desc_node, &dc->free_list);
 
-       if (!ds) {
-               dma_addr_t dmaaddr;
-               if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-                       dmaaddr = is_dmac64(dc) ?
-                               desc->hwdesc.DAR : desc->hwdesc32.DAR;
-                       if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
-                               dma_unmap_single(chan2parent(&dc->chan),
-                                       dmaaddr, desc->len, DMA_FROM_DEVICE);
-                       else
-                               dma_unmap_page(chan2parent(&dc->chan),
-                                       dmaaddr, desc->len, DMA_FROM_DEVICE);
-               }
-               if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-                       dmaaddr = is_dmac64(dc) ?
-                               desc->hwdesc.SAR : desc->hwdesc32.SAR;
-                       if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
-                               dma_unmap_single(chan2parent(&dc->chan),
-                                       dmaaddr, desc->len, DMA_TO_DEVICE);
-                       else
-                               dma_unmap_page(chan2parent(&dc->chan),
-                                       dmaaddr, desc->len, DMA_TO_DEVICE);
-               }
-       }
-
+       dma_descriptor_unmap(txd);
        /*
         * The API requires that no submissions are done from a
         * callback, so we don't need to drop the lock here
@@ -962,8 +939,8 @@ txx9dmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
        enum dma_status ret;
 
        ret = dma_cookie_status(chan, cookie, txstate);
-       if (ret == DMA_SUCCESS)
-               return DMA_SUCCESS;
+       if (ret == DMA_COMPLETE)
+               return DMA_COMPLETE;
 
        spin_lock_bh(&dc->lock);
        txx9dmac_scan_descriptors(dc);
index 7dd4461..4e10b10 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/acpi_gpio.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
+#include <linux/acpi.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/gpio.h>
index 43959ed..dfff090 100644 (file)
@@ -196,7 +196,7 @@ static bool intel_dsm_pci_probe(struct pci_dev *pdev)
        acpi_handle dhandle;
        int ret;
 
-       dhandle = DEVICE_ACPI_HANDLE(&pdev->dev);
+       dhandle = ACPI_HANDLE(&pdev->dev);
        if (!dhandle)
                return false;
 
index 1b2f41c..6d69a9b 100644 (file)
@@ -638,7 +638,7 @@ static void intel_didl_outputs(struct drm_device *dev)
        u32 temp;
        int i = 0;
 
-       handle = DEVICE_ACPI_HANDLE(&dev->pdev->dev);
+       handle = ACPI_HANDLE(&dev->pdev->dev);
        if (!handle || acpi_bus_get_device(handle, &acpi_dev))
                return;
 
index e286e13..1291204 100644 (file)
@@ -116,7 +116,7 @@ mxm_shadow_dsm(struct nouveau_mxm *mxm, u8 version)
        acpi_handle handle;
        int ret;
 
-       handle = DEVICE_ACPI_HANDLE(&device->pdev->dev);
+       handle = ACPI_HANDLE(&device->pdev->dev);
        if (!handle)
                return false;
 
index 07273a2..95c7404 100644 (file)
@@ -256,7 +256,7 @@ static int nouveau_dsm_pci_probe(struct pci_dev *pdev)
        acpi_handle dhandle;
        int retval = 0;
 
-       dhandle = DEVICE_ACPI_HANDLE(&pdev->dev);
+       dhandle = ACPI_HANDLE(&pdev->dev);
        if (!dhandle)
                return false;
 
@@ -414,7 +414,7 @@ bool nouveau_acpi_rom_supported(struct pci_dev *pdev)
        if (!nouveau_dsm_priv.dsm_detected && !nouveau_dsm_priv.optimus_detected)
                return false;
 
-       dhandle = DEVICE_ACPI_HANDLE(&pdev->dev);
+       dhandle = ACPI_HANDLE(&pdev->dev);
        if (!dhandle)
                return false;
 
@@ -448,7 +448,7 @@ nouveau_acpi_edid(struct drm_device *dev, struct drm_connector *connector)
                return NULL;
        }
 
-       handle = DEVICE_ACPI_HANDLE(&dev->pdev->dev);
+       handle = ACPI_HANDLE(&dev->pdev->dev);
        if (!handle)
                return NULL;
 
index 10f98c7..98a9074 100644 (file)
@@ -369,7 +369,7 @@ int radeon_atif_handler(struct radeon_device *rdev,
                return NOTIFY_DONE;
 
        /* Check pending SBIOS requests */
-       handle = DEVICE_ACPI_HANDLE(&rdev->pdev->dev);
+       handle = ACPI_HANDLE(&rdev->pdev->dev);
        count = radeon_atif_get_sbios_requests(handle, &req);
 
        if (count <= 0)
@@ -556,7 +556,7 @@ int radeon_acpi_pcie_notify_device_ready(struct radeon_device *rdev)
        struct radeon_atcs *atcs = &rdev->atcs;
 
        /* Get the device handle */
-       handle = DEVICE_ACPI_HANDLE(&rdev->pdev->dev);
+       handle = ACPI_HANDLE(&rdev->pdev->dev);
        if (!handle)
                return -EINVAL;
 
@@ -596,7 +596,7 @@ int radeon_acpi_pcie_performance_request(struct radeon_device *rdev,
        u32 retry = 3;
 
        /* Get the device handle */
-       handle = DEVICE_ACPI_HANDLE(&rdev->pdev->dev);
+       handle = ACPI_HANDLE(&rdev->pdev->dev);
        if (!handle)
                return -EINVAL;
 
@@ -699,7 +699,7 @@ int radeon_acpi_init(struct radeon_device *rdev)
        int ret;
 
        /* Get the device handle */
-       handle = DEVICE_ACPI_HANDLE(&rdev->pdev->dev);
+       handle = ACPI_HANDLE(&rdev->pdev->dev);
 
        /* No need to proceed if we're sure that ATIF is not supported */
        if (!ASIC_IS_AVIVO(rdev) || !rdev->bios || !handle)
index 6153ec1..9d302ea 100644 (file)
@@ -8,8 +8,7 @@
  */
 #include <linux/vga_switcheroo.h>
 #include <linux/slab.h>
-#include <acpi/acpi.h>
-#include <acpi/acpi_bus.h>
+#include <linux/acpi.h>
 #include <linux/pci.h>
 
 #include "radeon_acpi.h"
@@ -447,7 +446,7 @@ static bool radeon_atpx_pci_probe_handle(struct pci_dev *pdev)
        acpi_handle dhandle, atpx_handle;
        acpi_status status;
 
-       dhandle = DEVICE_ACPI_HANDLE(&pdev->dev);
+       dhandle = ACPI_HANDLE(&pdev->dev);
        if (!dhandle)
                return false;
 
@@ -493,7 +492,7 @@ static int radeon_atpx_init(void)
  */
 static int radeon_atpx_get_client_id(struct pci_dev *pdev)
 {
-       if (radeon_atpx_priv.dhandle == DEVICE_ACPI_HANDLE(&pdev->dev))
+       if (radeon_atpx_priv.dhandle == ACPI_HANDLE(&pdev->dev))
                return VGA_SWITCHEROO_IGD;
        else
                return VGA_SWITCHEROO_DIS;
index c155d6f..b3633d9 100644 (file)
@@ -185,7 +185,7 @@ static bool radeon_atrm_get_bios(struct radeon_device *rdev)
                return false;
 
        while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev)) != NULL) {
-               dhandle = DEVICE_ACPI_HANDLE(&pdev->dev);
+               dhandle = ACPI_HANDLE(&pdev->dev);
                if (!dhandle)
                        continue;
 
index ae48d18..5f7e55f 100644 (file)
@@ -1008,7 +1008,7 @@ static int i2c_hid_probe(struct i2c_client *client,
        hid->hid_get_raw_report = i2c_hid_get_raw_report;
        hid->hid_output_raw_report = i2c_hid_output_raw_report;
        hid->dev.parent = &client->dev;
-       ACPI_HANDLE_SET(&hid->dev, ACPI_HANDLE(&client->dev));
+       ACPI_COMPANION_SET(&hid->dev, ACPI_COMPANION(&client->dev));
        hid->bus = BUS_I2C;
        hid->version = le16_to_cpu(ihid->hdesc.bcdVersion);
        hid->vendor = le16_to_cpu(ihid->hdesc.wVendorID);
index 5923cfa..d74c0b3 100644 (file)
@@ -615,6 +615,22 @@ void i2c_unlock_adapter(struct i2c_adapter *adapter)
 }
 EXPORT_SYMBOL_GPL(i2c_unlock_adapter);
 
+static void i2c_dev_set_name(struct i2c_adapter *adap,
+                            struct i2c_client *client)
+{
+       struct acpi_device *adev = ACPI_COMPANION(&client->dev);
+
+       if (adev) {
+               dev_set_name(&client->dev, "i2c-%s", acpi_dev_name(adev));
+               return;
+       }
+
+       /* For 10-bit clients, add an arbitrary offset to avoid collisions */
+       dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap),
+                    client->addr | ((client->flags & I2C_CLIENT_TEN)
+                                    ? 0xa000 : 0));
+}
+
 /**
  * i2c_new_device - instantiate an i2c device
  * @adap: the adapter managing the device
@@ -671,12 +687,9 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
        client->dev.bus = &i2c_bus_type;
        client->dev.type = &i2c_client_type;
        client->dev.of_node = info->of_node;
-       ACPI_HANDLE_SET(&client->dev, info->acpi_node.handle);
+       ACPI_COMPANION_SET(&client->dev, info->acpi_node.companion);
 
-       /* For 10-bit clients, add an arbitrary offset to avoid collisions */
-       dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap),
-                    client->addr | ((client->flags & I2C_CLIENT_TEN)
-                                    ? 0xa000 : 0));
+       i2c_dev_set_name(adap, client);
        status = device_register(&client->dev);
        if (status)
                goto out_err;
@@ -1100,7 +1113,7 @@ static acpi_status acpi_i2c_add_device(acpi_handle handle, u32 level,
                return AE_OK;
 
        memset(&info, 0, sizeof(info));
-       info.acpi_node.handle = handle;
+       info.acpi_node.companion = adev;
        info.irq = -1;
 
        INIT_LIST_HEAD(&resource_list);
index 140c8ef..d9e1f7c 100644 (file)
@@ -7,6 +7,7 @@
  * Copyright (C) 2006 Hannes Reinecke
  */
 
+#include <linux/acpi.h>
 #include <linux/ata.h>
 #include <linux/delay.h>
 #include <linux/device.h>
@@ -19,8 +20,6 @@
 #include <linux/dmi.h>
 #include <linux/module.h>
 
-#include <acpi/acpi_bus.h>
-
 #define REGS_PER_GTF           7
 
 struct GTM_buffer {
@@ -128,7 +127,7 @@ static int ide_get_dev_handle(struct device *dev, acpi_handle *handle,
 
        DEBPRINT("ENTER: pci %02x:%02x.%01x\n", bus, devnum, func);
 
-       dev_handle = DEVICE_ACPI_HANDLE(dev);
+       dev_handle = ACPI_HANDLE(dev);
        if (!dev_handle) {
                DEBPRINT("no acpi handle for device\n");
                goto err;
index 3226ce9..cbd4e9a 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * intel_idle.c - native hardware idle loop for modern Intel processors
  *
- * Copyright (c) 2010, Intel Corporation.
+ * Copyright (c) 2013, Intel Corporation.
  * Len Brown <len.brown@intel.com>
  *
  * This program is free software; you can redistribute it and/or modify it
@@ -329,6 +329,22 @@ static struct cpuidle_state atom_cstates[] __initdata = {
        {
                .enter = NULL }
 };
+static struct cpuidle_state avn_cstates[CPUIDLE_STATE_MAX] = {
+       {
+               .name = "C1-AVN",
+               .desc = "MWAIT 0x00",
+               .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID,
+               .exit_latency = 2,
+               .target_residency = 2,
+               .enter = &intel_idle },
+       {
+               .name = "C6-AVN",
+               .desc = "MWAIT 0x51",
+               .flags = MWAIT2flg(0x58) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
+               .exit_latency = 15,
+               .target_residency = 45,
+               .enter = &intel_idle },
+};
 
 /**
  * intel_idle
@@ -462,6 +478,11 @@ static const struct idle_cpu idle_cpu_hsw = {
        .disable_promotion_to_c1e = true,
 };
 
+static const struct idle_cpu idle_cpu_avn = {
+       .state_table = avn_cstates,
+       .disable_promotion_to_c1e = true,
+};
+
 #define ICPU(model, cpu) \
        { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu }
 
@@ -483,6 +504,7 @@ static const struct x86_cpu_id intel_idle_ids[] = {
        ICPU(0x3f, idle_cpu_hsw),
        ICPU(0x45, idle_cpu_hsw),
        ICPU(0x46, idle_cpu_hsw),
+       ICPU(0x4D, idle_cpu_avn),
        {}
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids);
index 8766eab..b6b7a28 100644 (file)
@@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev)
 
 static struct ctl_table_header *raid_table_header;
 
-static ctl_table raid_table[] = {
+static struct ctl_table raid_table[] = {
        {
                .procname       = "speed_limit_min",
                .data           = &sysctl_speed_limit_min,
@@ -130,7 +130,7 @@ static ctl_table raid_table[] = {
        { }
 };
 
-static ctl_table raid_dir_table[] = {
+static struct ctl_table raid_dir_table[] = {
        {
                .procname       = "raid",
                .maxlen         = 0,
@@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = {
        { }
 };
 
-static ctl_table raid_root_table[] = {
+static struct ctl_table raid_root_table[] = {
        {
                .procname       = "dev",
                .maxlen         = 0,
@@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit)
        goto retry;
 }
 
-static inline int mddev_lock(struct mddev * mddev)
+static inline int __must_check mddev_lock(struct mddev * mddev)
 {
        return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
 
+/* Sometimes we need to take the lock in a situation where
+ * failure due to interrupts is not acceptable.
+ */
+static inline void mddev_lock_nointr(struct mddev * mddev)
+{
+       mutex_lock(&mddev->reconfig_mutex);
+}
+
 static inline int mddev_is_locked(struct mddev *mddev)
 {
        return mutex_is_locked(&mddev->reconfig_mutex);
@@ -2978,7 +2986,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
                for_each_mddev(mddev, tmp) {
                        struct md_rdev *rdev2;
 
-                       mddev_lock(mddev);
+                       mddev_lock_nointr(mddev);
                        rdev_for_each(rdev2, mddev)
                                if (rdev->bdev == rdev2->bdev &&
                                    rdev != rdev2 &&
@@ -2994,7 +3002,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
                                break;
                        }
                }
-               mddev_lock(my_mddev);
+               mddev_lock_nointr(my_mddev);
                if (overlap) {
                        /* Someone else could have slipped in a size
                         * change here, but doing so is just silly.
@@ -3580,6 +3588,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                mddev->in_sync = 1;
                del_timer_sync(&mddev->safemode_timer);
        }
+       blk_set_stacking_limits(&mddev->queue->limits);
        pers->run(mddev);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
        mddev_resume(mddev);
@@ -5258,7 +5267,7 @@ static void __md_stop_writes(struct mddev *mddev)
 
 void md_stop_writes(struct mddev *mddev)
 {
-       mddev_lock(mddev);
+       mddev_lock_nointr(mddev);
        __md_stop_writes(mddev);
        mddev_unlock(mddev);
 }
@@ -5291,20 +5300,35 @@ EXPORT_SYMBOL_GPL(md_stop);
 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 {
        int err = 0;
+       int did_freeze = 0;
+
+       if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+               did_freeze = 1;
+               set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+               md_wakeup_thread(mddev->thread);
+       }
+       if (mddev->sync_thread) {
+               set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+               /* Thread might be blocked waiting for metadata update
+                * which will now never happen */
+               wake_up_process(mddev->sync_thread->tsk);
+       }
+       mddev_unlock(mddev);
+       wait_event(resync_wait, mddev->sync_thread == NULL);
+       mddev_lock_nointr(mddev);
+
        mutex_lock(&mddev->open_mutex);
-       if (atomic_read(&mddev->openers) > !!bdev) {
+       if (atomic_read(&mddev->openers) > !!bdev ||
+           mddev->sync_thread ||
+           (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
                printk("md: %s still in use.\n",mdname(mddev));
+               if (did_freeze) {
+                       clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+                       md_wakeup_thread(mddev->thread);
+               }
                err = -EBUSY;
                goto out;
        }
-       if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
-               /* Someone opened the device since we flushed it
-                * so page cache could be dirty and it is too late
-                * to flush.  So abort
-                */
-               mutex_unlock(&mddev->open_mutex);
-               return -EBUSY;
-       }
        if (mddev->pers) {
                __md_stop_writes(mddev);
 
@@ -5315,7 +5339,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
                set_disk_ro(mddev->gendisk, 1);
                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                sysfs_notify_dirent_safe(mddev->sysfs_state);
-               err = 0;        
+               err = 0;
        }
 out:
        mutex_unlock(&mddev->open_mutex);
@@ -5331,20 +5355,34 @@ static int do_md_stop(struct mddev * mddev, int mode,
 {
        struct gendisk *disk = mddev->gendisk;
        struct md_rdev *rdev;
+       int did_freeze = 0;
+
+       if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+               did_freeze = 1;
+               set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+               md_wakeup_thread(mddev->thread);
+       }
+       if (mddev->sync_thread) {
+               set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+               /* Thread might be blocked waiting for metadata update
+                * which will now never happen */
+               wake_up_process(mddev->sync_thread->tsk);
+       }
+       mddev_unlock(mddev);
+       wait_event(resync_wait, mddev->sync_thread == NULL);
+       mddev_lock_nointr(mddev);
 
        mutex_lock(&mddev->open_mutex);
        if (atomic_read(&mddev->openers) > !!bdev ||
-           mddev->sysfs_active) {
+           mddev->sysfs_active ||
+           mddev->sync_thread ||
+           (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
                printk("md: %s still in use.\n",mdname(mddev));
                mutex_unlock(&mddev->open_mutex);
-               return -EBUSY;
-       }
-       if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
-               /* Someone opened the device since we flushed it
-                * so page cache could be dirty and it is too late
-                * to flush.  So abort
-                */
-               mutex_unlock(&mddev->open_mutex);
+               if (did_freeze) {
+                       clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+                       md_wakeup_thread(mddev->thread);
+               }
                return -EBUSY;
        }
        if (mddev->pers) {
@@ -6551,7 +6589,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
                                wait_event(mddev->sb_wait,
                                           !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
                                           !test_bit(MD_CHANGE_PENDING, &mddev->flags));
-                               mddev_lock(mddev);
+                               mddev_lock_nointr(mddev);
                        }
                } else {
                        err = -EROFS;
@@ -7361,9 +7399,6 @@ void md_do_sync(struct md_thread *thread)
                mddev->curr_resync = 2;
 
        try_again:
-               if (kthread_should_stop())
-                       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-
                if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
                        goto skip;
                for_each_mddev(mddev2, tmp) {
@@ -7388,7 +7423,7 @@ void md_do_sync(struct md_thread *thread)
                                 * be caught by 'softlockup'
                                 */
                                prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
-                               if (!kthread_should_stop() &&
+                               if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
                                    mddev2->curr_resync >= mddev->curr_resync) {
                                        printk(KERN_INFO "md: delaying %s of %s"
                                               " until %s has finished (they"
@@ -7464,7 +7499,7 @@ void md_do_sync(struct md_thread *thread)
        last_check = 0;
 
        if (j>2) {
-               printk(KERN_INFO 
+               printk(KERN_INFO
                       "md: resuming %s of %s from checkpoint.\n",
                       desc, mdname(mddev));
                mddev->curr_resync = j;
@@ -7501,7 +7536,8 @@ void md_do_sync(struct md_thread *thread)
                        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
                }
 
-               while (j >= mddev->resync_max && !kthread_should_stop()) {
+               while (j >= mddev->resync_max &&
+                      !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
                        /* As this condition is controlled by user-space,
                         * we can block indefinitely, so use '_interruptible'
                         * to avoid triggering warnings.
@@ -7509,17 +7545,18 @@ void md_do_sync(struct md_thread *thread)
                        flush_signals(current); /* just in case */
                        wait_event_interruptible(mddev->recovery_wait,
                                                 mddev->resync_max > j
-                                                || kthread_should_stop());
+                                                || test_bit(MD_RECOVERY_INTR,
+                                                            &mddev->recovery));
                }
 
-               if (kthread_should_stop())
-                       goto interrupted;
+               if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+                       break;
 
                sectors = mddev->pers->sync_request(mddev, j, &skipped,
                                                  currspeed < speed_min(mddev));
                if (sectors == 0) {
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                       goto out;
+                       break;
                }
 
                if (!skipped) { /* actual IO requested */
@@ -7556,10 +7593,8 @@ void md_do_sync(struct md_thread *thread)
                        last_mark = next;
                }
 
-
-               if (kthread_should_stop())
-                       goto interrupted;
-
+               if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+                       break;
 
                /*
                 * this loop exits only if either when we are slower than
@@ -7582,11 +7617,12 @@ void md_do_sync(struct md_thread *thread)
                        }
                }
        }
-       printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
+       printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
+              test_bit(MD_RECOVERY_INTR, &mddev->recovery)
+              ? "interrupted" : "done");
        /*
         * this also signals 'finished resyncing' to md_stop
         */
- out:
        blk_finish_plug(&plug);
        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
@@ -7640,16 +7676,6 @@ void md_do_sync(struct md_thread *thread)
        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
        return;
-
- interrupted:
-       /*
-        * got a signal, exit.
-        */
-       printk(KERN_INFO
-              "md: md_do_sync() got signal ... exiting\n");
-       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-       goto out;
-
 }
 EXPORT_SYMBOL_GPL(md_do_sync);
 
@@ -7894,6 +7920,7 @@ void md_reap_sync_thread(struct mddev *mddev)
 
        /* resync has finished, collect result */
        md_unregister_thread(&mddev->sync_thread);
+       wake_up(&resync_wait);
        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
                /* success...*/
index af6681b..1e5a540 100644 (file)
@@ -66,7 +66,8 @@
  */
 static int max_queued_requests = 1024;
 
-static void allow_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+                         sector_t bi_sector);
 static void lower_barrier(struct r1conf *conf);
 
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
@@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 }
 
 #define RESYNC_BLOCK_SIZE (64*1024)
-//#define RESYNC_BLOCK_SIZE PAGE_SIZE
+#define RESYNC_DEPTH 32
 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
-#define RESYNC_WINDOW (2048*1024)
+#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
+#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
+#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
 
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
        struct bio *bio = r1_bio->master_bio;
        int done;
        struct r1conf *conf = r1_bio->mddev->private;
+       sector_t start_next_window = r1_bio->start_next_window;
+       sector_t bi_sector = bio->bi_sector;
 
        if (bio->bi_phys_segments) {
                unsigned long flags;
@@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
                bio->bi_phys_segments--;
                done = (bio->bi_phys_segments == 0);
                spin_unlock_irqrestore(&conf->device_lock, flags);
+               /*
+                * make_request() might be waiting for
+                * bi_phys_segments to decrease
+                */
+               wake_up(&conf->wait_barrier);
        } else
                done = 1;
 
@@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
                 * Wake up any possible resync thread that waits for the device
                 * to go idle.
                 */
-               allow_barrier(conf);
+               allow_barrier(conf, start_next_window, bi_sector);
        }
 }
 
@@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf)
  *    there is no normal IO happeing.  It must arrange to call
  *    lower_barrier when the particular background IO completes.
  */
-#define RESYNC_DEPTH 32
-
 static void raise_barrier(struct r1conf *conf)
 {
        spin_lock_irq(&conf->resync_lock);
@@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf)
        /* block any new IO from starting */
        conf->barrier++;
 
-       /* Now wait for all pending IO to complete */
+       /* For these conditions we must wait:
+        * A: while the array is in frozen state
+        * B: while barrier >= RESYNC_DEPTH, meaning resync reach
+        *    the max count which allowed.
+        * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
+        *    next resync will reach to the window which normal bios are
+        *    handling.
+        */
        wait_event_lock_irq(conf->wait_barrier,
-                           !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+                           !conf->array_frozen &&
+                           conf->barrier < RESYNC_DEPTH &&
+                           (conf->start_next_window >=
+                            conf->next_resync + RESYNC_SECTORS),
                            conf->resync_lock);
 
        spin_unlock_irq(&conf->resync_lock);
@@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf)
        wake_up(&conf->wait_barrier);
 }
 
-static void wait_barrier(struct r1conf *conf)
+static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
 {
+       bool wait = false;
+
+       if (conf->array_frozen || !bio)
+               wait = true;
+       else if (conf->barrier && bio_data_dir(bio) == WRITE) {
+               if (conf->next_resync < RESYNC_WINDOW_SECTORS)
+                       wait = true;
+               else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
+                               >= bio_end_sector(bio)) ||
+                        (conf->next_resync + NEXT_NORMALIO_DISTANCE
+                               <= bio->bi_sector))
+                       wait = false;
+               else
+                       wait = true;
+       }
+
+       return wait;
+}
+
+static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
+{
+       sector_t sector = 0;
+
        spin_lock_irq(&conf->resync_lock);
-       if (conf->barrier) {
+       if (need_to_wait_for_sync(conf, bio)) {
                conf->nr_waiting++;
                /* Wait for the barrier to drop.
                 * However if there are already pending
@@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf)
                 * count down.
                 */
                wait_event_lock_irq(conf->wait_barrier,
-                                   !conf->barrier ||
-                                   (conf->nr_pending &&
+                                   !conf->array_frozen &&
+                                   (!conf->barrier ||
+                                   ((conf->start_next_window <
+                                     conf->next_resync + RESYNC_SECTORS) &&
                                     current->bio_list &&
-                                    !bio_list_empty(current->bio_list)),
+                                    !bio_list_empty(current->bio_list))),
                                    conf->resync_lock);
                conf->nr_waiting--;
        }
+
+       if (bio && bio_data_dir(bio) == WRITE) {
+               if (conf->next_resync + NEXT_NORMALIO_DISTANCE
+                   <= bio->bi_sector) {
+                       if (conf->start_next_window == MaxSector)
+                               conf->start_next_window =
+                                       conf->next_resync +
+                                       NEXT_NORMALIO_DISTANCE;
+
+                       if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
+                           <= bio->bi_sector)
+                               conf->next_window_requests++;
+                       else
+                               conf->current_window_requests++;
+               }
+               if (bio->bi_sector >= conf->start_next_window)
+                       sector = conf->start_next_window;
+       }
+
        conf->nr_pending++;
        spin_unlock_irq(&conf->resync_lock);
+       return sector;
 }
 
-static void allow_barrier(struct r1conf *conf)
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+                         sector_t bi_sector)
 {
        unsigned long flags;
+
        spin_lock_irqsave(&conf->resync_lock, flags);
        conf->nr_pending--;
+       if (start_next_window) {
+               if (start_next_window == conf->start_next_window) {
+                       if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
+                           <= bi_sector)
+                               conf->next_window_requests--;
+                       else
+                               conf->current_window_requests--;
+               } else
+                       conf->current_window_requests--;
+
+               if (!conf->current_window_requests) {
+                       if (conf->next_window_requests) {
+                               conf->current_window_requests =
+                                       conf->next_window_requests;
+                               conf->next_window_requests = 0;
+                               conf->start_next_window +=
+                                       NEXT_NORMALIO_DISTANCE;
+                       } else
+                               conf->start_next_window = MaxSector;
+               }
+       }
        spin_unlock_irqrestore(&conf->resync_lock, flags);
        wake_up(&conf->wait_barrier);
 }
@@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra)
 {
        /* stop syncio and normal IO and wait for everything to
         * go quite.
-        * We increment barrier and nr_waiting, and then
-        * wait until nr_pending match nr_queued+extra
+        * We wait until nr_pending match nr_queued+extra
         * This is called in the context of one normal IO request
         * that has failed. Thus any sync request that might be pending
         * will be blocked by nr_pending, and we need to wait for
@@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra)
         * we continue.
         */
        spin_lock_irq(&conf->resync_lock);
-       conf->barrier++;
-       conf->nr_waiting++;
+       conf->array_frozen = 1;
        wait_event_lock_irq_cmd(conf->wait_barrier,
                                conf->nr_pending == conf->nr_queued+extra,
                                conf->resync_lock,
@@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf)
 {
        /* reverse the effect of the freeze */
        spin_lock_irq(&conf->resync_lock);
-       conf->barrier--;
-       conf->nr_waiting--;
+       conf->array_frozen = 0;
        wake_up(&conf->wait_barrier);
        spin_unlock_irq(&conf->resync_lock);
 }
@@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        int first_clone;
        int sectors_handled;
        int max_sectors;
+       sector_t start_next_window;
 
        /*
         * Register the new request and wait if the reconstruction
@@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
                finish_wait(&conf->wait_barrier, &w);
        }
 
-       wait_barrier(conf);
+       start_next_window = wait_barrier(conf, bio);
 
        bitmap = mddev->bitmap;
 
@@ -1163,6 +1247,7 @@ read_again:
 
        disks = conf->raid_disks * 2;
  retry_write:
+       r1_bio->start_next_window = start_next_window;
        blocked_rdev = NULL;
        rcu_read_lock();
        max_sectors = r1_bio->sectors;
@@ -1231,14 +1316,24 @@ read_again:
        if (unlikely(blocked_rdev)) {
                /* Wait for this device to become unblocked */
                int j;
+               sector_t old = start_next_window;
 
                for (j = 0; j < i; j++)
                        if (r1_bio->bios[j])
                                rdev_dec_pending(conf->mirrors[j].rdev, mddev);
                r1_bio->state = 0;
-               allow_barrier(conf);
+               allow_barrier(conf, start_next_window, bio->bi_sector);
                md_wait_for_blocked_rdev(blocked_rdev, mddev);
-               wait_barrier(conf);
+               start_next_window = wait_barrier(conf, bio);
+               /*
+                * We must make sure the multi r1bios of bio have
+                * the same value of bi_phys_segments
+                */
+               if (bio->bi_phys_segments && old &&
+                   old != start_next_window)
+                       /* Wait for the former r1bio(s) to complete */
+                       wait_event(conf->wait_barrier,
+                                  bio->bi_phys_segments == 1);
                goto retry_write;
        }
 
@@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf)
 
 static void close_sync(struct r1conf *conf)
 {
-       wait_barrier(conf);
-       allow_barrier(conf);
+       wait_barrier(conf, NULL);
+       allow_barrier(conf, 0, 0);
 
        mempool_destroy(conf->r1buf_pool);
        conf->r1buf_pool = NULL;
+
+       conf->next_resync = 0;
+       conf->start_next_window = MaxSector;
 }
 
 static int raid1_spare_active(struct mddev *mddev)
@@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        conf->pending_count = 0;
        conf->recovery_disabled = mddev->recovery_disabled - 1;
 
+       conf->start_next_window = MaxSector;
+       conf->current_window_requests = conf->next_window_requests = 0;
+
        err = -EIO;
        for (i = 0; i < conf->raid_disks * 2; i++) {
 
@@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev)
                           atomic_read(&bitmap->behind_writes) == 0);
        }
 
-       raise_barrier(conf);
-       lower_barrier(conf);
+       freeze_array(conf, 0);
+       unfreeze_array(conf);
 
        md_unregister_thread(&mddev->thread);
        if (conf->r1bio_pool)
@@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
                wake_up(&conf->wait_barrier);
                break;
        case 1:
-               raise_barrier(conf);
+               freeze_array(conf, 0);
                break;
        case 0:
-               lower_barrier(conf);
+               unfreeze_array(conf);
                break;
        }
 }
@@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev)
                mddev->new_chunk_sectors = 0;
                conf = setup_conf(mddev);
                if (!IS_ERR(conf))
-                       conf->barrier = 1;
+                       /* Array must appear to be quiesced */
+                       conf->array_frozen = 1;
                return conf;
        }
        return ERR_PTR(-EINVAL);
index 0ff3715..9bebca7 100644 (file)
@@ -41,6 +41,19 @@ struct r1conf {
         */
        sector_t                next_resync;
 
+       /* When raid1 starts resync, we divide array into four partitions
+        * |---------|--------------|---------------------|-------------|
+        *        next_resync   start_next_window       end_window
+        * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
+        * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
+        * current_window_requests means the count of normalIO between
+        *   start_next_window and end_window.
+        * next_window_requests means the count of normalIO after end_window.
+        * */
+       sector_t                start_next_window;
+       int                     current_window_requests;
+       int                     next_window_requests;
+
        spinlock_t              device_lock;
 
        /* list of 'struct r1bio' that need to be processed by raid1d,
@@ -65,6 +78,7 @@ struct r1conf {
        int                     nr_waiting;
        int                     nr_queued;
        int                     barrier;
+       int                     array_frozen;
 
        /* Set to 1 if a full sync is needed, (fresh device added).
         * Cleared when a sync completes.
@@ -111,6 +125,7 @@ struct r1bio {
                                                 * in this BehindIO request
                                                 */
        sector_t                sector;
+       sector_t                start_next_window;
        int                     sectors;
        unsigned long           state;
        struct mddev            *mddev;
index 7c3508a..c504e83 100644 (file)
@@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait, mddev->flags == 0 ||
-                          kthread_should_stop());
+                          test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+               if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+                       allow_barrier(conf);
+                       return sectors_done;
+               }
                conf->reshape_safe = mddev->reshape_position;
                allow_barrier(conf);
        }
index 7f0e17a..47da0af 100644 (file)
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
        return &conf->stripe_hashtbl[hash];
 }
 
+static inline int stripe_hash_locks_hash(sector_t sect)
+{
+       return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
+}
+
+static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
+{
+       spin_lock_irq(conf->hash_locks + hash);
+       spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
+{
+       spin_unlock(&conf->device_lock);
+       spin_unlock_irq(conf->hash_locks + hash);
+}
+
+static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+       int i;
+       local_irq_disable();
+       spin_lock(conf->hash_locks);
+       for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+               spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
+       spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+       int i;
+       spin_unlock(&conf->device_lock);
+       for (i = NR_STRIPE_HASH_LOCKS; i; i--)
+               spin_unlock(conf->hash_locks + i - 1);
+       local_irq_enable();
+}
+
 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
  * order without overlap.  There may be several bio's per stripe+device, and
  * a bio could span several devices.
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
        }
 }
 
-static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
+                             struct list_head *temp_inactive_list)
 {
        BUG_ON(!list_empty(&sh->lru));
        BUG_ON(atomic_read(&conf->active_stripes)==0);
@@ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
                            < IO_THRESHOLD)
                                md_wakeup_thread(conf->mddev->thread);
                atomic_dec(&conf->active_stripes);
-               if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
-                       list_add_tail(&sh->lru, &conf->inactive_list);
-                       wake_up(&conf->wait_for_stripe);
-                       if (conf->retry_read_aligned)
-                               md_wakeup_thread(conf->mddev->thread);
-               }
+               if (!test_bit(STRIPE_EXPANDING, &sh->state))
+                       list_add_tail(&sh->lru, temp_inactive_list);
        }
 }
 
-static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
+                            struct list_head *temp_inactive_list)
 {
        if (atomic_dec_and_test(&sh->count))
-               do_release_stripe(conf, sh);
+               do_release_stripe(conf, sh, temp_inactive_list);
+}
+
+/*
+ * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
+ *
+ * Be careful: Only one task can add/delete stripes from temp_inactive_list at
+ * given time. Adding stripes only takes device lock, while deleting stripes
+ * only takes hash lock.
+ */
+static void release_inactive_stripe_list(struct r5conf *conf,
+                                        struct list_head *temp_inactive_list,
+                                        int hash)
+{
+       int size;
+       bool do_wakeup = false;
+       unsigned long flags;
+
+       if (hash == NR_STRIPE_HASH_LOCKS) {
+               size = NR_STRIPE_HASH_LOCKS;
+               hash = NR_STRIPE_HASH_LOCKS - 1;
+       } else
+               size = 1;
+       while (size) {
+               struct list_head *list = &temp_inactive_list[size - 1];
+
+               /*
+                * We don't hold any lock here yet, get_active_stripe() might
+                * remove stripes from the list
+                */
+               if (!list_empty_careful(list)) {
+                       spin_lock_irqsave(conf->hash_locks + hash, flags);
+                       if (list_empty(conf->inactive_list + hash) &&
+                           !list_empty(list))
+                               atomic_dec(&conf->empty_inactive_list_nr);
+                       list_splice_tail_init(list, conf->inactive_list + hash);
+                       do_wakeup = true;
+                       spin_unlock_irqrestore(conf->hash_locks + hash, flags);
+               }
+               size--;
+               hash--;
+       }
+
+       if (do_wakeup) {
+               wake_up(&conf->wait_for_stripe);
+               if (conf->retry_read_aligned)
+                       md_wakeup_thread(conf->mddev->thread);
+       }
 }
 
 /* should hold conf->device_lock already */
-static int release_stripe_list(struct r5conf *conf)
+static int release_stripe_list(struct r5conf *conf,
+                              struct list_head *temp_inactive_list)
 {
        struct stripe_head *sh;
        int count = 0;
@@ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf)
        head = llist_del_all(&conf->released_stripes);
        head = llist_reverse_order(head);
        while (head) {
+               int hash;
+
                sh = llist_entry(head, struct stripe_head, release_list);
                head = llist_next(head);
                /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
@@ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf)
                 * again, the count is always > 1. This is true for
                 * STRIPE_ON_UNPLUG_LIST bit too.
                 */
-               __release_stripe(conf, sh);
+               hash = sh->hash_lock_index;
+               __release_stripe(conf, sh, &temp_inactive_list[hash]);
                count++;
        }
 
@@ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh)
 {
        struct r5conf *conf = sh->raid_conf;
        unsigned long flags;
+       struct list_head list;
+       int hash;
        bool wakeup;
 
-       if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
+       if (unlikely(!conf->mddev->thread) ||
+               test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
                goto slow_path;
        wakeup = llist_add(&sh->release_list, &conf->released_stripes);
        if (wakeup)
@@ -336,8 +424,11 @@ slow_path:
        local_irq_save(flags);
        /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
        if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
-               do_release_stripe(conf, sh);
+               INIT_LIST_HEAD(&list);
+               hash = sh->hash_lock_index;
+               do_release_stripe(conf, sh, &list);
                spin_unlock(&conf->device_lock);
+               release_inactive_stripe_list(conf, &list, hash);
        }
        local_irq_restore(flags);
 }
@@ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 
 
 /* find an idle stripe, make sure it is unhashed, and return it. */
-static struct stripe_head *get_free_stripe(struct r5conf *conf)
+static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh = NULL;
        struct list_head *first;
 
-       if (list_empty(&conf->inactive_list))
+       if (list_empty(conf->inactive_list + hash))
                goto out;
-       first = conf->inactive_list.next;
+       first = (conf->inactive_list + hash)->next;
        sh = list_entry(first, struct stripe_head, lru);
        list_del_init(first);
        remove_hash(sh);
        atomic_inc(&conf->active_stripes);
+       BUG_ON(hash != sh->hash_lock_index);
+       if (list_empty(conf->inactive_list + hash))
+               atomic_inc(&conf->empty_inactive_list_nr);
 out:
        return sh;
 }
@@ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 {
        struct r5conf *conf = sh->raid_conf;
-       int i;
+       int i, seq;
 
        BUG_ON(atomic_read(&sh->count) != 0);
        BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
@@ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                (unsigned long long)sh->sector);
 
        remove_hash(sh);
-
+retry:
+       seq = read_seqcount_begin(&conf->gen_lock);
        sh->generation = conf->generation - previous;
        sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
        sh->sector = sector;
@@ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                dev->flags = 0;
                raid5_build_block(sh, i, previous);
        }
+       if (read_seqcount_retry(&conf->gen_lock, seq))
+               goto retry;
        insert_hash(conf, sh);
        sh->cpu = smp_processor_id();
 }
@@ -552,29 +649,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                  int previous, int noblock, int noquiesce)
 {
        struct stripe_head *sh;
+       int hash = stripe_hash_locks_hash(sector);
 
        pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 
-       spin_lock_irq(&conf->device_lock);
+       spin_lock_irq(conf->hash_locks + hash);
 
        do {
                wait_event_lock_irq(conf->wait_for_stripe,
                                    conf->quiesce == 0 || noquiesce,
-                                   conf->device_lock);
+                                   *(conf->hash_locks + hash));
                sh = __find_stripe(conf, sector, conf->generation - previous);
                if (!sh) {
                        if (!conf->inactive_blocked)
-                               sh = get_free_stripe(conf);
+                               sh = get_free_stripe(conf, hash);
                        if (noblock && sh == NULL)
                                break;
                        if (!sh) {
                                conf->inactive_blocked = 1;
-                               wait_event_lock_irq(conf->wait_for_stripe,
-                                                   !list_empty(&conf->inactive_list) &&
-                                                   (atomic_read(&conf->active_stripes)
-                                                    < (conf->max_nr_stripes *3/4)
-                                                    || !conf->inactive_blocked),
-                                                   conf->device_lock);
+                               wait_event_lock_irq(
+                                       conf->wait_for_stripe,
+                                       !list_empty(conf->inactive_list + hash) &&
+                                       (atomic_read(&conf->active_stripes)
+                                        < (conf->max_nr_stripes * 3 / 4)
+                                        || !conf->inactive_blocked),
+                                       *(conf->hash_locks + hash));
                                conf->inactive_blocked = 0;
                        } else
                                init_stripe(sh, sector, previous);
@@ -585,9 +684,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
                                    && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
                        } else {
+                               spin_lock(&conf->device_lock);
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
                                if (list_empty(&sh->lru) &&
+                                   !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) &&
                                    !test_bit(STRIPE_EXPANDING, &sh->state))
                                        BUG();
                                list_del_init(&sh->lru);
@@ -595,6 +696,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                        sh->group->stripes_cnt--;
                                        sh->group = NULL;
                                }
+                               spin_unlock(&conf->device_lock);
                        }
                }
        } while (sh == NULL);
@@ -602,7 +704,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
        if (sh)
                atomic_inc(&sh->count);
 
-       spin_unlock_irq(&conf->device_lock);
+       spin_unlock_irq(conf->hash_locks + hash);
        return sh;
 }
 
@@ -758,7 +860,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                bi->bi_sector = (sh->sector
                                                 + rdev->data_offset);
                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
-                               bi->bi_rw |= REQ_FLUSH;
+                               bi->bi_rw |= REQ_NOMERGE;
 
                        bi->bi_vcnt = 1;
                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1582,7 +1684,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
        put_cpu();
 }
 
-static int grow_one_stripe(struct r5conf *conf)
+static int grow_one_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh;
        sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1598,6 +1700,7 @@ static int grow_one_stripe(struct r5conf *conf)
                kmem_cache_free(conf->slab_cache, sh);
                return 0;
        }
+       sh->hash_lock_index = hash;
        /* we just created an active stripe so... */
        atomic_set(&sh->count, 1);
        atomic_inc(&conf->active_stripes);
@@ -1610,6 +1713,7 @@ static int grow_stripes(struct r5conf *conf, int num)
 {
        struct kmem_cache *sc;
        int devs = max(conf->raid_disks, conf->previous_raid_disks);
+       int hash;
 
        if (conf->mddev->gendisk)
                sprintf(conf->cache_name[0],
@@ -1627,9 +1731,13 @@ static int grow_stripes(struct r5conf *conf, int num)
                return 1;
        conf->slab_cache = sc;
        conf->pool_size = devs;
-       while (num--)
-               if (!grow_one_stripe(conf))
+       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
+       while (num--) {
+               if (!grow_one_stripe(conf, hash))
                        return 1;
+               conf->max_nr_stripes++;
+               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
+       }
        return 0;
 }
 
@@ -1687,6 +1795,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        int err;
        struct kmem_cache *sc;
        int i;
+       int hash, cnt;
 
        if (newsize <= conf->pool_size)
                return 0; /* never bother to shrink */
@@ -1726,19 +1835,29 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         * OK, we have enough stripes, start collecting inactive
         * stripes and copying them over
         */
+       hash = 0;
+       cnt = 0;
        list_for_each_entry(nsh, &newstripes, lru) {
-               spin_lock_irq(&conf->device_lock);
-               wait_event_lock_irq(conf->wait_for_stripe,
-                                   !list_empty(&conf->inactive_list),
-                                   conf->device_lock);
-               osh = get_free_stripe(conf);
-               spin_unlock_irq(&conf->device_lock);
+               lock_device_hash_lock(conf, hash);
+               wait_event_cmd(conf->wait_for_stripe,
+                                   !list_empty(conf->inactive_list + hash),
+                                   unlock_device_hash_lock(conf, hash),
+                                   lock_device_hash_lock(conf, hash));
+               osh = get_free_stripe(conf, hash);
+               unlock_device_hash_lock(conf, hash);
                atomic_set(&nsh->count, 1);
                for(i=0; i<conf->pool_size; i++)
                        nsh->dev[i].page = osh->dev[i].page;
                for( ; i<newsize; i++)
                        nsh->dev[i].page = NULL;
+               nsh->hash_lock_index = hash;
                kmem_cache_free(conf->slab_cache, osh);
+               cnt++;
+               if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
+                   !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
+                       hash++;
+                       cnt = 0;
+               }
        }
        kmem_cache_destroy(conf->slab_cache);
 
@@ -1797,13 +1916,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        return err;
 }
 
-static int drop_one_stripe(struct r5conf *conf)
+static int drop_one_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh;
 
-       spin_lock_irq(&conf->device_lock);
-       sh = get_free_stripe(conf);
-       spin_unlock_irq(&conf->device_lock);
+       spin_lock_irq(conf->hash_locks + hash);
+       sh = get_free_stripe(conf, hash);
+       spin_unlock_irq(conf->hash_locks + hash);
        if (!sh)
                return 0;
        BUG_ON(atomic_read(&sh->count));
@@ -1815,8 +1934,10 @@ static int drop_one_stripe(struct r5conf *conf)
 
 static void shrink_stripes(struct r5conf *conf)
 {
-       while (drop_one_stripe(conf))
-               ;
+       int hash;
+       for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
+               while (drop_one_stripe(conf, hash))
+                       ;
 
        if (conf->slab_cache)
                kmem_cache_destroy(conf->slab_cache);
@@ -1921,6 +2042,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
                               mdname(conf->mddev), bdn);
                else
                        retry = 1;
+               if (set_bad && test_bit(In_sync, &rdev->flags)
+                   && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+                       retry = 1;
                if (retry)
                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
                                set_bit(R5_ReadError, &sh->dev[i].flags);
@@ -3900,7 +4024,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
        }
 }
 
-static void activate_bit_delay(struct r5conf *conf)
+static void activate_bit_delay(struct r5conf *conf,
+       struct list_head *temp_inactive_list)
 {
        /* device_lock is held */
        struct list_head head;
@@ -3908,9 +4033,11 @@ static void activate_bit_delay(struct r5conf *conf)
        list_del_init(&conf->bitmap_list);
        while (!list_empty(&head)) {
                struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+               int hash;
                list_del_init(&sh->lru);
                atomic_inc(&sh->count);
-               __release_stripe(conf, sh);
+               hash = sh->hash_lock_index;
+               __release_stripe(conf, sh, &temp_inactive_list[hash]);
        }
 }
 
@@ -3926,7 +4053,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
                return 1;
        if (conf->quiesce)
                return 1;
-       if (list_empty_careful(&conf->inactive_list))
+       if (atomic_read(&conf->empty_inactive_list_nr))
                return 1;
 
        return 0;
@@ -4256,6 +4383,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 struct raid5_plug_cb {
        struct blk_plug_cb      cb;
        struct list_head        list;
+       struct list_head        temp_inactive_list[NR_STRIPE_HASH_LOCKS];
 };
 
 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
@@ -4266,6 +4394,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
        struct mddev *mddev = cb->cb.data;
        struct r5conf *conf = mddev->private;
        int cnt = 0;
+       int hash;
 
        if (cb->list.next && !list_empty(&cb->list)) {
                spin_lock_irq(&conf->device_lock);
@@ -4283,11 +4412,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
                         * STRIPE_ON_RELEASE_LIST could be set here. In that
                         * case, the count is always > 1 here
                         */
-                       __release_stripe(conf, sh);
+                       hash = sh->hash_lock_index;
+                       __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
                        cnt++;
                }
                spin_unlock_irq(&conf->device_lock);
        }
+       release_inactive_stripe_list(conf, cb->temp_inactive_list,
+                                    NR_STRIPE_HASH_LOCKS);
        if (mddev->queue)
                trace_block_unplug(mddev->queue, cnt, !from_schedule);
        kfree(cb);
@@ -4308,8 +4440,12 @@ static void release_stripe_plug(struct mddev *mddev,
 
        cb = container_of(blk_cb, struct raid5_plug_cb, cb);
 
-       if (cb->list.next == NULL)
+       if (cb->list.next == NULL) {
+               int i;
                INIT_LIST_HEAD(&cb->list);
+               for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                       INIT_LIST_HEAD(cb->temp_inactive_list + i);
+       }
 
        if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
                list_add_tail(&sh->lru, &cb->list);
@@ -4692,14 +4828,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
            time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
                /* Cannot proceed until we've updated the superblock... */
                wait_event(conf->wait_for_overlap,
-                          atomic_read(&conf->reshape_stripes)==0);
+                          atomic_read(&conf->reshape_stripes)==0
+                          || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+               if (atomic_read(&conf->reshape_stripes) != 0)
+                       return 0;
                mddev->reshape_position = conf->reshape_progress;
                mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait, mddev->flags == 0 ||
-                          kthread_should_stop());
+                          test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+               if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+                       return 0;
                spin_lock_irq(&conf->device_lock);
                conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
@@ -4782,7 +4923,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
            >= mddev->resync_max - mddev->curr_resync_completed) {
                /* Cannot proceed until we've updated the superblock... */
                wait_event(conf->wait_for_overlap,
-                          atomic_read(&conf->reshape_stripes) == 0);
+                          atomic_read(&conf->reshape_stripes) == 0
+                          || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+               if (atomic_read(&conf->reshape_stripes) != 0)
+                       goto ret;
                mddev->reshape_position = conf->reshape_progress;
                mddev->curr_resync_completed = sector_nr;
                conf->reshape_checkpoint = jiffies;
@@ -4790,13 +4934,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait,
                           !test_bit(MD_CHANGE_DEVS, &mddev->flags)
-                          || kthread_should_stop());
+                          || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+               if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+                       goto ret;
                spin_lock_irq(&conf->device_lock);
                conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
        }
+ret:
        return reshape_sectors;
 }
 
@@ -4954,27 +5101,45 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 }
 
 static int handle_active_stripes(struct r5conf *conf, int group,
-                                struct r5worker *worker)
+                                struct r5worker *worker,
+                                struct list_head *temp_inactive_list)
 {
        struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
-       int i, batch_size = 0;
+       int i, batch_size = 0, hash;
+       bool release_inactive = false;
 
        while (batch_size < MAX_STRIPE_BATCH &&
                        (sh = __get_priority_stripe(conf, group)) != NULL)
                batch[batch_size++] = sh;
 
-       if (batch_size == 0)
-               return batch_size;
+       if (batch_size == 0) {
+               for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                       if (!list_empty(temp_inactive_list + i))
+                               break;
+               if (i == NR_STRIPE_HASH_LOCKS)
+                       return batch_size;
+               release_inactive = true;
+       }
        spin_unlock_irq(&conf->device_lock);
 
+       release_inactive_stripe_list(conf, temp_inactive_list,
+                                    NR_STRIPE_HASH_LOCKS);
+
+       if (release_inactive) {
+               spin_lock_irq(&conf->device_lock);
+               return 0;
+       }
+
        for (i = 0; i < batch_size; i++)
                handle_stripe(batch[i]);
 
        cond_resched();
 
        spin_lock_irq(&conf->device_lock);
-       for (i = 0; i < batch_size; i++)
-               __release_stripe(conf, batch[i]);
+       for (i = 0; i < batch_size; i++) {
+               hash = batch[i]->hash_lock_index;
+               __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
+       }
        return batch_size;
 }
 
@@ -4995,9 +5160,10 @@ static void raid5_do_work(struct work_struct *work)
        while (1) {
                int batch_size, released;
 
-               released = release_stripe_list(conf);
+               released = release_stripe_list(conf, worker->temp_inactive_list);
 
-               batch_size = handle_active_stripes(conf, group_id, worker);
+               batch_size = handle_active_stripes(conf, group_id, worker,
+                                                  worker->temp_inactive_list);
                worker->working = false;
                if (!batch_size && !released)
                        break;
@@ -5036,7 +5202,7 @@ static void raid5d(struct md_thread *thread)
                struct bio *bio;
                int batch_size, released;
 
-               released = release_stripe_list(conf);
+               released = release_stripe_list(conf, conf->temp_inactive_list);
 
                if (
                    !list_empty(&conf->bitmap_list)) {
@@ -5046,7 +5212,7 @@ static void raid5d(struct md_thread *thread)
                        bitmap_unplug(mddev->bitmap);
                        spin_lock_irq(&conf->device_lock);
                        conf->seq_write = conf->seq_flush;
-                       activate_bit_delay(conf);
+                       activate_bit_delay(conf, conf->temp_inactive_list);
                }
                raid5_activate_delayed(conf);
 
@@ -5060,7 +5226,8 @@ static void raid5d(struct md_thread *thread)
                        handled++;
                }
 
-               batch_size = handle_active_stripes(conf, ANY_GROUP, NULL);
+               batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
+                                                  conf->temp_inactive_list);
                if (!batch_size && !released)
                        break;
                handled += batch_size;
@@ -5096,22 +5263,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 {
        struct r5conf *conf = mddev->private;
        int err;
+       int hash;
 
        if (size <= 16 || size > 32768)
                return -EINVAL;
+       hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
        while (size < conf->max_nr_stripes) {
-               if (drop_one_stripe(conf))
+               if (drop_one_stripe(conf, hash))
                        conf->max_nr_stripes--;
                else
                        break;
+               hash--;
+               if (hash < 0)
+                       hash = NR_STRIPE_HASH_LOCKS - 1;
        }
        err = md_allow_write(mddev);
        if (err)
                return err;
+       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
        while (size > conf->max_nr_stripes) {
-               if (grow_one_stripe(conf))
+               if (grow_one_stripe(conf, hash))
                        conf->max_nr_stripes++;
                else break;
+               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
        }
        return 0;
 }
@@ -5199,15 +5373,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
                return 0;
 }
 
-static int alloc_thread_groups(struct r5conf *conf, int cnt);
+static int alloc_thread_groups(struct r5conf *conf, int cnt,
+                              int *group_cnt,
+                              int *worker_cnt_per_group,
+                              struct r5worker_group **worker_groups);
 static ssize_t
 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
 {
        struct r5conf *conf = mddev->private;
        unsigned long new;
        int err;
-       struct r5worker_group *old_groups;
-       int old_group_cnt;
+       struct r5worker_group *new_groups, *old_groups;
+       int group_cnt, worker_cnt_per_group;
 
        if (len >= PAGE_SIZE)
                return -EINVAL;
@@ -5223,14 +5400,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
        mddev_suspend(mddev);
 
        old_groups = conf->worker_groups;
-       old_group_cnt = conf->worker_cnt_per_group;
+       if (old_groups)
+               flush_workqueue(raid5_wq);
+
+       err = alloc_thread_groups(conf, new,
+                                 &group_cnt, &worker_cnt_per_group,
+                                 &new_groups);
+       if (!err) {
+               spin_lock_irq(&conf->device_lock);
+               conf->group_cnt = group_cnt;
+               conf->worker_cnt_per_group = worker_cnt_per_group;
+               conf->worker_groups = new_groups;
+               spin_unlock_irq(&conf->device_lock);
 
-       conf->worker_groups = NULL;
-       err = alloc_thread_groups(conf, new);
-       if (err) {
-               conf->worker_groups = old_groups;
-               conf->worker_cnt_per_group = old_group_cnt;
-       } else {
                if (old_groups)
                        kfree(old_groups[0].workers);
                kfree(old_groups);
@@ -5260,40 +5442,47 @@ static struct attribute_group raid5_attrs_group = {
        .attrs = raid5_attrs,
 };
 
-static int alloc_thread_groups(struct r5conf *conf, int cnt)
+static int alloc_thread_groups(struct r5conf *conf, int cnt,
+                              int *group_cnt,
+                              int *worker_cnt_per_group,
+                              struct r5worker_group **worker_groups)
 {
-       int i, j;
+       int i, j, k;
        ssize_t size;
        struct r5worker *workers;
 
-       conf->worker_cnt_per_group = cnt;
+       *worker_cnt_per_group = cnt;
        if (cnt == 0) {
-               conf->worker_groups = NULL;
+               *group_cnt = 0;
+               *worker_groups = NULL;
                return 0;
        }
-       conf->group_cnt = num_possible_nodes();
+       *group_cnt = num_possible_nodes();
        size = sizeof(struct r5worker) * cnt;
-       workers = kzalloc(size * conf->group_cnt, GFP_NOIO);
-       conf->worker_groups = kzalloc(sizeof(struct r5worker_group) *
-                               conf->group_cnt, GFP_NOIO);
-       if (!conf->worker_groups || !workers) {
+       workers = kzalloc(size * *group_cnt, GFP_NOIO);
+       *worker_groups = kzalloc(sizeof(struct r5worker_group) *
+                               *group_cnt, GFP_NOIO);
+       if (!*worker_groups || !workers) {
                kfree(workers);
-               kfree(conf->worker_groups);
-               conf->worker_groups = NULL;
+               kfree(*worker_groups);
                return -ENOMEM;
        }
 
-       for (i = 0; i < conf->group_cnt; i++) {
+       for (i = 0; i < *group_cnt; i++) {
                struct r5worker_group *group;
 
-               group = &conf->worker_groups[i];
+               group = worker_groups[i];
                INIT_LIST_HEAD(&group->handle_list);
                group->conf = conf;
                group->workers = workers + i * cnt;
 
                for (j = 0; j < cnt; j++) {
-                       group->workers[j].group = group;
-                       INIT_WORK(&group->workers[j].work, raid5_do_work);
+                       struct r5worker *worker = group->workers + j;
+                       worker->group = group;
+                       INIT_WORK(&worker->work, raid5_do_work);
+
+                       for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
+                               INIT_LIST_HEAD(worker->temp_inactive_list + k);
                }
        }
 
@@ -5444,6 +5633,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        struct md_rdev *rdev;
        struct disk_info *disk;
        char pers_name[6];
+       int i;
+       int group_cnt, worker_cnt_per_group;
+       struct r5worker_group *new_group;
 
        if (mddev->new_level != 5
            && mddev->new_level != 4
@@ -5478,7 +5670,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        if (conf == NULL)
                goto abort;
        /* Don't enable multi-threading by default*/
-       if (alloc_thread_groups(conf, 0))
+       if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
+                                &new_group)) {
+               conf->group_cnt = group_cnt;
+               conf->worker_cnt_per_group = worker_cnt_per_group;
+               conf->worker_groups = new_group;
+       } else
                goto abort;
        spin_lock_init(&conf->device_lock);
        seqcount_init(&conf->gen_lock);
@@ -5488,7 +5685,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        INIT_LIST_HEAD(&conf->hold_list);
        INIT_LIST_HEAD(&conf->delayed_list);
        INIT_LIST_HEAD(&conf->bitmap_list);
-       INIT_LIST_HEAD(&conf->inactive_list);
        init_llist_head(&conf->released_stripes);
        atomic_set(&conf->active_stripes, 0);
        atomic_set(&conf->preread_active_stripes, 0);
@@ -5514,6 +5710,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
                goto abort;
 
+       /* We init hash_locks[0] separately to that it can be used
+        * as the reference lock in the spin_lock_nest_lock() call
+        * in lock_all_device_hash_locks_irq in order to convince
+        * lockdep that we know what we are doing.
+        */
+       spin_lock_init(conf->hash_locks);
+       for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+               spin_lock_init(conf->hash_locks + i);
+
+       for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+               INIT_LIST_HEAD(conf->inactive_list + i);
+
+       for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+               INIT_LIST_HEAD(conf->temp_inactive_list + i);
+
        conf->level = mddev->new_level;
        if (raid5_alloc_percpu(conf) != 0)
                goto abort;
@@ -5554,7 +5765,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        else
                conf->max_degraded = 1;
        conf->algorithm = mddev->new_layout;
-       conf->max_nr_stripes = NR_STRIPES;
        conf->reshape_progress = mddev->reshape_position;
        if (conf->reshape_progress != MaxSector) {
                conf->prev_chunk_sectors = mddev->chunk_sectors;
@@ -5563,7 +5773,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
                 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
-       if (grow_stripes(conf, conf->max_nr_stripes)) {
+       atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
+       if (grow_stripes(conf, NR_STRIPES)) {
                printk(KERN_ERR
                       "md/raid:%s: couldn't allocate %dkB for buffers\n",
                       mdname(mddev), memory);
@@ -6369,12 +6580,18 @@ static int raid5_start_reshape(struct mddev *mddev)
        if (!mddev->sync_thread) {
                mddev->recovery = 0;
                spin_lock_irq(&conf->device_lock);
+               write_seqcount_begin(&conf->gen_lock);
                mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+               mddev->new_chunk_sectors =
+                       conf->chunk_sectors = conf->prev_chunk_sectors;
+               mddev->new_layout = conf->algorithm = conf->prev_algo;
                rdev_for_each(rdev, mddev)
                        rdev->new_data_offset = rdev->data_offset;
                smp_wmb();
+               conf->generation --;
                conf->reshape_progress = MaxSector;
                mddev->reshape_position = MaxSector;
+               write_seqcount_end(&conf->gen_lock);
                spin_unlock_irq(&conf->device_lock);
                return -EAGAIN;
        }
@@ -6462,27 +6679,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
                break;
 
        case 1: /* stop all writes */
-               spin_lock_irq(&conf->device_lock);
+               lock_all_device_hash_locks_irq(conf);
                /* '2' tells resync/reshape to pause so that all
                 * active stripes can drain
                 */
                conf->quiesce = 2;
-               wait_event_lock_irq(conf->wait_for_stripe,
+               wait_event_cmd(conf->wait_for_stripe,
                                    atomic_read(&conf->active_stripes) == 0 &&
                                    atomic_read(&conf->active_aligned_reads) == 0,
-                                   conf->device_lock);
+                                   unlock_all_device_hash_locks_irq(conf),
+                                   lock_all_device_hash_locks_irq(conf));
                conf->quiesce = 1;
-               spin_unlock_irq(&conf->device_lock);
+               unlock_all_device_hash_locks_irq(conf);
                /* allow reshape to continue */
                wake_up(&conf->wait_for_overlap);
                break;
 
        case 0: /* re-enable writes */
-               spin_lock_irq(&conf->device_lock);
+               lock_all_device_hash_locks_irq(conf);
                conf->quiesce = 0;
                wake_up(&conf->wait_for_stripe);
                wake_up(&conf->wait_for_overlap);
-               spin_unlock_irq(&conf->device_lock);
+               unlock_all_device_hash_locks_irq(conf);
                break;
        }
 }
index b42e6b4..01ad8ae 100644 (file)
@@ -205,6 +205,7 @@ struct stripe_head {
        short                   pd_idx;         /* parity disk index */
        short                   qd_idx;         /* 'Q' disk index for raid6 */
        short                   ddf_layout;/* use DDF ordering to calculate Q */
+       short                   hash_lock_index;
        unsigned long           state;          /* state flags */
        atomic_t                count;        /* nr of active thread/requests */
        int                     bm_seq; /* sequence number for bitmap flushes */
@@ -367,9 +368,18 @@ struct disk_info {
        struct md_rdev  *rdev, *replacement;
 };
 
+/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
+ * This is because we sometimes take all the spinlocks
+ * and creating that much locking depth can cause
+ * problems.
+ */
+#define NR_STRIPE_HASH_LOCKS 8
+#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
+
 struct r5worker {
        struct work_struct work;
        struct r5worker_group *group;
+       struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
        bool working;
 };
 
@@ -382,6 +392,8 @@ struct r5worker_group {
 
 struct r5conf {
        struct hlist_head       *stripe_hashtbl;
+       /* only protect corresponding hash list and inactive_list */
+       spinlock_t              hash_locks[NR_STRIPE_HASH_LOCKS];
        struct mddev            *mddev;
        int                     chunk_sectors;
        int                     level, algorithm;
@@ -462,7 +474,8 @@ struct r5conf {
         * Free stripes pool
         */
        atomic_t                active_stripes;
-       struct list_head        inactive_list;
+       struct list_head        inactive_list[NR_STRIPE_HASH_LOCKS];
+       atomic_t                empty_inactive_list_nr;
        struct llist_head       released_stripes;
        wait_queue_head_t       wait_for_stripe;
        wait_queue_head_t       wait_for_overlap;
@@ -477,6 +490,7 @@ struct r5conf {
         * the new thread here until we fully activate the array.
         */
        struct md_thread        *thread;
+       struct list_head        temp_inactive_list[NR_STRIPE_HASH_LOCKS];
        struct r5worker_group   *worker_groups;
        int                     group_cnt;
        int                     worker_cnt_per_group;
index 36513e8..65cab70 100644 (file)
@@ -341,8 +341,7 @@ static void deinterlace_issue_dma(struct deinterlace_ctx *ctx, int op,
        ctx->xt->dir = DMA_MEM_TO_MEM;
        ctx->xt->src_sgl = false;
        ctx->xt->dst_sgl = true;
-       flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT |
-               DMA_COMPL_SKIP_DEST_UNMAP | DMA_COMPL_SKIP_SRC_UNMAP;
+       flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT;
 
        tx = dmadev->device_prep_interleaved_dma(chan, ctx->xt, flags);
        if (tx == NULL) {
index 6a74ce0..ccdadd6 100644 (file)
@@ -565,7 +565,7 @@ static void buffer_queue(struct videobuf_queue *vq, struct videobuf_buffer *vb)
 
        desc = dmaengine_prep_slave_sg(fh->chan,
                buf->sg, sg_elems, DMA_DEV_TO_MEM,
-               DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP);
+               DMA_PREP_INTERRUPT);
        if (!desc) {
                spin_lock_irq(&fh->queue_lock);
                list_del_init(&vb->queue);
index 08b18f3..9e2b985 100644 (file)
@@ -633,8 +633,7 @@ static int data_submit_dma(struct fpga_device *priv, struct data_buf *buf)
        struct dma_async_tx_descriptor *tx;
        dma_cookie_t cookie;
        dma_addr_t dst, src;
-       unsigned long dma_flags = DMA_COMPL_SKIP_DEST_UNMAP |
-                                 DMA_COMPL_SKIP_SRC_UNMAP;
+       unsigned long dma_flags = 0;
 
        dst_sg = buf->vb.sglist;
        dst_nents = buf->vb.sglen;
index ef89565..157b570 100644 (file)
@@ -308,8 +308,7 @@ static void sdio_acpi_set_handle(struct sdio_func *func)
        struct mmc_host *host = func->card->host;
        u64 addr = (host->slotno << 16) | func->num;
 
-       ACPI_HANDLE_SET(&func->dev,
-                       acpi_get_child(ACPI_HANDLE(host->parent), addr));
+       acpi_preset_companion(&func->dev, ACPI_HANDLE(host->parent), addr);
 }
 #else
 static inline void sdio_acpi_set_handle(struct sdio_func *func) {}
index d78a97d..59f08c4 100644 (file)
@@ -375,8 +375,7 @@ static int atmel_nand_dma_op(struct mtd_info *mtd, void *buf, int len,
 
        dma_dev = host->dma_chan->device;
 
-       flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP |
-               DMA_COMPL_SKIP_DEST_UNMAP;
+       flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT;
 
        phys_addr = dma_map_single(dma_dev->dev, p, len, dir);
        if (dma_mapping_error(dma_dev->dev, phys_addr)) {
index 3dc1a75..8b27522 100644 (file)
@@ -573,8 +573,6 @@ static int dma_xfer(struct fsmc_nand_data *host, void *buffer, int len,
        dma_dev = chan->device;
        dma_addr = dma_map_single(dma_dev->dev, buffer, len, direction);
 
-       flags |= DMA_COMPL_SKIP_SRC_UNMAP | DMA_COMPL_SKIP_DEST_UNMAP;
-
        if (direction == DMA_TO_DEVICE) {
                dma_src = dma_addr;
                dma_dst = host->data_pa;
index 0951f7a..822616e 100644 (file)
@@ -459,8 +459,7 @@ static int ks8842_tx_frame_dma(struct sk_buff *skb, struct net_device *netdev)
                sg_dma_len(&ctl->sg) += 4 - sg_dma_len(&ctl->sg) % 4;
 
        ctl->adesc = dmaengine_prep_slave_sg(ctl->chan,
-               &ctl->sg, 1, DMA_MEM_TO_DEV,
-               DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP);
+               &ctl->sg, 1, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT);
        if (!ctl->adesc)
                return NETDEV_TX_BUSY;
 
@@ -571,8 +570,7 @@ static int __ks8842_start_new_rx_dma(struct net_device *netdev)
                sg_dma_len(sg) = DMA_BUFFER_SIZE;
 
                ctl->adesc = dmaengine_prep_slave_sg(ctl->chan,
-                       sg, 1, DMA_DEV_TO_MEM,
-                       DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP);
+                       sg, 1, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT);
 
                if (!ctl->adesc)
                        goto out;
index 12a9e83..d0222f1 100644 (file)
@@ -1034,10 +1034,9 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset,
        struct dma_chan *chan = qp->dma_chan;
        struct dma_device *device;
        size_t pay_off, buff_off;
-       dma_addr_t src, dest;
+       struct dmaengine_unmap_data *unmap;
        dma_cookie_t cookie;
        void *buf = entry->buf;
-       unsigned long flags;
 
        entry->len = len;
 
@@ -1045,35 +1044,49 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset,
                goto err;
 
        if (len < copy_bytes) 
-               goto err1;
+               goto err_wait;
 
        device = chan->device;
        pay_off = (size_t) offset & ~PAGE_MASK;
        buff_off = (size_t) buf & ~PAGE_MASK;
 
        if (!is_dma_copy_aligned(device, pay_off, buff_off, len))
-               goto err1;
+               goto err_wait;
 
-       dest = dma_map_single(device->dev, buf, len, DMA_FROM_DEVICE);
-       if (dma_mapping_error(device->dev, dest))
-               goto err1;
+       unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOWAIT);
+       if (!unmap)
+               goto err_wait;
 
-       src = dma_map_single(device->dev, offset, len, DMA_TO_DEVICE);
-       if (dma_mapping_error(device->dev, src))
-               goto err2;
+       unmap->len = len;
+       unmap->addr[0] = dma_map_page(device->dev, virt_to_page(offset),
+                                     pay_off, len, DMA_TO_DEVICE);
+       if (dma_mapping_error(device->dev, unmap->addr[0]))
+               goto err_get_unmap;
+
+       unmap->to_cnt = 1;
 
-       flags = DMA_COMPL_DEST_UNMAP_SINGLE | DMA_COMPL_SRC_UNMAP_SINGLE |
-               DMA_PREP_INTERRUPT;
-       txd = device->device_prep_dma_memcpy(chan, dest, src, len, flags);
+       unmap->addr[1] = dma_map_page(device->dev, virt_to_page(buf),
+                                     buff_off, len, DMA_FROM_DEVICE);
+       if (dma_mapping_error(device->dev, unmap->addr[1]))
+               goto err_get_unmap;
+
+       unmap->from_cnt = 1;
+
+       txd = device->device_prep_dma_memcpy(chan, unmap->addr[1],
+                                            unmap->addr[0], len,
+                                            DMA_PREP_INTERRUPT);
        if (!txd)
-               goto err3;
+               goto err_get_unmap;
 
        txd->callback = ntb_rx_copy_callback;
        txd->callback_param = entry;
+       dma_set_unmap(txd, unmap);
 
        cookie = dmaengine_submit(txd);
        if (dma_submit_error(cookie))
-               goto err3;
+               goto err_set_unmap;
+
+       dmaengine_unmap_put(unmap);
 
        qp->last_cookie = cookie;
 
@@ -1081,11 +1094,11 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset,
 
        return;
 
-err3:
-       dma_unmap_single(device->dev, src, len, DMA_TO_DEVICE);
-err2:
-       dma_unmap_single(device->dev, dest, len, DMA_FROM_DEVICE);
-err1:
+err_set_unmap:
+       dmaengine_unmap_put(unmap);
+err_get_unmap:
+       dmaengine_unmap_put(unmap);
+err_wait:
        /* If the callbacks come out of order, the writing of the index to the
         * last completed will be out of order.  This may result in the
         * receive stalling forever.
@@ -1245,12 +1258,12 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
        struct dma_chan *chan = qp->dma_chan;
        struct dma_device *device;
        size_t dest_off, buff_off;
-       dma_addr_t src, dest;
+       struct dmaengine_unmap_data *unmap;
+       dma_addr_t dest;
        dma_cookie_t cookie;
        void __iomem *offset;
        size_t len = entry->len;
        void *buf = entry->buf;
-       unsigned long flags;
 
        offset = qp->tx_mw + qp->tx_max_frame * qp->tx_index;
        hdr = offset + qp->tx_max_frame - sizeof(struct ntb_payload_header);
@@ -1273,28 +1286,41 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
        if (!is_dma_copy_aligned(device, buff_off, dest_off, len))
                goto err;
 
-       src = dma_map_single(device->dev, buf, len, DMA_TO_DEVICE);
-       if (dma_mapping_error(device->dev, src))
+       unmap = dmaengine_get_unmap_data(device->dev, 1, GFP_NOWAIT);
+       if (!unmap)
                goto err;
 
-       flags = DMA_COMPL_SRC_UNMAP_SINGLE | DMA_PREP_INTERRUPT;
-       txd = device->device_prep_dma_memcpy(chan, dest, src, len, flags);
+       unmap->len = len;
+       unmap->addr[0] = dma_map_page(device->dev, virt_to_page(buf),
+                                     buff_off, len, DMA_TO_DEVICE);
+       if (dma_mapping_error(device->dev, unmap->addr[0]))
+               goto err_get_unmap;
+
+       unmap->to_cnt = 1;
+
+       txd = device->device_prep_dma_memcpy(chan, dest, unmap->addr[0], len,
+                                            DMA_PREP_INTERRUPT);
        if (!txd)
-               goto err1;
+               goto err_get_unmap;
 
        txd->callback = ntb_tx_copy_callback;
        txd->callback_param = entry;
+       dma_set_unmap(txd, unmap);
 
        cookie = dmaengine_submit(txd);
        if (dma_submit_error(cookie))
-               goto err1;
+               goto err_set_unmap;
+
+       dmaengine_unmap_put(unmap);
 
        dma_async_issue_pending(chan);
        qp->tx_async++;
 
        return;
-err1:
-       dma_unmap_single(device->dev, src, len, DMA_TO_DEVICE);
+err_set_unmap:
+       dmaengine_unmap_put(unmap);
+err_get_unmap:
+       dmaengine_unmap_put(unmap);
 err:
        ntb_memcpy_tx(entry, offset);
        qp->tx_memcpy++;
index 1ce8ee0..a94d850 100644 (file)
@@ -367,7 +367,7 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *pdev, u32 flags)
                string = (struct acpi_buffer){ ACPI_ALLOCATE_BUFFER, NULL };
        }
 
-       handle = DEVICE_ACPI_HANDLE(&pdev->dev);
+       handle = ACPI_HANDLE(&pdev->dev);
        if (!handle) {
                /*
                 * This hotplug controller was not listed in the ACPI name
index 26100f5..1592dbe 100644 (file)
@@ -176,7 +176,6 @@ u8 acpiphp_get_latch_status(struct acpiphp_slot *slot);
 u8 acpiphp_get_adapter_status(struct acpiphp_slot *slot);
 
 /* variables */
-extern bool acpiphp_debug;
 extern bool acpiphp_disabled;
 
 #endif /* _ACPIPHP_H */
index ead7c53..cff7cad 100644 (file)
@@ -54,7 +54,7 @@ int pciehp_acpi_slot_detection_check(struct pci_dev *dev)
 {
        if (slot_detection_mode != PCIEHP_DETECT_ACPI)
                return 0;
-       if (acpi_pci_detect_ejectable(DEVICE_ACPI_HANDLE(&dev->dev)))
+       if (acpi_pci_detect_ejectable(ACPI_HANDLE(&dev->dev)))
                return 0;
        return -ENODEV;
 }
@@ -96,7 +96,7 @@ static int __init dummy_probe(struct pcie_device *dev)
                        dup_slot_id++;
        }
        list_add_tail(&slot->list, &dummy_slots);
-       handle = DEVICE_ACPI_HANDLE(&pdev->dev);
+       handle = ACPI_HANDLE(&pdev->dev);
        if (!acpi_slot_detected && acpi_pci_detect_ejectable(handle))
                acpi_slot_detected = 1;
        return -ENODEV;         /* dummy driver always returns error */
index b2781df..5b05a68 100644 (file)
@@ -9,6 +9,7 @@
  * Work to add BIOS PROM support was completed by Mike Habeck.
  */
 
+#include <linux/acpi.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -29,7 +30,6 @@
 #include <asm/sn/sn_feature_sets.h>
 #include <asm/sn/sn_sal.h>
 #include <asm/sn/types.h>
-#include <linux/acpi.h>
 #include <asm/sn/acpi.h>
 
 #include "../pci.h"
@@ -414,7 +414,7 @@ static int enable_slot(struct hotplug_slot *bss_hotplug_slot)
                acpi_handle rethandle;
                acpi_status ret;
 
-               phandle = PCI_CONTROLLER(slot->pci_bus)->acpi_handle;
+               phandle = acpi_device_handle(PCI_CONTROLLER(slot->pci_bus)->companion);
 
                if (acpi_bus_get_device(phandle, &pdevice)) {
                        dev_dbg(&slot->pci_bus->self->dev,
@@ -495,7 +495,7 @@ static int disable_slot(struct hotplug_slot *bss_hotplug_slot)
 
        /* free the ACPI resources for the slot */
        if (SN_ACPI_BASE_SUPPORT() &&
-            PCI_CONTROLLER(slot->pci_bus)->acpi_handle) {
+            PCI_CONTROLLER(slot->pci_bus)->companion) {
                unsigned long long adr;
                struct acpi_device *device;
                acpi_handle phandle;
@@ -504,7 +504,7 @@ static int disable_slot(struct hotplug_slot *bss_hotplug_slot)
                acpi_status ret;
 
                /* Get the rootbus node pointer */
-               phandle = PCI_CONTROLLER(slot->pci_bus)->acpi_handle;
+               phandle = acpi_device_handle(PCI_CONTROLLER(slot->pci_bus)->companion);
 
                acpi_scan_lock_acquire();
                /*
index 1b90579..50ce680 100644 (file)
@@ -37,7 +37,7 @@ static int ioapic_probe(struct pci_dev *dev, const struct pci_device_id *ent)
        char *type;
        struct resource *res;
 
-       handle = DEVICE_ACPI_HANDLE(&dev->dev);
+       handle = ACPI_HANDLE(&dev->dev);
        if (!handle)
                return -EINVAL;
 
index dfd1f59..f166126 100644 (file)
@@ -173,14 +173,14 @@ static pci_power_t acpi_pci_choose_state(struct pci_dev *pdev)
 
 static bool acpi_pci_power_manageable(struct pci_dev *dev)
 {
-       acpi_handle handle = DEVICE_ACPI_HANDLE(&dev->dev);
+       acpi_handle handle = ACPI_HANDLE(&dev->dev);
 
        return handle ? acpi_bus_power_manageable(handle) : false;
 }
 
 static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 {
-       acpi_handle handle = DEVICE_ACPI_HANDLE(&dev->dev);
+       acpi_handle handle = ACPI_HANDLE(&dev->dev);
        static const u8 state_conv[] = {
                [PCI_D0] = ACPI_STATE_D0,
                [PCI_D1] = ACPI_STATE_D1,
@@ -217,7 +217,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 
 static bool acpi_pci_can_wakeup(struct pci_dev *dev)
 {
-       acpi_handle handle = DEVICE_ACPI_HANDLE(&dev->dev);
+       acpi_handle handle = ACPI_HANDLE(&dev->dev);
 
        return handle ? acpi_bus_can_wakeup(handle) : false;
 }
index edaed6f..d51f45a 100644 (file)
@@ -263,7 +263,7 @@ device_has_dsm(struct device *dev)
        acpi_handle handle;
        struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
 
-       handle = DEVICE_ACPI_HANDLE(dev);
+       handle = ACPI_HANDLE(dev);
 
        if (!handle)
                return FALSE;
@@ -295,7 +295,7 @@ acpilabel_show(struct device *dev, struct device_attribute *attr, char *buf)
        acpi_handle handle;
        int length;
 
-       handle = DEVICE_ACPI_HANDLE(dev);
+       handle = ACPI_HANDLE(dev);
 
        if (!handle)
                return -1;
@@ -316,7 +316,7 @@ acpiindex_show(struct device *dev, struct device_attribute *attr, char *buf)
        acpi_handle handle;
        int length;
 
-       handle = DEVICE_ACPI_HANDLE(dev);
+       handle = ACPI_HANDLE(dev);
 
        if (!handle)
                return -1;
index 605a9be..b9429fb 100644 (file)
@@ -519,7 +519,7 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id)
 
        gmux_data->power_state = VGA_SWITCHEROO_ON;
 
-       gmux_data->dhandle = DEVICE_ACPI_HANDLE(&pnp->dev);
+       gmux_data->dhandle = ACPI_HANDLE(&pnp->dev);
        if (!gmux_data->dhandle) {
                pr_err("Cannot find acpi handle for pnp device %s\n",
                       dev_name(&pnp->dev));
index 747826d..14655a0 100644 (file)
@@ -89,7 +89,7 @@ static int pnpacpi_set_resources(struct pnp_dev *dev)
 
        pnp_dbg(&dev->dev, "set resources\n");
 
-       handle = DEVICE_ACPI_HANDLE(&dev->dev);
+       handle = ACPI_HANDLE(&dev->dev);
        if (!handle || acpi_bus_get_device(handle, &acpi_dev)) {
                dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__);
                return -ENODEV;
@@ -122,7 +122,7 @@ static int pnpacpi_disable_resources(struct pnp_dev *dev)
 
        dev_dbg(&dev->dev, "disable resources\n");
 
-       handle = DEVICE_ACPI_HANDLE(&dev->dev);
+       handle = ACPI_HANDLE(&dev->dev);
        if (!handle || acpi_bus_get_device(handle, &acpi_dev)) {
                dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__);
                return 0;
@@ -144,7 +144,7 @@ static bool pnpacpi_can_wakeup(struct pnp_dev *dev)
        struct acpi_device *acpi_dev;
        acpi_handle handle;
 
-       handle = DEVICE_ACPI_HANDLE(&dev->dev);
+       handle = ACPI_HANDLE(&dev->dev);
        if (!handle || acpi_bus_get_device(handle, &acpi_dev)) {
                dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__);
                return false;
@@ -159,7 +159,7 @@ static int pnpacpi_suspend(struct pnp_dev *dev, pm_message_t state)
        acpi_handle handle;
        int error = 0;
 
-       handle = DEVICE_ACPI_HANDLE(&dev->dev);
+       handle = ACPI_HANDLE(&dev->dev);
        if (!handle || acpi_bus_get_device(handle, &acpi_dev)) {
                dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__);
                return 0;
@@ -194,7 +194,7 @@ static int pnpacpi_suspend(struct pnp_dev *dev, pm_message_t state)
 static int pnpacpi_resume(struct pnp_dev *dev)
 {
        struct acpi_device *acpi_dev;
-       acpi_handle handle = DEVICE_ACPI_HANDLE(&dev->dev);
+       acpi_handle handle = ACPI_HANDLE(&dev->dev);
        int error = 0;
 
        if (!handle || acpi_bus_get_device(handle, &acpi_dev)) {
index 15f166a..0077302 100644 (file)
@@ -626,7 +626,7 @@ comment "Platform RTC drivers"
 
 config RTC_DRV_CMOS
        tristate "PC-style 'CMOS'"
-       depends on X86 || ALPHA || ARM || M32R || ATARI || PPC || MIPS || SPARC64
+       depends on X86 || ARM || M32R || ATARI || PPC || MIPS || SPARC64
        default y if X86
        help
          Say "yes" here to get direct support for the real time clock
@@ -643,6 +643,14 @@ config RTC_DRV_CMOS
          This driver can also be built as a module. If so, the module
          will be called rtc-cmos.
 
+config RTC_DRV_ALPHA
+       bool "Alpha PC-style CMOS"
+       depends on ALPHA
+       default y
+       help
+         Direct support for the real-time clock found on every Alpha
+         system, specifically MC146818 compatibles.  If in doubt, say Y.
+
 config RTC_DRV_VRTC
        tristate "Virtual RTC for Intel MID platforms"
        depends on X86_INTEL_MID
index 8b2cd8a..c0da95e 100644 (file)
@@ -428,6 +428,14 @@ static int __exit at91_rtc_remove(struct platform_device *pdev)
        return 0;
 }
 
+static void at91_rtc_shutdown(struct platform_device *pdev)
+{
+       /* Disable all interrupts */
+       at91_rtc_write(AT91_RTC_IDR, AT91_RTC_ACKUPD | AT91_RTC_ALARM |
+                                       AT91_RTC_SECEV | AT91_RTC_TIMEV |
+                                       AT91_RTC_CALEV);
+}
+
 #ifdef CONFIG_PM_SLEEP
 
 /* AT91RM9200 RTC Power management control */
@@ -466,6 +474,7 @@ static SIMPLE_DEV_PM_OPS(at91_rtc_pm_ops, at91_rtc_suspend, at91_rtc_resume);
 
 static struct platform_driver at91_rtc_driver = {
        .remove         = __exit_p(at91_rtc_remove),
+       .shutdown       = at91_rtc_shutdown,
        .driver         = {
                .name   = "at91_rtc",
                .owner  = THIS_MODULE,
index b9f0192..6d207af 100644 (file)
@@ -150,7 +150,7 @@ static int mid_spi_dma_transfer(struct dw_spi *dws, int cs_change)
                                &dws->tx_sgl,
                                1,
                                DMA_MEM_TO_DEV,
-                               DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_DEST_UNMAP);
+                               DMA_PREP_INTERRUPT);
        txdesc->callback = dw_spi_dma_done;
        txdesc->callback_param = dws;
 
@@ -173,7 +173,7 @@ static int mid_spi_dma_transfer(struct dw_spi *dws, int cs_change)
                                &dws->rx_sgl,
                                1,
                                DMA_DEV_TO_MEM,
-                               DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_DEST_UNMAP);
+                               DMA_PREP_INTERRUPT);
        rxdesc->callback = dw_spi_dma_done;
        rxdesc->callback_param = dws;
 
index 8d85ddc..18cc625 100644 (file)
@@ -357,6 +357,19 @@ struct spi_device *spi_alloc_device(struct spi_master *master)
 }
 EXPORT_SYMBOL_GPL(spi_alloc_device);
 
+static void spi_dev_set_name(struct spi_device *spi)
+{
+       struct acpi_device *adev = ACPI_COMPANION(&spi->dev);
+
+       if (adev) {
+               dev_set_name(&spi->dev, "spi-%s", acpi_dev_name(adev));
+               return;
+       }
+
+       dev_set_name(&spi->dev, "%s.%u", dev_name(&spi->master->dev),
+                    spi->chip_select);
+}
+
 /**
  * spi_add_device - Add spi_device allocated with spi_alloc_device
  * @spi: spi_device to register
@@ -383,9 +396,7 @@ int spi_add_device(struct spi_device *spi)
        }
 
        /* Set the bus ID string */
-       dev_set_name(&spi->dev, "%s.%u", dev_name(&spi->master->dev),
-                       spi->chip_select);
-
+       spi_dev_set_name(spi);
 
        /* We need to make sure there's no other device with this
         * chipselect **BEFORE** we call setup(), else we'll trash
@@ -1144,7 +1155,7 @@ static acpi_status acpi_spi_add_device(acpi_handle handle, u32 level,
                return AE_NO_MEMORY;
        }
 
-       ACPI_HANDLE_SET(&spi->dev, handle);
+       ACPI_COMPANION_SET(&spi->dev, adev);
        spi->irq = -1;
 
        INIT_LIST_HEAD(&resource_list);
index 5377502..7d8103c 100644 (file)
@@ -1433,7 +1433,7 @@ static void work_fn_rx(struct work_struct *work)
        desc = s->desc_rx[new];
 
        if (dma_async_is_tx_complete(s->chan_rx, s->active_rx, NULL, NULL) !=
-           DMA_SUCCESS) {
+           DMA_COMPLETE) {
                /* Handle incomplete DMA receive */
                struct dma_chan *chan = s->chan_rx;
                struct shdma_desc *sh_desc = container_of(desc,
index 06cec63..a7c04e2 100644 (file)
@@ -5501,6 +5501,6 @@ acpi_handle usb_get_hub_port_acpi_handle(struct usb_device *hdev,
        if (!hub)
                return NULL;
 
-       return DEVICE_ACPI_HANDLE(&hub->ports[port1 - 1]->dev);
+       return ACPI_HANDLE(&hub->ports[port1 - 1]->dev);
 }
 #endif
index 255c144..4e243c3 100644 (file)
@@ -173,7 +173,7 @@ static int usb_acpi_find_device(struct device *dev, acpi_handle *handle)
                }
 
                /* root hub's parent is the usb hcd. */
-               parent_handle = DEVICE_ACPI_HANDLE(dev->parent);
+               parent_handle = ACPI_HANDLE(dev->parent);
                *handle = acpi_get_child(parent_handle, udev->portnum);
                if (!*handle)
                        return -ENODEV;
@@ -194,7 +194,7 @@ static int usb_acpi_find_device(struct device *dev, acpi_handle *handle)
 
                        raw_port_num = usb_hcd_find_raw_port_number(hcd,
                                port_num);
-                       *handle = acpi_get_child(DEVICE_ACPI_HANDLE(&udev->dev),
+                       *handle = acpi_get_child(ACPI_HANDLE(&udev->dev),
                                raw_port_num);
                        if (!*handle)
                                return -ENODEV;
index d15f6e8..1888251 100644 (file)
@@ -59,12 +59,12 @@ static int xen_add_device(struct device *dev)
                        add.flags = XEN_PCI_DEV_EXTFN;
 
 #ifdef CONFIG_ACPI
-               handle = DEVICE_ACPI_HANDLE(&pci_dev->dev);
+               handle = ACPI_HANDLE(&pci_dev->dev);
                if (!handle && pci_dev->bus->bridge)
-                       handle = DEVICE_ACPI_HANDLE(pci_dev->bus->bridge);
+                       handle = ACPI_HANDLE(pci_dev->bus->bridge);
 #ifdef CONFIG_PCI_IOV
                if (!handle && pci_dev->is_virtfn)
-                       handle = DEVICE_ACPI_HANDLE(physfn->bus->bridge);
+                       handle = ACPI_HANDLE(physfn->bus->bridge);
 #endif
                if (handle) {
                        acpi_status status;
index f039b10..b03dd23 100644 (file)
 #include "fid.h"
 
 /**
- * v9fs_dentry_delete - called when dentry refcount equals 0
- * @dentry:  dentry in question
- *
- * By returning 1 here we should remove cacheing of unused
- * dentry components.
- *
- */
-
-static int v9fs_dentry_delete(const struct dentry *dentry)
-{
-       p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
-                dentry->d_name.name, dentry);
-
-       return 1;
-}
-
-/**
  * v9fs_cached_dentry_delete - called when dentry refcount equals 0
  * @dentry:  dentry in question
  *
@@ -134,6 +117,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = {
 };
 
 const struct dentry_operations v9fs_dentry_operations = {
-       .d_delete = v9fs_dentry_delete,
+       .d_delete = always_delete_dentry,
        .d_release = v9fs_dentry_release,
 };
index 823efcb..08159ed 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -80,6 +80,8 @@ struct kioctx {
        struct percpu_ref       users;
        atomic_t                dead;
 
+       struct percpu_ref       reqs;
+
        unsigned long           user_id;
 
        struct __percpu kioctx_cpu *cpu;
@@ -107,7 +109,6 @@ struct kioctx {
        struct page             **ring_pages;
        long                    nr_pages;
 
-       struct rcu_head         rcu_head;
        struct work_struct      free_work;
 
        struct {
@@ -250,8 +251,10 @@ static void aio_free_ring(struct kioctx *ctx)
 
        put_aio_ring_file(ctx);
 
-       if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
+       if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
                kfree(ctx->ring_pages);
+               ctx->ring_pages = NULL;
+       }
 }
 
 static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
@@ -463,26 +466,34 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb)
        return cancel(kiocb);
 }
 
-static void free_ioctx_rcu(struct rcu_head *head)
+static void free_ioctx(struct work_struct *work)
 {
-       struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
 
+       pr_debug("freeing %p\n", ctx);
+
+       aio_free_ring(ctx);
        free_percpu(ctx->cpu);
        kmem_cache_free(kioctx_cachep, ctx);
 }
 
+static void free_ioctx_reqs(struct percpu_ref *ref)
+{
+       struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
+
+       INIT_WORK(&ctx->free_work, free_ioctx);
+       schedule_work(&ctx->free_work);
+}
+
 /*
  * When this function runs, the kioctx has been removed from the "hash table"
  * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
  * now it's safe to cancel any that need to be.
  */
-static void free_ioctx(struct work_struct *work)
+static void free_ioctx_users(struct percpu_ref *ref)
 {
-       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
-       struct aio_ring *ring;
+       struct kioctx *ctx = container_of(ref, struct kioctx, users);
        struct kiocb *req;
-       unsigned cpu, avail;
-       DEFINE_WAIT(wait);
 
        spin_lock_irq(&ctx->ctx_lock);
 
@@ -496,54 +507,8 @@ static void free_ioctx(struct work_struct *work)
 
        spin_unlock_irq(&ctx->ctx_lock);
 
-       for_each_possible_cpu(cpu) {
-               struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu);
-
-               atomic_add(kcpu->reqs_available, &ctx->reqs_available);
-               kcpu->reqs_available = 0;
-       }
-
-       while (1) {
-               prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-
-               ring = kmap_atomic(ctx->ring_pages[0]);
-               avail = (ring->head <= ring->tail)
-                        ? ring->tail - ring->head
-                        : ctx->nr_events - ring->head + ring->tail;
-
-               atomic_add(avail, &ctx->reqs_available);
-               ring->head = ring->tail;
-               kunmap_atomic(ring);
-
-               if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1)
-                       break;
-
-               schedule();
-       }
-       finish_wait(&ctx->wait, &wait);
-
-       WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1);
-
-       aio_free_ring(ctx);
-
-       pr_debug("freeing %p\n", ctx);
-
-       /*
-        * Here the call_rcu() is between the wait_event() for reqs_active to
-        * hit 0, and freeing the ioctx.
-        *
-        * aio_complete() decrements reqs_active, but it has to touch the ioctx
-        * after to issue a wakeup so we use rcu.
-        */
-       call_rcu(&ctx->rcu_head, free_ioctx_rcu);
-}
-
-static void free_ioctx_ref(struct percpu_ref *ref)
-{
-       struct kioctx *ctx = container_of(ref, struct kioctx, users);
-
-       INIT_WORK(&ctx->free_work, free_ioctx);
-       schedule_work(&ctx->free_work);
+       percpu_ref_kill(&ctx->reqs);
+       percpu_ref_put(&ctx->reqs);
 }
 
 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
@@ -602,6 +567,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
        }
 }
 
+static void aio_nr_sub(unsigned nr)
+{
+       spin_lock(&aio_nr_lock);
+       if (WARN_ON(aio_nr - nr > aio_nr))
+               aio_nr = 0;
+       else
+               aio_nr -= nr;
+       spin_unlock(&aio_nr_lock);
+}
+
 /* ioctx_alloc
  *     Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
  */
@@ -639,8 +614,11 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
        ctx->max_reqs = nr_events;
 
-       if (percpu_ref_init(&ctx->users, free_ioctx_ref))
-               goto out_freectx;
+       if (percpu_ref_init(&ctx->users, free_ioctx_users))
+               goto err;
+
+       if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+               goto err;
 
        spin_lock_init(&ctx->ctx_lock);
        spin_lock_init(&ctx->completion_lock);
@@ -651,10 +629,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
        ctx->cpu = alloc_percpu(struct kioctx_cpu);
        if (!ctx->cpu)
-               goto out_freeref;
+               goto err;
 
        if (aio_setup_ring(ctx) < 0)
-               goto out_freepcpu;
+               goto err;
 
        atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
        ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
@@ -666,7 +644,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
        if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
            aio_nr + nr_events < aio_nr) {
                spin_unlock(&aio_nr_lock);
-               goto out_cleanup;
+               err = -EAGAIN;
+               goto err;
        }
        aio_nr += ctx->max_reqs;
        spin_unlock(&aio_nr_lock);
@@ -675,23 +654,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
        err = ioctx_add_table(ctx, mm);
        if (err)
-               goto out_cleanup_put;
+               goto err_cleanup;
 
        pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
                 ctx, ctx->user_id, mm, ctx->nr_events);
        return ctx;
 
-out_cleanup_put:
-       percpu_ref_put(&ctx->users);
-out_cleanup:
-       err = -EAGAIN;
-       aio_free_ring(ctx);
-out_freepcpu:
+err_cleanup:
+       aio_nr_sub(ctx->max_reqs);
+err:
        free_percpu(ctx->cpu);
-out_freeref:
+       free_percpu(ctx->reqs.pcpu_count);
        free_percpu(ctx->users.pcpu_count);
-out_freectx:
-       put_aio_ring_file(ctx);
        kmem_cache_free(kioctx_cachep, ctx);
        pr_debug("error allocating ioctx %d\n", err);
        return ERR_PTR(err);
@@ -726,10 +700,7 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
                 * -EAGAIN with no ioctxs actually in use (as far as userspace
                 *  could tell).
                 */
-               spin_lock(&aio_nr_lock);
-               BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
-               aio_nr -= ctx->max_reqs;
-               spin_unlock(&aio_nr_lock);
+               aio_nr_sub(ctx->max_reqs);
 
                if (ctx->mmap_size)
                        vm_munmap(ctx->mmap_base, ctx->mmap_size);
@@ -861,6 +832,8 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
        if (unlikely(!req))
                goto out_put;
 
+       percpu_ref_get(&ctx->reqs);
+
        req->ki_ctx = ctx;
        return req;
 out_put:
@@ -930,12 +903,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
                return;
        }
 
-       /*
-        * Take rcu_read_lock() in case the kioctx is being destroyed, as we
-        * need to issue a wakeup after incrementing reqs_available.
-        */
-       rcu_read_lock();
-
        if (iocb->ki_list.next) {
                unsigned long flags;
 
@@ -1010,7 +977,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
        if (waitqueue_active(&ctx->wait))
                wake_up(&ctx->wait);
 
-       rcu_read_unlock();
+       percpu_ref_put(&ctx->reqs);
 }
 EXPORT_SYMBOL(aio_complete);
 
@@ -1421,6 +1388,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
        return 0;
 out_put_req:
        put_reqs_available(ctx, 1);
+       percpu_ref_put(&ctx->reqs);
        kiocb_free(req);
        return ret;
 }
index 2bdb4e2..33d79a4 100644 (file)
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bio_get_nr_vecs);
 
 static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                          *page, unsigned int len, unsigned int offset,
-                         unsigned short max_sectors)
+                         unsigned int max_sectors)
 {
        int retried_segments = 0;
        struct bio_vec *bvec;
index f9d5094..aa976ec 100644 (file)
@@ -9,12 +9,17 @@ config BTRFS_FS
        select XOR_BLOCKS
 
        help
-         Btrfs is a new filesystem with extents, writable snapshotting,
-         support for multiple devices and many more features.
+         Btrfs is a general purpose copy-on-write filesystem with extents,
+         writable snapshotting, support for multiple devices and many more
+         features focused on fault tolerance, repair and easy administration.
 
-         Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
-         FINALIZED.  You should say N here unless you are interested in
-         testing Btrfs with non-critical data.
+         The filesystem disk format is no longer unstable, and it's not
+         expected to change unless there are strong reasons to do so. If there
+         is a format change, file systems with a unchanged format will
+         continue to be mountable and usable by newer kernels.
+
+         For more information, please see the web pages at
+         http://btrfs.wiki.kernel.org.
 
          To compile this file system support as a module, choose M here. The
          module will be called btrfs.
index 8aec751..c1e0b0c 100644 (file)
@@ -495,6 +495,7 @@ static int __btrfs_start_workers(struct btrfs_workers *workers)
        spin_lock_irq(&workers->lock);
        if (workers->stopping) {
                spin_unlock_irq(&workers->lock);
+               ret = -EINVAL;
                goto fail_kthread;
        }
        list_add_tail(&worker->worker_list, &workers->idle_list);
index e0aab44..b50764b 100644 (file)
  * the integrity of (super)-block write requests, do not
  * enable the config option BTRFS_FS_CHECK_INTEGRITY to
  * include and compile the integrity check tool.
+ *
+ * Expect millions of lines of information in the kernel log with an
+ * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
+ * kernel config to at least 26 (which is 64MB). Usually the value is
+ * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
+ * changed like this before LOG_BUF_SHIFT can be set to a high value:
+ * config LOG_BUF_SHIFT
+ *       int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
+ *       range 12 30
  */
 
 #include <linux/sched.h>
 #define BTRFSIC_PRINT_MASK_INITIAL_DATABASE                    0x00000400
 #define BTRFSIC_PRINT_MASK_NUM_COPIES                          0x00000800
 #define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS               0x00001000
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE               0x00002000
 
 struct btrfsic_dev_state;
 struct btrfsic_state;
@@ -3015,6 +3025,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
            (rw & WRITE) && NULL != bio->bi_io_vec) {
                unsigned int i;
                u64 dev_bytenr;
+               u64 cur_bytenr;
                int bio_is_patched;
                char **mapped_datav;
 
@@ -3033,6 +3044,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
                                       GFP_NOFS);
                if (!mapped_datav)
                        goto leave;
+               cur_bytenr = dev_bytenr;
                for (i = 0; i < bio->bi_vcnt; i++) {
                        BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
                        mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
@@ -3044,16 +3056,13 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
                                kfree(mapped_datav);
                                goto leave;
                        }
-                       if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
-                            BTRFSIC_PRINT_MASK_VERBOSE) ==
-                           (dev_state->state->print_mask &
-                            (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
-                             BTRFSIC_PRINT_MASK_VERBOSE)))
+                       if (dev_state->state->print_mask &
+                           BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
                                printk(KERN_INFO
-                                      "#%u: page=%p, len=%u, offset=%u\n",
-                                      i, bio->bi_io_vec[i].bv_page,
-                                      bio->bi_io_vec[i].bv_len,
+                                      "#%u: bytenr=%llu, len=%u, offset=%u\n",
+                                      i, cur_bytenr, bio->bi_io_vec[i].bv_len,
                                       bio->bi_io_vec[i].bv_offset);
+                       cur_bytenr += bio->bi_io_vec[i].bv_len;
                }
                btrfsic_process_written_block(dev_state, dev_bytenr,
                                              mapped_datav, bio->bi_vcnt,
index f9aeb27..54ab861 100644 (file)
@@ -3613,9 +3613,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                       struct bio *bio, u64 file_start, int contig);
-int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root, struct btrfs_path *path,
-                       u64 isize);
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                             struct list_head *list, int search_commit);
 /* inode.c */
@@ -3744,9 +3741,6 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                             int skip_pinned);
-int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
-                              u64 start, u64 end, int skip_pinned,
-                              int modified);
 extern const struct file_operations btrfs_file_operations;
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root, struct inode *inode,
index 342f9fd..2cfc3df 100644 (file)
@@ -366,7 +366,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
        dev_replace->tgtdev = tgt_device;
 
        printk_in_rcu(KERN_INFO
-                     "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
+                     "btrfs: dev_replace from %s (devid %llu) to %s started\n",
                      src_device->missing ? "<missing disk>" :
                        rcu_str_deref(src_device->name),
                      src_device->devid,
index 4c4ed0b..8072cfa 100644 (file)
@@ -3517,7 +3517,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
 int btrfs_commit_super(struct btrfs_root *root)
 {
        struct btrfs_trans_handle *trans;
-       int ret;
 
        mutex_lock(&root->fs_info->cleaner_mutex);
        btrfs_run_delayed_iputs(root);
@@ -3531,25 +3530,7 @@ int btrfs_commit_super(struct btrfs_root *root)
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-       ret = btrfs_commit_transaction(trans, root);
-       if (ret)
-               return ret;
-       /* run commit again to drop the original snapshot */
-       trans = btrfs_join_transaction(root);
-       if (IS_ERR(trans))
-               return PTR_ERR(trans);
-       ret = btrfs_commit_transaction(trans, root);
-       if (ret)
-               return ret;
-       ret = btrfs_write_and_wait_transaction(NULL, root);
-       if (ret) {
-               btrfs_error(root->fs_info, ret,
-                           "Failed to sync btree inode to disk.");
-               return ret;
-       }
-
-       ret = write_ctree_super(NULL, root, 0);
-       return ret;
+       return btrfs_commit_transaction(trans, root);
 }
 
 int close_ctree(struct btrfs_root *root)
index 856bc2b..8e457fc 100644 (file)
@@ -1980,6 +1980,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        int ret;
 
+       ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
        BUG_ON(!mirror_num);
 
        /* we can't repair anything in raid56 yet */
@@ -2036,6 +2037,9 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
        unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
        int ret = 0;
 
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
        for (i = 0; i < num_pages; i++) {
                struct page *p = extent_buffer_page(eb, i);
                ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
@@ -2057,12 +2061,12 @@ static int clean_io_failure(u64 start, struct page *page)
        u64 private;
        u64 private_failure;
        struct io_failure_record *failrec;
-       struct btrfs_fs_info *fs_info;
+       struct inode *inode = page->mapping->host;
+       struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
        struct extent_state *state;
        int num_copies;
        int did_repair = 0;
        int ret;
-       struct inode *inode = page->mapping->host;
 
        private = 0;
        ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
@@ -2085,6 +2089,8 @@ static int clean_io_failure(u64 start, struct page *page)
                did_repair = 1;
                goto out;
        }
+       if (fs_info->sb->s_flags & MS_RDONLY)
+               goto out;
 
        spin_lock(&BTRFS_I(inode)->io_tree.lock);
        state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
@@ -2094,7 +2100,6 @@ static int clean_io_failure(u64 start, struct page *page)
 
        if (state && state->start <= failrec->start &&
            state->end >= failrec->start + failrec->len - 1) {
-               fs_info = BTRFS_I(inode)->root->fs_info;
                num_copies = btrfs_num_copies(fs_info, failrec->logical,
                                              failrec->len);
                if (num_copies > 1)  {
index da8d2f6..f1a7744 100644 (file)
@@ -2129,7 +2129,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path,
                                                  old->extent_offset, fs_info,
                                                  path, record_one_backref,
                                                  old);
-               BUG_ON(ret < 0 && ret != -ENOENT);
+               if (ret < 0 && ret != -ENOENT)
+                       return false;
 
                /* no backref to be processed for this extent */
                if (!old->count) {
@@ -6186,8 +6187,7 @@ insert:
        write_unlock(&em_tree->lock);
 out:
 
-       if (em)
-               trace_btrfs_get_extent(root, em);
+       trace_btrfs_get_extent(root, em);
 
        if (path)
                btrfs_free_path(path);
index 25a8f38..69582d5 100644 (file)
@@ -638,6 +638,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
                        WARN_ON(nr < 0);
                }
        }
+       list_splice_tail(&splice, &fs_info->ordered_roots);
        spin_unlock(&fs_info->ordered_root_lock);
 }
 
@@ -803,7 +804,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
                        btrfs_put_ordered_extent(ordered);
                        break;
                }
-               if (ordered->file_offset + ordered->len < start) {
+               if (ordered->file_offset + ordered->len <= start) {
                        btrfs_put_ordered_extent(ordered);
                        break;
                }
index 2544805..561e2f1 100644 (file)
@@ -938,8 +938,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
        }
 
-       if (sctx->readonly && !sctx->is_dev_replace)
-               goto did_not_correct_error;
+       if (sctx->readonly) {
+               ASSERT(!sctx->is_dev_replace);
+               goto out;
+       }
 
        if (!is_metadata && !have_csum) {
                struct scrub_fixup_nodatasum *fixup_nodatasum;
index 57c16b4..c6a872a 100644 (file)
@@ -1480,7 +1480,7 @@ static void do_async_commit(struct work_struct *work)
         * We've got freeze protection passed with the transaction.
         * Tell lockdep about it.
         */
-       if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
+       if (ac->newtrans->type & __TRANS_FREEZABLE)
                rwsem_acquire_read(
                     &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
                     0, 1, _THIS_IP_);
@@ -1521,7 +1521,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
         * Tell lockdep we've released the freeze rwsem, since the
         * async commit thread will be the one to unlock it.
         */
-       if (trans->type < TRANS_JOIN_NOLOCK)
+       if (ac->newtrans->type & __TRANS_FREEZABLE)
                rwsem_release(
                        &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
                        1, _THIS_IP_);
index 744553c..9f7fc51 100644 (file)
@@ -3697,7 +3697,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                        ret = btrfs_truncate_inode_items(trans, log,
                                                         inode, 0, 0);
                } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
-                                             &BTRFS_I(inode)->runtime_flags)) {
+                                             &BTRFS_I(inode)->runtime_flags) ||
+                          inode_only == LOG_INODE_EXISTS) {
                        if (inode_only == LOG_INODE_ALL)
                                fast_search = true;
                        max_key.type = BTRFS_XATTR_ITEM_KEY;
@@ -3801,7 +3802,7 @@ log_extents:
                        err = ret;
                        goto out_unlock;
                }
-       } else {
+       } else if (inode_only == LOG_INODE_ALL) {
                struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
                struct extent_map *em, *n;
 
index 0db6370..92303f4 100644 (file)
@@ -5394,7 +5394,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
 {
        struct bio_vec *prev;
        struct request_queue *q = bdev_get_queue(bdev);
-       unsigned short max_sectors = queue_max_sectors(q);
+       unsigned int max_sectors = queue_max_sectors(q);
        struct bvec_merge_data bvm = {
                .bi_bdev = bdev,
                .bi_sector = sector,
index 277bd1b..e081acb 100644 (file)
@@ -56,29 +56,28 @@ static void configfs_d_iput(struct dentry * dentry,
        struct configfs_dirent *sd = dentry->d_fsdata;
 
        if (sd) {
-               BUG_ON(sd->s_dentry != dentry);
                /* Coordinate with configfs_readdir */
                spin_lock(&configfs_dirent_lock);
-               sd->s_dentry = NULL;
+               /* Coordinate with configfs_attach_attr where will increase
+                * sd->s_count and update sd->s_dentry to new allocated one.
+                * Only set sd->dentry to null when this dentry is the only
+                * sd owner.
+                * If not do so, configfs_d_iput may run just after
+                * configfs_attach_attr and set sd->s_dentry to null
+                * even it's still in use.
+                */
+               if (atomic_read(&sd->s_count) <= 2)
+                       sd->s_dentry = NULL;
+
                spin_unlock(&configfs_dirent_lock);
                configfs_put(sd);
        }
        iput(inode);
 }
 
-/*
- * We _must_ delete our dentries on last dput, as the chain-to-parent
- * behavior is required to clear the parents of default_groups.
- */
-static int configfs_d_delete(const struct dentry *dentry)
-{
-       return 1;
-}
-
 const struct dentry_operations configfs_dentry_ops = {
        .d_iput         = configfs_d_iput,
-       /* simple_delete_dentry() isn't exported */
-       .d_delete       = configfs_d_delete,
+       .d_delete       = always_delete_dentry,
 };
 
 #ifdef CONFIG_LOCKDEP
@@ -426,8 +425,11 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
        struct configfs_attribute * attr = sd->s_element;
        int error;
 
+       spin_lock(&configfs_dirent_lock);
        dentry->d_fsdata = configfs_get(sd);
        sd->s_dentry = dentry;
+       spin_unlock(&configfs_dirent_lock);
+
        error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
                                configfs_init_file);
        if (error) {
index 62406b6..bc3fbcd 100644 (file)
@@ -695,7 +695,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
        while (nr) {
                if (dump_interrupted())
                        return 0;
-               n = vfs_write(file, addr, nr, &pos);
+               n = __kernel_write(file, addr, nr, &pos);
                if (n <= 0)
                        return 0;
                file->f_pos = pos;
@@ -733,7 +733,7 @@ int dump_align(struct coredump_params *cprm, int align)
 {
        unsigned mod = cprm->written & (align - 1);
        if (align & (align - 1))
-               return -EINVAL;
-       return mod ? dump_skip(cprm, align - mod) : 0;
+               return 0;
+       return mod ? dump_skip(cprm, align - mod) : 1;
 }
 EXPORT_SYMBOL(dump_align);
index 0a38ef8..4bdb300 100644 (file)
@@ -88,35 +88,6 @@ EXPORT_SYMBOL(rename_lock);
 
 static struct kmem_cache *dentry_cache __read_mostly;
 
-/**
- * read_seqbegin_or_lock - begin a sequence number check or locking block
- * @lock: sequence lock
- * @seq : sequence number to be checked
- *
- * First try it once optimistically without taking the lock. If that fails,
- * take the lock. The sequence number is also used as a marker for deciding
- * whether to be a reader (even) or writer (odd).
- * N.B. seq must be initialized to an even number to begin with.
- */
-static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
-{
-       if (!(*seq & 1))        /* Even */
-               *seq = read_seqbegin(lock);
-       else                    /* Odd */
-               read_seqlock_excl(lock);
-}
-
-static inline int need_seqretry(seqlock_t *lock, int seq)
-{
-       return !(seq & 1) && read_seqretry(lock, seq);
-}
-
-static inline void done_seqretry(seqlock_t *lock, int seq)
-{
-       if (seq & 1)
-               read_sequnlock_excl(lock);
-}
-
 /*
  * This is the single most critical data structure when it comes
  * to the dcache: the hashtable for lookups. Somebody should try
@@ -125,8 +96,6 @@ static inline void done_seqretry(seqlock_t *lock, int seq)
  * This hash-function tries to avoid losing too many bits of hash
  * information, yet avoid using a prime hash-size or similar.
  */
-#define D_HASHBITS     d_hash_shift
-#define D_HASHMASK     d_hash_mask
 
 static unsigned int d_hash_mask __read_mostly;
 static unsigned int d_hash_shift __read_mostly;
@@ -137,8 +106,8 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
                                        unsigned int hash)
 {
        hash += (unsigned long) parent / L1_CACHE_BYTES;
-       hash = hash + (hash >> D_HASHBITS);
-       return dentry_hashtable + (hash & D_HASHMASK);
+       hash = hash + (hash >> d_hash_shift);
+       return dentry_hashtable + (hash & d_hash_mask);
 }
 
 /* Statistics gathering. */
@@ -469,7 +438,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 {
        list_del(&dentry->d_u.d_child);
        /*
-        * Inform try_to_ascend() that we are no longer attached to the
+        * Inform d_walk() that we are no longer attached to the
         * dentry tree
         */
        dentry->d_flags |= DCACHE_DENTRY_KILLED;
@@ -1069,34 +1038,6 @@ void shrink_dcache_sb(struct super_block *sb)
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
-/*
- * This tries to ascend one level of parenthood, but
- * we can race with renaming, so we need to re-check
- * the parenthood after dropping the lock and check
- * that the sequence number still matches.
- */
-static struct dentry *try_to_ascend(struct dentry *old, unsigned seq)
-{
-       struct dentry *new = old->d_parent;
-
-       rcu_read_lock();
-       spin_unlock(&old->d_lock);
-       spin_lock(&new->d_lock);
-
-       /*
-        * might go back up the wrong parent if we have had a rename
-        * or deletion
-        */
-       if (new != old->d_parent ||
-                (old->d_flags & DCACHE_DENTRY_KILLED) ||
-                need_seqretry(&rename_lock, seq)) {
-               spin_unlock(&new->d_lock);
-               new = NULL;
-       }
-       rcu_read_unlock();
-       return new;
-}
-
 /**
  * enum d_walk_ret - action to talke during tree walk
  * @D_WALK_CONTINUE:   contrinue walk
@@ -1185,9 +1126,24 @@ resume:
         */
        if (this_parent != parent) {
                struct dentry *child = this_parent;
-               this_parent = try_to_ascend(this_parent, seq);
-               if (!this_parent)
+               this_parent = child->d_parent;
+
+               rcu_read_lock();
+               spin_unlock(&child->d_lock);
+               spin_lock(&this_parent->d_lock);
+
+               /*
+                * might go back up the wrong parent if we have had a rename
+                * or deletion
+                */
+               if (this_parent != child->d_parent ||
+                        (child->d_flags & DCACHE_DENTRY_KILLED) ||
+                        need_seqretry(&rename_lock, seq)) {
+                       spin_unlock(&this_parent->d_lock);
+                       rcu_read_unlock();
                        goto rename_retry;
+               }
+               rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
index a8766b8..becc725 100644 (file)
@@ -83,19 +83,10 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
        return 0;
 }
 
-/*
- * Retaining negative dentries for an in-memory filesystem just wastes
- * memory and lookup time: arrange for them to be deleted immediately.
- */
-static int efivarfs_delete_dentry(const struct dentry *dentry)
-{
-       return 1;
-}
-
 static struct dentry_operations efivarfs_d_ops = {
        .d_compare = efivarfs_d_compare,
        .d_hash = efivarfs_d_hash,
-       .d_delete = efivarfs_delete_dentry,
+       .d_delete = always_delete_dentry,
 };
 
 static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
index 977319f..7ea097f 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1380,10 +1380,6 @@ int search_binary_handler(struct linux_binprm *bprm)
        if (retval)
                return retval;
 
-       retval = audit_bprm(bprm);
-       if (retval)
-               return retval;
-
        retval = -ENOENT;
  retry:
        read_lock(&binfmt_lock);
@@ -1431,6 +1427,7 @@ static int exec_binprm(struct linux_binprm *bprm)
 
        ret = search_binary_handler(bprm);
        if (ret >= 0) {
+               audit_bprm(bprm);
                trace_sched_process_exec(current, old_pid, bprm);
                ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
                current->did_exec = 1;
index e66a800..c8420f7 100644 (file)
@@ -1899,7 +1899,8 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
                        gi->nhash = 0;
                }
        /* Skip entries for other sb and dead entries */
-       } while (gi->sdp != gi->gl->gl_sbd || __lockref_is_dead(&gl->gl_lockref));
+       } while (gi->sdp != gi->gl->gl_sbd ||
+                __lockref_is_dead(&gi->gl->gl_lockref));
 
        return 0;
 }
index 1615df1..7119504 100644 (file)
@@ -1171,8 +1171,11 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
        if (d != NULL)
                dentry = d;
        if (dentry->d_inode) {
-               if (!(*opened & FILE_OPENED))
+               if (!(*opened & FILE_OPENED)) {
+                       if (d == NULL)
+                               dget(dentry);
                        return finish_no_open(file, dentry);
+               }
                dput(d);
                return 0;
        }
index c8423d6..2a6ba06 100644 (file)
@@ -466,19 +466,19 @@ static void gdlm_cancel(struct gfs2_glock *gl)
 static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
                             char *lvb_bits)
 {
-       uint32_t gen;
+       __le32 gen;
        memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
-       memcpy(&gen, lvb_bits, sizeof(uint32_t));
+       memcpy(&gen, lvb_bits, sizeof(__le32));
        *lvb_gen = le32_to_cpu(gen);
 }
 
 static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
                              char *lvb_bits)
 {
-       uint32_t gen;
+       __le32 gen;
        memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
        gen = cpu_to_le32(lvb_gen);
-       memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t));
+       memcpy(ls->ls_control_lvb, &gen, sizeof(__le32));
 }
 
 static int all_jid_bits_clear(char *lvb)
index 453b50e..98236d0 100644 (file)
@@ -667,7 +667,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        struct buffer_head *bh;
        struct page *page;
        void *kaddr, *ptr;
-       struct gfs2_quota q, *qp;
+       struct gfs2_quota q;
        int err, nbytes;
        u64 size;
 
@@ -683,28 +683,25 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                return err;
 
        err = -EIO;
-       qp = &q;
-       qp->qu_value = be64_to_cpu(qp->qu_value);
-       qp->qu_value += change;
-       qp->qu_value = cpu_to_be64(qp->qu_value);
-       qd->qd_qb.qb_value = qp->qu_value;
+       be64_add_cpu(&q.qu_value, change);
+       qd->qd_qb.qb_value = q.qu_value;
        if (fdq) {
                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-                       qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
-                       qd->qd_qb.qb_warn = qp->qu_warn;
+                       q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
+                       qd->qd_qb.qb_warn = q.qu_warn;
                }
                if (fdq->d_fieldmask & FS_DQ_BHARD) {
-                       qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
-                       qd->qd_qb.qb_limit = qp->qu_limit;
+                       q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
+                       qd->qd_qb.qb_limit = q.qu_limit;
                }
                if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
-                       qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
-                       qd->qd_qb.qb_value = qp->qu_value;
+                       q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+                       qd->qd_qb.qb_value = q.qu_value;
                }
        }
 
        /* Write the quota into the quota file on disk */
-       ptr = qp;
+       ptr = &q;
        nbytes = sizeof(struct gfs2_quota);
 get_a_page:
        page = find_or_create_page(mapping, index, GFP_NOFS);
index 4d83abd..c8d6161 100644 (file)
@@ -1127,7 +1127,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
                rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
                rgd->rd_free_clone = rgd->rd_free;
        }
-       if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
+       if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
                rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
                gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
                                     rgd->rd_bits[0].bi_bh->b_data);
@@ -1161,7 +1161,7 @@ int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
        if (rgd->rd_flags & GFS2_RDF_UPTODATE)
                return 0;
 
-       if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
+       if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
                return gfs2_rgrp_bh_get(rgd);
 
        rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
index 2543728..db23ce1 100644 (file)
@@ -33,15 +33,6 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 
 #define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file))
 
-static int hostfs_d_delete(const struct dentry *dentry)
-{
-       return 1;
-}
-
-static const struct dentry_operations hostfs_dentry_ops = {
-       .d_delete               = hostfs_d_delete,
-};
-
 /* Changed in hostfs_args before the kernel starts running */
 static char *root_ino = "";
 static int append = 0;
@@ -925,7 +916,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = HOSTFS_SUPER_MAGIC;
        sb->s_op = &hostfs_sbops;
-       sb->s_d_op = &hostfs_dentry_ops;
+       sb->s_d_op = &simple_dentry_operations;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
 
        /* NULL is printed as <NULL> by sprintf: avoid that. */
index 5de0694..a184424 100644 (file)
@@ -47,10 +47,16 @@ EXPORT_SYMBOL(simple_statfs);
  * Retaining negative dentries for an in-memory filesystem just wastes
  * memory and lookup time: arrange for them to be deleted immediately.
  */
-static int simple_delete_dentry(const struct dentry *dentry)
+int always_delete_dentry(const struct dentry *dentry)
 {
        return 1;
 }
+EXPORT_SYMBOL(always_delete_dentry);
+
+const struct dentry_operations simple_dentry_operations = {
+       .d_delete = always_delete_dentry,
+};
+EXPORT_SYMBOL(simple_dentry_operations);
 
 /*
  * Lookup the data. This is trivial - if the dentry didn't already
@@ -58,10 +64,6 @@ static int simple_delete_dentry(const struct dentry *dentry)
  */
 struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
-       static const struct dentry_operations simple_dentry_operations = {
-               .d_delete = simple_delete_dentry,
-       };
-
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
        if (!dentry->d_sb->s_d_op)
index e029a4c..8f77a8c 100644 (file)
@@ -2435,6 +2435,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
  */
 static inline int may_create(struct inode *dir, struct dentry *child)
 {
+       audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
        if (child->d_inode)
                return -EEXIST;
        if (IS_DEADDIR(dir))
index 088de13..ee7237f 100644 (file)
@@ -141,8 +141,8 @@ xdr_error:                                  \
 
 static void next_decode_page(struct nfsd4_compoundargs *argp)
 {
-       argp->pagelist++;
        argp->p = page_address(argp->pagelist[0]);
+       argp->pagelist++;
        if (argp->pagelen < PAGE_SIZE) {
                argp->end = argp->p + (argp->pagelen>>2);
                argp->pagelen = 0;
@@ -1229,6 +1229,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
                len -= pages * PAGE_SIZE;
 
                argp->p = (__be32 *)page_address(argp->pagelist[0]);
+               argp->pagelist++;
                argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE);
        }
        argp->p += XDR_QUADLEN(len);
index 94b5f5d..7eea63c 100644 (file)
@@ -298,41 +298,12 @@ commit_metadata(struct svc_fh *fhp)
 }
 
 /*
- * Set various file attributes.
- * N.B. After this call fhp needs an fh_put
+ * Go over the attributes and take care of the small differences between
+ * NFS semantics and what Linux expects.
  */
-__be32
-nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
-            int check_guard, time_t guardtime)
+static void
+nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
 {
-       struct dentry   *dentry;
-       struct inode    *inode;
-       int             accmode = NFSD_MAY_SATTR;
-       umode_t         ftype = 0;
-       __be32          err;
-       int             host_err;
-       int             size_change = 0;
-
-       if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
-               accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
-       if (iap->ia_valid & ATTR_SIZE)
-               ftype = S_IFREG;
-
-       /* Get inode */
-       err = fh_verify(rqstp, fhp, ftype, accmode);
-       if (err)
-               goto out;
-
-       dentry = fhp->fh_dentry;
-       inode = dentry->d_inode;
-
-       /* Ignore any mode updates on symlinks */
-       if (S_ISLNK(inode->i_mode))
-               iap->ia_valid &= ~ATTR_MODE;
-
-       if (!iap->ia_valid)
-               goto out;
-
        /*
         * NFSv2 does not differentiate between "set-[ac]time-to-now"
         * which only requires access, and "set-[ac]time-to-X" which
@@ -342,8 +313,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
         * convert to "set to now" instead of "set to explicit time"
         *
         * We only call inode_change_ok as the last test as technically
-        * it is not an interface that we should be using.  It is only
-        * valid if the filesystem does not define it's own i_op->setattr.
+        * it is not an interface that we should be using.
         */
 #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
 #define        MAX_TOUCH_TIME_ERROR (30*60)
@@ -369,30 +339,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                        iap->ia_valid &= ~BOTH_TIME_SET;
                }
        }
-           
-       /*
-        * The size case is special.
-        * It changes the file as well as the attributes.
-        */
-       if (iap->ia_valid & ATTR_SIZE) {
-               if (iap->ia_size < inode->i_size) {
-                       err = nfsd_permission(rqstp, fhp->fh_export, dentry,
-                                       NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
-                       if (err)
-                               goto out;
-               }
-
-               host_err = get_write_access(inode);
-               if (host_err)
-                       goto out_nfserr;
-
-               size_change = 1;
-               host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
-               if (host_err) {
-                       put_write_access(inode);
-                       goto out_nfserr;
-               }
-       }
 
        /* sanitize the mode change */
        if (iap->ia_valid & ATTR_MODE) {
@@ -415,32 +361,111 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
                        iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
                }
        }
+}
 
-       /* Change the attributes. */
+static __be32
+nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
+               struct iattr *iap)
+{
+       struct inode *inode = fhp->fh_dentry->d_inode;
+       int host_err;
 
-       iap->ia_valid |= ATTR_CTIME;
+       if (iap->ia_size < inode->i_size) {
+               __be32 err;
 
-       err = nfserr_notsync;
-       if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
-               host_err = nfsd_break_lease(inode);
-               if (host_err)
-                       goto out_nfserr;
-               fh_lock(fhp);
+               err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+                               NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE);
+               if (err)
+                       return err;
+       }
 
-               host_err = notify_change(dentry, iap, NULL);
-               err = nfserrno(host_err);
-               fh_unlock(fhp);
+       host_err = get_write_access(inode);
+       if (host_err)
+               goto out_nfserrno;
+
+       host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
+       if (host_err)
+               goto out_put_write_access;
+       return 0;
+
+out_put_write_access:
+       put_write_access(inode);
+out_nfserrno:
+       return nfserrno(host_err);
+}
+
+/*
+ * Set various file attributes.  After this call fhp needs an fh_put.
+ */
+__be32
+nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
+            int check_guard, time_t guardtime)
+{
+       struct dentry   *dentry;
+       struct inode    *inode;
+       int             accmode = NFSD_MAY_SATTR;
+       umode_t         ftype = 0;
+       __be32          err;
+       int             host_err;
+       int             size_change = 0;
+
+       if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
+               accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
+       if (iap->ia_valid & ATTR_SIZE)
+               ftype = S_IFREG;
+
+       /* Get inode */
+       err = fh_verify(rqstp, fhp, ftype, accmode);
+       if (err)
+               goto out;
+
+       dentry = fhp->fh_dentry;
+       inode = dentry->d_inode;
+
+       /* Ignore any mode updates on symlinks */
+       if (S_ISLNK(inode->i_mode))
+               iap->ia_valid &= ~ATTR_MODE;
+
+       if (!iap->ia_valid)
+               goto out;
+
+       nfsd_sanitize_attrs(inode, iap);
+
+       /*
+        * The size case is special, it changes the file in addition to the
+        * attributes.
+        */
+       if (iap->ia_valid & ATTR_SIZE) {
+               err = nfsd_get_write_access(rqstp, fhp, iap);
+               if (err)
+                       goto out;
+               size_change = 1;
        }
+
+       iap->ia_valid |= ATTR_CTIME;
+
+       if (check_guard && guardtime != inode->i_ctime.tv_sec) {
+               err = nfserr_notsync;
+               goto out_put_write_access;
+       }
+
+       host_err = nfsd_break_lease(inode);
+       if (host_err)
+               goto out_put_write_access_nfserror;
+
+       fh_lock(fhp);
+       host_err = notify_change(dentry, iap, NULL);
+       fh_unlock(fhp);
+
+out_put_write_access_nfserror:
+       err = nfserrno(host_err);
+out_put_write_access:
        if (size_change)
                put_write_access(inode);
        if (!err)
                commit_metadata(fhp);
 out:
        return err;
-
-out_nfserr:
-       err = nfserrno(host_err);
-       goto out;
 }
 
 #if defined(CONFIG_NFSD_V2_ACL) || \
index 1485e38..03c8d74 100644 (file)
@@ -1151,10 +1151,16 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                goto out_free_page;
 
        }
-       kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
-       if (!uid_valid(kloginuid)) {
-               length = -EINVAL;
-               goto out_free_page;
+
+       /* is userspace tring to explicitly UNSET the loginuid? */
+       if (loginuid == AUDIT_UID_UNSET) {
+               kloginuid = INVALID_UID;
+       } else {
+               kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
+               if (!uid_valid(kloginuid)) {
+                       length = -EINVAL;
+                       goto out_free_page;
+               }
        }
 
        length = audit_set_loginuid(kloginuid);
index 737e156..cca93b6 100644 (file)
@@ -175,22 +175,6 @@ static const struct inode_operations proc_link_inode_operations = {
 };
 
 /*
- * As some entries in /proc are volatile, we want to 
- * get rid of unused dentries.  This could be made 
- * smarter: we could keep a "volatile" flag in the 
- * inode to indicate which ones to keep.
- */
-static int proc_delete_dentry(const struct dentry * dentry)
-{
-       return 1;
-}
-
-static const struct dentry_operations proc_dentry_operations =
-{
-       .d_delete       = proc_delete_dentry,
-};
-
-/*
  * Don't create negative dentries here, return -ENOENT by hand
  * instead.
  */
@@ -209,7 +193,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
                        inode = proc_get_inode(dir->i_sb, de);
                        if (!inode)
                                return ERR_PTR(-ENOMEM);
-                       d_set_d_op(dentry, &proc_dentry_operations);
+                       d_set_d_op(dentry, &simple_dentry_operations);
                        d_add(dentry, inode);
                        return NULL;
                }
index 49a7fff..9ae46b8 100644 (file)
@@ -42,12 +42,6 @@ static const struct inode_operations ns_inode_operations = {
        .setattr        = proc_setattr,
 };
 
-static int ns_delete_dentry(const struct dentry *dentry)
-{
-       /* Don't cache namespace inodes when not in use */
-       return 1;
-}
-
 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 {
        struct inode *inode = dentry->d_inode;
@@ -59,7 +53,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 
 const struct dentry_operations ns_dentry_operations =
 {
-       .d_delete       = ns_delete_dentry,
+       .d_delete       = always_delete_dentry,
        .d_dname        = ns_dname,
 };
 
index c70111e..b6fa865 100644 (file)
@@ -25,6 +25,78 @@ config SQUASHFS
 
          If unsure, say N.
 
+choice
+       prompt "File decompression options"
+       depends on SQUASHFS
+       help
+         Squashfs now supports two options for decompressing file
+         data.  Traditionally Squashfs has decompressed into an
+         intermediate buffer and then memcopied it into the page cache.
+         Squashfs now supports the ability to decompress directly into
+         the page cache.
+
+         If unsure, select "Decompress file data into an intermediate buffer"
+
+config SQUASHFS_FILE_CACHE
+       bool "Decompress file data into an intermediate buffer"
+       help
+         Decompress file data into an intermediate buffer and then
+         memcopy it into the page cache.
+
+config SQUASHFS_FILE_DIRECT
+       bool "Decompress files directly into the page cache"
+       help
+         Directly decompress file data into the page cache.
+         Doing so can significantly improve performance because
+         it eliminates a memcpy and it also removes the lock contention
+         on the single buffer.
+
+endchoice
+
+choice
+       prompt "Decompressor parallelisation options"
+       depends on SQUASHFS
+       help
+         Squashfs now supports three parallelisation options for
+         decompression.  Each one exhibits various trade-offs between
+         decompression performance and CPU and memory usage.
+
+         If in doubt, select "Single threaded compression"
+
+config SQUASHFS_DECOMP_SINGLE
+       bool "Single threaded compression"
+       help
+         Traditionally Squashfs has used single-threaded decompression.
+         Only one block (data or metadata) can be decompressed at any
+         one time.  This limits CPU and memory usage to a minimum.
+
+config SQUASHFS_DECOMP_MULTI
+       bool "Use multiple decompressors for parallel I/O"
+       help
+         By default Squashfs uses a single decompressor but it gives
+         poor performance on parallel I/O workloads when using multiple CPU
+         machines due to waiting on decompressor availability.
+
+         If you have a parallel I/O workload and your system has enough memory,
+         using this option may improve overall I/O performance.
+
+         This decompressor implementation uses up to two parallel
+         decompressors per core.  It dynamically allocates decompressors
+         on a demand basis.
+
+config SQUASHFS_DECOMP_MULTI_PERCPU
+       bool "Use percpu multiple decompressors for parallel I/O"
+       help
+         By default Squashfs uses a single decompressor but it gives
+         poor performance on parallel I/O workloads when using multiple CPU
+         machines due to waiting on decompressor availability.
+
+         This decompressor implementation uses a maximum of one
+         decompressor per core.  It uses percpu variables to ensure
+         decompression is load-balanced across the cores.
+
+endchoice
+
 config SQUASHFS_XATTR
        bool "Squashfs XATTR support"
        depends on SQUASHFS
index 110b047..4132520 100644 (file)
@@ -5,6 +5,11 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
+squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
+squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
+squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
+squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
 squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
 squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
 squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
index 41d108e..0cea9b9 100644 (file)
@@ -36,6 +36,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "page_actor.h"
 
 /*
  * Read the metadata block length, this is stored in the first two
@@ -86,16 +87,16 @@ static struct buffer_head *get_block_length(struct super_block *sb,
  * generated a larger block - this does occasionally happen with compression
  * algorithms).
  */
-int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
-                       int length, u64 *next_index, int srclength, int pages)
+int squashfs_read_data(struct super_block *sb, u64 index, int length,
+               u64 *next_index, struct squashfs_page_actor *output)
 {
        struct squashfs_sb_info *msblk = sb->s_fs_info;
        struct buffer_head **bh;
        int offset = index & ((1 << msblk->devblksize_log2) - 1);
        u64 cur_index = index >> msblk->devblksize_log2;
-       int bytes, compressed, b = 0, k = 0, page = 0, avail;
+       int bytes, compressed, b = 0, k = 0, avail, i;
 
-       bh = kcalloc(((srclength + msblk->devblksize - 1)
+       bh = kcalloc(((output->length + msblk->devblksize - 1)
                >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
        if (bh == NULL)
                return -ENOMEM;
@@ -111,9 +112,9 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
                        *next_index = index + length;
 
                TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
-                       index, compressed ? "" : "un", length, srclength);
+                       index, compressed ? "" : "un", length, output->length);
 
-               if (length < 0 || length > srclength ||
+               if (length < 0 || length > output->length ||
                                (index + length) > msblk->bytes_used)
                        goto read_failure;
 
@@ -145,7 +146,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
                TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
                                compressed ? "" : "un", length);
 
-               if (length < 0 || length > srclength ||
+               if (length < 0 || length > output->length ||
                                        (index + length) > msblk->bytes_used)
                        goto block_release;
 
@@ -158,9 +159,15 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
                ll_rw_block(READ, b - 1, bh + 1);
        }
 
+       for (i = 0; i < b; i++) {
+               wait_on_buffer(bh[i]);
+               if (!buffer_uptodate(bh[i]))
+                       goto block_release;
+       }
+
        if (compressed) {
-               length = squashfs_decompress(msblk, buffer, bh, b, offset,
-                        length, srclength, pages);
+               length = squashfs_decompress(msblk, bh, b, offset, length,
+                       output);
                if (length < 0)
                        goto read_failure;
        } else {
@@ -168,22 +175,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
                 * Block is uncompressed.
                 */
                int in, pg_offset = 0;
+               void *data = squashfs_first_page(output);
 
                for (bytes = length; k < b; k++) {
                        in = min(bytes, msblk->devblksize - offset);
                        bytes -= in;
-                       wait_on_buffer(bh[k]);
-                       if (!buffer_uptodate(bh[k]))
-                               goto block_release;
                        while (in) {
                                if (pg_offset == PAGE_CACHE_SIZE) {
-                                       page++;
+                                       data = squashfs_next_page(output);
                                        pg_offset = 0;
                                }
                                avail = min_t(int, in, PAGE_CACHE_SIZE -
                                                pg_offset);
-                               memcpy(buffer[page] + pg_offset,
-                                               bh[k]->b_data + offset, avail);
+                               memcpy(data + pg_offset, bh[k]->b_data + offset,
+                                               avail);
                                in -= avail;
                                pg_offset += avail;
                                offset += avail;
@@ -191,6 +196,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
                        offset = 0;
                        put_bh(bh[k]);
                }
+               squashfs_finish_page(output);
        }
 
        kfree(bh);
index af0b738..1cb70a0 100644 (file)
@@ -56,6 +56,7 @@
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs.h"
+#include "page_actor.h"
 
 /*
  * Look-up block in cache, and increment usage count.  If not in cache, read
@@ -119,9 +120,8 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
                        entry->error = 0;
                        spin_unlock(&cache->lock);
 
-                       entry->length = squashfs_read_data(sb, entry->data,
-                               block, length, &entry->next_index,
-                               cache->block_size, cache->pages);
+                       entry->length = squashfs_read_data(sb, block, length,
+                               &entry->next_index, entry->actor);
 
                        spin_lock(&cache->lock);
 
@@ -220,6 +220,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache)
                                kfree(cache->entry[i].data[j]);
                        kfree(cache->entry[i].data);
                }
+               kfree(cache->entry[i].actor);
        }
 
        kfree(cache->entry);
@@ -280,6 +281,13 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
                                goto cleanup;
                        }
                }
+
+               entry->actor = squashfs_page_actor_init(entry->data,
+                                               cache->pages, 0);
+               if (entry->actor == NULL) {
+                       ERROR("Failed to allocate %s cache entry\n", name);
+                       goto cleanup;
+               }
        }
 
        return cache;
@@ -410,6 +418,7 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length)
        int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        int i, res;
        void *table, *buffer, **data;
+       struct squashfs_page_actor *actor;
 
        table = buffer = kmalloc(length, GFP_KERNEL);
        if (table == NULL)
@@ -421,19 +430,28 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length)
                goto failed;
        }
 
+       actor = squashfs_page_actor_init(data, pages, length);
+       if (actor == NULL) {
+               res = -ENOMEM;
+               goto failed2;
+       }
+
        for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
                data[i] = buffer;
 
-       res = squashfs_read_data(sb, data, block, length |
-               SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
+       res = squashfs_read_data(sb, block, length |
+               SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor);
 
        kfree(data);
+       kfree(actor);
 
        if (res < 0)
                goto failed;
 
        return table;
 
+failed2:
+       kfree(data);
 failed:
        kfree(table);
        return ERR_PTR(res);
index 3f6271d..ac22fe7 100644 (file)
@@ -30,6 +30,7 @@
 #include "squashfs_fs_sb.h"
 #include "decompressor.h"
 #include "squashfs.h"
+#include "page_actor.h"
 
 /*
  * This file (and decompressor.h) implements a decompressor framework for
  */
 
 static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
-       NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
+       NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
 };
 
 #ifndef CONFIG_SQUASHFS_LZO
 static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
-       NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
+       NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
 };
 #endif
 
 #ifndef CONFIG_SQUASHFS_XZ
 static const struct squashfs_decompressor squashfs_xz_comp_ops = {
-       NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
+       NULL, NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
 };
 #endif
 
 #ifndef CONFIG_SQUASHFS_ZLIB
 static const struct squashfs_decompressor squashfs_zlib_comp_ops = {
-       NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0
+       NULL, NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0
 };
 #endif
 
 static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
-       NULL, NULL, NULL, 0, "unknown", 0
+       NULL, NULL, NULL, NULL, 0, "unknown", 0
 };
 
 static const struct squashfs_decompressor *decompressor[] = {
@@ -83,10 +84,11 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
 }
 
 
-void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
+static void *get_comp_opts(struct super_block *sb, unsigned short flags)
 {
        struct squashfs_sb_info *msblk = sb->s_fs_info;
-       void *strm, *buffer = NULL;
+       void *buffer = NULL, *comp_opts;
+       struct squashfs_page_actor *actor = NULL;
        int length = 0;
 
        /*
@@ -94,23 +96,46 @@ void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
         */
        if (SQUASHFS_COMP_OPTS(flags)) {
                buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
-               if (buffer == NULL)
-                       return ERR_PTR(-ENOMEM);
+               if (buffer == NULL) {
+                       comp_opts = ERR_PTR(-ENOMEM);
+                       goto out;
+               }
+
+               actor = squashfs_page_actor_init(&buffer, 1, 0);
+               if (actor == NULL) {
+                       comp_opts = ERR_PTR(-ENOMEM);
+                       goto out;
+               }
 
-               length = squashfs_read_data(sb, &buffer,
-                       sizeof(struct squashfs_super_block), 0, NULL,
-                       PAGE_CACHE_SIZE, 1);
+               length = squashfs_read_data(sb,
+                       sizeof(struct squashfs_super_block), 0, NULL, actor);
 
                if (length < 0) {
-                       strm = ERR_PTR(length);
-                       goto finished;
+                       comp_opts = ERR_PTR(length);
+                       goto out;
                }
        }
 
-       strm = msblk->decompressor->init(msblk, buffer, length);
+       comp_opts = squashfs_comp_opts(msblk, buffer, length);
 
-finished:
+out:
+       kfree(actor);
        kfree(buffer);
+       return comp_opts;
+}
+
+
+void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags)
+{
+       struct squashfs_sb_info *msblk = sb->s_fs_info;
+       void *stream, *comp_opts = get_comp_opts(sb, flags);
+
+       if (IS_ERR(comp_opts))
+               return comp_opts;
+
+       stream = squashfs_decompressor_create(msblk, comp_opts);
+       if (IS_ERR(stream))
+               kfree(comp_opts);
 
-       return strm;
+       return stream;
 }
index 330073e..af09853 100644 (file)
  */
 
 struct squashfs_decompressor {
-       void    *(*init)(struct squashfs_sb_info *, void *, int);
+       void    *(*init)(struct squashfs_sb_info *, void *);
+       void    *(*comp_opts)(struct squashfs_sb_info *, void *, int);
        void    (*free)(void *);
-       int     (*decompress)(struct squashfs_sb_info *, void **,
-               struct buffer_head **, int, int, int, int, int);
+       int     (*decompress)(struct squashfs_sb_info *, void *,
+               struct buffer_head **, int, int, int,
+               struct squashfs_page_actor *);
        int     id;
        char    *name;
        int     supported;
 };
 
-static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
-       void *s)
+static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk,
+                                                       void *buff, int length)
 {
-       if (msblk->decompressor)
-               msblk->decompressor->free(s);
-}
-
-static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
-       void **buffer, struct buffer_head **bh, int b, int offset, int length,
-       int srclength, int pages)
-{
-       return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
-               length, srclength, pages);
+       return msblk->decompressor->comp_opts ?
+               msblk->decompressor->comp_opts(msblk, buff, length) : NULL;
 }
 
 #ifdef CONFIG_SQUASHFS_XZ
diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
new file mode 100644 (file)
index 0000000..d6008a6
--- /dev/null
@@ -0,0 +1,198 @@
+/*
+ *  Copyright (c) 2013
+ *  Minchan Kim <minchan@kernel.org>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/cpumask.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file implements multi-threaded decompression in the
+ * decompressor framework
+ */
+
+
+/*
+ * The reason that multiply two is that a CPU can request new I/O
+ * while it is waiting previous request.
+ */
+#define MAX_DECOMPRESSOR       (num_online_cpus() * 2)
+
+
+int squashfs_max_decompressors(void)
+{
+       return MAX_DECOMPRESSOR;
+}
+
+
+struct squashfs_stream {
+       void                    *comp_opts;
+       struct list_head        strm_list;
+       struct mutex            mutex;
+       int                     avail_decomp;
+       wait_queue_head_t       wait;
+};
+
+
+struct decomp_stream {
+       void *stream;
+       struct list_head list;
+};
+
+
+static void put_decomp_stream(struct decomp_stream *decomp_strm,
+                               struct squashfs_stream *stream)
+{
+       mutex_lock(&stream->mutex);
+       list_add(&decomp_strm->list, &stream->strm_list);
+       mutex_unlock(&stream->mutex);
+       wake_up(&stream->wait);
+}
+
+void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+                               void *comp_opts)
+{
+       struct squashfs_stream *stream;
+       struct decomp_stream *decomp_strm = NULL;
+       int err = -ENOMEM;
+
+       stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+       if (!stream)
+               goto out;
+
+       stream->comp_opts = comp_opts;
+       mutex_init(&stream->mutex);
+       INIT_LIST_HEAD(&stream->strm_list);
+       init_waitqueue_head(&stream->wait);
+
+       /*
+        * We should have a decompressor at least as default
+        * so if we fail to allocate new decompressor dynamically,
+        * we could always fall back to default decompressor and
+        * file system works.
+        */
+       decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL);
+       if (!decomp_strm)
+               goto out;
+
+       decomp_strm->stream = msblk->decompressor->init(msblk,
+                                               stream->comp_opts);
+       if (IS_ERR(decomp_strm->stream)) {
+               err = PTR_ERR(decomp_strm->stream);
+               goto out;
+       }
+
+       list_add(&decomp_strm->list, &stream->strm_list);
+       stream->avail_decomp = 1;
+       return stream;
+
+out:
+       kfree(decomp_strm);
+       kfree(stream);
+       return ERR_PTR(err);
+}
+
+
+void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+{
+       struct squashfs_stream *stream = msblk->stream;
+       if (stream) {
+               struct decomp_stream *decomp_strm;
+
+               while (!list_empty(&stream->strm_list)) {
+                       decomp_strm = list_entry(stream->strm_list.prev,
+                                               struct decomp_stream, list);
+                       list_del(&decomp_strm->list);
+                       msblk->decompressor->free(decomp_strm->stream);
+                       kfree(decomp_strm);
+                       stream->avail_decomp--;
+               }
+               WARN_ON(stream->avail_decomp);
+               kfree(stream->comp_opts);
+               kfree(stream);
+       }
+}
+
+
+static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
+                                       struct squashfs_stream *stream)
+{
+       struct decomp_stream *decomp_strm;
+
+       while (1) {
+               mutex_lock(&stream->mutex);
+
+               /* There is available decomp_stream */
+               if (!list_empty(&stream->strm_list)) {
+                       decomp_strm = list_entry(stream->strm_list.prev,
+                               struct decomp_stream, list);
+                       list_del(&decomp_strm->list);
+                       mutex_unlock(&stream->mutex);
+                       break;
+               }
+
+               /*
+                * If there is no available decomp and already full,
+                * let's wait for releasing decomp from other users.
+                */
+               if (stream->avail_decomp >= MAX_DECOMPRESSOR)
+                       goto wait;
+
+               /* Let's allocate new decomp */
+               decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL);
+               if (!decomp_strm)
+                       goto wait;
+
+               decomp_strm->stream = msblk->decompressor->init(msblk,
+                                               stream->comp_opts);
+               if (IS_ERR(decomp_strm->stream)) {
+                       kfree(decomp_strm);
+                       goto wait;
+               }
+
+               stream->avail_decomp++;
+               WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR);
+
+               mutex_unlock(&stream->mutex);
+               break;
+wait:
+               /*
+                * If system memory is tough, let's for other's
+                * releasing instead of hurting VM because it could
+                * make page cache thrashing.
+                */
+               mutex_unlock(&stream->mutex);
+               wait_event(stream->wait,
+                       !list_empty(&stream->strm_list));
+       }
+
+       return decomp_strm;
+}
+
+
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
+       int b, int offset, int length, struct squashfs_page_actor *output)
+{
+       int res;
+       struct squashfs_stream *stream = msblk->stream;
+       struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream);
+       res = msblk->decompressor->decompress(msblk, decomp_stream->stream,
+               bh, b, offset, length, output);
+       put_decomp_stream(decomp_stream, stream);
+       if (res < 0)
+               ERROR("%s decompression failed, data probably corrupt\n",
+                       msblk->decompressor->name);
+       return res;
+}
diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
new file mode 100644 (file)
index 0000000..23a9c28
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file implements multi-threaded decompression using percpu
+ * variables, one thread per cpu core.
+ */
+
+struct squashfs_stream {
+       void            *stream;
+};
+
+void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+                                               void *comp_opts)
+{
+       struct squashfs_stream *stream;
+       struct squashfs_stream __percpu *percpu;
+       int err, cpu;
+
+       percpu = alloc_percpu(struct squashfs_stream);
+       if (percpu == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       for_each_possible_cpu(cpu) {
+               stream = per_cpu_ptr(percpu, cpu);
+               stream->stream = msblk->decompressor->init(msblk, comp_opts);
+               if (IS_ERR(stream->stream)) {
+                       err = PTR_ERR(stream->stream);
+                       goto out;
+               }
+       }
+
+       kfree(comp_opts);
+       return (__force void *) percpu;
+
+out:
+       for_each_possible_cpu(cpu) {
+               stream = per_cpu_ptr(percpu, cpu);
+               if (!IS_ERR_OR_NULL(stream->stream))
+                       msblk->decompressor->free(stream->stream);
+       }
+       free_percpu(percpu);
+       return ERR_PTR(err);
+}
+
+void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+{
+       struct squashfs_stream __percpu *percpu =
+                       (struct squashfs_stream __percpu *) msblk->stream;
+       struct squashfs_stream *stream;
+       int cpu;
+
+       if (msblk->stream) {
+               for_each_possible_cpu(cpu) {
+                       stream = per_cpu_ptr(percpu, cpu);
+                       msblk->decompressor->free(stream->stream);
+               }
+               free_percpu(percpu);
+       }
+}
+
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
+       int b, int offset, int length, struct squashfs_page_actor *output)
+{
+       struct squashfs_stream __percpu *percpu =
+                       (struct squashfs_stream __percpu *) msblk->stream;
+       struct squashfs_stream *stream = get_cpu_ptr(percpu);
+       int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
+               offset, length, output);
+       put_cpu_ptr(stream);
+
+       if (res < 0)
+               ERROR("%s decompression failed, data probably corrupt\n",
+                       msblk->decompressor->name);
+
+       return res;
+}
+
+int squashfs_max_decompressors(void)
+{
+       return num_possible_cpus();
+}
diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c
new file mode 100644 (file)
index 0000000..a6c7592
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file implements single-threaded decompression in the
+ * decompressor framework
+ */
+
+struct squashfs_stream {
+       void            *stream;
+       struct mutex    mutex;
+};
+
+void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+                                               void *comp_opts)
+{
+       struct squashfs_stream *stream;
+       int err = -ENOMEM;
+
+       stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+       if (stream == NULL)
+               goto out;
+
+       stream->stream = msblk->decompressor->init(msblk, comp_opts);
+       if (IS_ERR(stream->stream)) {
+               err = PTR_ERR(stream->stream);
+               goto out;
+       }
+
+       kfree(comp_opts);
+       mutex_init(&stream->mutex);
+       return stream;
+
+out:
+       kfree(stream);
+       return ERR_PTR(err);
+}
+
+void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+{
+       struct squashfs_stream *stream = msblk->stream;
+
+       if (stream) {
+               msblk->decompressor->free(stream->stream);
+               kfree(stream);
+       }
+}
+
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
+       int b, int offset, int length, struct squashfs_page_actor *output)
+{
+       int res;
+       struct squashfs_stream *stream = msblk->stream;
+
+       mutex_lock(&stream->mutex);
+       res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
+               offset, length, output);
+       mutex_unlock(&stream->mutex);
+
+       if (res < 0)
+               ERROR("%s decompression failed, data probably corrupt\n",
+                       msblk->decompressor->name);
+
+       return res;
+}
+
+int squashfs_max_decompressors(void)
+{
+       return 1;
+}
index 8ca62c2..e5c9689 100644 (file)
@@ -370,77 +370,15 @@ static int read_blocklist(struct inode *inode, int index, u64 *block)
        return le32_to_cpu(size);
 }
 
-
-static int squashfs_readpage(struct file *file, struct page *page)
+/* Copy data into page cache  */
+void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
+       int bytes, int offset)
 {
        struct inode *inode = page->mapping->host;
        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
-       int bytes, i, offset = 0, sparse = 0;
-       struct squashfs_cache_entry *buffer = NULL;
        void *pageaddr;
-
-       int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
-       int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
-       int start_index = page->index & ~mask;
-       int end_index = start_index | mask;
-       int file_end = i_size_read(inode) >> msblk->block_log;
-
-       TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
-                               page->index, squashfs_i(inode)->start);
-
-       if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-                                       PAGE_CACHE_SHIFT))
-               goto out;
-
-       if (index < file_end || squashfs_i(inode)->fragment_block ==
-                                       SQUASHFS_INVALID_BLK) {
-               /*
-                * Reading a datablock from disk.  Need to read block list
-                * to get location and block size.
-                */
-               u64 block = 0;
-               int bsize = read_blocklist(inode, index, &block);
-               if (bsize < 0)
-                       goto error_out;
-
-               if (bsize == 0) { /* hole */
-                       bytes = index == file_end ?
-                               (i_size_read(inode) & (msblk->block_size - 1)) :
-                                msblk->block_size;
-                       sparse = 1;
-               } else {
-                       /*
-                        * Read and decompress datablock.
-                        */
-                       buffer = squashfs_get_datablock(inode->i_sb,
-                                                               block, bsize);
-                       if (buffer->error) {
-                               ERROR("Unable to read page, block %llx, size %x"
-                                       "\n", block, bsize);
-                               squashfs_cache_put(buffer);
-                               goto error_out;
-                       }
-                       bytes = buffer->length;
-               }
-       } else {
-               /*
-                * Datablock is stored inside a fragment (tail-end packed
-                * block).
-                */
-               buffer = squashfs_get_fragment(inode->i_sb,
-                               squashfs_i(inode)->fragment_block,
-                               squashfs_i(inode)->fragment_size);
-
-               if (buffer->error) {
-                       ERROR("Unable to read page, block %llx, size %x\n",
-                               squashfs_i(inode)->fragment_block,
-                               squashfs_i(inode)->fragment_size);
-                       squashfs_cache_put(buffer);
-                       goto error_out;
-               }
-               bytes = i_size_read(inode) & (msblk->block_size - 1);
-               offset = squashfs_i(inode)->fragment_offset;
-       }
+       int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+       int start_index = page->index & ~mask, end_index = start_index | mask;
 
        /*
         * Loop copying datablock into pages.  As the datablock likely covers
@@ -451,7 +389,7 @@ static int squashfs_readpage(struct file *file, struct page *page)
        for (i = start_index; i <= end_index && bytes > 0; i++,
                        bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
                struct page *push_page;
-               int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);
+               int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0;
 
                TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
 
@@ -475,11 +413,75 @@ skip_page:
                if (i != page->index)
                        page_cache_release(push_page);
        }
+}
+
+/* Read datablock stored packed inside a fragment (tail-end packed block) */
+static int squashfs_readpage_fragment(struct page *page)
+{
+       struct inode *inode = page->mapping->host;
+       struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+       struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
+               squashfs_i(inode)->fragment_block,
+               squashfs_i(inode)->fragment_size);
+       int res = buffer->error;
+
+       if (res)
+               ERROR("Unable to read page, block %llx, size %x\n",
+                       squashfs_i(inode)->fragment_block,
+                       squashfs_i(inode)->fragment_size);
+       else
+               squashfs_copy_cache(page, buffer, i_size_read(inode) &
+                       (msblk->block_size - 1),
+                       squashfs_i(inode)->fragment_offset);
+
+       squashfs_cache_put(buffer);
+       return res;
+}
 
-       if (!sparse)
-               squashfs_cache_put(buffer);
+static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
+{
+       struct inode *inode = page->mapping->host;
+       struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+       int bytes = index == file_end ?
+                       (i_size_read(inode) & (msblk->block_size - 1)) :
+                        msblk->block_size;
 
+       squashfs_copy_cache(page, NULL, bytes, 0);
        return 0;
+}
+
+static int squashfs_readpage(struct file *file, struct page *page)
+{
+       struct inode *inode = page->mapping->host;
+       struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+       int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+       int file_end = i_size_read(inode) >> msblk->block_log;
+       int res;
+       void *pageaddr;
+
+       TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
+                               page->index, squashfs_i(inode)->start);
+
+       if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                                       PAGE_CACHE_SHIFT))
+               goto out;
+
+       if (index < file_end || squashfs_i(inode)->fragment_block ==
+                                       SQUASHFS_INVALID_BLK) {
+               u64 block = 0;
+               int bsize = read_blocklist(inode, index, &block);
+               if (bsize < 0)
+                       goto error_out;
+
+               if (bsize == 0)
+                       res = squashfs_readpage_sparse(page, index, file_end);
+               else
+                       res = squashfs_readpage_block(page, block, bsize);
+       } else
+               res = squashfs_readpage_fragment(page);
+
+       if (!res)
+               return 0;
 
 error_out:
        SetPageError(page);
diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c
new file mode 100644 (file)
index 0000000..f2310d2
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/* Read separately compressed datablock and memcopy into page cache */
+int squashfs_readpage_block(struct page *page, u64 block, int bsize)
+{
+       struct inode *i = page->mapping->host;
+       struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
+               block, bsize);
+       int res = buffer->error;
+
+       if (res)
+               ERROR("Unable to read page, block %llx, size %x\n", block,
+                       bsize);
+       else
+               squashfs_copy_cache(page, buffer, buffer->length, 0);
+
+       squashfs_cache_put(buffer);
+       return res;
+}
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
new file mode 100644 (file)
index 0000000..2943b2b
--- /dev/null
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "page_actor.h"
+
+static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
+       int pages, struct page **page);
+
+/* Read separately compressed datablock directly into page cache */
+int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
+
+{
+       struct inode *inode = target_page->mapping->host;
+       struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+
+       int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+       int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+       int start_index = target_page->index & ~mask;
+       int end_index = start_index | mask;
+       int i, n, pages, missing_pages, bytes, res = -ENOMEM;
+       struct page **page;
+       struct squashfs_page_actor *actor;
+       void *pageaddr;
+
+       if (end_index > file_end)
+               end_index = file_end;
+
+       pages = end_index - start_index + 1;
+
+       page = kmalloc(sizeof(void *) * pages, GFP_KERNEL);
+       if (page == NULL)
+               return res;
+
+       /*
+        * Create a "page actor" which will kmap and kunmap the
+        * page cache pages appropriately within the decompressor
+        */
+       actor = squashfs_page_actor_init_special(page, pages, 0);
+       if (actor == NULL)
+               goto out;
+
+       /* Try to grab all the pages covered by the Squashfs block */
+       for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
+               page[i] = (n == target_page->index) ? target_page :
+                       grab_cache_page_nowait(target_page->mapping, n);
+
+               if (page[i] == NULL) {
+                       missing_pages++;
+                       continue;
+               }
+
+               if (PageUptodate(page[i])) {
+                       unlock_page(page[i]);
+                       page_cache_release(page[i]);
+                       page[i] = NULL;
+                       missing_pages++;
+               }
+       }
+
+       if (missing_pages) {
+               /*
+                * Couldn't get one or more pages, this page has either
+                * been VM reclaimed, but others are still in the page cache
+                * and uptodate, or we're racing with another thread in
+                * squashfs_readpage also trying to grab them.  Fall back to
+                * using an intermediate buffer.
+                */
+               res = squashfs_read_cache(target_page, block, bsize, pages,
+                                                               page);
+               goto out;
+       }
+
+       /* Decompress directly into the page cache buffers */
+       res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+       if (res < 0)
+               goto mark_errored;
+
+       /* Last page may have trailing bytes not filled */
+       bytes = res % PAGE_CACHE_SIZE;
+       if (bytes) {
+               pageaddr = kmap_atomic(page[pages - 1]);
+               memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
+               kunmap_atomic(pageaddr);
+       }
+
+       /* Mark pages as uptodate, unlock and release */
+       for (i = 0; i < pages; i++) {
+               flush_dcache_page(page[i]);
+               SetPageUptodate(page[i]);
+               unlock_page(page[i]);
+               if (page[i] != target_page)
+                       page_cache_release(page[i]);
+       }
+
+       kfree(actor);
+       kfree(page);
+
+       return 0;
+
+mark_errored:
+       /* Decompression failed, mark pages as errored.  Target_page is
+        * dealt with by the caller
+        */
+       for (i = 0; i < pages; i++) {
+               if (page[i] == target_page)
+                       continue;
+               flush_dcache_page(page[i]);
+               SetPageError(page[i]);
+               unlock_page(page[i]);
+               page_cache_release(page[i]);
+       }
+
+out:
+       kfree(actor);
+       kfree(page);
+       return res;
+}
+
+
+static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
+       int pages, struct page **page)
+{
+       struct inode *i = target_page->mapping->host;
+       struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
+                                                block, bsize);
+       int bytes = buffer->length, res = buffer->error, n, offset = 0;
+       void *pageaddr;
+
+       if (res) {
+               ERROR("Unable to read page, block %llx, size %x\n", block,
+                       bsize);
+               goto out;
+       }
+
+       for (n = 0; n < pages && bytes > 0; n++,
+                       bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+               int avail = min_t(int, bytes, PAGE_CACHE_SIZE);
+
+               if (page[n] == NULL)
+                       continue;
+
+               pageaddr = kmap_atomic(page[n]);
+               squashfs_copy_data(pageaddr, buffer, offset, avail);
+               memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+               kunmap_atomic(pageaddr);
+               flush_dcache_page(page[n]);
+               SetPageUptodate(page[n]);
+               unlock_page(page[n]);
+               if (page[n] != target_page)
+                       page_cache_release(page[n]);
+       }
+
+out:
+       squashfs_cache_put(buffer);
+       return res;
+}
index 00f4dfc..244b9fb 100644 (file)
 #include "squashfs_fs_sb.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "page_actor.h"
 
 struct squashfs_lzo {
        void    *input;
        void    *output;
 };
 
-static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len)
+static void *lzo_init(struct squashfs_sb_info *msblk, void *buff)
 {
        int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
 
@@ -74,22 +75,16 @@ static void lzo_free(void *strm)
 }
 
 
-static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
-       struct buffer_head **bh, int b, int offset, int length, int srclength,
-       int pages)
+static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
+       struct buffer_head **bh, int b, int offset, int length,
+       struct squashfs_page_actor *output)
 {
-       struct squashfs_lzo *stream = msblk->stream;
-       void *buff = stream->input;
+       struct squashfs_lzo *stream = strm;
+       void *buff = stream->input, *data;
        int avail, i, bytes = length, res;
-       size_t out_len = srclength;
-
-       mutex_lock(&msblk->read_data_mutex);
+       size_t out_len = output->length;
 
        for (i = 0; i < b; i++) {
-               wait_on_buffer(bh[i]);
-               if (!buffer_uptodate(bh[i]))
-                       goto block_release;
-
                avail = min(bytes, msblk->devblksize - offset);
                memcpy(buff, bh[i]->b_data + offset, avail);
                buff += avail;
@@ -104,24 +99,24 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
                goto failed;
 
        res = bytes = (int)out_len;
-       for (i = 0, buff = stream->output; bytes && i < pages; i++) {
-               avail = min_t(int, bytes, PAGE_CACHE_SIZE);
-               memcpy(buffer[i], buff, avail);
-               buff += avail;
-               bytes -= avail;
+       data = squashfs_first_page(output);
+       buff = stream->output;
+       while (data) {
+               if (bytes <= PAGE_CACHE_SIZE) {
+                       memcpy(data, buff, bytes);
+                       break;
+               } else {
+                       memcpy(data, buff, PAGE_CACHE_SIZE);
+                       buff += PAGE_CACHE_SIZE;
+                       bytes -= PAGE_CACHE_SIZE;
+                       data = squashfs_next_page(output);
+               }
        }
+       squashfs_finish_page(output);
 
-       mutex_unlock(&msblk->read_data_mutex);
        return res;
 
-block_release:
-       for (; i < b; i++)
-               put_bh(bh[i]);
-
 failed:
-       mutex_unlock(&msblk->read_data_mutex);
-
-       ERROR("lzo decompression failed, data probably corrupt\n");
        return -EIO;
 }
 
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
new file mode 100644 (file)
index 0000000..5a1c11f
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include "page_actor.h"
+
+/*
+ * This file contains implementations of page_actor for decompressing into
+ * an intermediate buffer, and for decompressing directly into the
+ * page cache.
+ *
+ * Calling code should avoid sleeping between calls to squashfs_first_page()
+ * and squashfs_finish_page().
+ */
+
+/* Implementation of page_actor for decompressing into intermediate buffer */
+static void *cache_first_page(struct squashfs_page_actor *actor)
+{
+       actor->next_page = 1;
+       return actor->buffer[0];
+}
+
+static void *cache_next_page(struct squashfs_page_actor *actor)
+{
+       if (actor->next_page == actor->pages)
+               return NULL;
+
+       return actor->buffer[actor->next_page++];
+}
+
+static void cache_finish_page(struct squashfs_page_actor *actor)
+{
+       /* empty */
+}
+
+struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
+       int pages, int length)
+{
+       struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+       if (actor == NULL)
+               return NULL;
+
+       actor->length = length ? : pages * PAGE_CACHE_SIZE;
+       actor->buffer = buffer;
+       actor->pages = pages;
+       actor->next_page = 0;
+       actor->squashfs_first_page = cache_first_page;
+       actor->squashfs_next_page = cache_next_page;
+       actor->squashfs_finish_page = cache_finish_page;
+       return actor;
+}
+
+/* Implementation of page_actor for decompressing directly into page cache. */
+static void *direct_first_page(struct squashfs_page_actor *actor)
+{
+       actor->next_page = 1;
+       return actor->pageaddr = kmap_atomic(actor->page[0]);
+}
+
+static void *direct_next_page(struct squashfs_page_actor *actor)
+{
+       if (actor->pageaddr)
+               kunmap_atomic(actor->pageaddr);
+
+       return actor->pageaddr = actor->next_page == actor->pages ? NULL :
+               kmap_atomic(actor->page[actor->next_page++]);
+}
+
+static void direct_finish_page(struct squashfs_page_actor *actor)
+{
+       if (actor->pageaddr)
+               kunmap_atomic(actor->pageaddr);
+}
+
+struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
+       int pages, int length)
+{
+       struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+       if (actor == NULL)
+               return NULL;
+
+       actor->length = length ? : pages * PAGE_CACHE_SIZE;
+       actor->page = page;
+       actor->pages = pages;
+       actor->next_page = 0;
+       actor->pageaddr = NULL;
+       actor->squashfs_first_page = direct_first_page;
+       actor->squashfs_next_page = direct_next_page;
+       actor->squashfs_finish_page = direct_finish_page;
+       return actor;
+}
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
new file mode 100644 (file)
index 0000000..26dd820
--- /dev/null
@@ -0,0 +1,81 @@
+#ifndef PAGE_ACTOR_H
+#define PAGE_ACTOR_H
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef CONFIG_SQUASHFS_FILE_DIRECT
+struct squashfs_page_actor {
+       void    **page;
+       int     pages;
+       int     length;
+       int     next_page;
+};
+
+static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
+       int pages, int length)
+{
+       struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+       if (actor == NULL)
+               return NULL;
+
+       actor->length = length ? : pages * PAGE_CACHE_SIZE;
+       actor->page = page;
+       actor->pages = pages;
+       actor->next_page = 0;
+       return actor;
+}
+
+static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
+{
+       actor->next_page = 1;
+       return actor->page[0];
+}
+
+static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
+{
+       return actor->next_page == actor->pages ? NULL :
+               actor->page[actor->next_page++];
+}
+
+static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
+{
+       /* empty */
+}
+#else
+struct squashfs_page_actor {
+       union {
+               void            **buffer;
+               struct page     **page;
+       };
+       void    *pageaddr;
+       void    *(*squashfs_first_page)(struct squashfs_page_actor *);
+       void    *(*squashfs_next_page)(struct squashfs_page_actor *);
+       void    (*squashfs_finish_page)(struct squashfs_page_actor *);
+       int     pages;
+       int     length;
+       int     next_page;
+};
+
+extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
+extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
+                                                        **, int, int);
+static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
+{
+       return actor->squashfs_first_page(actor);
+}
+static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
+{
+       return actor->squashfs_next_page(actor);
+}
+static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
+{
+       actor->squashfs_finish_page(actor);
+}
+#endif
+#endif
index d126651..9e1bb79 100644 (file)
@@ -28,8 +28,8 @@
 #define WARNING(s, args...)    pr_warning("SQUASHFS: "s, ## args)
 
 /* block.c */
-extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
-                               int, int);
+extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
+                               struct squashfs_page_actor *);
 
 /* cache.c */
 extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
@@ -48,7 +48,14 @@ extern void *squashfs_read_table(struct super_block *, u64, int);
 
 /* decompressor.c */
 extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
-extern void *squashfs_decompressor_init(struct super_block *, unsigned short);
+extern void *squashfs_decompressor_setup(struct super_block *, unsigned short);
+
+/* decompressor_xxx.c */
+extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *);
+extern void squashfs_decompressor_destroy(struct squashfs_sb_info *);
+extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **,
+       int, int, int, struct squashfs_page_actor *);
+extern int squashfs_max_decompressors(void);
 
 /* export.c */
 extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64,
@@ -59,6 +66,13 @@ extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
 extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
                                u64, u64, unsigned int);
 
+/* file.c */
+void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int,
+                               int);
+
+/* file_xxx.c */
+extern int squashfs_readpage_block(struct page *, u64, int);
+
 /* id.c */
 extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
 extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64,
index 52934a2..1da565c 100644 (file)
@@ -50,6 +50,7 @@ struct squashfs_cache_entry {
        wait_queue_head_t       wait_queue;
        struct squashfs_cache   *cache;
        void                    **data;
+       struct squashfs_page_actor      *actor;
 };
 
 struct squashfs_sb_info {
@@ -63,10 +64,9 @@ struct squashfs_sb_info {
        __le64                                  *id_table;
        __le64                                  *fragment_index;
        __le64                                  *xattr_id_table;
-       struct mutex                            read_data_mutex;
        struct mutex                            meta_index_mutex;
        struct meta_index                       *meta_index;
-       void                                    *stream;
+       struct squashfs_stream                  *stream;
        __le64                                  *inode_lookup_table;
        u64                                     inode_table;
        u64                                     directory_table;
index 60553a9..202df63 100644 (file)
@@ -98,7 +98,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
        msblk->devblksize_log2 = ffz(~msblk->devblksize);
 
-       mutex_init(&msblk->read_data_mutex);
        mutex_init(&msblk->meta_index_mutex);
 
        /*
@@ -206,13 +205,14 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
 
        /* Allocate read_page block */
-       msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size);
+       msblk->read_page = squashfs_cache_init("data",
+               squashfs_max_decompressors(), msblk->block_size);
        if (msblk->read_page == NULL) {
                ERROR("Failed to allocate read_page block\n");
                goto failed_mount;
        }
 
-       msblk->stream = squashfs_decompressor_init(sb, flags);
+       msblk->stream = squashfs_decompressor_setup(sb, flags);
        if (IS_ERR(msblk->stream)) {
                err = PTR_ERR(msblk->stream);
                msblk->stream = NULL;
@@ -336,7 +336,7 @@ failed_mount:
        squashfs_cache_delete(msblk->block_cache);
        squashfs_cache_delete(msblk->fragment_cache);
        squashfs_cache_delete(msblk->read_page);
-       squashfs_decompressor_free(msblk, msblk->stream);
+       squashfs_decompressor_destroy(msblk);
        kfree(msblk->inode_lookup_table);
        kfree(msblk->fragment_index);
        kfree(msblk->id_table);
@@ -383,7 +383,7 @@ static void squashfs_put_super(struct super_block *sb)
                squashfs_cache_delete(sbi->block_cache);
                squashfs_cache_delete(sbi->fragment_cache);
                squashfs_cache_delete(sbi->read_page);
-               squashfs_decompressor_free(sbi, sbi->stream);
+               squashfs_decompressor_destroy(sbi);
                kfree(sbi->id_table);
                kfree(sbi->fragment_index);
                kfree(sbi->meta_index);
index 1760b7d..c609624 100644 (file)
 #include "squashfs_fs_sb.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "page_actor.h"
 
 struct squashfs_xz {
        struct xz_dec *state;
        struct xz_buf buf;
 };
 
-struct comp_opts {
+struct disk_comp_opts {
        __le32 dictionary_size;
        __le32 flags;
 };
 
-static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
-       int len)
+struct comp_opts {
+       int dict_size;
+};
+
+static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk,
+       void *buff, int len)
 {
-       struct comp_opts *comp_opts = buff;
-       struct squashfs_xz *stream;
-       int dict_size = msblk->block_size;
-       int err, n;
+       struct disk_comp_opts *comp_opts = buff;
+       struct comp_opts *opts;
+       int err = 0, n;
+
+       opts = kmalloc(sizeof(*opts), GFP_KERNEL);
+       if (opts == NULL) {
+               err = -ENOMEM;
+               goto out2;
+       }
 
        if (comp_opts) {
                /* check compressor options are the expected length */
                if (len < sizeof(*comp_opts)) {
                        err = -EIO;
-                       goto failed;
+                       goto out;
                }
 
-               dict_size = le32_to_cpu(comp_opts->dictionary_size);
+               opts->dict_size = le32_to_cpu(comp_opts->dictionary_size);
 
                /* the dictionary size should be 2^n or 2^n+2^(n+1) */
-               n = ffs(dict_size) - 1;
-               if (dict_size != (1 << n) && dict_size != (1 << n) +
+               n = ffs(opts->dict_size) - 1;
+               if (opts->dict_size != (1 << n) && opts->dict_size != (1 << n) +
                                                (1 << (n + 1))) {
                        err = -EIO;
-                       goto failed;
+                       goto out;
                }
-       }
+       } else
+               /* use defaults */
+               opts->dict_size = max_t(int, msblk->block_size,
+                                                       SQUASHFS_METADATA_SIZE);
+
+       return opts;
+
+out:
+       kfree(opts);
+out2:
+       return ERR_PTR(err);
+}
+
 
-       dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE);
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff)
+{
+       struct comp_opts *comp_opts = buff;
+       struct squashfs_xz *stream;
+       int err;
 
        stream = kmalloc(sizeof(*stream), GFP_KERNEL);
        if (stream == NULL) {
@@ -77,7 +103,7 @@ static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
                goto failed;
        }
 
-       stream->state = xz_dec_init(XZ_PREALLOC, dict_size);
+       stream->state = xz_dec_init(XZ_PREALLOC, comp_opts->dict_size);
        if (stream->state == NULL) {
                kfree(stream);
                err = -ENOMEM;
@@ -103,42 +129,37 @@ static void squashfs_xz_free(void *strm)
 }
 
 
-static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
-       struct buffer_head **bh, int b, int offset, int length, int srclength,
-       int pages)
+static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
+       struct buffer_head **bh, int b, int offset, int length,
+       struct squashfs_page_actor *output)
 {
        enum xz_ret xz_err;
-       int avail, total = 0, k = 0, page = 0;
-       struct squashfs_xz *stream = msblk->stream;
-
-       mutex_lock(&msblk->read_data_mutex);
+       int avail, total = 0, k = 0;
+       struct squashfs_xz *stream = strm;
 
        xz_dec_reset(stream->state);
        stream->buf.in_pos = 0;
        stream->buf.in_size = 0;
        stream->buf.out_pos = 0;
        stream->buf.out_size = PAGE_CACHE_SIZE;
-       stream->buf.out = buffer[page++];
+       stream->buf.out = squashfs_first_page(output);
 
        do {
                if (stream->buf.in_pos == stream->buf.in_size && k < b) {
                        avail = min(length, msblk->devblksize - offset);
                        length -= avail;
-                       wait_on_buffer(bh[k]);
-                       if (!buffer_uptodate(bh[k]))
-                               goto release_mutex;
-
                        stream->buf.in = bh[k]->b_data + offset;
                        stream->buf.in_size = avail;
                        stream->buf.in_pos = 0;
                        offset = 0;
                }
 
-               if (stream->buf.out_pos == stream->buf.out_size
-                                                       && page < pages) {
-                       stream->buf.out = buffer[page++];
-                       stream->buf.out_pos = 0;
-                       total += PAGE_CACHE_SIZE;
+               if (stream->buf.out_pos == stream->buf.out_size) {
+                       stream->buf.out = squashfs_next_page(output);
+                       if (stream->buf.out != NULL) {
+                               stream->buf.out_pos = 0;
+                               total += PAGE_CACHE_SIZE;
+                       }
                }
 
                xz_err = xz_dec_run(stream->state, &stream->buf);
@@ -147,23 +168,14 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
                        put_bh(bh[k++]);
        } while (xz_err == XZ_OK);
 
-       if (xz_err != XZ_STREAM_END) {
-               ERROR("xz_dec_run error, data probably corrupt\n");
-               goto release_mutex;
-       }
-
-       if (k < b) {
-               ERROR("xz_uncompress error, input remaining\n");
-               goto release_mutex;
-       }
+       squashfs_finish_page(output);
 
-       total += stream->buf.out_pos;
-       mutex_unlock(&msblk->read_data_mutex);
-       return total;
+       if (xz_err != XZ_STREAM_END || k < b)
+               goto out;
 
-release_mutex:
-       mutex_unlock(&msblk->read_data_mutex);
+       return total + stream->buf.out_pos;
 
+out:
        for (; k < b; k++)
                put_bh(bh[k]);
 
@@ -172,6 +184,7 @@ release_mutex:
 
 const struct squashfs_decompressor squashfs_xz_comp_ops = {
        .init = squashfs_xz_init,
+       .comp_opts = squashfs_xz_comp_opts,
        .free = squashfs_xz_free,
        .decompress = squashfs_xz_uncompress,
        .id = XZ_COMPRESSION,
index 55d918f..8727cab 100644 (file)
@@ -32,8 +32,9 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "page_actor.h"
 
-static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len)
+static void *zlib_init(struct squashfs_sb_info *dummy, void *buff)
 {
        z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
        if (stream == NULL)
@@ -61,44 +62,37 @@ static void zlib_free(void *strm)
 }
 
 
-static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
-       struct buffer_head **bh, int b, int offset, int length, int srclength,
-       int pages)
+static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
+       struct buffer_head **bh, int b, int offset, int length,
+       struct squashfs_page_actor *output)
 {
-       int zlib_err, zlib_init = 0;
-       int k = 0, page = 0;
-       z_stream *stream = msblk->stream;
-
-       mutex_lock(&msblk->read_data_mutex);
+       int zlib_err, zlib_init = 0, k = 0;
+       z_stream *stream = strm;
 
-       stream->avail_out = 0;
+       stream->avail_out = PAGE_CACHE_SIZE;
+       stream->next_out = squashfs_first_page(output);
        stream->avail_in = 0;
 
        do {
                if (stream->avail_in == 0 && k < b) {
                        int avail = min(length, msblk->devblksize - offset);
                        length -= avail;
-                       wait_on_buffer(bh[k]);
-                       if (!buffer_uptodate(bh[k]))
-                               goto release_mutex;
-
                        stream->next_in = bh[k]->b_data + offset;
                        stream->avail_in = avail;
                        offset = 0;
                }
 
-               if (stream->avail_out == 0 && page < pages) {
-                       stream->next_out = buffer[page++];
-                       stream->avail_out = PAGE_CACHE_SIZE;
+               if (stream->avail_out == 0) {
+                       stream->next_out = squashfs_next_page(output);
+                       if (stream->next_out != NULL)
+                               stream->avail_out = PAGE_CACHE_SIZE;
                }
 
                if (!zlib_init) {
                        zlib_err = zlib_inflateInit(stream);
                        if (zlib_err != Z_OK) {
-                               ERROR("zlib_inflateInit returned unexpected "
-                                       "result 0x%x, srclength %d\n",
-                                       zlib_err, srclength);
-                               goto release_mutex;
+                               squashfs_finish_page(output);
+                               goto out;
                        }
                        zlib_init = 1;
                }
@@ -109,29 +103,21 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
                        put_bh(bh[k++]);
        } while (zlib_err == Z_OK);
 
-       if (zlib_err != Z_STREAM_END) {
-               ERROR("zlib_inflate error, data probably corrupt\n");
-               goto release_mutex;
-       }
+       squashfs_finish_page(output);
 
-       zlib_err = zlib_inflateEnd(stream);
-       if (zlib_err != Z_OK) {
-               ERROR("zlib_inflate error, data probably corrupt\n");
-               goto release_mutex;
-       }
+       if (zlib_err != Z_STREAM_END)
+               goto out;
 
-       if (k < b) {
-               ERROR("zlib_uncompress error, data remaining\n");
-               goto release_mutex;
-       }
+       zlib_err = zlib_inflateEnd(stream);
+       if (zlib_err != Z_OK)
+               goto out;
 
-       length = stream->total_out;
-       mutex_unlock(&msblk->read_data_mutex);
-       return length;
+       if (k < b)
+               goto out;
 
-release_mutex:
-       mutex_unlock(&msblk->read_data_mutex);
+       return stream->total_out;
 
+out:
        for (; k < b; k++)
                put_bh(bh[k]);
 
index 1c02da8..3ef11b2 100644 (file)
@@ -1137,6 +1137,7 @@ xfs_bmap_add_attrfork(
        int                     committed;      /* xaction was committed */
        int                     logflags;       /* logging flags */
        int                     error;          /* error return value */
+       int                     cancel_flags = 0;
 
        ASSERT(XFS_IFORK_Q(ip) == 0);
 
@@ -1147,19 +1148,20 @@ xfs_bmap_add_attrfork(
        if (rsvd)
                tp->t_flags |= XFS_TRANS_RESERVE;
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
-       if (error)
-               goto error0;
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               return error;
+       }
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
                        XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
                        XFS_QMOPT_RES_REGBLKS);
-       if (error) {
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
-               return error;
-       }
+       if (error)
+               goto trans_cancel;
+       cancel_flags |= XFS_TRANS_ABORT;
        if (XFS_IFORK_Q(ip))
-               goto error1;
+               goto trans_cancel;
        if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
                /*
                 * For inodes coming from pre-6.2 filesystems.
@@ -1169,7 +1171,7 @@ xfs_bmap_add_attrfork(
        }
        ASSERT(ip->i_d.di_anextents == 0);
 
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, 0);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
        switch (ip->i_d.di_format) {
@@ -1191,7 +1193,7 @@ xfs_bmap_add_attrfork(
        default:
                ASSERT(0);
                error = XFS_ERROR(EINVAL);
-               goto error1;
+               goto trans_cancel;
        }
 
        ASSERT(ip->i_afp == NULL);
@@ -1219,7 +1221,7 @@ xfs_bmap_add_attrfork(
        if (logflags)
                xfs_trans_log_inode(tp, ip, logflags);
        if (error)
-               goto error2;
+               goto bmap_cancel;
        if (!xfs_sb_version_hasattr(&mp->m_sb) ||
           (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
                __int64_t sbfields = 0;
@@ -1242,14 +1244,16 @@ xfs_bmap_add_attrfork(
 
        error = xfs_bmap_finish(&tp, &flist, &committed);
        if (error)
-               goto error2;
-       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-error2:
+               goto bmap_cancel;
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
+
+bmap_cancel:
        xfs_bmap_cancel(&flist);
-error1:
+trans_cancel:
+       xfs_trans_cancel(tp, cancel_flags);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-error0:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
        return error;
 }
 
index da88f16..02df7b4 100644 (file)
@@ -41,6 +41,7 @@
 #include "xfs_fsops.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_dinode.h"
 
 
 #ifdef HAVE_PERCPU_SB
@@ -718,8 +719,22 @@ xfs_mountfs(
         * Set the inode cluster size.
         * This may still be overridden by the file system
         * block size if it is larger than the chosen cluster size.
+        *
+        * For v5 filesystems, scale the cluster size with the inode size to
+        * keep a constant ratio of inode per cluster buffer, but only if mkfs
+        * has set the inode alignment value appropriately for larger cluster
+        * sizes.
         */
        mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               int     new_size = mp->m_inode_cluster_size;
+
+               new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
+               if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
+                       mp->m_inode_cluster_size = new_size;
+               xfs_info(mp, "Using inode cluster size of %d bytes",
+                        mp->m_inode_cluster_size);
+       }
 
        /*
         * Set inode alignment fields
index 1d8101a..a466c5e 100644 (file)
@@ -112,7 +112,7 @@ typedef struct xfs_mount {
        __uint8_t               m_blkbb_log;    /* blocklog - BBSHIFT */
        __uint8_t               m_agno_log;     /* log #ag's */
        __uint8_t               m_agino_log;    /* #bits for agino in inum */
-       __uint16_t              m_inode_cluster_size;/* min inode buf size */
+       uint                    m_inode_cluster_size;/* min inode buf size */
        uint                    m_blockmask;    /* sb_blocksize-1 */
        uint                    m_blockwsize;   /* sb_blocksize in words */
        uint                    m_blockwmask;   /* blockwsize-1 */
index 1bba7f6..50c3f56 100644 (file)
@@ -111,12 +111,14 @@ xfs_trans_log_inode(
 
        /*
         * First time we log the inode in a transaction, bump the inode change
-        * counter if it is configured for this to occur.
+        * counter if it is configured for this to occur. We don't use
+        * inode_inc_version() because there is no need for extra locking around
+        * i_version as we already hold the inode locked exclusively for
+        * metadata modification.
         */
        if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
            IS_I_VERSION(VFS_I(ip))) {
-               inode_inc_iversion(VFS_I(ip));
-               ip->i_d.di_changecount = VFS_I(ip)->i_version;
+               ip->i_d.di_changecount = ++VFS_I(ip)->i_version;
                flags |= XFS_ILOG_CORE;
        }
 
index d53d9f0..2fd59c0 100644 (file)
@@ -385,8 +385,7 @@ xfs_calc_ifree_reservation(
                xfs_calc_inode_res(mp, 1) +
                xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
                xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-               MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
-                   XFS_INODE_CLUSTER_SIZE(mp)) +
+               max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) +
                xfs_calc_buf_res(1, 0) +
                xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
                                 mp->m_in_maxlevels, 0) +
index 89c60b0..7b2de02 100644 (file)
@@ -431,9 +431,9 @@ static inline acpi_handle acpi_get_child(acpi_handle handle, u64 addr)
 {
        return acpi_find_child(handle, addr, false);
 }
+void acpi_preset_companion(struct device *dev, acpi_handle parent, u64 addr);
 int acpi_is_root_bridge(acpi_handle);
 struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle);
-#define DEVICE_ACPI_HANDLE(dev) ((acpi_handle)ACPI_HANDLE(dev))
 
 int acpi_enable_wakeup_device_power(struct acpi_device *dev, int state);
 int acpi_disable_wakeup_device_power(struct acpi_device *dev);
diff --git a/include/crypto/hash_info.h b/include/crypto/hash_info.h
new file mode 100644 (file)
index 0000000..e1e5a3e
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * Hash Info: Hash algorithms information
+ *
+ * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#ifndef _CRYPTO_HASH_INFO_H
+#define _CRYPTO_HASH_INFO_H
+
+#include <crypto/sha.h>
+#include <crypto/md5.h>
+
+#include <uapi/linux/hash_info.h>
+
+/* not defined in include/crypto/ */
+#define RMD128_DIGEST_SIZE      16
+#define RMD160_DIGEST_SIZE     20
+#define RMD256_DIGEST_SIZE      32
+#define RMD320_DIGEST_SIZE      40
+
+/* not defined in include/crypto/ */
+#define WP512_DIGEST_SIZE      64
+#define WP384_DIGEST_SIZE      48
+#define WP256_DIGEST_SIZE      32
+
+/* not defined in include/crypto/ */
+#define TGR128_DIGEST_SIZE 16
+#define TGR160_DIGEST_SIZE 20
+#define TGR192_DIGEST_SIZE 24
+
+extern const char *const hash_algo_name[HASH_ALGO__LAST];
+extern const int hash_digest_size[HASH_ALGO__LAST];
+
+#endif /* _CRYPTO_HASH_INFO_H */
index f5b0224..fc09732 100644 (file)
@@ -15,6 +15,7 @@
 #define _LINUX_PUBLIC_KEY_H
 
 #include <linux/mpi.h>
+#include <crypto/hash_info.h>
 
 enum pkey_algo {
        PKEY_ALGO_DSA,
@@ -22,21 +23,11 @@ enum pkey_algo {
        PKEY_ALGO__LAST
 };
 
-extern const char *const pkey_algo[PKEY_ALGO__LAST];
+extern const char *const pkey_algo_name[PKEY_ALGO__LAST];
+extern const struct public_key_algorithm *pkey_algo[PKEY_ALGO__LAST];
 
-enum pkey_hash_algo {
-       PKEY_HASH_MD4,
-       PKEY_HASH_MD5,
-       PKEY_HASH_SHA1,
-       PKEY_HASH_RIPE_MD_160,
-       PKEY_HASH_SHA256,
-       PKEY_HASH_SHA384,
-       PKEY_HASH_SHA512,
-       PKEY_HASH_SHA224,
-       PKEY_HASH__LAST
-};
-
-extern const char *const pkey_hash_algo[PKEY_HASH__LAST];
+/* asymmetric key implementation supports only up to SHA224 */
+#define PKEY_HASH__LAST                (HASH_ALGO_SHA224 + 1)
 
 enum pkey_id_type {
        PKEY_ID_PGP,            /* OpenPGP generated key ID */
@@ -44,7 +35,7 @@ enum pkey_id_type {
        PKEY_ID_TYPE__LAST
 };
 
-extern const char *const pkey_id_type[PKEY_ID_TYPE__LAST];
+extern const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST];
 
 /*
  * Cryptographic data for the public-key subtype of the asymmetric key type.
@@ -59,6 +50,7 @@ struct public_key {
 #define PKEY_CAN_DECRYPT       0x02
 #define PKEY_CAN_SIGN          0x04
 #define PKEY_CAN_VERIFY                0x08
+       enum pkey_algo pkey_algo : 8;
        enum pkey_id_type id_type : 8;
        union {
                MPI     mpi[5];
@@ -88,7 +80,8 @@ struct public_key_signature {
        u8 *digest;
        u8 digest_size;                 /* Number of bytes in digest */
        u8 nr_mpi;                      /* Occupancy of mpi[] */
-       enum pkey_hash_algo pkey_hash_algo : 8;
+       enum pkey_algo pkey_algo : 8;
+       enum hash_algo pkey_hash_algo : 8;
        union {
                MPI mpi[2];
                struct {
diff --git a/include/keys/big_key-type.h b/include/keys/big_key-type.h
new file mode 100644 (file)
index 0000000..d69bc8a
--- /dev/null
@@ -0,0 +1,25 @@
+/* Big capacity key type.
+ *
+ * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _KEYS_BIG_KEY_TYPE_H
+#define _KEYS_BIG_KEY_TYPE_H
+
+#include <linux/key-type.h>
+
+extern struct key_type key_type_big_key;
+
+extern int big_key_instantiate(struct key *key, struct key_preparsed_payload *prep);
+extern void big_key_revoke(struct key *key);
+extern void big_key_destroy(struct key *key);
+extern void big_key_describe(const struct key *big_key, struct seq_file *m);
+extern long big_key_read(const struct key *key, char __user *buffer, size_t buflen);
+
+#endif /* _KEYS_BIG_KEY_TYPE_H */
index cf49159..fca5c62 100644 (file)
@@ -1,6 +1,6 @@
 /* Keyring key type
  *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2008, 2013 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
 #define _KEYS_KEYRING_TYPE_H
 
 #include <linux/key.h>
-#include <linux/rcupdate.h>
-
-/*
- * the keyring payload contains a list of the keys to which the keyring is
- * subscribed
- */
-struct keyring_list {
-       struct rcu_head rcu;            /* RCU deletion hook */
-       unsigned short  maxkeys;        /* max keys this list can hold */
-       unsigned short  nkeys;          /* number of keys currently held */
-       unsigned short  delkey;         /* key to be unlinked by RCU */
-       struct key __rcu *keys[0];
-};
-
+#include <linux/assoc_array.h>
 
 #endif /* _KEYS_KEYRING_TYPE_H */
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
new file mode 100644 (file)
index 0000000..8dabc39
--- /dev/null
@@ -0,0 +1,23 @@
+/* System keyring containing trusted public keys.
+ *
+ * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _KEYS_SYSTEM_KEYRING_H
+#define _KEYS_SYSTEM_KEYRING_H
+
+#ifdef CONFIG_SYSTEM_TRUSTED_KEYRING
+
+#include <linux/key.h>
+
+extern struct key *system_trusted_keyring;
+
+#endif
+
+#endif /* _KEYS_SYSTEM_KEYRING_H */
index b0972c4..d9099b1 100644 (file)
 #include <acpi/acpi_numa.h>
 #include <asm/acpi.h>
 
+static inline acpi_handle acpi_device_handle(struct acpi_device *adev)
+{
+       return adev ? adev->handle : NULL;
+}
+
+#define ACPI_COMPANION(dev)            ((dev)->acpi_node.companion)
+#define ACPI_COMPANION_SET(dev, adev)  ACPI_COMPANION(dev) = (adev)
+#define ACPI_HANDLE(dev)               acpi_device_handle(ACPI_COMPANION(dev))
+
+static inline const char *acpi_dev_name(struct acpi_device *adev)
+{
+       return dev_name(&adev->dev);
+}
+
 enum acpi_irq_model_id {
        ACPI_IRQ_MODEL_PIC = 0,
        ACPI_IRQ_MODEL_IOAPIC,
@@ -401,6 +415,15 @@ static inline bool acpi_driver_match_device(struct device *dev,
 
 #define acpi_disabled 1
 
+#define ACPI_COMPANION(dev)            (NULL)
+#define ACPI_COMPANION_SET(dev, adev)  do { } while (0)
+#define ACPI_HANDLE(dev)               (NULL)
+
+static inline const char *acpi_dev_name(struct acpi_device *adev)
+{
+       return NULL;
+}
+
 static inline void acpi_early_init(void) { }
 
 static inline int early_acpi_boot_init(void)
diff --git a/include/linux/assoc_array.h b/include/linux/assoc_array.h
new file mode 100644 (file)
index 0000000..9a193b8
--- /dev/null
@@ -0,0 +1,92 @@
+/* Generic associative array implementation.
+ *
+ * See Documentation/assoc_array.txt for information.
+ *
+ * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_ASSOC_ARRAY_H
+#define _LINUX_ASSOC_ARRAY_H
+
+#ifdef CONFIG_ASSOCIATIVE_ARRAY
+
+#include <linux/types.h>
+
+#define ASSOC_ARRAY_KEY_CHUNK_SIZE BITS_PER_LONG /* Key data retrieved in chunks of this size */
+
+/*
+ * Generic associative array.
+ */
+struct assoc_array {
+       struct assoc_array_ptr  *root;          /* The node at the root of the tree */
+       unsigned long           nr_leaves_on_tree;
+};
+
+/*
+ * Operations on objects and index keys for use by array manipulation routines.
+ */
+struct assoc_array_ops {
+       /* Method to get a chunk of an index key from caller-supplied data */
+       unsigned long (*get_key_chunk)(const void *index_key, int level);
+
+       /* Method to get a piece of an object's index key */
+       unsigned long (*get_object_key_chunk)(const void *object, int level);
+
+       /* Is this the object we're looking for? */
+       bool (*compare_object)(const void *object, const void *index_key);
+
+       /* How different are two objects, to a bit position in their keys? (or
+        * -1 if they're the same)
+        */
+       int (*diff_objects)(const void *a, const void *b);
+
+       /* Method to free an object. */
+       void (*free_object)(void *object);
+};
+
+/*
+ * Access and manipulation functions.
+ */
+struct assoc_array_edit;
+
+static inline void assoc_array_init(struct assoc_array *array)
+{
+       array->root = NULL;
+       array->nr_leaves_on_tree = 0;
+}
+
+extern int assoc_array_iterate(const struct assoc_array *array,
+                              int (*iterator)(const void *object,
+                                              void *iterator_data),
+                              void *iterator_data);
+extern void *assoc_array_find(const struct assoc_array *array,
+                             const struct assoc_array_ops *ops,
+                             const void *index_key);
+extern void assoc_array_destroy(struct assoc_array *array,
+                               const struct assoc_array_ops *ops);
+extern struct assoc_array_edit *assoc_array_insert(struct assoc_array *array,
+                                                  const struct assoc_array_ops *ops,
+                                                  const void *index_key,
+                                                  void *object);
+extern void assoc_array_insert_set_object(struct assoc_array_edit *edit,
+                                         void *object);
+extern struct assoc_array_edit *assoc_array_delete(struct assoc_array *array,
+                                                  const struct assoc_array_ops *ops,
+                                                  const void *index_key);
+extern struct assoc_array_edit *assoc_array_clear(struct assoc_array *array,
+                                                 const struct assoc_array_ops *ops);
+extern void assoc_array_apply_edit(struct assoc_array_edit *edit);
+extern void assoc_array_cancel_edit(struct assoc_array_edit *edit);
+extern int assoc_array_gc(struct assoc_array *array,
+                         const struct assoc_array_ops *ops,
+                         bool (*iterator)(void *object, void *iterator_data),
+                         void *iterator_data);
+
+#endif /* CONFIG_ASSOCIATIVE_ARRAY */
+#endif /* _LINUX_ASSOC_ARRAY_H */
diff --git a/include/linux/assoc_array_priv.h b/include/linux/assoc_array_priv.h
new file mode 100644 (file)
index 0000000..711275e
--- /dev/null
@@ -0,0 +1,182 @@
+/* Private definitions for the generic associative array implementation.
+ *
+ * See Documentation/assoc_array.txt for information.
+ *
+ * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_ASSOC_ARRAY_PRIV_H
+#define _LINUX_ASSOC_ARRAY_PRIV_H
+
+#ifdef CONFIG_ASSOCIATIVE_ARRAY
+
+#include <linux/assoc_array.h>
+
+#define ASSOC_ARRAY_FAN_OUT            16      /* Number of slots per node */
+#define ASSOC_ARRAY_FAN_MASK           (ASSOC_ARRAY_FAN_OUT - 1)
+#define ASSOC_ARRAY_LEVEL_STEP         (ilog2(ASSOC_ARRAY_FAN_OUT))
+#define ASSOC_ARRAY_LEVEL_STEP_MASK    (ASSOC_ARRAY_LEVEL_STEP - 1)
+#define ASSOC_ARRAY_KEY_CHUNK_MASK     (ASSOC_ARRAY_KEY_CHUNK_SIZE - 1)
+#define ASSOC_ARRAY_KEY_CHUNK_SHIFT    (ilog2(BITS_PER_LONG))
+
+/*
+ * Undefined type representing a pointer with type information in the bottom
+ * two bits.
+ */
+struct assoc_array_ptr;
+
+/*
+ * An N-way node in the tree.
+ *
+ * Each slot contains one of four things:
+ *
+ *     (1) Nothing (NULL).
+ *
+ *     (2) A leaf object (pointer types 0).
+ *
+ *     (3) A next-level node (pointer type 1, subtype 0).
+ *
+ *     (4) A shortcut (pointer type 1, subtype 1).
+ *
+ * The tree is optimised for search-by-ID, but permits reasonable iteration
+ * also.
+ *
+ * The tree is navigated by constructing an index key consisting of an array of
+ * segments, where each segment is ilog2(ASSOC_ARRAY_FAN_OUT) bits in size.
+ *
+ * The segments correspond to levels of the tree (the first segment is used at
+ * level 0, the second at level 1, etc.).
+ */
+struct assoc_array_node {
+       struct assoc_array_ptr  *back_pointer;
+       u8                      parent_slot;
+       struct assoc_array_ptr  *slots[ASSOC_ARRAY_FAN_OUT];
+       unsigned long           nr_leaves_on_branch;
+};
+
+/*
+ * A shortcut through the index space out to where a collection of nodes/leaves
+ * with the same IDs live.
+ */
+struct assoc_array_shortcut {
+       struct assoc_array_ptr  *back_pointer;
+       int                     parent_slot;
+       int                     skip_to_level;
+       struct assoc_array_ptr  *next_node;
+       unsigned long           index_key[];
+};
+
+/*
+ * Preallocation cache.
+ */
+struct assoc_array_edit {
+       struct rcu_head                 rcu;
+       struct assoc_array              *array;
+       const struct assoc_array_ops    *ops;
+       const struct assoc_array_ops    *ops_for_excised_subtree;
+       struct assoc_array_ptr          *leaf;
+       struct assoc_array_ptr          **leaf_p;
+       struct assoc_array_ptr          *dead_leaf;
+       struct assoc_array_ptr          *new_meta[3];
+       struct assoc_array_ptr          *excised_meta[1];
+       struct assoc_array_ptr          *excised_subtree;
+       struct assoc_array_ptr          **set_backpointers[ASSOC_ARRAY_FAN_OUT];
+       struct assoc_array_ptr          *set_backpointers_to;
+       struct assoc_array_node         *adjust_count_on;
+       long                            adjust_count_by;
+       struct {
+               struct assoc_array_ptr  **ptr;
+               struct assoc_array_ptr  *to;
+       } set[2];
+       struct {
+               u8                      *p;
+               u8                      to;
+       } set_parent_slot[1];
+       u8                              segment_cache[ASSOC_ARRAY_FAN_OUT + 1];
+};
+
+/*
+ * Internal tree member pointers are marked in the bottom one or two bits to
+ * indicate what type they are so that we don't have to look behind every
+ * pointer to see what it points to.
+ *
+ * We provide functions to test type annotations and to create and translate
+ * the annotated pointers.
+ */
+#define ASSOC_ARRAY_PTR_TYPE_MASK 0x1UL
+#define ASSOC_ARRAY_PTR_LEAF_TYPE 0x0UL        /* Points to leaf (or nowhere) */
+#define ASSOC_ARRAY_PTR_META_TYPE 0x1UL        /* Points to node or shortcut */
+#define ASSOC_ARRAY_PTR_SUBTYPE_MASK   0x2UL
+#define ASSOC_ARRAY_PTR_NODE_SUBTYPE   0x0UL
+#define ASSOC_ARRAY_PTR_SHORTCUT_SUBTYPE 0x2UL
+
+static inline bool assoc_array_ptr_is_meta(const struct assoc_array_ptr *x)
+{
+       return (unsigned long)x & ASSOC_ARRAY_PTR_TYPE_MASK;
+}
+static inline bool assoc_array_ptr_is_leaf(const struct assoc_array_ptr *x)
+{
+       return !assoc_array_ptr_is_meta(x);
+}
+static inline bool assoc_array_ptr_is_shortcut(const struct assoc_array_ptr *x)
+{
+       return (unsigned long)x & ASSOC_ARRAY_PTR_SUBTYPE_MASK;
+}
+static inline bool assoc_array_ptr_is_node(const struct assoc_array_ptr *x)
+{
+       return !assoc_array_ptr_is_shortcut(x);
+}
+
+static inline void *assoc_array_ptr_to_leaf(const struct assoc_array_ptr *x)
+{
+       return (void *)((unsigned long)x & ~ASSOC_ARRAY_PTR_TYPE_MASK);
+}
+
+static inline
+unsigned long __assoc_array_ptr_to_meta(const struct assoc_array_ptr *x)
+{
+       return (unsigned long)x &
+               ~(ASSOC_ARRAY_PTR_SUBTYPE_MASK | ASSOC_ARRAY_PTR_TYPE_MASK);
+}
+static inline
+struct assoc_array_node *assoc_array_ptr_to_node(const struct assoc_array_ptr *x)
+{
+       return (struct assoc_array_node *)__assoc_array_ptr_to_meta(x);
+}
+static inline
+struct assoc_array_shortcut *assoc_array_ptr_to_shortcut(const struct assoc_array_ptr *x)
+{
+       return (struct assoc_array_shortcut *)__assoc_array_ptr_to_meta(x);
+}
+
+static inline
+struct assoc_array_ptr *__assoc_array_x_to_ptr(const void *p, unsigned long t)
+{
+       return (struct assoc_array_ptr *)((unsigned long)p | t);
+}
+static inline
+struct assoc_array_ptr *assoc_array_leaf_to_ptr(const void *p)
+{
+       return __assoc_array_x_to_ptr(p, ASSOC_ARRAY_PTR_LEAF_TYPE);
+}
+static inline
+struct assoc_array_ptr *assoc_array_node_to_ptr(const struct assoc_array_node *p)
+{
+       return __assoc_array_x_to_ptr(
+               p, ASSOC_ARRAY_PTR_META_TYPE | ASSOC_ARRAY_PTR_NODE_SUBTYPE);
+}
+static inline
+struct assoc_array_ptr *assoc_array_shortcut_to_ptr(const struct assoc_array_shortcut *p)
+{
+       return __assoc_array_x_to_ptr(
+               p, ASSOC_ARRAY_PTR_META_TYPE | ASSOC_ARRAY_PTR_SHORTCUT_SUBTYPE);
+}
+
+#endif /* CONFIG_ASSOCIATIVE_ARRAY */
+#endif /* _LINUX_ASSOC_ARRAY_PRIV_H */
index 729a4d1..a406419 100644 (file)
@@ -73,6 +73,8 @@ struct audit_field {
        void                            *lsm_rule;
 };
 
+extern int is_audit_feature_set(int which);
+
 extern int __init audit_register_class(int class, unsigned *list);
 extern int audit_classify_syscall(int abi, unsigned syscall);
 extern int audit_classify_arch(int arch);
@@ -207,7 +209,7 @@ static inline int audit_get_sessionid(struct task_struct *tsk)
 
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
-extern int __audit_bprm(struct linux_binprm *bprm);
+extern void __audit_bprm(struct linux_binprm *bprm);
 extern int __audit_socketcall(int nargs, unsigned long *args);
 extern int __audit_sockaddr(int len, void *addr);
 extern void __audit_fd_pair(int fd1, int fd2);
@@ -236,11 +238,10 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_set_perm(qbytes, uid, gid, mode);
 }
-static inline int audit_bprm(struct linux_binprm *bprm)
+static inline void audit_bprm(struct linux_binprm *bprm)
 {
        if (unlikely(!audit_dummy_context()))
-               return __audit_bprm(bprm);
-       return 0;
+               __audit_bprm(bprm);
 }
 static inline int audit_socketcall(int nargs, unsigned long *args)
 {
@@ -367,10 +368,8 @@ static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
                                        gid_t gid, umode_t mode)
 { }
-static inline int audit_bprm(struct linux_binprm *bprm)
-{
-       return 0;
-}
+static inline void audit_bprm(struct linux_binprm *bprm)
+{ }
 static inline int audit_socketcall(int nargs, unsigned long *args)
 {
        return 0;
index f26ec20..1b135d4 100644 (file)
@@ -505,6 +505,9 @@ struct request_queue {
                                 (1 << QUEUE_FLAG_SAME_COMP)    |       \
                                 (1 << QUEUE_FLAG_ADD_RANDOM))
 
+#define QUEUE_FLAG_MQ_DEFAULT  ((1 << QUEUE_FLAG_IO_STAT) |            \
+                                (1 << QUEUE_FLAG_SAME_COMP))
+
 static inline void queue_lockdep_assert_held(struct request_queue *q)
 {
        if (q->queue_lock)
index b025925..952b010 100644 (file)
@@ -644,9 +644,11 @@ struct device_dma_parameters {
        unsigned long segment_boundary_mask;
 };
 
+struct acpi_device;
+
 struct acpi_dev_node {
 #ifdef CONFIG_ACPI
-       void    *handle;
+       struct acpi_device *companion;
 #endif
 };
 
@@ -790,14 +792,6 @@ static inline struct device *kobj_to_dev(struct kobject *kobj)
        return container_of(kobj, struct device, kobj);
 }
 
-#ifdef CONFIG_ACPI
-#define ACPI_HANDLE(dev)       ((dev)->acpi_node.handle)
-#define ACPI_HANDLE_SET(dev, _handle_) (dev)->acpi_node.handle = (_handle_)
-#else
-#define ACPI_HANDLE(dev)       (NULL)
-#define ACPI_HANDLE_SET(dev, _handle_) do { } while (0)
-#endif
-
 /* Get the wakeup routines, which depend on struct device */
 #include <linux/pm_wakeup.h>
 
index 0bc7275..41cf0c3 100644 (file)
@@ -45,13 +45,13 @@ static inline int dma_submit_error(dma_cookie_t cookie)
 
 /**
  * enum dma_status - DMA transaction status
- * @DMA_SUCCESS: transaction completed successfully
+ * @DMA_COMPLETE: transaction completed
  * @DMA_IN_PROGRESS: transaction not yet processed
  * @DMA_PAUSED: transaction is paused
  * @DMA_ERROR: transaction failed
  */
 enum dma_status {
-       DMA_SUCCESS,
+       DMA_COMPLETE,
        DMA_IN_PROGRESS,
        DMA_PAUSED,
        DMA_ERROR,
@@ -171,12 +171,6 @@ struct dma_interleaved_template {
  * @DMA_CTRL_ACK - if clear, the descriptor cannot be reused until the client
  *  acknowledges receipt, i.e. has has a chance to establish any dependency
  *  chains
- * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s)
- * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s)
- * @DMA_COMPL_SRC_UNMAP_SINGLE - set to do the source dma-unmapping as single
- *     (if not set, do the source dma-unmapping as page)
- * @DMA_COMPL_DEST_UNMAP_SINGLE - set to do the destination dma-unmapping as single
- *     (if not set, do the destination dma-unmapping as page)
  * @DMA_PREP_PQ_DISABLE_P - prevent generation of P while generating Q
  * @DMA_PREP_PQ_DISABLE_Q - prevent generation of Q while generating P
  * @DMA_PREP_CONTINUE - indicate to a driver that it is reusing buffers as
@@ -188,14 +182,10 @@ struct dma_interleaved_template {
 enum dma_ctrl_flags {
        DMA_PREP_INTERRUPT = (1 << 0),
        DMA_CTRL_ACK = (1 << 1),
-       DMA_COMPL_SKIP_SRC_UNMAP = (1 << 2),
-       DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3),
-       DMA_COMPL_SRC_UNMAP_SINGLE = (1 << 4),
-       DMA_COMPL_DEST_UNMAP_SINGLE = (1 << 5),
-       DMA_PREP_PQ_DISABLE_P = (1 << 6),
-       DMA_PREP_PQ_DISABLE_Q = (1 << 7),
-       DMA_PREP_CONTINUE = (1 << 8),
-       DMA_PREP_FENCE = (1 << 9),
+       DMA_PREP_PQ_DISABLE_P = (1 << 2),
+       DMA_PREP_PQ_DISABLE_Q = (1 << 3),
+       DMA_PREP_CONTINUE = (1 << 4),
+       DMA_PREP_FENCE = (1 << 5),
 };
 
 /**
@@ -413,6 +403,17 @@ void dma_chan_cleanup(struct kref *kref);
 typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param);
 
 typedef void (*dma_async_tx_callback)(void *dma_async_param);
+
+struct dmaengine_unmap_data {
+       u8 to_cnt;
+       u8 from_cnt;
+       u8 bidi_cnt;
+       struct device *dev;
+       struct kref kref;
+       size_t len;
+       dma_addr_t addr[0];
+};
+
 /**
  * struct dma_async_tx_descriptor - async transaction descriptor
  * ---dma generic offload fields---
@@ -438,6 +439,7 @@ struct dma_async_tx_descriptor {
        dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx);
        dma_async_tx_callback callback;
        void *callback_param;
+       struct dmaengine_unmap_data *unmap;
 #ifdef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH
        struct dma_async_tx_descriptor *next;
        struct dma_async_tx_descriptor *parent;
@@ -445,6 +447,40 @@ struct dma_async_tx_descriptor {
 #endif
 };
 
+#ifdef CONFIG_DMA_ENGINE
+static inline void dma_set_unmap(struct dma_async_tx_descriptor *tx,
+                                struct dmaengine_unmap_data *unmap)
+{
+       kref_get(&unmap->kref);
+       tx->unmap = unmap;
+}
+
+struct dmaengine_unmap_data *
+dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags);
+void dmaengine_unmap_put(struct dmaengine_unmap_data *unmap);
+#else
+static inline void dma_set_unmap(struct dma_async_tx_descriptor *tx,
+                                struct dmaengine_unmap_data *unmap)
+{
+}
+static inline struct dmaengine_unmap_data *
+dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags)
+{
+       return NULL;
+}
+static inline void dmaengine_unmap_put(struct dmaengine_unmap_data *unmap)
+{
+}
+#endif
+
+static inline void dma_descriptor_unmap(struct dma_async_tx_descriptor *tx)
+{
+       if (tx->unmap) {
+               dmaengine_unmap_put(tx->unmap);
+               tx->unmap = NULL;
+       }
+}
+
 #ifndef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH
 static inline void txd_lock(struct dma_async_tx_descriptor *txd)
 {
@@ -979,10 +1015,10 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
 {
        if (last_complete <= last_used) {
                if ((cookie <= last_complete) || (cookie > last_used))
-                       return DMA_SUCCESS;
+                       return DMA_COMPLETE;
        } else {
                if ((cookie <= last_complete) && (cookie > last_used))
-                       return DMA_SUCCESS;
+                       return DMA_COMPLETE;
        }
        return DMA_IN_PROGRESS;
 }
@@ -1013,11 +1049,11 @@ static inline struct dma_chan *dma_find_channel(enum dma_transaction_type tx_typ
 }
 static inline enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
 {
-       return DMA_SUCCESS;
+       return DMA_COMPLETE;
 }
 static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
 {
-       return DMA_SUCCESS;
+       return DMA_COMPLETE;
 }
 static inline void dma_issue_pending_all(void)
 {
index bf5d574..121f11f 100644 (file)
@@ -2622,7 +2622,9 @@ extern int simple_write_begin(struct file *file, struct address_space *mapping,
 extern int simple_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata);
+extern int always_delete_dentry(const struct dentry *);
 extern struct inode *alloc_anon_inode(struct super_block *);
+extern const struct dentry_operations simple_dentry_operations;
 
 extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
index acd2010..9649ff0 100644 (file)
@@ -31,6 +31,7 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
 void hugepage_put_subpool(struct hugepage_subpool *spool);
 
 int PageHuge(struct page *page);
+int PageHeadHuge(struct page *page_head);
 
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
 int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
@@ -69,7 +70,6 @@ int dequeue_hwpoisoned_huge_page(struct page *page);
 bool isolate_huge_page(struct page *page, struct list_head *list);
 void putback_active_hugepage(struct page *page);
 bool is_hugepage_active(struct page *page);
-void copy_huge_page(struct page *dst, struct page *src);
 
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
@@ -104,6 +104,11 @@ static inline int PageHuge(struct page *page)
        return 0;
 }
 
+static inline int PageHeadHuge(struct page *page_head)
+{
+       return 0;
+}
+
 static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 {
 }
@@ -140,9 +145,6 @@ static inline int dequeue_hwpoisoned_huge_page(struct page *page)
 #define isolate_huge_page(p, l) false
 #define putback_active_hugepage(p)     do {} while (0)
 #define is_hugepage_active(x)  false
-static inline void copy_huge_page(struct page *dst, struct page *src)
-{
-}
 
 static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot)
index 518a53a..a74c3a8 100644 (file)
@@ -45,6 +45,7 @@ struct key_preparsed_payload {
        const void      *data;          /* Raw data */
        size_t          datalen;        /* Raw datalen */
        size_t          quotalen;       /* Quota length for proposed payload */
+       bool            trusted;        /* True if key is trusted */
 };
 
 typedef int (*request_key_actor_t)(struct key_construction *key,
@@ -63,6 +64,11 @@ struct key_type {
         */
        size_t def_datalen;
 
+       /* Default key search algorithm. */
+       unsigned def_lookup_type;
+#define KEYRING_SEARCH_LOOKUP_DIRECT   0x0000  /* Direct lookup by description. */
+#define KEYRING_SEARCH_LOOKUP_ITERATE  0x0001  /* Iterative search. */
+
        /* vet a description */
        int (*vet_description)(const char *description);
 
index 4dfde11..80d6774 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/sysctl.h>
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
+#include <linux/assoc_array.h>
 
 #ifdef __KERNEL__
 #include <linux/uidgid.h>
@@ -82,6 +83,12 @@ struct key_owner;
 struct keyring_list;
 struct keyring_name;
 
+struct keyring_index_key {
+       struct key_type         *type;
+       const char              *description;
+       size_t                  desc_len;
+};
+
 /*****************************************************************************/
 /*
  * key reference with possession attribute handling
@@ -99,7 +106,7 @@ struct keyring_name;
 typedef struct __key_reference_with_attributes *key_ref_t;
 
 static inline key_ref_t make_key_ref(const struct key *key,
-                                    unsigned long possession)
+                                    bool possession)
 {
        return (key_ref_t) ((unsigned long) key | possession);
 }
@@ -109,7 +116,7 @@ static inline struct key *key_ref_to_ptr(const key_ref_t key_ref)
        return (struct key *) ((unsigned long) key_ref & ~1UL);
 }
 
-static inline unsigned long is_key_possessed(const key_ref_t key_ref)
+static inline bool is_key_possessed(const key_ref_t key_ref)
 {
        return (unsigned long) key_ref & 1UL;
 }
@@ -129,7 +136,6 @@ struct key {
                struct list_head graveyard_link;
                struct rb_node  serial_node;
        };
-       struct key_type         *type;          /* type of key */
        struct rw_semaphore     sem;            /* change vs change sem */
        struct key_user         *user;          /* owner of this key */
        void                    *security;      /* security data for this key */
@@ -162,13 +168,21 @@ struct key {
 #define KEY_FLAG_NEGATIVE      5       /* set if key is negative */
 #define KEY_FLAG_ROOT_CAN_CLEAR        6       /* set if key can be cleared by root without permission */
 #define KEY_FLAG_INVALIDATED   7       /* set if key has been invalidated */
+#define KEY_FLAG_TRUSTED       8       /* set if key is trusted */
+#define KEY_FLAG_TRUSTED_ONLY  9       /* set if keyring only accepts links to trusted keys */
 
-       /* the description string
-        * - this is used to match a key against search criteria
-        * - this should be a printable string
+       /* the key type and key description string
+        * - the desc is used to match a key against search criteria
+        * - it should be a printable string
         * - eg: for krb5 AFS, this might be "afs@REDHAT.COM"
         */
-       char                    *description;
+       union {
+               struct keyring_index_key index_key;
+               struct {
+                       struct key_type *type;          /* type of key */
+                       char            *description;
+               };
+       };
 
        /* type specific data
         * - this is used by the keyring type to index the name
@@ -185,11 +199,14 @@ struct key {
         *   whatever
         */
        union {
-               unsigned long           value;
-               void __rcu              *rcudata;
-               void                    *data;
-               struct keyring_list __rcu *subscriptions;
-       } payload;
+               union {
+                       unsigned long           value;
+                       void __rcu              *rcudata;
+                       void                    *data;
+                       void                    *data2[2];
+               } payload;
+               struct assoc_array keys;
+       };
 };
 
 extern struct key *key_alloc(struct key_type *type,
@@ -203,18 +220,23 @@ extern struct key *key_alloc(struct key_type *type,
 #define KEY_ALLOC_IN_QUOTA     0x0000  /* add to quota, reject if would overrun */
 #define KEY_ALLOC_QUOTA_OVERRUN        0x0001  /* add to quota, permit even if overrun */
 #define KEY_ALLOC_NOT_IN_QUOTA 0x0002  /* not in quota */
+#define KEY_ALLOC_TRUSTED      0x0004  /* Key should be flagged as trusted */
 
 extern void key_revoke(struct key *key);
 extern void key_invalidate(struct key *key);
 extern void key_put(struct key *key);
 
-static inline struct key *key_get(struct key *key)
+static inline struct key *__key_get(struct key *key)
 {
-       if (key)
-               atomic_inc(&key->usage);
+       atomic_inc(&key->usage);
        return key;
 }
 
+static inline struct key *key_get(struct key *key)
+{
+       return key ? __key_get(key) : key;
+}
+
 static inline void key_ref_put(key_ref_t key_ref)
 {
        key_put(key_ref_to_ptr(key_ref));
index 0548eb2..1cedd00 100644 (file)
@@ -1318,7 +1318,6 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 
 #if USE_SPLIT_PTE_PTLOCKS
 #if BLOATED_SPINLOCKS
-void __init ptlock_cache_init(void);
 extern bool ptlock_alloc(struct page *page);
 extern void ptlock_free(struct page *page);
 
@@ -1327,7 +1326,6 @@ static inline spinlock_t *ptlock_ptr(struct page *page)
        return page->ptl;
 }
 #else /* BLOATED_SPINLOCKS */
-static inline void ptlock_cache_init(void) {}
 static inline bool ptlock_alloc(struct page *page)
 {
        return true;
@@ -1380,17 +1378,10 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
 {
        return &mm->page_table_lock;
 }
-static inline void ptlock_cache_init(void) {}
 static inline bool ptlock_init(struct page *page) { return true; }
 static inline void pte_lock_deinit(struct page *page) {}
 #endif /* USE_SPLIT_PTE_PTLOCKS */
 
-static inline void pgtable_init(void)
-{
-       ptlock_cache_init();
-       pgtable_cache_init();
-}
-
 static inline bool pgtable_page_ctor(struct page *page)
 {
        inc_zone_page_state(page, NR_PAGETABLE);
index 10f5a72..bd29941 100644 (file)
@@ -44,18 +44,22 @@ struct page {
        /* First double word block */
        unsigned long flags;            /* Atomic flags, some possibly
                                         * updated asynchronously */
-       struct address_space *mapping;  /* If low bit clear, points to
-                                        * inode address_space, or NULL.
-                                        * If page mapped as anonymous
-                                        * memory, low bit is set, and
-                                        * it points to anon_vma object:
-                                        * see PAGE_MAPPING_ANON below.
-                                        */
+       union {
+               struct address_space *mapping;  /* If low bit clear, points to
+                                                * inode address_space, or NULL.
+                                                * If page mapped as anonymous
+                                                * memory, low bit is set, and
+                                                * it points to anon_vma object:
+                                                * see PAGE_MAPPING_ANON below.
+                                                */
+               void *s_mem;                    /* slab first object */
+       };
+
        /* Second double word */
        struct {
                union {
                        pgoff_t index;          /* Our offset within mapping. */
-                       void *freelist;         /* slub/slob first free object */
+                       void *freelist;         /* sl[aou]b first free object */
                        bool pfmemalloc;        /* If set by the page allocator,
                                                 * ALLOC_NO_WATERMARKS was set
                                                 * and the low watermark was not
@@ -65,9 +69,6 @@ struct page {
                                                 * this page is only used to
                                                 * free other pages.
                                                 */
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
-               pgtable_t pmd_huge_pte; /* protected by page->ptl */
-#endif
                };
 
                union {
@@ -114,6 +115,7 @@ struct page {
                                };
                                atomic_t _count;                /* Usage count, see below. */
                        };
+                       unsigned int active;    /* SLAB */
                };
        };
 
@@ -135,6 +137,12 @@ struct page {
 
                struct list_head list;  /* slobs list of pages */
                struct slab *slab_page; /* slab fields */
+               struct rcu_head rcu_head;       /* Used by SLAB
+                                                * when destroying via RCU
+                                                */
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
+               pgtable_t pmd_huge_pte; /* protected by page->ptl */
+#endif
        };
 
        /* Remainder is not double word aligned */
index d006f0c..5a462c4 100644 (file)
@@ -27,7 +27,7 @@ static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
        while (!pci_is_root_bus(pbus))
                pbus = pbus->parent;
 
-       return DEVICE_ACPI_HANDLE(pbus->bridge);
+       return ACPI_HANDLE(pbus->bridge);
 }
 
 static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
@@ -39,7 +39,7 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
        else
                dev = &pbus->self->dev;
 
-       return DEVICE_ACPI_HANDLE(dev);
+       return ACPI_HANDLE(dev);
 }
 
 void acpi_pci_add_bus(struct pci_bus *bus);
index 179fb91..f50821c 100644 (file)
@@ -67,10 +67,10 @@ struct edmacc_param {
 #define ITCCHEN                BIT(23)
 
 /*ch_status paramater of callback function possible values*/
-#define DMA_COMPLETE 1
-#define DMA_CC_ERROR 2
-#define DMA_TC1_ERROR 3
-#define DMA_TC2_ERROR 4
+#define EDMA_DMA_COMPLETE 1
+#define EDMA_DMA_CC_ERROR 2
+#define EDMA_DMA_TC1_ERROR 3
+#define EDMA_DMA_TC2_ERROR 4
 
 enum address_mode {
        INCR = 0,
index 9d37e2b..5623a7f 100644 (file)
@@ -1052,17 +1052,25 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  * @xfrm_policy_delete_security:
  *     @ctx contains the xfrm_sec_ctx.
  *     Authorize deletion of xp->security.
- * @xfrm_state_alloc_security:
+ * @xfrm_state_alloc:
  *     @x contains the xfrm_state being added to the Security Association
  *     Database by the XFRM system.
  *     @sec_ctx contains the security context information being provided by
  *     the user-level SA generation program (e.g., setkey or racoon).
- *     @secid contains the secid from which to take the mls portion of the context.
  *     Allocate a security structure to the x->security field; the security
  *     field is initialized to NULL when the xfrm_state is allocated. Set the
- *     context to correspond to either sec_ctx or polsec, with the mls portion
- *     taken from secid in the latter case.
- *     Return 0 if operation was successful (memory to allocate, legal context).
+ *     context to correspond to sec_ctx. Return 0 if operation was successful
+ *     (memory to allocate, legal context).
+ * @xfrm_state_alloc_acquire:
+ *     @x contains the xfrm_state being added to the Security Association
+ *     Database by the XFRM system.
+ *     @polsec contains the policy's security context.
+ *     @secid contains the secid from which to take the mls portion of the
+ *     context.
+ *     Allocate a security structure to the x->security field; the security
+ *     field is initialized to NULL when the xfrm_state is allocated. Set the
+ *     context to correspond to secid. Return 0 if operation was successful
+ *     (memory to allocate, legal context).
  * @xfrm_state_free_security:
  *     @x contains the xfrm_state.
  *     Deallocate x->security.
@@ -1679,9 +1687,11 @@ struct security_operations {
        int (*xfrm_policy_clone_security) (struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctx);
        void (*xfrm_policy_free_security) (struct xfrm_sec_ctx *ctx);
        int (*xfrm_policy_delete_security) (struct xfrm_sec_ctx *ctx);
-       int (*xfrm_state_alloc_security) (struct xfrm_state *x,
-               struct xfrm_user_sec_ctx *sec_ctx,
-               u32 secid);
+       int (*xfrm_state_alloc) (struct xfrm_state *x,
+                                struct xfrm_user_sec_ctx *sec_ctx);
+       int (*xfrm_state_alloc_acquire) (struct xfrm_state *x,
+                                        struct xfrm_sec_ctx *polsec,
+                                        u32 secid);
        void (*xfrm_state_free_security) (struct xfrm_state *x);
        int (*xfrm_state_delete_security) (struct xfrm_state *x);
        int (*xfrm_policy_lookup) (struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir);
index 1e8a8b6..cf87a24 100644 (file)
@@ -354,6 +354,35 @@ static inline void read_sequnlock_excl(seqlock_t *sl)
        spin_unlock(&sl->lock);
 }
 
+/**
+ * read_seqbegin_or_lock - begin a sequence number check or locking block
+ * @lock: sequence lock
+ * @seq : sequence number to be checked
+ *
+ * First try it once optimistically without taking the lock. If that fails,
+ * take the lock. The sequence number is also used as a marker for deciding
+ * whether to be a reader (even) or writer (odd).
+ * N.B. seq must be initialized to an even number to begin with.
+ */
+static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
+{
+       if (!(*seq & 1))        /* Even */
+               *seq = read_seqbegin(lock);
+       else                    /* Odd */
+               read_seqlock_excl(lock);
+}
+
+static inline int need_seqretry(seqlock_t *lock, int seq)
+{
+       return !(seq & 1) && read_seqretry(lock, seq);
+}
+
+static inline void done_seqretry(seqlock_t *lock, int seq)
+{
+       if (seq & 1)
+               read_sequnlock_excl(lock);
+}
+
 static inline void read_seqlock_excl_bh(seqlock_t *sl)
 {
        spin_lock_bh(&sl->lock);
index 74f1058..c2bba24 100644 (file)
  *  }
  *  rcu_read_unlock();
  *
- * See also the comment on struct slab_rcu in mm/slab.c.
+ * This is useful if we need to approach a kernel structure obliquely,
+ * from its address obtained without the usual locking. We can lock
+ * the structure to stabilize it and check it's still at the given address,
+ * only if we can be sure that the memory has not been meanwhile reused
+ * for some other kind of object (which our subsystem's lock might corrupt).
+ *
+ * rcu_read_lock before reading the address, then rcu_read_unlock after
+ * taking the spinlock within the structure expected at that address.
  */
 #define SLAB_DESTROY_BY_RCU    0x00080000UL    /* Defer freeing slabs to RCU */
 #define SLAB_MEM_SPREAD                0x00100000UL    /* Spread some memory over cpuset */
index e9346b4..09bfffb 100644 (file)
@@ -27,8 +27,8 @@ struct kmem_cache {
 
        size_t colour;                  /* cache colouring range */
        unsigned int colour_off;        /* colour offset */
-       struct kmem_cache *slabp_cache;
-       unsigned int slab_size;
+       struct kmem_cache *freelist_cache;
+       unsigned int freelist_size;
 
        /* constructor func */
        void (*ctor)(void *obj);
index cc0b67e..f56bfa9 100644 (file)
@@ -11,7 +11,7 @@
 enum stat_item {
        ALLOC_FASTPATH,         /* Allocation from cpu slab */
        ALLOC_SLOWPATH,         /* Allocation by getting a new cpu slab */
-       FREE_FASTPATH,          /* Free to cpu slub */
+       FREE_FASTPATH,          /* Free to cpu slab */
        FREE_SLOWPATH,          /* Freeing not to cpu slab */
        FREE_FROZEN,            /* Freeing to frozen slab */
        FREE_ADD_PARTIAL,       /* Freeing moves slab to partial list */
index 4db2985..4836ba3 100644 (file)
@@ -27,6 +27,12 @@ struct user_namespace {
        kuid_t                  owner;
        kgid_t                  group;
        unsigned int            proc_inum;
+
+       /* Register of per-UID persistent keyrings for this namespace */
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+       struct key              *persistent_keyring_register;
+       struct rw_semaphore     persistent_keyring_register_sem;
+#endif
 };
 
 extern struct user_namespace init_user_ns;
index 61939ba..eaa00b1 100644 (file)
@@ -278,6 +278,31 @@ do {                                                                       \
        __ret;                                                          \
 })
 
+#define __wait_event_cmd(wq, condition, cmd1, cmd2)                    \
+       (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,  \
+                           cmd1; schedule(); cmd2)
+
+/**
+ * wait_event_cmd - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * cmd1: the command will be executed before sleep
+ * cmd2: the command will be executed after sleep
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define wait_event_cmd(wq, condition, cmd1, cmd2)                      \
+do {                                                                   \
+       if (condition)                                                  \
+               break;                                                  \
+       __wait_event_cmd(wq, condition, cmd1, cmd2);                    \
+} while (0)
+
 #define __wait_event_interruptible(wq, condition)                      \
        ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0,          \
                      schedule())
index f18b3b7..4832d75 100644 (file)
@@ -162,12 +162,14 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
                { EXTENT_FLAG_LOGGING,          "LOGGING"       },      \
                { EXTENT_FLAG_FILLING,          "FILLING"       })
 
-TRACE_EVENT(btrfs_get_extent,
+TRACE_EVENT_CONDITION(btrfs_get_extent,
 
        TP_PROTO(struct btrfs_root *root, struct extent_map *map),
 
        TP_ARGS(root, map),
 
+       TP_CONDITION(map),
+
        TP_STRUCT__entry(
                __field(        u64,  root_objectid     )
                __field(        u64,  start             )
index db0b825..44b05a0 100644 (file)
@@ -68,6 +68,9 @@
 #define AUDIT_MAKE_EQUIV       1015    /* Append to watched tree */
 #define AUDIT_TTY_GET          1016    /* Get TTY auditing status */
 #define AUDIT_TTY_SET          1017    /* Set TTY auditing status */
+#define AUDIT_SET_FEATURE      1018    /* Turn an audit feature on or off */
+#define AUDIT_GET_FEATURE      1019    /* Get which features are enabled */
+#define AUDIT_FEATURE_CHANGE   1020    /* audit log listing feature changes */
 
 #define AUDIT_FIRST_USER_MSG   1100    /* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC         1107    /* We filter this differently */
@@ -357,6 +360,12 @@ enum {
 #define AUDIT_PERM_READ                4
 #define AUDIT_PERM_ATTR                8
 
+/* MAX_AUDIT_MESSAGE_LENGTH is set in audit:lib/libaudit.h as:
+ * 8970 // PATH_MAX*2+CONTEXT_SIZE*2+11+256+1
+ * max header+body+tailer: 44 + 29 + 32 + 262 + 7 + pad
+ */
+#define AUDIT_MESSAGE_TEXT_MAX 8560
+
 struct audit_status {
        __u32           mask;           /* Bit mask for valid entries */
        __u32           enabled;        /* 1 = enabled, 0 = disabled */
@@ -368,11 +377,28 @@ struct audit_status {
        __u32           backlog;        /* messages waiting in queue */
 };
 
+struct audit_features {
+#define AUDIT_FEATURE_VERSION  1
+       __u32   vers;
+       __u32   mask;           /* which bits we are dealing with */
+       __u32   features;       /* which feature to enable/disable */
+       __u32   lock;           /* which features to lock */
+};
+
+#define AUDIT_FEATURE_ONLY_UNSET_LOGINUID      0
+#define AUDIT_FEATURE_LOGINUID_IMMUTABLE       1
+#define AUDIT_LAST_FEATURE                     AUDIT_FEATURE_LOGINUID_IMMUTABLE
+
+#define audit_feature_valid(x)         ((x) >= 0 && (x) <= AUDIT_LAST_FEATURE)
+#define AUDIT_FEATURE_TO_MASK(x)       (1 << ((x) & 31)) /* mask for __u32 */
+
 struct audit_tty_status {
        __u32           enabled;        /* 1 = enabled, 0 = disabled */
        __u32           log_passwd;     /* 1 = enabled, 0 = disabled */
 };
 
+#define AUDIT_UID_UNSET (unsigned int)-1
+
 /* audit_rule_data supports filter rules with both integer and string
  * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
  * AUDIT_LIST_RULES requests.
diff --git a/include/uapi/linux/hash_info.h b/include/uapi/linux/hash_info.h
new file mode 100644 (file)
index 0000000..ca18c45
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * Hash Info: Hash algorithms information
+ *
+ * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#ifndef _UAPI_LINUX_HASH_INFO_H
+#define _UAPI_LINUX_HASH_INFO_H
+
+enum hash_algo {
+       HASH_ALGO_MD4,
+       HASH_ALGO_MD5,
+       HASH_ALGO_SHA1,
+       HASH_ALGO_RIPE_MD_160,
+       HASH_ALGO_SHA256,
+       HASH_ALGO_SHA384,
+       HASH_ALGO_SHA512,
+       HASH_ALGO_SHA224,
+       HASH_ALGO_RIPE_MD_128,
+       HASH_ALGO_RIPE_MD_256,
+       HASH_ALGO_RIPE_MD_320,
+       HASH_ALGO_WP_256,
+       HASH_ALGO_WP_384,
+       HASH_ALGO_WP_512,
+       HASH_ALGO_TGR_128,
+       HASH_ALGO_TGR_160,
+       HASH_ALGO_TGR_192,
+       HASH_ALGO__LAST
+};
+
+#endif /* _UAPI_LINUX_HASH_INFO_H */
index c9b7f4f..840cb99 100644 (file)
@@ -56,5 +56,6 @@
 #define KEYCTL_REJECT                  19      /* reject a partially constructed key */
 #define KEYCTL_INSTANTIATE_IOV         20      /* instantiate a partially constructed key */
 #define KEYCTL_INVALIDATE              21      /* invalidate a key */
+#define KEYCTL_GET_PERSISTENT          22      /* get a user's persistent keyring */
 
 #endif /*  _LINUX_KEYCTL_H */
index fe1a540..f7cf7f3 100644 (file)
@@ -16,6 +16,7 @@
 #define _MD_P_H
 
 #include <linux/types.h>
+#include <asm/byteorder.h>
 
 /*
  * RAID superblock.
index 3fc8a2f..79383d3 100644 (file)
@@ -301,20 +301,6 @@ config AUDIT_TREE
        depends on AUDITSYSCALL
        select FSNOTIFY
 
-config AUDIT_LOGINUID_IMMUTABLE
-       bool "Make audit loginuid immutable"
-       depends on AUDIT
-       help
-         The config option toggles if a task setting its loginuid requires
-         CAP_SYS_AUDITCONTROL or if that task should require no special permissions
-         but should instead only allow setting its loginuid if it was never
-         previously set.  On systems which use systemd or a similar central
-         process to restart login services this should be set to true.  On older
-         systems in which an admin would typically have to directly stop and
-         start processes this should be set to false.  Setting this to true allows
-         one to drop potentially dangerous capabilites from the login tasks,
-         but may not be backwards compatible with older init systems.
-
 source "kernel/irq/Kconfig"
 source "kernel/time/Kconfig"
 
@@ -1669,6 +1655,18 @@ config BASE_SMALL
        default 0 if BASE_FULL
        default 1 if !BASE_FULL
 
+config SYSTEM_TRUSTED_KEYRING
+       bool "Provide system-wide ring of trusted keys"
+       depends on KEYS
+       help
+         Provide a system keyring to which trusted keys can be added.  Keys in
+         the keyring are considered to be trusted.  Keys may be added at will
+         by the kernel from compiled-in data and from hardware key stores, but
+         userspace may only add extra keys if those keys can be verified by
+         keys already in the keyring.
+
+         Keys in this keyring are used by module signature checking.
+
 menuconfig MODULES
        bool "Enable loadable module support"
        option modules
@@ -1742,6 +1740,7 @@ config MODULE_SRCVERSION_ALL
 config MODULE_SIG
        bool "Module signature verification"
        depends on MODULES
+       select SYSTEM_TRUSTED_KEYRING
        select KEYS
        select CRYPTO
        select ASYMMETRIC_KEY_TYPE
index 01573fd..febc511 100644 (file)
@@ -476,7 +476,7 @@ static void __init mm_init(void)
        mem_init();
        kmem_cache_init();
        percpu_init_late();
-       pgtable_init();
+       pgtable_cache_init();
        vmalloc_init();
 }
 
index d697396..7a51443 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -208,15 +208,18 @@ static void shm_open(struct vm_area_struct *vma)
  */
 static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
 {
+       struct file *shm_file;
+
+       shm_file = shp->shm_file;
+       shp->shm_file = NULL;
        ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
        shm_rmid(ns, shp);
        shm_unlock(shp);
-       if (!is_file_hugepages(shp->shm_file))
-               shmem_lock(shp->shm_file, 0, shp->mlock_user);
+       if (!is_file_hugepages(shm_file))
+               shmem_lock(shm_file, 0, shp->mlock_user);
        else if (shp->mlock_user)
-               user_shm_unlock(file_inode(shp->shm_file)->i_size,
-                                               shp->mlock_user);
-       fput (shp->shm_file);
+               user_shm_unlock(file_inode(shm_file)->i_size, shp->mlock_user);
+       fput(shm_file);
        ipc_rcu_putref(shp, shm_rcu_free);
 }
 
@@ -974,15 +977,25 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
                ipc_lock_object(&shp->shm_perm);
                if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
                        kuid_t euid = current_euid();
-                       err = -EPERM;
                        if (!uid_eq(euid, shp->shm_perm.uid) &&
-                           !uid_eq(euid, shp->shm_perm.cuid))
+                           !uid_eq(euid, shp->shm_perm.cuid)) {
+                               err = -EPERM;
                                goto out_unlock0;
-                       if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK))
+                       }
+                       if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) {
+                               err = -EPERM;
                                goto out_unlock0;
+                       }
                }
 
                shm_file = shp->shm_file;
+
+               /* check if shm_destroy() is tearing down shp */
+               if (shm_file == NULL) {
+                       err = -EIDRM;
+                       goto out_unlock0;
+               }
+
                if (is_file_hugepages(shm_file))
                        goto out_unlock0;
 
@@ -1101,6 +1114,14 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
                goto out_unlock;
 
        ipc_lock_object(&shp->shm_perm);
+
+       /* check if shm_destroy() is tearing down shp */
+       if (shp->shm_file == NULL) {
+               ipc_unlock_object(&shp->shm_perm);
+               err = -EIDRM;
+               goto out_unlock;
+       }
+
        path = shp->shm_file->f_path;
        path_get(&path);
        shp->shm_nattch++;
index 09a9c94..bbaf7d5 100644 (file)
@@ -41,8 +41,9 @@ ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
 obj-$(CONFIG_UID16) += uid16.o
+obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
 obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
@@ -122,19 +123,52 @@ targets += timeconst.h
 $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
        $(call if_changed,bc)
 
-ifeq ($(CONFIG_MODULE_SIG),y)
+###############################################################################
+#
+# Roll all the X.509 certificates that we can find together and pull them into
+# the kernel so that they get loaded into the system trusted keyring during
+# boot.
 #
-# Pull the signing certificate and any extra certificates into the kernel
+# We look in the source root and the build root for all files whose name ends
+# in ".x509".  Unfortunately, this will generate duplicate filenames, so we
+# have make canonicalise the pathnames and then sort them to discard the
+# duplicates.
 #
+###############################################################################
+ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
+X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
+X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509
+X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
+                               $(or $(realpath $(CERT)),$(CERT))))
+
+ifeq ($(X509_CERTIFICATES),)
+$(warning *** No X.509 certificates found ***)
+endif
+
+ifneq ($(wildcard $(obj)/.x509.list),)
+ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
+$(info X.509 certificate list changed)
+$(shell rm $(obj)/.x509.list)
+endif
+endif
+
+kernel/system_certificates.o: $(obj)/x509_certificate_list
 
-quiet_cmd_touch = TOUCH   $@
-      cmd_touch = touch   $@
+quiet_cmd_x509certs  = CERTS   $@
+      cmd_x509certs  = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo "  - Including cert $(X509)")
 
-extra_certificates:
-       $(call cmd,touch)
+targets += $(obj)/x509_certificate_list
+$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
+       $(call if_changed,x509certs)
 
-kernel/modsign_certificate.o: signing_key.x509 extra_certificates
+targets += $(obj)/.x509.list
+$(obj)/.x509.list:
+       @echo $(X509_CERTIFICATES) >$@
 
+clean-files := x509_certificate_list .x509.list
+endif
+
+ifeq ($(CONFIG_MODULE_SIG),y)
 ###############################################################################
 #
 # If module signing is requested, say by allyesconfig, but a key has not been
index 7b0e23a..906ae5a 100644 (file)
@@ -60,7 +60,6 @@
 #ifdef CONFIG_SECURITY
 #include <linux/security.h>
 #endif
-#include <net/netlink.h>
 #include <linux/freezer.h>
 #include <linux/tty.h>
 #include <linux/pid_namespace.h>
@@ -140,6 +139,17 @@ static struct task_struct *kauditd_task;
 static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
 
+static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
+                                  .mask = -1,
+                                  .features = 0,
+                                  .lock = 0,};
+
+static char *audit_feature_names[2] = {
+       "only_unset_loginuid",
+       "loginuid_immutable",
+};
+
+
 /* Serialize requests from userspace. */
 DEFINE_MUTEX(audit_cmd_mutex);
 
@@ -584,6 +594,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
                return -EOPNOTSUPP;
        case AUDIT_GET:
        case AUDIT_SET:
+       case AUDIT_GET_FEATURE:
+       case AUDIT_SET_FEATURE:
        case AUDIT_LIST_RULES:
        case AUDIT_ADD_RULE:
        case AUDIT_DEL_RULE:
@@ -613,7 +625,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
        int rc = 0;
        uid_t uid = from_kuid(&init_user_ns, current_uid());
 
-       if (!audit_enabled) {
+       if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
                *ab = NULL;
                return rc;
        }
@@ -628,6 +640,94 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
        return rc;
 }
 
+int is_audit_feature_set(int i)
+{
+       return af.features & AUDIT_FEATURE_TO_MASK(i);
+}
+
+
+static int audit_get_feature(struct sk_buff *skb)
+{
+       u32 seq;
+
+       seq = nlmsg_hdr(skb)->nlmsg_seq;
+
+       audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
+                        &af, sizeof(af));
+
+       return 0;
+}
+
+static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
+                                    u32 old_lock, u32 new_lock, int res)
+{
+       struct audit_buffer *ab;
+
+       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+       audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d",
+                        audit_feature_names[which], !!old_feature, !!new_feature,
+                        !!old_lock, !!new_lock, res);
+       audit_log_end(ab);
+}
+
+static int audit_set_feature(struct sk_buff *skb)
+{
+       struct audit_features *uaf;
+       int i;
+
+       BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0]));
+       uaf = nlmsg_data(nlmsg_hdr(skb));
+
+       /* if there is ever a version 2 we should handle that here */
+
+       for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
+               u32 feature = AUDIT_FEATURE_TO_MASK(i);
+               u32 old_feature, new_feature, old_lock, new_lock;
+
+               /* if we are not changing this feature, move along */
+               if (!(feature & uaf->mask))
+                       continue;
+
+               old_feature = af.features & feature;
+               new_feature = uaf->features & feature;
+               new_lock = (uaf->lock | af.lock) & feature;
+               old_lock = af.lock & feature;
+
+               /* are we changing a locked feature? */
+               if ((af.lock & feature) && (new_feature != old_feature)) {
+                       audit_log_feature_change(i, old_feature, new_feature,
+                                                old_lock, new_lock, 0);
+                       return -EPERM;
+               }
+       }
+       /* nothing invalid, do the changes */
+       for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
+               u32 feature = AUDIT_FEATURE_TO_MASK(i);
+               u32 old_feature, new_feature, old_lock, new_lock;
+
+               /* if we are not changing this feature, move along */
+               if (!(feature & uaf->mask))
+                       continue;
+
+               old_feature = af.features & feature;
+               new_feature = uaf->features & feature;
+               old_lock = af.lock & feature;
+               new_lock = (uaf->lock | af.lock) & feature;
+
+               if (new_feature != old_feature)
+                       audit_log_feature_change(i, old_feature, new_feature,
+                                                old_lock, new_lock, 1);
+
+               if (new_feature)
+                       af.features |= feature;
+               else
+                       af.features &= ~feature;
+               af.lock |= new_lock;
+       }
+
+       return 0;
+}
+
 static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
        u32                     seq;
@@ -659,6 +759,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
        switch (msg_type) {
        case AUDIT_GET:
+               memset(&status_set, 0, sizeof(status_set));
                status_set.enabled       = audit_enabled;
                status_set.failure       = audit_failure;
                status_set.pid           = audit_pid;
@@ -670,7 +771,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                 &status_set, sizeof(status_set));
                break;
        case AUDIT_SET:
-               if (nlh->nlmsg_len < sizeof(struct audit_status))
+               if (nlmsg_len(nlh) < sizeof(struct audit_status))
                        return -EINVAL;
                status_get   = (struct audit_status *)data;
                if (status_get->mask & AUDIT_STATUS_ENABLED) {
@@ -699,6 +800,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
                        err = audit_set_backlog_limit(status_get->backlog_limit);
                break;
+       case AUDIT_GET_FEATURE:
+               err = audit_get_feature(skb);
+               if (err)
+                       return err;
+               break;
+       case AUDIT_SET_FEATURE:
+               err = audit_set_feature(skb);
+               if (err)
+                       return err;
+               break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
        case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
@@ -715,7 +826,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        }
                        audit_log_common_recv_msg(&ab, msg_type);
                        if (msg_type != AUDIT_USER_TTY)
-                               audit_log_format(ab, " msg='%.1024s'",
+                               audit_log_format(ab, " msg='%.*s'",
+                                                AUDIT_MESSAGE_TEXT_MAX,
                                                 (char *)data);
                        else {
                                int size;
@@ -818,7 +930,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                struct task_struct *tsk = current;
 
                spin_lock(&tsk->sighand->siglock);
-               s.enabled = tsk->signal->audit_tty != 0;
+               s.enabled = tsk->signal->audit_tty;
                s.log_passwd = tsk->signal->audit_tty_log_passwd;
                spin_unlock(&tsk->sighand->siglock);
 
@@ -832,7 +944,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
                memset(&s, 0, sizeof(s));
                /* guard against past and future API changes */
-               memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len));
+               memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
                if ((s.enabled != 0 && s.enabled != 1) ||
                    (s.log_passwd != 0 && s.log_passwd != 1))
                        return -EINVAL;
@@ -1067,13 +1179,6 @@ static void wait_for_auditd(unsigned long sleep_time)
        remove_wait_queue(&audit_backlog_wait, &wait);
 }
 
-/* Obtain an audit buffer.  This routine does locking to obtain the
- * audit buffer, but then no locking is required for calls to
- * audit_log_*format.  If the tsk is a task that is currently in a
- * syscall, then the syscall is marked as auditable and an audit record
- * will be written at syscall exit.  If there is no associated task, tsk
- * should be NULL. */
-
 /**
  * audit_log_start - obtain an audit buffer
  * @ctx: audit_context (may be NULL)
@@ -1389,7 +1494,7 @@ void audit_log_session_info(struct audit_buffer *ab)
        u32 sessionid = audit_get_sessionid(current);
        uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
 
-       audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid);
+       audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
 }
 
 void audit_log_key(struct audit_buffer *ab, char *key)
@@ -1536,6 +1641,26 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
                }
        }
 
+       /* log the audit_names record type */
+       audit_log_format(ab, " nametype=");
+       switch(n->type) {
+       case AUDIT_TYPE_NORMAL:
+               audit_log_format(ab, "NORMAL");
+               break;
+       case AUDIT_TYPE_PARENT:
+               audit_log_format(ab, "PARENT");
+               break;
+       case AUDIT_TYPE_CHILD_DELETE:
+               audit_log_format(ab, "DELETE");
+               break;
+       case AUDIT_TYPE_CHILD_CREATE:
+               audit_log_format(ab, "CREATE");
+               break;
+       default:
+               audit_log_format(ab, "UNKNOWN");
+               break;
+       }
+
        audit_log_fcaps(ab, n);
        audit_log_end(ab);
 }
index 123c9b7..b779642 100644 (file)
@@ -197,6 +197,9 @@ struct audit_context {
                        int                     fd;
                        int                     flags;
                } mmap;
+               struct {
+                       int                     argc;
+               } execve;
        };
        int fds[2];
 
index f7aee8b..51f3fd4 100644 (file)
@@ -343,6 +343,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
        case AUDIT_DEVMINOR:
        case AUDIT_EXIT:
        case AUDIT_SUCCESS:
+       case AUDIT_INODE:
                /* bit ops are only useful on syscall args */
                if (f->op == Audit_bitmask || f->op == Audit_bittest)
                        return -EINVAL;
@@ -423,7 +424,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                f->lsm_rule = NULL;
 
                /* Support legacy tests for a valid loginuid */
-               if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) {
+               if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
                        f->type = AUDIT_LOGINUID_SET;
                        f->val = 0;
                }
index 9845cb3..90594c9 100644 (file)
@@ -95,13 +95,6 @@ struct audit_aux_data {
 /* Number of target pids per aux struct. */
 #define AUDIT_AUX_PIDS 16
 
-struct audit_aux_data_execve {
-       struct audit_aux_data   d;
-       int argc;
-       int envc;
-       struct mm_struct *mm;
-};
-
 struct audit_aux_data_pids {
        struct audit_aux_data   d;
        pid_t                   target_pid[AUDIT_AUX_PIDS];
@@ -121,12 +114,6 @@ struct audit_aux_data_bprm_fcaps {
        struct audit_cap_data   new_pcap;
 };
 
-struct audit_aux_data_capset {
-       struct audit_aux_data   d;
-       pid_t                   pid;
-       struct audit_cap_data   cap;
-};
-
 struct audit_tree_refs {
        struct audit_tree_refs *next;
        struct audit_chunk *c[31];
@@ -566,7 +553,7 @@ static int audit_filter_rules(struct task_struct *tsk,
                        break;
                case AUDIT_INODE:
                        if (name)
-                               result = (name->ino == f->val);
+                               result = audit_comparator(name->ino, f->op, f->val);
                        else if (ctx) {
                                list_for_each_entry(n, &ctx->names_list, list) {
                                        if (audit_comparator(n->ino, f->op, f->val)) {
@@ -943,8 +930,10 @@ int audit_alloc(struct task_struct *tsk)
                return 0; /* Return if not auditing. */
 
        state = audit_filter_task(tsk, &key);
-       if (state == AUDIT_DISABLED)
+       if (state == AUDIT_DISABLED) {
+               clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
                return 0;
+       }
 
        if (!(context = audit_alloc_context(state))) {
                kfree(key);
@@ -1149,20 +1138,16 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 }
 
 static void audit_log_execve_info(struct audit_context *context,
-                                 struct audit_buffer **ab,
-                                 struct audit_aux_data_execve *axi)
+                                 struct audit_buffer **ab)
 {
        int i, len;
        size_t len_sent = 0;
        const char __user *p;
        char *buf;
 
-       if (axi->mm != current->mm)
-               return; /* execve failed, no additional info */
-
-       p = (const char __user *)axi->mm->arg_start;
+       p = (const char __user *)current->mm->arg_start;
 
-       audit_log_format(*ab, "argc=%d", axi->argc);
+       audit_log_format(*ab, "argc=%d", context->execve.argc);
 
        /*
         * we need some kernel buffer to hold the userspace args.  Just
@@ -1176,7 +1161,7 @@ static void audit_log_execve_info(struct audit_context *context,
                return;
        }
 
-       for (i = 0; i < axi->argc; i++) {
+       for (i = 0; i < context->execve.argc; i++) {
                len = audit_log_single_execve_arg(context, ab, i,
                                                  &len_sent, p, buf);
                if (len <= 0)
@@ -1279,6 +1264,9 @@ static void show_special(struct audit_context *context, int *call_panic)
                audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
                                 context->mmap.flags);
                break; }
+       case AUDIT_EXECVE: {
+               audit_log_execve_info(context, &ab);
+               break; }
        }
        audit_log_end(ab);
 }
@@ -1325,11 +1313,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
                switch (aux->type) {
 
-               case AUDIT_EXECVE: {
-                       struct audit_aux_data_execve *axi = (void *)aux;
-                       audit_log_execve_info(context, &ab, axi);
-                       break; }
-
                case AUDIT_BPRM_FCAPS: {
                        struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
                        audit_log_format(ab, "fver=%x", axs->fcap_ver);
@@ -1964,6 +1947,43 @@ int auditsc_get_stamp(struct audit_context *ctx,
 /* global counter which is incremented every time something logs in */
 static atomic_t session_id = ATOMIC_INIT(0);
 
+static int audit_set_loginuid_perm(kuid_t loginuid)
+{
+       /* if we are unset, we don't need privs */
+       if (!audit_loginuid_set(current))
+               return 0;
+       /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
+       if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
+               return -EPERM;
+       /* it is set, you need permission */
+       if (!capable(CAP_AUDIT_CONTROL))
+               return -EPERM;
+       /* reject if this is not an unset and we don't allow that */
+       if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
+               return -EPERM;
+       return 0;
+}
+
+static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
+                                  unsigned int oldsessionid, unsigned int sessionid,
+                                  int rc)
+{
+       struct audit_buffer *ab;
+       uid_t uid, ologinuid, nloginuid;
+
+       uid = from_kuid(&init_user_ns, task_uid(current));
+       ologinuid = from_kuid(&init_user_ns, koldloginuid);
+       nloginuid = from_kuid(&init_user_ns, kloginuid),
+
+       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+       if (!ab)
+               return;
+       audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old "
+                        "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid,
+                        nloginuid, oldsessionid, sessionid, !rc);
+       audit_log_end(ab);
+}
+
 /**
  * audit_set_loginuid - set current task's audit_context loginuid
  * @loginuid: loginuid value
@@ -1975,37 +1995,26 @@ static atomic_t session_id = ATOMIC_INIT(0);
 int audit_set_loginuid(kuid_t loginuid)
 {
        struct task_struct *task = current;
-       struct audit_context *context = task->audit_context;
-       unsigned int sessionid;
+       unsigned int oldsessionid, sessionid = (unsigned int)-1;
+       kuid_t oldloginuid;
+       int rc;
 
-#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
-       if (audit_loginuid_set(task))
-               return -EPERM;
-#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
-       if (!capable(CAP_AUDIT_CONTROL))
-               return -EPERM;
-#endif  /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+       oldloginuid = audit_get_loginuid(current);
+       oldsessionid = audit_get_sessionid(current);
 
-       sessionid = atomic_inc_return(&session_id);
-       if (context && context->in_syscall) {
-               struct audit_buffer *ab;
+       rc = audit_set_loginuid_perm(loginuid);
+       if (rc)
+               goto out;
+
+       /* are we setting or clearing? */
+       if (uid_valid(loginuid))
+               sessionid = atomic_inc_return(&session_id);
 
-               ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
-               if (ab) {
-                       audit_log_format(ab, "login pid=%d uid=%u "
-                               "old auid=%u new auid=%u"
-                               " old ses=%u new ses=%u",
-                               task->pid,
-                               from_kuid(&init_user_ns, task_uid(task)),
-                               from_kuid(&init_user_ns, task->loginuid),
-                               from_kuid(&init_user_ns, loginuid),
-                               task->sessionid, sessionid);
-                       audit_log_end(ab);
-               }
-       }
        task->sessionid = sessionid;
        task->loginuid = loginuid;
-       return 0;
+out:
+       audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
+       return rc;
 }
 
 /**
@@ -2126,22 +2135,12 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
        context->ipc.has_perm = 1;
 }
 
-int __audit_bprm(struct linux_binprm *bprm)
+void __audit_bprm(struct linux_binprm *bprm)
 {
-       struct audit_aux_data_execve *ax;
        struct audit_context *context = current->audit_context;
 
-       ax = kmalloc(sizeof(*ax), GFP_KERNEL);
-       if (!ax)
-               return -ENOMEM;
-
-       ax->argc = bprm->argc;
-       ax->envc = bprm->envc;
-       ax->mm = bprm->mm;
-       ax->d.type = AUDIT_EXECVE;
-       ax->d.next = context->aux;
-       context->aux = (void *)ax;
-       return 0;
+       context->type = AUDIT_EXECVE;
+       context->execve.argc = bprm->argc;
 }
 
 
index e0839bc..4c62513 100644 (file)
@@ -895,11 +895,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
        iput(inode);
 }
 
-static int cgroup_delete(const struct dentry *d)
-{
-       return 1;
-}
-
 static void remove_dir(struct dentry *d)
 {
        struct dentry *parent = dget(d->d_parent);
@@ -1486,7 +1481,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
 {
        static const struct dentry_operations cgroup_dops = {
                .d_iput = cgroup_diput,
-               .d_delete = cgroup_delete,
+               .d_delete = always_delete_dentry,
        };
 
        struct inode *inode =
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
deleted file mode 100644 (file)
index 4a9a86d..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <linux/export.h>
-
-#define GLOBAL(name)   \
-       .globl VMLINUX_SYMBOL(name);    \
-       VMLINUX_SYMBOL(name):
-
-       .section ".init.data","aw"
-
-GLOBAL(modsign_certificate_list)
-       .incbin "signing_key.x509"
-       .incbin "extra_certificates"
-GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
deleted file mode 100644 (file)
index 7cbd450..0000000
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Public keys for module signature verification
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <keys/asymmetric-type.h>
-#include "module-internal.h"
-
-struct key *modsign_keyring;
-
-extern __initconst const u8 modsign_certificate_list[];
-extern __initconst const u8 modsign_certificate_list_end[];
-
-/*
- * We need to make sure ccache doesn't cache the .o file as it doesn't notice
- * if modsign.pub changes.
- */
-static __initconst const char annoy_ccache[] = __TIME__ "foo";
-
-/*
- * Load the compiled-in keys
- */
-static __init int module_verify_init(void)
-{
-       pr_notice("Initialise module verification\n");
-
-       modsign_keyring = keyring_alloc(".module_sign",
-                                       KUIDT_INIT(0), KGIDT_INIT(0),
-                                       current_cred(),
-                                       ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                                        KEY_USR_VIEW | KEY_USR_READ),
-                                       KEY_ALLOC_NOT_IN_QUOTA, NULL);
-       if (IS_ERR(modsign_keyring))
-               panic("Can't allocate module signing keyring\n");
-
-       return 0;
-}
-
-/*
- * Must be initialised before we try and load the keys into the keyring.
- */
-device_initcall(module_verify_init);
-
-/*
- * Load the compiled-in keys
- */
-static __init int load_module_signing_keys(void)
-{
-       key_ref_t key;
-       const u8 *p, *end;
-       size_t plen;
-
-       pr_notice("Loading module verification certificates\n");
-
-       end = modsign_certificate_list_end;
-       p = modsign_certificate_list;
-       while (p < end) {
-               /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
-                * than 256 bytes in size.
-                */
-               if (end - p < 4)
-                       goto dodgy_cert;
-               if (p[0] != 0x30 &&
-                   p[1] != 0x82)
-                       goto dodgy_cert;
-               plen = (p[2] << 8) | p[3];
-               plen += 4;
-               if (plen > end - p)
-                       goto dodgy_cert;
-
-               key = key_create_or_update(make_key_ref(modsign_keyring, 1),
-                                          "asymmetric",
-                                          NULL,
-                                          p,
-                                          plen,
-                                          (KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                                          KEY_USR_VIEW,
-                                          KEY_ALLOC_NOT_IN_QUOTA);
-               if (IS_ERR(key))
-                       pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n",
-                              PTR_ERR(key));
-               else
-                       pr_notice("MODSIGN: Loaded cert '%s'\n",
-                                 key_ref_to_ptr(key)->description);
-               p += plen;
-       }
-
-       return 0;
-
-dodgy_cert:
-       pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n");
-       return 0;
-}
-late_initcall(load_module_signing_keys);
index 24f9247..915e123 100644 (file)
@@ -9,6 +9,4 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
-extern struct key *modsign_keyring;
-
 extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
index f2970bd..be5b8fa 100644 (file)
@@ -14,6 +14,7 @@
 #include <crypto/public_key.h>
 #include <crypto/hash.h>
 #include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
 #include "module-internal.h"
 
 /*
@@ -28,7 +29,7 @@
  */
 struct module_signature {
        u8      algo;           /* Public-key crypto algorithm [enum pkey_algo] */
-       u8      hash;           /* Digest algorithm [enum pkey_hash_algo] */
+       u8      hash;           /* Digest algorithm [enum hash_algo] */
        u8      id_type;        /* Key identifier type [enum pkey_id_type] */
        u8      signer_len;     /* Length of signer's name */
        u8      key_id_len;     /* Length of key identifier */
@@ -39,7 +40,7 @@ struct module_signature {
 /*
  * Digest the module contents.
  */
-static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
+static struct public_key_signature *mod_make_digest(enum hash_algo hash,
                                                    const void *mod,
                                                    unsigned long modlen)
 {
@@ -54,7 +55,7 @@ static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,
        /* Allocate the hashing algorithm we're going to need and find out how
         * big the hash operational data will be.
         */
-       tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0);
+       tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
        if (IS_ERR(tfm))
                return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
 
@@ -157,7 +158,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
 
        pr_debug("Look up: \"%s\"\n", id);
 
-       key = keyring_search(make_key_ref(modsign_keyring, 1),
+       key = keyring_search(make_key_ref(system_trusted_keyring, 1),
                             &key_type_asymmetric, id);
        if (IS_ERR(key))
                pr_warn("Request for unknown module key '%s' err %ld\n",
@@ -217,7 +218,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
                return -ENOPKG;
 
        if (ms.hash >= PKEY_HASH__LAST ||
-           !pkey_hash_algo[ms.hash])
+           !hash_algo_name[ms.hash])
                return -ENOPKG;
 
        key = request_asymmetric_key(sig, ms.signer_len,
index 10c22ca..b38109e 100644 (file)
@@ -792,7 +792,8 @@ void free_basic_memory_bitmaps(void)
 {
        struct memory_bitmap *bm1, *bm2;
 
-       BUG_ON(!(forbidden_pages_map && free_pages_map));
+       if (WARN_ON(!(forbidden_pages_map && free_pages_map)))
+               return;
 
        bm1 = forbidden_pages_map;
        bm2 = free_pages_map;
index 2485027..98d3575 100644 (file)
@@ -70,6 +70,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                data->swap = swsusp_resume_device ?
                        swap_type_of(swsusp_resume_device, 0, NULL) : -1;
                data->mode = O_RDONLY;
+               data->free_bitmaps = false;
                error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
                if (error)
                        pm_notifier_call_chain(PM_POST_HIBERNATION);
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
new file mode 100644 (file)
index 0000000..4aef390
--- /dev/null
@@ -0,0 +1,10 @@
+#include <linux/export.h>
+#include <linux/init.h>
+
+       __INITRODATA
+
+       .globl VMLINUX_SYMBOL(system_certificate_list)
+VMLINUX_SYMBOL(system_certificate_list):
+       .incbin "kernel/x509_certificate_list"
+       .globl VMLINUX_SYMBOL(system_certificate_list_end)
+VMLINUX_SYMBOL(system_certificate_list_end):
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
new file mode 100644 (file)
index 0000000..564dd93
--- /dev/null
@@ -0,0 +1,105 @@
+/* System trusted keyring for trusted public keys
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
+#include "module-internal.h"
+
+struct key *system_trusted_keyring;
+EXPORT_SYMBOL_GPL(system_trusted_keyring);
+
+extern __initconst const u8 system_certificate_list[];
+extern __initconst const u8 system_certificate_list_end[];
+
+/*
+ * Load the compiled-in keys
+ */
+static __init int system_trusted_keyring_init(void)
+{
+       pr_notice("Initialise system trusted keyring\n");
+
+       system_trusted_keyring =
+               keyring_alloc(".system_keyring",
+                             KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
+                             ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                             KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
+                             KEY_ALLOC_NOT_IN_QUOTA, NULL);
+       if (IS_ERR(system_trusted_keyring))
+               panic("Can't allocate system trusted keyring\n");
+
+       set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
+       return 0;
+}
+
+/*
+ * Must be initialised before we try and load the keys into the keyring.
+ */
+device_initcall(system_trusted_keyring_init);
+
+/*
+ * Load the compiled-in list of X.509 certificates.
+ */
+static __init int load_system_certificate_list(void)
+{
+       key_ref_t key;
+       const u8 *p, *end;
+       size_t plen;
+
+       pr_notice("Loading compiled-in X.509 certificates\n");
+
+       end = system_certificate_list_end;
+       p = system_certificate_list;
+       while (p < end) {
+               /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
+                * than 256 bytes in size.
+                */
+               if (end - p < 4)
+                       goto dodgy_cert;
+               if (p[0] != 0x30 &&
+                   p[1] != 0x82)
+                       goto dodgy_cert;
+               plen = (p[2] << 8) | p[3];
+               plen += 4;
+               if (plen > end - p)
+                       goto dodgy_cert;
+
+               key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
+                                          "asymmetric",
+                                          NULL,
+                                          p,
+                                          plen,
+                                          ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                                          KEY_USR_VIEW | KEY_USR_READ),
+                                          KEY_ALLOC_NOT_IN_QUOTA |
+                                          KEY_ALLOC_TRUSTED);
+               if (IS_ERR(key)) {
+                       pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
+                              PTR_ERR(key));
+               } else {
+                       pr_notice("Loaded X.509 cert '%s'\n",
+                                 key_ref_to_ptr(key)->description);
+                       key_ref_put(key);
+               }
+               p += plen;
+       }
+
+       return 0;
+
+dodgy_cert:
+       pr_err("Problem parsing in-kernel X.509 certificate list\n");
+       return 0;
+}
+late_initcall(load_system_certificate_list);
index 5bbb919..a3a0dbf 100644 (file)
@@ -51,6 +51,10 @@ struct user_namespace init_user_ns = {
        .owner = GLOBAL_ROOT_UID,
        .group = GLOBAL_ROOT_GID,
        .proc_inum = PROC_USER_INIT_INO,
+#ifdef CONFIG_KEYS_KERBEROS_CACHE
+       .krb_cache_register_sem =
+       __RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem),
+#endif
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
index 13fb113..240fb62 100644 (file)
@@ -101,6 +101,9 @@ int create_user_ns(struct cred *new)
 
        set_cred_user_ns(new, ns);
 
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+       init_rwsem(&ns->persistent_keyring_register_sem);
+#endif
        return 0;
 }
 
@@ -130,6 +133,9 @@ void free_user_ns(struct user_namespace *ns)
 
        do {
                parent = ns->parent;
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+               key_put(ns->persistent_keyring_register);
+#endif
                proc_free_inum(ns->proc_inum);
                kmem_cache_free(user_ns_cachep, ns);
                ns = parent;
index 06dc742..991c98b 100644 (file)
@@ -322,6 +322,20 @@ config TEXTSEARCH_FSM
 config BTREE
        boolean
 
+config ASSOCIATIVE_ARRAY
+       bool
+       help
+         Generic associative array.  Can be searched and iterated over whilst
+         it is being modified.  It is also reasonably quick to search and
+         modify.  The algorithms are non-recursive, and the trees are highly
+         capacious.
+
+         See:
+
+               Documentation/assoc_array.txt
+
+         for more information.
+
 config HAS_IOMEM
        boolean
        depends on !NO_IOMEM
index d480a8c..b46065f 100644 (file)
@@ -47,6 +47,7 @@ CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o
+obj-$(CONFIG_ASSOCIATIVE_ARRAY) += assoc_array.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 obj-$(CONFIG_DEBUG_LIST) += list_debug.o
 obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o
diff --git a/lib/assoc_array.c b/lib/assoc_array.c
new file mode 100644 (file)
index 0000000..17edeaf
--- /dev/null
@@ -0,0 +1,1746 @@
+/* Generic associative array implementation.
+ *
+ * See Documentation/assoc_array.txt for information.
+ *
+ * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+//#define DEBUG
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/assoc_array_priv.h>
+
+/*
+ * Iterate over an associative array.  The caller must hold the RCU read lock
+ * or better.
+ */
+static int assoc_array_subtree_iterate(const struct assoc_array_ptr *root,
+                                      const struct assoc_array_ptr *stop,
+                                      int (*iterator)(const void *leaf,
+                                                      void *iterator_data),
+                                      void *iterator_data)
+{
+       const struct assoc_array_shortcut *shortcut;
+       const struct assoc_array_node *node;
+       const struct assoc_array_ptr *cursor, *ptr, *parent;
+       unsigned long has_meta;
+       int slot, ret;
+
+       cursor = root;
+
+begin_node:
+       if (assoc_array_ptr_is_shortcut(cursor)) {
+               /* Descend through a shortcut */
+               shortcut = assoc_array_ptr_to_shortcut(cursor);
+               smp_read_barrier_depends();
+               cursor = ACCESS_ONCE(shortcut->next_node);
+       }
+
+       node = assoc_array_ptr_to_node(cursor);
+       smp_read_barrier_depends();
+       slot = 0;
+
+       /* We perform two passes of each node.
+        *
+        * The first pass does all the leaves in this node.  This means we
+        * don't miss any leaves if the node is split up by insertion whilst
+        * we're iterating over the branches rooted here (we may, however, see
+        * some leaves twice).
+        */
+       has_meta = 0;
+       for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
+               ptr = ACCESS_ONCE(node->slots[slot]);
+               has_meta |= (unsigned long)ptr;
+               if (ptr && assoc_array_ptr_is_leaf(ptr)) {
+                       /* We need a barrier between the read of the pointer
+                        * and dereferencing the pointer - but only if we are
+                        * actually going to dereference it.
+                        */
+                       smp_read_barrier_depends();
+
+                       /* Invoke the callback */
+                       ret = iterator(assoc_array_ptr_to_leaf(ptr),
+                                      iterator_data);
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       /* The second pass attends to all the metadata pointers.  If we follow
+        * one of these we may find that we don't come back here, but rather go
+        * back to a replacement node with the leaves in a different layout.
+        *
+        * We are guaranteed to make progress, however, as the slot number for
+        * a particular portion of the key space cannot change - and we
+        * continue at the back pointer + 1.
+        */
+       if (!(has_meta & ASSOC_ARRAY_PTR_META_TYPE))
+               goto finished_node;
+       slot = 0;
+
+continue_node:
+       node = assoc_array_ptr_to_node(cursor);
+       smp_read_barrier_depends();
+
+       for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
+               ptr = ACCESS_ONCE(node->slots[slot]);
+               if (assoc_array_ptr_is_meta(ptr)) {
+                       cursor = ptr;
+                       goto begin_node;
+               }
+       }
+
+finished_node:
+       /* Move up to the parent (may need to skip back over a shortcut) */
+       parent = ACCESS_ONCE(node->back_pointer);
+       slot = node->parent_slot;
+       if (parent == stop)
+               return 0;
+
+       if (assoc_array_ptr_is_shortcut(parent)) {
+               shortcut = assoc_array_ptr_to_shortcut(parent);
+               smp_read_barrier_depends();
+               cursor = parent;
+               parent = ACCESS_ONCE(shortcut->back_pointer);
+               slot = shortcut->parent_slot;
+               if (parent == stop)
+                       return 0;
+       }
+
+       /* Ascend to next slot in parent node */
+       cursor = parent;
+       slot++;
+       goto continue_node;
+}
+
+/**
+ * assoc_array_iterate - Pass all objects in the array to a callback
+ * @array: The array to iterate over.
+ * @iterator: The callback function.
+ * @iterator_data: Private data for the callback function.
+ *
+ * Iterate over all the objects in an associative array.  Each one will be
+ * presented to the iterator function.
+ *
+ * If the array is being modified concurrently with the iteration then it is
+ * possible that some objects in the array will be passed to the iterator
+ * callback more than once - though every object should be passed at least
+ * once.  If this is undesirable then the caller must lock against modification
+ * for the duration of this function.
+ *
+ * The function will return 0 if no objects were in the array or else it will
+ * return the result of the last iterator function called.  Iteration stops
+ * immediately if any call to the iteration function results in a non-zero
+ * return.
+ *
+ * The caller should hold the RCU read lock or better if concurrent
+ * modification is possible.
+ */
+int assoc_array_iterate(const struct assoc_array *array,
+                       int (*iterator)(const void *object,
+                                       void *iterator_data),
+                       void *iterator_data)
+{
+       struct assoc_array_ptr *root = ACCESS_ONCE(array->root);
+
+       if (!root)
+               return 0;
+       return assoc_array_subtree_iterate(root, NULL, iterator, iterator_data);
+}
+
+enum assoc_array_walk_status {
+       assoc_array_walk_tree_empty,
+       assoc_array_walk_found_terminal_node,
+       assoc_array_walk_found_wrong_shortcut,
+} status;
+
+struct assoc_array_walk_result {
+       struct {
+               struct assoc_array_node *node;  /* Node in which leaf might be found */
+               int             level;
+               int             slot;
+       } terminal_node;
+       struct {
+               struct assoc_array_shortcut *shortcut;
+               int             level;
+               int             sc_level;
+               unsigned long   sc_segments;
+               unsigned long   dissimilarity;
+       } wrong_shortcut;
+};
+
+/*
+ * Navigate through the internal tree looking for the closest node to the key.
+ */
+static enum assoc_array_walk_status
+assoc_array_walk(const struct assoc_array *array,
+                const struct assoc_array_ops *ops,
+                const void *index_key,
+                struct assoc_array_walk_result *result)
+{
+       struct assoc_array_shortcut *shortcut;
+       struct assoc_array_node *node;
+       struct assoc_array_ptr *cursor, *ptr;
+       unsigned long sc_segments, dissimilarity;
+       unsigned long segments;
+       int level, sc_level, next_sc_level;
+       int slot;
+
+       pr_devel("-->%s()\n", __func__);
+
+       cursor = ACCESS_ONCE(array->root);
+       if (!cursor)
+               return assoc_array_walk_tree_empty;
+
+       level = 0;
+
+       /* Use segments from the key for the new leaf to navigate through the
+        * internal tree, skipping through nodes and shortcuts that are on
+        * route to the destination.  Eventually we'll come to a slot that is
+        * either empty or contains a leaf at which point we've found a node in
+        * which the leaf we're looking for might be found or into which it
+        * should be inserted.
+        */
+jumped:
+       segments = ops->get_key_chunk(index_key, level);
+       pr_devel("segments[%d]: %lx\n", level, segments);
+
+       if (assoc_array_ptr_is_shortcut(cursor))
+               goto follow_shortcut;
+
+consider_node:
+       node = assoc_array_ptr_to_node(cursor);
+       smp_read_barrier_depends();
+
+       slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK);
+       slot &= ASSOC_ARRAY_FAN_MASK;
+       ptr = ACCESS_ONCE(node->slots[slot]);
+
+       pr_devel("consider slot %x [ix=%d type=%lu]\n",
+                slot, level, (unsigned long)ptr & 3);
+
+       if (!assoc_array_ptr_is_meta(ptr)) {
+               /* The node doesn't have a node/shortcut pointer in the slot
+                * corresponding to the index key that we have to follow.
+                */
+               result->terminal_node.node = node;
+               result->terminal_node.level = level;
+               result->terminal_node.slot = slot;
+               pr_devel("<--%s() = terminal_node\n", __func__);
+               return assoc_array_walk_found_terminal_node;
+       }
+
+       if (assoc_array_ptr_is_node(ptr)) {
+               /* There is a pointer to a node in the slot corresponding to
+                * this index key segment, so we need to follow it.
+                */
+               cursor = ptr;
+               level += ASSOC_ARRAY_LEVEL_STEP;
+               if ((level & ASSOC_ARRAY_KEY_CHUNK_MASK) != 0)
+                       goto consider_node;
+               goto jumped;
+       }
+
+       /* There is a shortcut in the slot corresponding to the index key
+        * segment.  We follow the shortcut if its partial index key matches
+        * this leaf's.  Otherwise we need to split the shortcut.
+        */
+       cursor = ptr;
+follow_shortcut:
+       shortcut = assoc_array_ptr_to_shortcut(cursor);
+       smp_read_barrier_depends();
+       pr_devel("shortcut to %d\n", shortcut->skip_to_level);
+       sc_level = level + ASSOC_ARRAY_LEVEL_STEP;
+       BUG_ON(sc_level > shortcut->skip_to_level);
+
+       do {
+               /* Check the leaf against the shortcut's index key a word at a
+                * time, trimming the final word (the shortcut stores the index
+                * key completely from the root to the shortcut's target).
+                */
+               if ((sc_level & ASSOC_ARRAY_KEY_CHUNK_MASK) == 0)
+                       segments = ops->get_key_chunk(index_key, sc_level);
+
+               sc_segments = shortcut->index_key[sc_level >> ASSOC_ARRAY_KEY_CHUNK_SHIFT];
+               dissimilarity = segments ^ sc_segments;
+
+               if (round_up(sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE) > shortcut->skip_to_level) {
+                       /* Trim segments that are beyond the shortcut */
+                       int shift = shortcut->skip_to_level & ASSOC_ARRAY_KEY_CHUNK_MASK;
+                       dissimilarity &= ~(ULONG_MAX << shift);
+                       next_sc_level = shortcut->skip_to_level;
+               } else {
+                       next_sc_level = sc_level + ASSOC_ARRAY_KEY_CHUNK_SIZE;
+                       next_sc_level = round_down(next_sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE);
+               }
+
+               if (dissimilarity != 0) {
+                       /* This shortcut points elsewhere */
+                       result->wrong_shortcut.shortcut = shortcut;
+                       result->wrong_shortcut.level = level;
+                       result->wrong_shortcut.sc_level = sc_level;
+                       result->wrong_shortcut.sc_segments = sc_segments;
+                       result->wrong_shortcut.dissimilarity = dissimilarity;
+                       return assoc_array_walk_found_wrong_shortcut;
+               }
+
+               sc_level = next_sc_level;
+       } while (sc_level < shortcut->skip_to_level);
+
+       /* The shortcut matches the leaf's index to this point. */
+       cursor = ACCESS_ONCE(shortcut->next_node);
+       if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) {
+               level = sc_level;
+               goto jumped;
+       } else {
+               level = sc_level;
+               goto consider_node;
+       }
+}
+
+/**
+ * assoc_array_find - Find an object by index key
+ * @array: The associative array to search.
+ * @ops: The operations to use.
+ * @index_key: The key to the object.
+ *
+ * Find an object in an associative array by walking through the internal tree
+ * to the node that should contain the object and then searching the leaves
+ * there.  NULL is returned if the requested object was not found in the array.
+ *
+ * The caller must hold the RCU read lock or better.
+ */
+void *assoc_array_find(const struct assoc_array *array,
+                      const struct assoc_array_ops *ops,
+                      const void *index_key)
+{
+       struct assoc_array_walk_result result;
+       const struct assoc_array_node *node;
+       const struct assoc_array_ptr *ptr;
+       const void *leaf;
+       int slot;
+
+       if (assoc_array_walk(array, ops, index_key, &result) !=
+           assoc_array_walk_found_terminal_node)
+               return NULL;
+
+       node = result.terminal_node.node;
+       smp_read_barrier_depends();
+
+       /* If the target key is available to us, it's has to be pointed to by
+        * the terminal node.
+        */
+       for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
+               ptr = ACCESS_ONCE(node->slots[slot]);
+               if (ptr && assoc_array_ptr_is_leaf(ptr)) {
+                       /* We need a barrier between the read of the pointer
+                        * and dereferencing the pointer - but only if we are
+                        * actually going to dereference it.
+                        */
+                       leaf = assoc_array_ptr_to_leaf(ptr);
+                       smp_read_barrier_depends();
+                       if (ops->compare_object(leaf, index_key))
+                               return (void *)leaf;
+               }
+       }
+
+       return NULL;
+}
+
+/*
+ * Destructively iterate over an associative array.  The caller must prevent
+ * other simultaneous accesses.
+ */
+static void assoc_array_destroy_subtree(struct assoc_array_ptr *root,
+                                       const struct assoc_array_ops *ops)
+{
+       struct assoc_array_shortcut *shortcut;
+       struct assoc_array_node *node;
+       struct assoc_array_ptr *cursor, *parent = NULL;
+       int slot = -1;
+
+       pr_devel("-->%s()\n", __func__);
+
+       cursor = root;
+       if (!cursor) {
+               pr_devel("empty\n");
+               return;
+       }
+
+move_to_meta:
+       if (assoc_array_ptr_is_shortcut(cursor)) {
+               /* Descend through a shortcut */
+               pr_devel("[%d] shortcut\n", slot);
+               BUG_ON(!assoc_array_ptr_is_shortcut(cursor));
+               shortcut = assoc_array_ptr_to_shortcut(cursor);
+               BUG_ON(shortcut->back_pointer != parent);
+               BUG_ON(slot != -1 && shortcut->parent_slot != slot);
+               parent = cursor;
+               cursor = shortcut->next_node;
+               slot = -1;
+               BUG_ON(!assoc_array_ptr_is_node(cursor));
+       }
+
+       pr_devel("[%d] node\n", slot);
+       node = assoc_array_ptr_to_node(cursor);
+       BUG_ON(node->back_pointer != parent);
+       BUG_ON(slot != -1 && node->parent_slot != slot);
+       slot = 0;
+
+continue_node:
+       pr_devel("Node %p [back=%p]\n", node, node->back_pointer);
+       for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
+               struct assoc_array_ptr *ptr = node->slots[slot];
+               if (!ptr)
+                       continue;
+               if (assoc_array_ptr_is_meta(ptr)) {
+                       parent = cursor;
+                       cursor = ptr;
+                       goto move_to_meta;
+               }
+
+               if (ops) {
+                       pr_devel("[%d] free leaf\n", slot);
+                       ops->free_object(assoc_array_ptr_to_leaf(ptr));
+               }
+       }
+
+       parent = node->back_pointer;
+       slot = node->parent_slot;
+       pr_devel("free node\n");
+       kfree(node);
+       if (!parent)
+               return; /* Done */
+
+       /* Move back up to the parent (may need to free a shortcut on
+        * the way up) */
+       if (assoc_array_ptr_is_shortcut(parent)) {
+               shortcut = assoc_array_ptr_to_shortcut(parent);
+               BUG_ON(shortcut->next_node != cursor);
+               cursor = parent;
+               parent = shortcut->back_pointer;
+               slot = shortcut->parent_slot;
+               pr_devel("free shortcut\n");
+               kfree(shortcut);
+               if (!parent)
+                       return;
+
+               BUG_ON(!assoc_array_ptr_is_node(parent));
+       }
+
+       /* Ascend to next slot in parent node */
+       pr_devel("ascend to %p[%d]\n", parent, slot);
+       cursor = parent;
+       node = assoc_array_ptr_to_node(cursor);
+       slot++;
+       goto continue_node;
+}
+
+/**
+ * assoc_array_destroy - Destroy an associative array
+ * @array: The array to destroy.
+ * @ops: The operations to use.
+ *
+ * Discard all metadata and free all objects in an associative array.  The
+ * array will be empty and ready to use again upon completion.  This function
+ * cannot fail.
+ *
+ * The caller must prevent all other accesses whilst this takes place as no
+ * attempt is made to adjust pointers gracefully to permit RCU readlock-holding
+ * accesses to continue.  On the other hand, no memory allocation is required.
+ */
+void assoc_array_destroy(struct assoc_array *array,
+                        const struct assoc_array_ops *ops)
+{
+       assoc_array_destroy_subtree(array->root, ops);
+       array->root = NULL;
+}
+
+/*
+ * Handle insertion into an empty tree.
+ */
+static bool assoc_array_insert_in_empty_tree(struct assoc_array_edit *edit)
+{
+       struct assoc_array_node *new_n0;
+
+       pr_devel("-->%s()\n", __func__);
+
+       new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL);
+       if (!new_n0)
+               return false;
+
+       edit->new_meta[0] = assoc_array_node_to_ptr(new_n0);
+       edit->leaf_p = &new_n0->slots[0];
+       edit->adjust_count_on = new_n0;
+       edit->set[0].ptr = &edit->array->root;
+       edit->set[0].to = assoc_array_node_to_ptr(new_n0);
+
+       pr_devel("<--%s() = ok [no root]\n", __func__);
+       return true;
+}
+
+/*
+ * Handle insertion into a terminal node.
+ */
+static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit,
+                                                 const struct assoc_array_ops *ops,
+                                                 const void *index_key,
+                                                 struct assoc_array_walk_result *result)
+{
+       struct assoc_array_shortcut *shortcut, *new_s0;
+       struct assoc_array_node *node, *new_n0, *new_n1, *side;
+       struct assoc_array_ptr *ptr;
+       unsigned long dissimilarity, base_seg, blank;
+       size_t keylen;
+       bool have_meta;
+       int level, diff;
+       int slot, next_slot, free_slot, i, j;
+
+       node    = result->terminal_node.node;
+       level   = result->terminal_node.level;
+       edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = result->terminal_node.slot;
+
+       pr_devel("-->%s()\n", __func__);
+
+       /* We arrived at a node which doesn't have an onward node or shortcut
+        * pointer that we have to follow.  This means that (a) the leaf we
+        * want must go here (either by insertion or replacement) or (b) we
+        * need to split this node and insert in one of the fragments.
+        */
+       free_slot = -1;
+
+       /* Firstly, we have to check the leaves in this node to see if there's
+        * a matching one we should replace in place.
+        */
+       for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
+               ptr = node->slots[i];
+               if (!ptr) {
+                       free_slot = i;
+                       continue;
+               }
+               if (ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) {
+                       pr_devel("replace in slot %d\n", i);
+                       edit->leaf_p = &node->slots[i];
+                       edit->dead_leaf = node->slots[i];
+                       pr_devel("<--%s() = ok [replace]\n", __func__);
+                       return true;
+               }
+       }
+
+       /* If there is a free slot in this node then we can just insert the
+        * leaf here.
+        */
+       if (free_slot >= 0) {
+               pr_devel("insert in free slot %d\n", free_slot);
+               edit->leaf_p = &node->slots[free_slot];
+               edit->adjust_count_on = node;
+               pr_devel("<--%s() = ok [insert]\n", __func__);
+               return true;
+       }
+
+       /* The node has no spare slots - so we're either going to have to split
+        * it or insert another node before it.
+        *
+        * Whatever, we're going to need at least two new nodes - so allocate
+        * those now.  We may also need a new shortcut, but we deal with that
+        * when we need it.
+        */
+       new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL);
+       if (!new_n0)
+               return false;
+       edit->new_meta[0] = assoc_array_node_to_ptr(new_n0);
+       new_n1 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL);
+       if (!new_n1)
+               return false;
+       edit->new_meta[1] = assoc_array_node_to_ptr(new_n1);
+
+       /* We need to find out how similar the leaves are. */
+       pr_devel("no spare slots\n");
+       have_meta = false;
+       for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
+               ptr = node->slots[i];
+               if (assoc_array_ptr_is_meta(ptr)) {
+                       edit->segment_cache[i] = 0xff;
+                       have_meta = true;
+                       continue;
+               }
+               base_seg = ops->get_object_key_chunk(
+                       assoc_array_ptr_to_leaf(ptr), level);
+               base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK;
+               edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK;
+       }
+
+       if (have_meta) {
+               pr_devel("have meta\n");
+               goto split_node;
+       }
+
+       /* The node contains only leaves */
+       dissimilarity = 0;
+       base_seg = edit->segment_cache[0];
+       for (i = 1; i < ASSOC_ARRAY_FAN_OUT; i++)
+               dissimilarity |= edit->segment_cache[i] ^ base_seg;
+
+       pr_devel("only leaves; dissimilarity=%lx\n", dissimilarity);
+
+       if ((dissimilarity & ASSOC_ARRAY_FAN_MASK) == 0) {
+               /* The old leaves all cluster in the same slot.  We will need
+                * to insert a shortcut if the new node wants to cluster with them.
+                */
+               if ((edit->segment_cache[ASSOC_ARRAY_FAN_OUT] ^ base_seg) == 0)
+                       goto all_leaves_cluster_together;
+
+               /* Otherwise we can just insert a new node ahead of the old
+                * one.
+                */
+               goto present_leaves_cluster_but_not_new_leaf;
+       }
+
+split_node:
+       pr_devel("split node\n");
+
+       /* We need to split the current node; we know that the node doesn't
+        * simply contain a full set of leaves that cluster together (it
+        * contains meta pointers and/or non-clustering leaves).
+        *
+        * We need to expel at least two leaves out of a set consisting of the
+        * leaves in the node and the new leaf.
+        *
+        * We need a new node (n0) to replace the current one and a new node to
+        * take the expelled nodes (n1).
+        */
+       edit->set[0].to = assoc_array_node_to_ptr(new_n0);
+       new_n0->back_pointer = node->back_pointer;
+       new_n0->parent_slot = node->parent_slot;
+       new_n1->back_pointer = assoc_array_node_to_ptr(new_n0);
+       new_n1->parent_slot = -1; /* Need to calculate this */
+
+do_split_node:
+       pr_devel("do_split_node\n");
+
+       new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch;
+       new_n1->nr_leaves_on_branch = 0;
+
+       /* Begin by finding two matching leaves.  There have to be at least two
+        * that match - even if there are meta pointers - because any leaf that
+        * would match a slot with a meta pointer in it must be somewhere
+        * behind that meta pointer and cannot be here.  Further, given N
+        * remaining leaf slots, we now have N+1 leaves to go in them.
+        */
+       for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
+               slot = edit->segment_cache[i];
+               if (slot != 0xff)
+                       for (j = i + 1; j < ASSOC_ARRAY_FAN_OUT + 1; j++)
+                               if (edit->segment_cache[j] == slot)
+                                       goto found_slot_for_multiple_occupancy;
+       }
+found_slot_for_multiple_occupancy:
+       pr_devel("same slot: %x %x [%02x]\n", i, j, slot);
+       BUG_ON(i >= ASSOC_ARRAY_FAN_OUT);
+       BUG_ON(j >= ASSOC_ARRAY_FAN_OUT + 1);
+       BUG_ON(slot >= ASSOC_ARRAY_FAN_OUT);
+
+       new_n1->parent_slot = slot;
+
+       /* Metadata pointers cannot change slot */
+       for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++)
+               if (assoc_array_ptr_is_meta(node->slots[i]))
+                       new_n0->slots[i] = node->slots[i];
+               else
+                       new_n0->slots[i] = NULL;
+       BUG_ON(new_n0->slots[slot] != NULL);
+       new_n0->slots[slot] = assoc_array_node_to_ptr(new_n1);
+
+       /* Filter the leaf pointers between the new nodes */
+       free_slot = -1;
+       next_slot = 0;
+       for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
+               if (assoc_array_ptr_is_meta(node->slots[i]))
+                       continue;
+               if (edit->segment_cache[i] == slot) {
+                       new_n1->slots[next_slot++] = node->slots[i];
+                       new_n1->nr_leaves_on_branch++;
+               } else {
+                       do {
+                               free_slot++;
+                       } while (new_n0->slots[free_slot] != NULL);
+                       new_n0->slots[free_slot] = node->slots[i];
+               }
+       }
+
+       pr_devel("filtered: f=%x n=%x\n", free_slot, next_slot);
+
+       if (edit->segment_cache[ASSOC_ARRAY_FAN_OUT] != slot) {
+               do {
+                       free_slot++;
+               } while (new_n0->slots[free_slot] != NULL);
+               edit->leaf_p = &new_n0->slots[free_slot];
+               edit->adjust_count_on = new_n0;
+       } else {
+               edit->leaf_p = &new_n1->slots[next_slot++];
+               edit->adjust_count_on = new_n1;
+       }
+
+       BUG_ON(next_slot <= 1);
+
+       edit->set_backpointers_to = assoc_array_node_to_ptr(new_n0);
+       for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
+               if (edit->segment_cache[i] == 0xff) {
+                       ptr = node->slots[i];
+                       BUG_ON(assoc_array_ptr_is_leaf(ptr));
+                       if (assoc_array_ptr_is_node(ptr)) {
+                               side = assoc_array_ptr_to_node(ptr);
+                               edit->set_backpointers[i] = &side->back_pointer;
+                       } else {
+                               shortcut = assoc_array_ptr_to_shortcut(ptr);
+                               edit->set_backpointers[i] = &shortcut->back_pointer;
+                       }
+               }
+       }
+
+       ptr = node->back_pointer;
+       if (!ptr)
+               edit->set[0].ptr = &edit->array->root;
+       else if (assoc_array_ptr_is_node(ptr))
+               edit->set[0].ptr = &assoc_array_ptr_to_node(ptr)->slots[node->parent_slot];
+       else
+               edit->set[0].ptr = &assoc_array_ptr_to_shortcut(ptr)->next_node;
+       edit->excised_meta[0] = assoc_array_node_to_ptr(node);
+       pr_devel("<--%s() = ok [split node]\n", __func__);
+       return true;
+
+present_leaves_cluster_but_not_new_leaf:
+       /* All the old leaves cluster in the same slot, but the new leaf wants
+        * to go into a different slot, so we create a new node to hold the new
+        * leaf and a pointer to a new node holding all the old leaves.
+        */
+       pr_devel("present leaves cluster but not new leaf\n");
+
+       new_n0->back_pointer = node->back_pointer;
+       new_n0->parent_slot = node->parent_slot;
+       new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch;
+       new_n1->back_pointer = assoc_array_node_to_ptr(new_n0);
+       new_n1->parent_slot = edit->segment_cache[0];
+       new_n1->nr_leaves_on_branch = node->nr_leaves_on_branch;
+       edit->adjust_count_on = new_n0;
+
+       for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++)
+               new_n1->slots[i] = node->slots[i];
+
+       new_n0->slots[edit->segment_cache[0]] = assoc_array_node_to_ptr(new_n0);
+       edit->leaf_p = &new_n0->slots[edit->segment_cache[ASSOC_ARRAY_FAN_OUT]];
+
+       edit->set[0].ptr = &assoc_array_ptr_to_node(node->back_pointer)->slots[node->parent_slot];
+       edit->set[0].to = assoc_array_node_to_ptr(new_n0);
+       edit->excised_meta[0] = assoc_array_node_to_ptr(node);
+       pr_devel("<--%s() = ok [insert node before]\n", __func__);
+       return true;
+
+all_leaves_cluster_together:
+       /* All the leaves, new and old, want to cluster together in this node
+        * in the same slot, so we have to replace this node with a shortcut to
+        * skip over the identical parts of the key and then place a pair of
+        * nodes, one inside the other, at the end of the shortcut and
+        * distribute the keys between them.
+        *
+        * Firstly we need to work out where the leaves start diverging as a
+        * bit position into their keys so that we know how big the shortcut
+        * needs to be.
+        *
+        * We only need to make a single pass of N of the N+1 leaves because if
+        * any keys differ between themselves at bit X then at least one of
+        * them must also differ with the base key at bit X or before.
+        */
+       pr_devel("all leaves cluster together\n");
+       diff = INT_MAX;
+       for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
+               int x = ops->diff_objects(assoc_array_ptr_to_leaf(edit->leaf),
+                                         assoc_array_ptr_to_leaf(node->slots[i]));
+               if (x < diff) {
+                       BUG_ON(x < 0);
+                       diff = x;
+               }
+       }
+       BUG_ON(diff == INT_MAX);
+       BUG_ON(diff < level + ASSOC_ARRAY_LEVEL_STEP);
+
+       keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE);
+       keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT;
+
+       new_s0 = kzalloc(sizeof(struct assoc_array_shortcut) +
+                        keylen * sizeof(unsigned long), GFP_KERNEL);
+       if (!new_s0)
+               return false;
+       edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s0);
+
+       edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0);
+       new_s0->back_pointer = node->back_pointer;
+       new_s0->parent_slot = node->parent_slot;
+       new_s0->next_node = assoc_array_node_to_ptr(new_n0);
+       new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0);
+       new_n0->parent_slot = 0;
+       new_n1->back_pointer = assoc_array_node_to_ptr(new_n0);
+       new_n1->parent_slot = -1; /* Need to calculate this */
+
+       new_s0->skip_to_level = level = diff & ~ASSOC_ARRAY_LEVEL_STEP_MASK;
+       pr_devel("skip_to_level = %d [diff %d]\n", level, diff);
+       BUG_ON(level <= 0);
+
+       for (i = 0; i < keylen; i++)
+               new_s0->index_key[i] =
+                       ops->get_key_chunk(index_key, i * ASSOC_ARRAY_KEY_CHUNK_SIZE);
+
+       blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK);
+       pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank);
+       new_s0->index_key[keylen - 1] &= ~blank;
+
+       /* This now reduces to a node splitting exercise for which we'll need
+        * to regenerate the disparity table.
+        */
+       for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
+               ptr = node->slots[i];
+               base_seg = ops->get_object_key_chunk(assoc_array_ptr_to_leaf(ptr),
+                                                    level);
+               base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK;
+               edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK;
+       }
+
+       base_seg = ops->get_key_chunk(index_key, level);
+       base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK;
+       edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = base_seg & ASSOC_ARRAY_FAN_MASK;
+       goto do_split_node;
+}
+
+/*
+ * Handle insertion into the middle of a shortcut.
+ */
+static bool assoc_array_insert_mid_shortcut(struct assoc_array_edit *edit,
+                                           const struct assoc_array_ops *ops,
+                                           struct assoc_array_walk_result *result)
+{
+       struct assoc_array_shortcut *shortcut, *new_s0, *new_s1;
+       struct assoc_array_node *node, *new_n0, *side;
+       unsigned long sc_segments, dissimilarity, blank;
+       size_t keylen;
+       int level, sc_level, diff;
+       int sc_slot;
+
+       shortcut        = result->wrong_shortcut.shortcut;
+       level           = result->wrong_shortcut.level;
+       sc_level        = result->wrong_shortcut.sc_level;
+       sc_segments     = result->wrong_shortcut.sc_segments;
+       dissimilarity   = result->wrong_shortcut.dissimilarity;
+
+       pr_devel("-->%s(ix=%d dis=%lx scix=%d)\n",
+                __func__, level, dissimilarity, sc_level);
+
+       /* We need to split a shortcut and insert a node between the two
+        * pieces.  Zero-length pieces will be dispensed with entirely.
+        *
+        * First of all, we need to find out in which level the first
+        * difference was.
+        */
+       diff = __ffs(dissimilarity);
+       diff &= ~ASSOC_ARRAY_LEVEL_STEP_MASK;
+       diff += sc_level & ~ASSOC_ARRAY_KEY_CHUNK_MASK;
+       pr_devel("diff=%d\n", diff);
+
+       if (!shortcut->back_pointer) {
+               edit->set[0].ptr = &edit->array->root;
+       } else if (assoc_array_ptr_is_node(shortcut->back_pointer)) {
+               node = assoc_array_ptr_to_node(shortcut->back_pointer);
+               edit->set[0].ptr = &node->slots[shortcut->parent_slot];
+       } else {
+               BUG();
+       }
+
+       edit->excised_meta[0] = assoc_array_shortcut_to_ptr(shortcut);
+
+       /* Create a new node now since we're going to need it anyway */
+       new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL);
+       if (!new_n0)
+               return false;
+       edit->new_meta[0] = assoc_array_node_to_ptr(new_n0);
+       edit->adjust_count_on = new_n0;
+
+       /* Insert a new shortcut before the new node if this segment isn't of
+        * zero length - otherwise we just connect the new node directly to the
+        * parent.
+        */
+       level += ASSOC_ARRAY_LEVEL_STEP;
+       if (diff > level) {
+               pr_devel("pre-shortcut %d...%d\n", level, diff);
+               keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE);
+               keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT;
+
+               new_s0 = kzalloc(sizeof(struct assoc_array_shortcut) +
+                                keylen * sizeof(unsigned long), GFP_KERNEL);
+               if (!new_s0)
+                       return false;
+               edit->new_meta[1] = assoc_array_shortcut_to_ptr(new_s0);
+               edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0);
+               new_s0->back_pointer = shortcut->back_pointer;
+               new_s0->parent_slot = shortcut->parent_slot;
+               new_s0->next_node = assoc_array_node_to_ptr(new_n0);
+               new_s0->skip_to_level = diff;
+
+               new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0);
+               new_n0->parent_slot = 0;
+
+               memcpy(new_s0->index_key, shortcut->index_key,
+                      keylen * sizeof(unsigned long));
+
+               blank = ULONG_MAX << (diff & ASSOC_ARRAY_KEY_CHUNK_MASK);
+               pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, diff, blank);
+               new_s0->index_key[keylen - 1] &= ~blank;
+       } else {
+               pr_devel("no pre-shortcut\n");
+               edit->set[0].to = assoc_array_node_to_ptr(new_n0);
+               new_n0->back_pointer = shortcut->back_pointer;
+               new_n0->parent_slot = shortcut->parent_slot;
+       }
+
+       side = assoc_array_ptr_to_node(shortcut->next_node);
+       new_n0->nr_leaves_on_branch = side->nr_leaves_on_branch;
+
+       /* We need to know which slot in the new node is going to take a
+        * metadata pointer.
+        */
+       sc_slot = sc_segments >> (diff & ASSOC_ARRAY_KEY_CHUNK_MASK);
+       sc_slot &= ASSOC_ARRAY_FAN_MASK;
+
+       pr_devel("new slot %lx >> %d -> %d\n",
+                sc_segments, diff & ASSOC_ARRAY_KEY_CHUNK_MASK, sc_slot);
+
+       /* Determine whether we need to follow the new node with a replacement
+        * for the current shortcut.  We could in theory reuse the current
+        * shortcut if its parent slot number doesn't change - but that's a
+        * 1-in-16 chance so not worth expending the code upon.
+        */
+       level = diff + ASSOC_ARRAY_LEVEL_STEP;
+       if (level < shortcut->skip_to_level) {
+               pr_devel("post-shortcut %d...%d\n", level, shortcut->skip_to_level);
+               keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE);
+               keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT;
+
+               new_s1 = kzalloc(sizeof(struct assoc_array_shortcut) +
+                                keylen * sizeof(unsigned long), GFP_KERNEL);
+               if (!new_s1)
+                       return false;
+               edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s1);
+
+               new_s1->back_pointer = assoc_array_node_to_ptr(new_n0);
+               new_s1->parent_slot = sc_slot;
+               new_s1->next_node = shortcut->next_node;
+               new_s1->skip_to_level = shortcut->skip_to_level;
+
+               new_n0->slots[sc_slot] = assoc_array_shortcut_to_ptr(new_s1);
+
+               memcpy(new_s1->index_key, shortcut->index_key,
+                      keylen * sizeof(unsigned long));
+
+               edit->set[1].ptr = &side->back_pointer;
+               edit->set[1].to = assoc_array_shortcut_to_ptr(new_s1);
+       } else {
+               pr_devel("no post-shortcut\n");
+
+               /* We don't have to replace the pointed-to node as long as we
+                * use memory barriers to make sure the parent slot number is
+                * changed before the back pointer (the parent slot number is
+                * irrelevant to the old parent shortcut).
+                */
+               new_n0->slots[sc_slot] = shortcut->next_node;
+               edit->set_parent_slot[0].p = &side->parent_slot;
+               edit->set_parent_slot[0].to = sc_slot;
+               edit->set[1].ptr = &side->back_pointer;
+               edit->set[1].to = assoc_array_node_to_ptr(new_n0);
+       }
+
+       /* Install the new leaf in a spare slot in the new node. */
+       if (sc_slot == 0)
+               edit->leaf_p = &new_n0->slots[1];
+       else
+               edit->leaf_p = &new_n0->slots[0];
+
+       pr_devel("<--%s() = ok [split shortcut]\n", __func__);
+       return edit;
+}
+
+/**
+ * assoc_array_insert - Script insertion of an object into an associative array
+ * @array: The array to insert into.
+ * @ops: The operations to use.
+ * @index_key: The key to insert at.
+ * @object: The object to insert.
+ *
+ * Precalculate and preallocate a script for the insertion or replacement of an
+ * object in an associative array.  This results in an edit script that can
+ * either be applied or cancelled.
+ *
+ * The function returns a pointer to an edit script or -ENOMEM.
+ *
+ * The caller should lock against other modifications and must continue to hold
+ * the lock until assoc_array_apply_edit() has been called.
+ *
+ * Accesses to the tree may take place concurrently with this function,
+ * provided they hold the RCU read lock.
+ */
+struct assoc_array_edit *assoc_array_insert(struct assoc_array *array,
+                                           const struct assoc_array_ops *ops,
+                                           const void *index_key,
+                                           void *object)
+{
+       struct assoc_array_walk_result result;
+       struct assoc_array_edit *edit;
+
+       pr_devel("-->%s()\n", __func__);
+
+       /* The leaf pointer we're given must not have the bottom bit set as we
+        * use those for type-marking the pointer.  NULL pointers are also not
+        * allowed as they indicate an empty slot but we have to allow them
+        * here as they can be updated later.
+        */
+       BUG_ON(assoc_array_ptr_is_meta(object));
+
+       edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL);
+       if (!edit)
+               return ERR_PTR(-ENOMEM);
+       edit->array = array;
+       edit->ops = ops;
+       edit->leaf = assoc_array_leaf_to_ptr(object);
+       edit->adjust_count_by = 1;
+
+       switch (assoc_array_walk(array, ops, index_key, &result)) {
+       case assoc_array_walk_tree_empty:
+               /* Allocate a root node if there isn't one yet */
+               if (!assoc_array_insert_in_empty_tree(edit))
+                       goto enomem;
+               return edit;
+
+       case assoc_array_walk_found_terminal_node:
+               /* We found a node that doesn't have a node/shortcut pointer in
+                * the slot corresponding to the index key that we have to
+                * follow.
+                */
+               if (!assoc_array_insert_into_terminal_node(edit, ops, index_key,
+                                                          &result))
+                       goto enomem;
+               return edit;
+
+       case assoc_array_walk_found_wrong_shortcut:
+               /* We found a shortcut that didn't match our key in a slot we
+                * needed to follow.
+                */
+               if (!assoc_array_insert_mid_shortcut(edit, ops, &result))
+                       goto enomem;
+               return edit;
+       }
+
+enomem:
+       /* Clean up after an out of memory error */
+       pr_devel("enomem\n");
+       assoc_array_cancel_edit(edit);
+       return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * assoc_array_insert_set_object - Set the new object pointer in an edit script
+ * @edit: The edit script to modify.
+ * @object: The object pointer to set.
+ *
+ * Change the object to be inserted in an edit script.  The object pointed to
+ * by the old object is not freed.  This must be done prior to applying the
+ * script.
+ */
+void assoc_array_insert_set_object(struct assoc_array_edit *edit, void *object)
+{
+       BUG_ON(!object);
+       edit->leaf = assoc_array_leaf_to_ptr(object);
+}
+
+struct assoc_array_delete_collapse_context {
+       struct assoc_array_node *node;
+       const void              *skip_leaf;
+       int                     slot;
+};
+
+/*
+ * Subtree collapse to node iterator.
+ */
+static int assoc_array_delete_collapse_iterator(const void *leaf,
+                                               void *iterator_data)
+{
+       struct assoc_array_delete_collapse_context *collapse = iterator_data;
+
+       if (leaf == collapse->skip_leaf)
+               return 0;
+
+       BUG_ON(collapse->slot >= ASSOC_ARRAY_FAN_OUT);
+
+       collapse->node->slots[collapse->slot++] = assoc_array_leaf_to_ptr(leaf);
+       return 0;
+}
+
+/**
+ * assoc_array_delete - Script deletion of an object from an associative array
+ * @array: The array to search.
+ * @ops: The operations to use.
+ * @index_key: The key to the object.
+ *
+ * Precalculate and preallocate a script for the deletion of an object from an
+ * associative array.  This results in an edit script that can either be
+ * applied or cancelled.
+ *
+ * The function returns a pointer to an edit script if the object was found,
+ * NULL if the object was not found or -ENOMEM.
+ *
+ * The caller should lock against other modifications and must continue to hold
+ * the lock until assoc_array_apply_edit() has been called.
+ *
+ * Accesses to the tree may take place concurrently with this function,
+ * provided they hold the RCU read lock.
+ */
+struct assoc_array_edit *assoc_array_delete(struct assoc_array *array,
+                                           const struct assoc_array_ops *ops,
+                                           const void *index_key)
+{
+       struct assoc_array_delete_collapse_context collapse;
+       struct assoc_array_walk_result result;
+       struct assoc_array_node *node, *new_n0;
+       struct assoc_array_edit *edit;
+       struct assoc_array_ptr *ptr;
+       bool has_meta;
+       int slot, i;
+
+       pr_devel("-->%s()\n", __func__);
+
+       edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL);
+       if (!edit)
+               return ERR_PTR(-ENOMEM);
+       edit->array = array;
+       edit->ops = ops;
+       edit->adjust_count_by = -1;
+
+       switch (assoc_array_walk(array, ops, index_key, &result)) {
+       case assoc_array_walk_found_terminal_node:
+               /* We found a node that should contain the leaf we've been
+                * asked to remove - *if* it's in the tree.
+                */
+               pr_devel("terminal_node\n");
+               node = result.terminal_node.node;
+
+               for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
+                       ptr = node->slots[slot];
+                       if (ptr &&
+                           assoc_array_ptr_is_leaf(ptr) &&
+                           ops->compare_object(assoc_array_ptr_to_leaf(ptr),
+                                               index_key))
+                               goto found_leaf;
+               }
+       case assoc_array_walk_tree_empty:
+       case assoc_array_walk_found_wrong_shortcut:
+       default:
+               assoc_array_cancel_edit(edit);
+               pr_devel("not found\n");
+               return NULL;
+       }
+
+found_leaf:
+       BUG_ON(array->nr_leaves_on_tree <= 0);
+
+       /* In the simplest form of deletion we just clear the slot and release
+        * the leaf after a suitable interval.
+        */
+       edit->dead_leaf = node->slots[slot];
+       edit->set[0].ptr = &node->slots[slot];
+       edit->set[0].to = NULL;
+       edit->adjust_count_on = node;
+
+       /* If that concludes erasure of the last leaf, then delete the entire
+        * internal array.
+        */
+       if (array->nr_leaves_on_tree == 1) {
+               edit->set[1].ptr = &array->root;
+               edit->set[1].to = NULL;
+               edit->adjust_count_on = NULL;
+               edit->excised_subtree = array->root;
+               pr_devel("all gone\n");
+               return edit;
+       }
+
+       /* However, we'd also like to clear up some metadata blocks if we
+        * possibly can.
+        *
+        * We go for a simple algorithm of: if this node has FAN_OUT or fewer
+        * leaves in it, then attempt to collapse it - and attempt to
+        * recursively collapse up the tree.
+        *
+        * We could also try and collapse in partially filled subtrees to take
+        * up space in this node.
+        */
+       if (node->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) {
+               struct assoc_array_node *parent, *grandparent;
+               struct assoc_array_ptr *ptr;
+
+               /* First of all, we need to know if this node has metadata so
+                * that we don't try collapsing if all the leaves are already
+                * here.
+                */
+               has_meta = false;
+               for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
+                       ptr = node->slots[i];
+                       if (assoc_array_ptr_is_meta(ptr)) {
+                               has_meta = true;
+                               break;
+                       }
+               }
+
+               pr_devel("leaves: %ld [m=%d]\n",
+                        node->nr_leaves_on_branch - 1, has_meta);
+
+               /* Look further up the tree to see if we can collapse this node
+                * into a more proximal node too.
+                */
+               parent = node;
+       collapse_up:
+               pr_devel("collapse subtree: %ld\n", parent->nr_leaves_on_branch);
+
+               ptr = parent->back_pointer;
+               if (!ptr)
+                       goto do_collapse;
+               if (assoc_array_ptr_is_shortcut(ptr)) {
+                       struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(ptr);
+                       ptr = s->back_pointer;
+                       if (!ptr)
+                               goto do_collapse;
+               }
+
+               grandparent = assoc_array_ptr_to_node(ptr);
+               if (grandparent->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) {
+                       parent = grandparent;
+                       goto collapse_up;
+               }
+
+       do_collapse:
+               /* There's no point collapsing if the original node has no meta
+                * pointers to discard and if we didn't merge into one of that
+                * node's ancestry.
+                */
+               if (has_meta || parent != node) {
+                       node = parent;
+
+                       /* Create a new node to collapse into */
+                       new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL);
+                       if (!new_n0)
+                               goto enomem;
+                       edit->new_meta[0] = assoc_array_node_to_ptr(new_n0);
+
+                       new_n0->back_pointer = node->back_pointer;
+                       new_n0->parent_slot = node->parent_slot;
+                       new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch;
+                       edit->adjust_count_on = new_n0;
+
+                       collapse.node = new_n0;
+                       collapse.skip_leaf = assoc_array_ptr_to_leaf(edit->dead_leaf);
+                       collapse.slot = 0;
+                       assoc_array_subtree_iterate(assoc_array_node_to_ptr(node),
+                                                   node->back_pointer,
+                                                   assoc_array_delete_collapse_iterator,
+                                                   &collapse);
+                       pr_devel("collapsed %d,%lu\n", collapse.slot, new_n0->nr_leaves_on_branch);
+                       BUG_ON(collapse.slot != new_n0->nr_leaves_on_branch - 1);
+
+                       if (!node->back_pointer) {
+                               edit->set[1].ptr = &array->root;
+                       } else if (assoc_array_ptr_is_leaf(node->back_pointer)) {
+                               BUG();
+                       } else if (assoc_array_ptr_is_node(node->back_pointer)) {
+                               struct assoc_array_node *p =
+                                       assoc_array_ptr_to_node(node->back_pointer);
+                               edit->set[1].ptr = &p->slots[node->parent_slot];
+                       } else if (assoc_array_ptr_is_shortcut(node->back_pointer)) {
+                               struct assoc_array_shortcut *s =
+                                       assoc_array_ptr_to_shortcut(node->back_pointer);
+                               edit->set[1].ptr = &s->next_node;
+                       }
+                       edit->set[1].to = assoc_array_node_to_ptr(new_n0);
+                       edit->excised_subtree = assoc_array_node_to_ptr(node);
+               }
+       }
+
+       return edit;
+
+enomem:
+       /* Clean up after an out of memory error */
+       pr_devel("enomem\n");
+       assoc_array_cancel_edit(edit);
+       return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * assoc_array_clear - Script deletion of all objects from an associative array
+ * @array: The array to clear.
+ * @ops: The operations to use.
+ *
+ * Precalculate and preallocate a script for the deletion of all the objects
+ * from an associative array.  This results in an edit script that can either
+ * be applied or cancelled.
+ *
+ * The function returns a pointer to an edit script if there are objects to be
+ * deleted, NULL if there are no objects in the array or -ENOMEM.
+ *
+ * The caller should lock against other modifications and must continue to hold
+ * the lock until assoc_array_apply_edit() has been called.
+ *
+ * Accesses to the tree may take place concurrently with this function,
+ * provided they hold the RCU read lock.
+ */
+struct assoc_array_edit *assoc_array_clear(struct assoc_array *array,
+                                          const struct assoc_array_ops *ops)
+{
+       struct assoc_array_edit *edit;
+
+       pr_devel("-->%s()\n", __func__);
+
+       if (!array->root)
+               return NULL;
+
+       edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL);
+       if (!edit)
+               return ERR_PTR(-ENOMEM);
+       edit->array = array;
+       edit->ops = ops;
+       edit->set[1].ptr = &array->root;
+       edit->set[1].to = NULL;
+       edit->excised_subtree = array->root;
+       edit->ops_for_excised_subtree = ops;
+       pr_devel("all gone\n");
+       return edit;
+}
+
+/*
+ * Handle the deferred destruction after an applied edit.
+ */
+static void assoc_array_rcu_cleanup(struct rcu_head *head)
+{
+       struct assoc_array_edit *edit =
+               container_of(head, struct assoc_array_edit, rcu);
+       int i;
+
+       pr_devel("-->%s()\n", __func__);
+
+       if (edit->dead_leaf)
+               edit->ops->free_object(assoc_array_ptr_to_leaf(edit->dead_leaf));
+       for (i = 0; i < ARRAY_SIZE(edit->excised_meta); i++)
+               if (edit->excised_meta[i])
+                       kfree(assoc_array_ptr_to_node(edit->excised_meta[i]));
+
+       if (edit->excised_subtree) {
+               BUG_ON(assoc_array_ptr_is_leaf(edit->excised_subtree));
+               if (assoc_array_ptr_is_node(edit->excised_subtree)) {
+                       struct assoc_array_node *n =
+                               assoc_array_ptr_to_node(edit->excised_subtree);
+                       n->back_pointer = NULL;
+               } else {
+                       struct assoc_array_shortcut *s =
+                               assoc_array_ptr_to_shortcut(edit->excised_subtree);
+                       s->back_pointer = NULL;
+               }
+               assoc_array_destroy_subtree(edit->excised_subtree,
+                                           edit->ops_for_excised_subtree);
+       }
+
+       kfree(edit);
+}
+
+/**
+ * assoc_array_apply_edit - Apply an edit script to an associative array
+ * @edit: The script to apply.
+ *
+ * Apply an edit script to an associative array to effect an insertion,
+ * deletion or clearance.  As the edit script includes preallocated memory,
+ * this is guaranteed not to fail.
+ *
+ * The edit script, dead objects and dead metadata will be scheduled for
+ * destruction after an RCU grace period to permit those doing read-only
+ * accesses on the array to continue to do so under the RCU read lock whilst
+ * the edit is taking place.
+ */
+void assoc_array_apply_edit(struct assoc_array_edit *edit)
+{
+       struct assoc_array_shortcut *shortcut;
+       struct assoc_array_node *node;
+       struct assoc_array_ptr *ptr;
+       int i;
+
+       pr_devel("-->%s()\n", __func__);
+
+       smp_wmb();
+       if (edit->leaf_p)
+               *edit->leaf_p = edit->leaf;
+
+       smp_wmb();
+       for (i = 0; i < ARRAY_SIZE(edit->set_parent_slot); i++)
+               if (edit->set_parent_slot[i].p)
+                       *edit->set_parent_slot[i].p = edit->set_parent_slot[i].to;
+
+       smp_wmb();
+       for (i = 0; i < ARRAY_SIZE(edit->set_backpointers); i++)
+               if (edit->set_backpointers[i])
+                       *edit->set_backpointers[i] = edit->set_backpointers_to;
+
+       smp_wmb();
+       for (i = 0; i < ARRAY_SIZE(edit->set); i++)
+               if (edit->set[i].ptr)
+                       *edit->set[i].ptr = edit->set[i].to;
+
+       if (edit->array->root == NULL) {
+               edit->array->nr_leaves_on_tree = 0;
+       } else if (edit->adjust_count_on) {
+               node = edit->adjust_count_on;
+               for (;;) {
+                       node->nr_leaves_on_branch += edit->adjust_count_by;
+
+                       ptr = node->back_pointer;
+                       if (!ptr)
+                               break;
+                       if (assoc_array_ptr_is_shortcut(ptr)) {
+                               shortcut = assoc_array_ptr_to_shortcut(ptr);
+                               ptr = shortcut->back_pointer;
+                               if (!ptr)
+                                       break;
+                       }
+                       BUG_ON(!assoc_array_ptr_is_node(ptr));
+                       node = assoc_array_ptr_to_node(ptr);
+               }
+
+               edit->array->nr_leaves_on_tree += edit->adjust_count_by;
+       }
+
+       call_rcu(&edit->rcu, assoc_array_rcu_cleanup);
+}
+
+/**
+ * assoc_array_cancel_edit - Discard an edit script.
+ * @edit: The script to discard.
+ *
+ * Free an edit script and all the preallocated data it holds without making
+ * any changes to the associative array it was intended for.
+ *
+ * NOTE!  In the case of an insertion script, this does _not_ release the leaf
+ * that was to be inserted.  That is left to the caller.
+ */
+void assoc_array_cancel_edit(struct assoc_array_edit *edit)
+{
+       struct assoc_array_ptr *ptr;
+       int i;
+
+       pr_devel("-->%s()\n", __func__);
+
+       /* Clean up after an out of memory error */
+       for (i = 0; i < ARRAY_SIZE(edit->new_meta); i++) {
+               ptr = edit->new_meta[i];
+               if (ptr) {
+                       if (assoc_array_ptr_is_node(ptr))
+                               kfree(assoc_array_ptr_to_node(ptr));
+                       else
+                               kfree(assoc_array_ptr_to_shortcut(ptr));
+               }
+       }
+       kfree(edit);
+}
+
+/**
+ * assoc_array_gc - Garbage collect an associative array.
+ * @array: The array to clean.
+ * @ops: The operations to use.
+ * @iterator: A callback function to pass judgement on each object.
+ * @iterator_data: Private data for the callback function.
+ *
+ * Collect garbage from an associative array and pack down the internal tree to
+ * save memory.
+ *
+ * The iterator function is asked to pass judgement upon each object in the
+ * array.  If it returns false, the object is discard and if it returns true,
+ * the object is kept.  If it returns true, it must increment the object's
+ * usage count (or whatever it needs to do to retain it) before returning.
+ *
+ * This function returns 0 if successful or -ENOMEM if out of memory.  In the
+ * latter case, the array is not changed.
+ *
+ * The caller should lock against other modifications and must continue to hold
+ * the lock until assoc_array_apply_edit() has been called.
+ *
+ * Accesses to the tree may take place concurrently with this function,
+ * provided they hold the RCU read lock.
+ */
+int assoc_array_gc(struct assoc_array *array,
+                  const struct assoc_array_ops *ops,
+                  bool (*iterator)(void *object, void *iterator_data),
+                  void *iterator_data)
+{
+       struct assoc_array_shortcut *shortcut, *new_s;
+       struct assoc_array_node *node, *new_n;
+       struct assoc_array_edit *edit;
+       struct assoc_array_ptr *cursor, *ptr;
+       struct assoc_array_ptr *new_root, *new_parent, **new_ptr_pp;
+       unsigned long nr_leaves_on_tree;
+       int keylen, slot, nr_free, next_slot, i;
+
+       pr_devel("-->%s()\n", __func__);
+
+       if (!array->root)
+               return 0;
+
+       edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL);
+       if (!edit)
+               return -ENOMEM;
+       edit->array = array;
+       edit->ops = ops;
+       edit->ops_for_excised_subtree = ops;
+       edit->set[0].ptr = &array->root;
+       edit->excised_subtree = array->root;
+
+       new_root = new_parent = NULL;
+       new_ptr_pp = &new_root;
+       cursor = array->root;
+
+descend:
+       /* If this point is a shortcut, then we need to duplicate it and
+        * advance the target cursor.
+        */
+       if (assoc_array_ptr_is_shortcut(cursor)) {
+               shortcut = assoc_array_ptr_to_shortcut(cursor);
+               keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE);
+               keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT;
+               new_s = kmalloc(sizeof(struct assoc_array_shortcut) +
+                               keylen * sizeof(unsigned long), GFP_KERNEL);
+               if (!new_s)
+                       goto enomem;
+               pr_devel("dup shortcut %p -> %p\n", shortcut, new_s);
+               memcpy(new_s, shortcut, (sizeof(struct assoc_array_shortcut) +
+                                        keylen * sizeof(unsigned long)));
+               new_s->back_pointer = new_parent;
+               new_s->parent_slot = shortcut->parent_slot;
+               *new_ptr_pp = new_parent = assoc_array_shortcut_to_ptr(new_s);
+               new_ptr_pp = &new_s->next_node;
+               cursor = shortcut->next_node;
+       }
+
+       /* Duplicate the node at this position */
+       node = assoc_array_ptr_to_node(cursor);
+       new_n = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL);
+       if (!new_n)
+               goto enomem;
+       pr_devel("dup node %p -> %p\n", node, new_n);
+       new_n->back_pointer = new_parent;
+       new_n->parent_slot = node->parent_slot;
+       *new_ptr_pp = new_parent = assoc_array_node_to_ptr(new_n);
+       new_ptr_pp = NULL;
+       slot = 0;
+
+continue_node:
+       /* Filter across any leaves and gc any subtrees */
+       for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
+               ptr = node->slots[slot];
+               if (!ptr)
+                       continue;
+
+               if (assoc_array_ptr_is_leaf(ptr)) {
+                       if (iterator(assoc_array_ptr_to_leaf(ptr),
+                                    iterator_data))
+                               /* The iterator will have done any reference
+                                * counting on the object for us.
+                                */
+                               new_n->slots[slot] = ptr;
+                       continue;
+               }
+
+               new_ptr_pp = &new_n->slots[slot];
+               cursor = ptr;
+               goto descend;
+       }
+
+       pr_devel("-- compress node %p --\n", new_n);
+
+       /* Count up the number of empty slots in this node and work out the
+        * subtree leaf count.
+        */
+       new_n->nr_leaves_on_branch = 0;
+       nr_free = 0;
+       for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
+               ptr = new_n->slots[slot];
+               if (!ptr)
+                       nr_free++;
+               else if (assoc_array_ptr_is_leaf(ptr))
+                       new_n->nr_leaves_on_branch++;
+       }
+       pr_devel("free=%d, leaves=%lu\n", nr_free, new_n->nr_leaves_on_branch);
+
+       /* See what we can fold in */
+       next_slot = 0;
+       for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
+               struct assoc_array_shortcut *s;
+               struct assoc_array_node *child;
+
+               ptr = new_n->slots[slot];
+               if (!ptr || assoc_array_ptr_is_leaf(ptr))
+                       continue;
+
+               s = NULL;
+               if (assoc_array_ptr_is_shortcut(ptr)) {
+                       s = assoc_array_ptr_to_shortcut(ptr);
+                       ptr = s->next_node;
+               }
+
+               child = assoc_array_ptr_to_node(ptr);
+               new_n->nr_leaves_on_branch += child->nr_leaves_on_branch;
+
+               if (child->nr_leaves_on_branch <= nr_free + 1) {
+                       /* Fold the child node into this one */
+                       pr_devel("[%d] fold node %lu/%d [nx %d]\n",
+                                slot, child->nr_leaves_on_branch, nr_free + 1,
+                                next_slot);
+
+                       /* We would already have reaped an intervening shortcut
+                        * on the way back up the tree.
+                        */
+                       BUG_ON(s);
+
+                       new_n->slots[slot] = NULL;
+                       nr_free++;
+                       if (slot < next_slot)
+                               next_slot = slot;
+                       for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) {
+                               struct assoc_array_ptr *p = child->slots[i];
+                               if (!p)
+                                       continue;
+                               BUG_ON(assoc_array_ptr_is_meta(p));
+                               while (new_n->slots[next_slot])
+                                       next_slot++;
+                               BUG_ON(next_slot >= ASSOC_ARRAY_FAN_OUT);
+                               new_n->slots[next_slot++] = p;
+                               nr_free--;
+                       }
+                       kfree(child);
+               } else {
+                       pr_devel("[%d] retain node %lu/%d [nx %d]\n",
+                                slot, child->nr_leaves_on_branch, nr_free + 1,
+                                next_slot);
+               }
+       }
+
+       pr_devel("after: %lu\n", new_n->nr_leaves_on_branch);
+
+       nr_leaves_on_tree = new_n->nr_leaves_on_branch;
+
+       /* Excise this node if it is singly occupied by a shortcut */
+       if (nr_free == ASSOC_ARRAY_FAN_OUT - 1) {
+               for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++)
+                       if ((ptr = new_n->slots[slot]))
+                               break;
+
+               if (assoc_array_ptr_is_meta(ptr) &&
+                   assoc_array_ptr_is_shortcut(ptr)) {
+                       pr_devel("excise node %p with 1 shortcut\n", new_n);
+                       new_s = assoc_array_ptr_to_shortcut(ptr);
+                       new_parent = new_n->back_pointer;
+                       slot = new_n->parent_slot;
+                       kfree(new_n);
+                       if (!new_parent) {
+                               new_s->back_pointer = NULL;
+                               new_s->parent_slot = 0;
+                               new_root = ptr;
+                               goto gc_complete;
+                       }
+
+                       if (assoc_array_ptr_is_shortcut(new_parent)) {
+                               /* We can discard any preceding shortcut also */
+                               struct assoc_array_shortcut *s =
+                                       assoc_array_ptr_to_shortcut(new_parent);
+
+                               pr_devel("excise preceding shortcut\n");
+
+                               new_parent = new_s->back_pointer = s->back_pointer;
+                               slot = new_s->parent_slot = s->parent_slot;
+                               kfree(s);
+                               if (!new_parent) {
+                                       new_s->back_pointer = NULL;
+                                       new_s->parent_slot = 0;
+                                       new_root = ptr;
+                                       goto gc_complete;
+                               }
+                       }
+
+                       new_s->back_pointer = new_parent;
+                       new_s->parent_slot = slot;
+                       new_n = assoc_array_ptr_to_node(new_parent);
+                       new_n->slots[slot] = ptr;
+                       goto ascend_old_tree;
+               }
+       }
+
+       /* Excise any shortcuts we might encounter that point to nodes that
+        * only contain leaves.
+        */
+       ptr = new_n->back_pointer;
+       if (!ptr)
+               goto gc_complete;
+
+       if (assoc_array_ptr_is_shortcut(ptr)) {
+               new_s = assoc_array_ptr_to_shortcut(ptr);
+               new_parent = new_s->back_pointer;
+               slot = new_s->parent_slot;
+
+               if (new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) {
+                       struct assoc_array_node *n;
+
+                       pr_devel("excise shortcut\n");
+                       new_n->back_pointer = new_parent;
+                       new_n->parent_slot = slot;
+                       kfree(new_s);
+                       if (!new_parent) {
+                               new_root = assoc_array_node_to_ptr(new_n);
+                               goto gc_complete;
+                       }
+
+                       n = assoc_array_ptr_to_node(new_parent);
+                       n->slots[slot] = assoc_array_node_to_ptr(new_n);
+               }
+       } else {
+               new_parent = ptr;
+       }
+       new_n = assoc_array_ptr_to_node(new_parent);
+
+ascend_old_tree:
+       ptr = node->back_pointer;
+       if (assoc_array_ptr_is_shortcut(ptr)) {
+               shortcut = assoc_array_ptr_to_shortcut(ptr);
+               slot = shortcut->parent_slot;
+               cursor = shortcut->back_pointer;
+       } else {
+               slot = node->parent_slot;
+               cursor = ptr;
+       }
+       BUG_ON(!ptr);
+       node = assoc_array_ptr_to_node(cursor);
+       slot++;
+       goto continue_node;
+
+gc_complete:
+       edit->set[0].to = new_root;
+       assoc_array_apply_edit(edit);
+       edit->array->nr_leaves_on_tree = nr_leaves_on_tree;
+       return 0;
+
+enomem:
+       pr_devel("enomem\n");
+       assoc_array_destroy_subtree(new_root, edit->ops);
+       kfree(edit);
+       return -ENOMEM;
+}
index 657979f..bf076d2 100644 (file)
@@ -121,3 +121,6 @@ void mpi_free(MPI a)
        kfree(a);
 }
 EXPORT_SYMBOL_GPL(mpi_free);
+
+MODULE_DESCRIPTION("Multiprecision maths library");
+MODULE_LICENSE("GPL");
index 7d57af2..dee6cf4 100644 (file)
@@ -476,40 +476,6 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg)
        return 0;
 }
 
-static void copy_gigantic_page(struct page *dst, struct page *src)
-{
-       int i;
-       struct hstate *h = page_hstate(src);
-       struct page *dst_base = dst;
-       struct page *src_base = src;
-
-       for (i = 0; i < pages_per_huge_page(h); ) {
-               cond_resched();
-               copy_highpage(dst, src);
-
-               i++;
-               dst = mem_map_next(dst, dst_base, i);
-               src = mem_map_next(src, src_base, i);
-       }
-}
-
-void copy_huge_page(struct page *dst, struct page *src)
-{
-       int i;
-       struct hstate *h = page_hstate(src);
-
-       if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
-               copy_gigantic_page(dst, src);
-               return;
-       }
-
-       might_sleep();
-       for (i = 0; i < pages_per_huge_page(h); i++) {
-               cond_resched();
-               copy_highpage(dst + i, src + i);
-       }
-}
-
 static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
@@ -736,6 +702,23 @@ int PageHuge(struct page *page)
 }
 EXPORT_SYMBOL_GPL(PageHuge);
 
+/*
+ * PageHeadHuge() only returns true for hugetlbfs head page, but not for
+ * normal or transparent huge pages.
+ */
+int PageHeadHuge(struct page *page_head)
+{
+       compound_page_dtor *dtor;
+
+       if (!PageHead(page_head))
+               return 0;
+
+       dtor = get_compound_page_dtor(page_head);
+
+       return dtor == free_huge_page;
+}
+EXPORT_SYMBOL_GPL(PageHeadHuge);
+
 pgoff_t __basepage_index(struct page *page)
 {
        struct page *page_head = compound_head(page);
index 0409e8f..5d9025f 100644 (file)
@@ -4272,13 +4272,6 @@ void copy_user_huge_page(struct page *dst, struct page *src,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
 #if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS
-static struct kmem_cache *page_ptl_cachep;
-void __init ptlock_cache_init(void)
-{
-       page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
-                       SLAB_PANIC, NULL);
-}
-
 bool ptlock_alloc(struct page *page)
 {
        spinlock_t *ptl;
index c4403cd..eca4a31 100644 (file)
@@ -2950,7 +2950,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
                return;
        }
 
-       p += snprintf(p, maxlen, policy_modes[mode]);
+       p += snprintf(p, maxlen, "%s", policy_modes[mode]);
 
        if (flags & MPOL_MODE_FLAGS) {
                p += snprintf(p, buffer + maxlen - p, "=");
index 316e720..bb94004 100644 (file)
@@ -442,6 +442,54 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 }
 
 /*
+ * Gigantic pages are so large that we do not guarantee that page++ pointer
+ * arithmetic will work across the entire page.  We need something more
+ * specialized.
+ */
+static void __copy_gigantic_page(struct page *dst, struct page *src,
+                               int nr_pages)
+{
+       int i;
+       struct page *dst_base = dst;
+       struct page *src_base = src;
+
+       for (i = 0; i < nr_pages; ) {
+               cond_resched();
+               copy_highpage(dst, src);
+
+               i++;
+               dst = mem_map_next(dst, dst_base, i);
+               src = mem_map_next(src, src_base, i);
+       }
+}
+
+static void copy_huge_page(struct page *dst, struct page *src)
+{
+       int i;
+       int nr_pages;
+
+       if (PageHuge(src)) {
+               /* hugetlbfs page */
+               struct hstate *h = page_hstate(src);
+               nr_pages = pages_per_huge_page(h);
+
+               if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
+                       __copy_gigantic_page(dst, src, nr_pages);
+                       return;
+               }
+       } else {
+               /* thp page */
+               BUG_ON(!PageTransHuge(src));
+               nr_pages = hpage_nr_pages(src);
+       }
+
+       for (i = 0; i < nr_pages; i++) {
+               cond_resched();
+               copy_highpage(dst + i, src + i);
+       }
+}
+
+/*
  * Copy the page to its new location
  */
 void migrate_page_copy(struct page *newpage, struct page *page)
index 0c8967b..eb043bf 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
 static bool pfmemalloc_active __read_mostly;
 
 /*
- * kmem_bufctl_t:
- *
- * Bufctl's are used for linking objs within a slab
- * linked offsets.
- *
- * This implementation relies on "struct page" for locating the cache &
- * slab an object belongs to.
- * This allows the bufctl structure to be small (one int), but limits
- * the number of objects a slab (not a cache) can contain when off-slab
- * bufctls are used. The limit is the size of the largest general cache
- * that does not use off-slab slabs.
- * For 32bit archs with 4 kB pages, is this 56.
- * This is not serious, as it is only for large objects, when it is unwise
- * to have too many per slab.
- * Note: This limit can be raised by introducing a general cache whose size
- * is less than 512 (PAGE_SIZE<<3), but greater than 256.
- */
-
-typedef unsigned int kmem_bufctl_t;
-#define BUFCTL_END     (((kmem_bufctl_t)(~0U))-0)
-#define BUFCTL_FREE    (((kmem_bufctl_t)(~0U))-1)
-#define        BUFCTL_ACTIVE   (((kmem_bufctl_t)(~0U))-2)
-#define        SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-3)
-
-/*
- * struct slab_rcu
- *
- * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
- * arrange for kmem_freepages to be called via RCU.  This is useful if
- * we need to approach a kernel structure obliquely, from its address
- * obtained without the usual locking.  We can lock the structure to
- * stabilize it and check it's still at the given address, only if we
- * can be sure that the memory has not been meanwhile reused for some
- * other kind of object (which our subsystem's lock might corrupt).
- *
- * rcu_read_lock before reading the address, then rcu_read_unlock after
- * taking the spinlock within the structure expected at that address.
- */
-struct slab_rcu {
-       struct rcu_head head;
-       struct kmem_cache *cachep;
-       void *addr;
-};
-
-/*
- * struct slab
- *
- * Manages the objs in a slab. Placed either at the beginning of mem allocated
- * for a slab, or allocated from an general cache.
- * Slabs are chained into three list: fully used, partial, fully free slabs.
- */
-struct slab {
-       union {
-               struct {
-                       struct list_head list;
-                       unsigned long colouroff;
-                       void *s_mem;            /* including colour offset */
-                       unsigned int inuse;     /* num of objs active in slab */
-                       kmem_bufctl_t free;
-                       unsigned short nodeid;
-               };
-               struct slab_rcu __slab_cover_slab_rcu;
-       };
-};
-
-/*
  * struct array_cache
  *
  * Purpose:
@@ -456,18 +390,10 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
        return page->slab_cache;
 }
 
-static inline struct slab *virt_to_slab(const void *obj)
-{
-       struct page *page = virt_to_head_page(obj);
-
-       VM_BUG_ON(!PageSlab(page));
-       return page->slab_page;
-}
-
-static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
+static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
                                 unsigned int idx)
 {
-       return slab->s_mem + cache->size * idx;
+       return page->s_mem + cache->size * idx;
 }
 
 /*
@@ -477,9 +403,9 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
  *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
  */
 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
-                                       const struct slab *slab, void *obj)
+                                       const struct page *page, void *obj)
 {
-       u32 offset = (obj - slab->s_mem);
+       u32 offset = (obj - page->s_mem);
        return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 }
 
@@ -641,7 +567,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 
 static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 {
-       return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
+       return ALIGN(nr_objs * sizeof(unsigned int), align);
 }
 
 /*
@@ -660,8 +586,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
         * on it. For the latter case, the memory allocated for a
         * slab is used for:
         *
-        * - The struct slab
-        * - One kmem_bufctl_t for each object
+        * - One unsigned int for each object
         * - Padding to respect alignment of @align
         * - @buffer_size bytes for each object
         *
@@ -674,8 +599,6 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
                mgmt_size = 0;
                nr_objs = slab_size / buffer_size;
 
-               if (nr_objs > SLAB_LIMIT)
-                       nr_objs = SLAB_LIMIT;
        } else {
                /*
                 * Ignore padding for the initial guess. The padding
@@ -685,8 +608,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
                 * into the memory allocation when taking the padding
                 * into account.
                 */
-               nr_objs = (slab_size - sizeof(struct slab)) /
-                         (buffer_size + sizeof(kmem_bufctl_t));
+               nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int));
 
                /*
                 * This calculated number will be either the right
@@ -696,9 +618,6 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
                       > slab_size)
                        nr_objs--;
 
-               if (nr_objs > SLAB_LIMIT)
-                       nr_objs = SLAB_LIMIT;
-
                mgmt_size = slab_mgmt_size(nr_objs, align);
        }
        *num = nr_objs;
@@ -829,10 +748,8 @@ static struct array_cache *alloc_arraycache(int node, int entries,
        return nc;
 }
 
-static inline bool is_slab_pfmemalloc(struct slab *slabp)
+static inline bool is_slab_pfmemalloc(struct page *page)
 {
-       struct page *page = virt_to_page(slabp->s_mem);
-
        return PageSlabPfmemalloc(page);
 }
 
@@ -841,23 +758,23 @@ static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
                                                struct array_cache *ac)
 {
        struct kmem_cache_node *n = cachep->node[numa_mem_id()];
-       struct slab *slabp;
+       struct page *page;
        unsigned long flags;
 
        if (!pfmemalloc_active)
                return;
 
        spin_lock_irqsave(&n->list_lock, flags);
-       list_for_each_entry(slabp, &n->slabs_full, list)
-               if (is_slab_pfmemalloc(slabp))
+       list_for_each_entry(page, &n->slabs_full, lru)
+               if (is_slab_pfmemalloc(page))
                        goto out;
 
-       list_for_each_entry(slabp, &n->slabs_partial, list)
-               if (is_slab_pfmemalloc(slabp))
+       list_for_each_entry(page, &n->slabs_partial, lru)
+               if (is_slab_pfmemalloc(page))
                        goto out;
 
-       list_for_each_entry(slabp, &n->slabs_free, list)
-               if (is_slab_pfmemalloc(slabp))
+       list_for_each_entry(page, &n->slabs_free, lru)
+               if (is_slab_pfmemalloc(page))
                        goto out;
 
        pfmemalloc_active = false;
@@ -897,8 +814,8 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
                 */
                n = cachep->node[numa_mem_id()];
                if (!list_empty(&n->slabs_free) && force_refill) {
-                       struct slab *slabp = virt_to_slab(objp);
-                       ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem));
+                       struct page *page = virt_to_head_page(objp);
+                       ClearPageSlabPfmemalloc(page);
                        clear_obj_pfmemalloc(&objp);
                        recheck_pfmemalloc_active(cachep, ac);
                        return objp;
@@ -1099,8 +1016,7 @@ static void drain_alien_cache(struct kmem_cache *cachep,
 
 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 {
-       struct slab *slabp = virt_to_slab(objp);
-       int nodeid = slabp->nodeid;
+       int nodeid = page_to_nid(virt_to_page(objp));
        struct kmem_cache_node *n;
        struct array_cache *alien = NULL;
        int node;
@@ -1111,7 +1027,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
         * Make sure we are not freeing a object from another node to the array
         * cache on this cpu.
         */
-       if (likely(slabp->nodeid == node))
+       if (likely(nodeid == node))
                return 0;
 
        n = cachep->node[node];
@@ -1512,6 +1428,8 @@ void __init kmem_cache_init(void)
 {
        int i;
 
+       BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
+                                       sizeof(struct rcu_head));
        kmem_cache = &kmem_cache_boot;
        setup_node_pointer(kmem_cache);
 
@@ -1687,7 +1605,7 @@ static noinline void
 slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
 {
        struct kmem_cache_node *n;
-       struct slab *slabp;
+       struct page *page;
        unsigned long flags;
        int node;
 
@@ -1706,15 +1624,15 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
                        continue;
 
                spin_lock_irqsave(&n->list_lock, flags);
-               list_for_each_entry(slabp, &n->slabs_full, list) {
+               list_for_each_entry(page, &n->slabs_full, lru) {
                        active_objs += cachep->num;
                        active_slabs++;
                }
-               list_for_each_entry(slabp, &n->slabs_partial, list) {
-                       active_objs += slabp->inuse;
+               list_for_each_entry(page, &n->slabs_partial, lru) {
+                       active_objs += page->active;
                        active_slabs++;
                }
-               list_for_each_entry(slabp, &n->slabs_free, list)
+               list_for_each_entry(page, &n->slabs_free, lru)
                        num_slabs++;
 
                free_objects += n->free_objects;
@@ -1736,19 +1654,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
  * did not request dmaable memory, we might get it, but that
  * would be relatively rare and ignorable.
  */
-static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
+                                                               int nodeid)
 {
        struct page *page;
        int nr_pages;
-       int i;
-
-#ifndef CONFIG_MMU
-       /*
-        * Nommu uses slab's for process anonymous memory allocations, and thus
-        * requires __GFP_COMP to properly refcount higher order allocations
-        */
-       flags |= __GFP_COMP;
-#endif
 
        flags |= cachep->allocflags;
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -1772,12 +1682,9 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        else
                add_zone_page_state(page_zone(page),
                        NR_SLAB_UNRECLAIMABLE, nr_pages);
-       for (i = 0; i < nr_pages; i++) {
-               __SetPageSlab(page + i);
-
-               if (page->pfmemalloc)
-                       SetPageSlabPfmemalloc(page + i);
-       }
+       __SetPageSlab(page);
+       if (page->pfmemalloc)
+               SetPageSlabPfmemalloc(page);
        memcg_bind_pages(cachep, cachep->gfporder);
 
        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
@@ -1789,17 +1696,15 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                        kmemcheck_mark_unallocated_pages(page, nr_pages);
        }
 
-       return page_address(page);
+       return page;
 }
 
 /*
  * Interface to system's page release.
  */
-static void kmem_freepages(struct kmem_cache *cachep, void *addr)
+static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
 {
-       unsigned long i = (1 << cachep->gfporder);
-       struct page *page = virt_to_page(addr);
-       const unsigned long nr_freed = i;
+       const unsigned long nr_freed = (1 << cachep->gfporder);
 
        kmemcheck_free_shadow(page, cachep->gfporder);
 
@@ -1809,27 +1714,28 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
        else
                sub_zone_page_state(page_zone(page),
                                NR_SLAB_UNRECLAIMABLE, nr_freed);
-       while (i--) {
-               BUG_ON(!PageSlab(page));
-               __ClearPageSlabPfmemalloc(page);
-               __ClearPageSlab(page);
-               page++;
-       }
+
+       BUG_ON(!PageSlab(page));
+       __ClearPageSlabPfmemalloc(page);
+       __ClearPageSlab(page);
+       page_mapcount_reset(page);
+       page->mapping = NULL;
 
        memcg_release_pages(cachep, cachep->gfporder);
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += nr_freed;
-       free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
+       __free_memcg_kmem_pages(page, cachep->gfporder);
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
 {
-       struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
-       struct kmem_cache *cachep = slab_rcu->cachep;
+       struct kmem_cache *cachep;
+       struct page *page;
 
-       kmem_freepages(cachep, slab_rcu->addr);
-       if (OFF_SLAB(cachep))
-               kmem_cache_free(cachep->slabp_cache, slab_rcu);
+       page = container_of(head, struct page, rcu_head);
+       cachep = page->slab_cache;
+
+       kmem_freepages(cachep, page);
 }
 
 #if DEBUG
@@ -1978,19 +1884,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
                /* Print some data about the neighboring objects, if they
                 * exist:
                 */
-               struct slab *slabp = virt_to_slab(objp);
+               struct page *page = virt_to_head_page(objp);
                unsigned int objnr;
 
-               objnr = obj_to_index(cachep, slabp, objp);
+               objnr = obj_to_index(cachep, page, objp);
                if (objnr) {
-                       objp = index_to_obj(cachep, slabp, objnr - 1);
+                       objp = index_to_obj(cachep, page, objnr - 1);
                        realobj = (char *)objp + obj_offset(cachep);
                        printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
                               realobj, size);
                        print_objinfo(cachep, objp, 2);
                }
                if (objnr + 1 < cachep->num) {
-                       objp = index_to_obj(cachep, slabp, objnr + 1);
+                       objp = index_to_obj(cachep, page, objnr + 1);
                        realobj = (char *)objp + obj_offset(cachep);
                        printk(KERN_ERR "Next obj: start=%p, len=%d\n",
                               realobj, size);
@@ -2001,11 +1907,12 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 #endif
 
 #if DEBUG
-static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
+static void slab_destroy_debugcheck(struct kmem_cache *cachep,
+                                               struct page *page)
 {
        int i;
        for (i = 0; i < cachep->num; i++) {
-               void *objp = index_to_obj(cachep, slabp, i);
+               void *objp = index_to_obj(cachep, page, i);
 
                if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
@@ -2030,7 +1937,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
        }
 }
 #else
-static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
+static void slab_destroy_debugcheck(struct kmem_cache *cachep,
+                                               struct page *page)
 {
 }
 #endif
@@ -2044,23 +1952,34 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
  * Before calling the slab must have been unlinked from the cache.  The
  * cache-lock is not held/needed.
  */
-static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
+static void slab_destroy(struct kmem_cache *cachep, struct page *page)
 {
-       void *addr = slabp->s_mem - slabp->colouroff;
+       void *freelist;
 
-       slab_destroy_debugcheck(cachep, slabp);
+       freelist = page->freelist;
+       slab_destroy_debugcheck(cachep, page);
        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
-               struct slab_rcu *slab_rcu;
+               struct rcu_head *head;
+
+               /*
+                * RCU free overloads the RCU head over the LRU.
+                * slab_page has been overloeaded over the LRU,
+                * however it is not used from now on so that
+                * we can use it safely.
+                */
+               head = (void *)&page->rcu_head;
+               call_rcu(head, kmem_rcu_free);
 
-               slab_rcu = (struct slab_rcu *)slabp;
-               slab_rcu->cachep = cachep;
-               slab_rcu->addr = addr;
-               call_rcu(&slab_rcu->head, kmem_rcu_free);
        } else {
-               kmem_freepages(cachep, addr);
-               if (OFF_SLAB(cachep))
-                       kmem_cache_free(cachep->slabp_cache, slabp);
+               kmem_freepages(cachep, page);
        }
+
+       /*
+        * From now on, we don't use freelist
+        * although actual page can be freed in rcu context
+        */
+       if (OFF_SLAB(cachep))
+               kmem_cache_free(cachep->freelist_cache, freelist);
 }
 
 /**
@@ -2097,8 +2016,8 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
                         * use off-slab slabs. Needed to avoid a possible
                         * looping condition in cache_grow().
                         */
-                       offslab_limit = size - sizeof(struct slab);
-                       offslab_limit /= sizeof(kmem_bufctl_t);
+                       offslab_limit = size;
+                       offslab_limit /= sizeof(unsigned int);
 
                        if (num > offslab_limit)
                                break;
@@ -2220,7 +2139,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 int
 __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 {
-       size_t left_over, slab_size, ralign;
+       size_t left_over, freelist_size, ralign;
        gfp_t gfp;
        int err;
        size_t size = cachep->size;
@@ -2339,22 +2258,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        if (!cachep->num)
                return -E2BIG;
 
-       slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
-                         + sizeof(struct slab), cachep->align);
+       freelist_size =
+               ALIGN(cachep->num * sizeof(unsigned int), cachep->align);
 
        /*
         * If the slab has been placed off-slab, and we have enough space then
         * move it on-slab. This is at the expense of any extra colouring.
         */
-       if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
+       if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
                flags &= ~CFLGS_OFF_SLAB;
-               left_over -= slab_size;
+               left_over -= freelist_size;
        }
 
        if (flags & CFLGS_OFF_SLAB) {
                /* really off slab. No need for manual alignment */
-               slab_size =
-                   cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
+               freelist_size = cachep->num * sizeof(unsigned int);
 
 #ifdef CONFIG_PAGE_POISONING
                /* If we're going to use the generic kernel_map_pages()
@@ -2371,16 +2289,16 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
        if (cachep->colour_off < cachep->align)
                cachep->colour_off = cachep->align;
        cachep->colour = left_over / cachep->colour_off;
-       cachep->slab_size = slab_size;
+       cachep->freelist_size = freelist_size;
        cachep->flags = flags;
-       cachep->allocflags = 0;
+       cachep->allocflags = __GFP_COMP;
        if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
                cachep->allocflags |= GFP_DMA;
        cachep->size = size;
        cachep->reciprocal_buffer_size = reciprocal_value(size);
 
        if (flags & CFLGS_OFF_SLAB) {
-               cachep->slabp_cache = kmalloc_slab(slab_size, 0u);
+               cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
                /*
                 * This is a possibility for one of the malloc_sizes caches.
                 * But since we go off slab only for object size greater than
@@ -2388,7 +2306,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
                 * this should not happen at all.
                 * But leave a BUG_ON for some lucky dude.
                 */
-               BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
+               BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
        }
 
        err = setup_cpu_cache(cachep, gfp);
@@ -2494,7 +2412,7 @@ static int drain_freelist(struct kmem_cache *cache,
 {
        struct list_head *p;
        int nr_freed;
-       struct slab *slabp;
+       struct page *page;
 
        nr_freed = 0;
        while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
@@ -2506,18 +2424,18 @@ static int drain_freelist(struct kmem_cache *cache,
                        goto out;
                }
 
-               slabp = list_entry(p, struct slab, list);
+               page = list_entry(p, struct page, lru);
 #if DEBUG
-               BUG_ON(slabp->inuse);
+               BUG_ON(page->active);
 #endif
-               list_del(&slabp->list);
+               list_del(&page->lru);
                /*
                 * Safe to drop the lock. The slab is no longer linked
                 * to the cache.
                 */
                n->free_objects -= cache->num;
                spin_unlock_irq(&n->list_lock);
-               slab_destroy(cache, slabp);
+               slab_destroy(cache, page);
                nr_freed++;
        }
 out:
@@ -2600,52 +2518,42 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
  * descriptors in kmem_cache_create, we search through the malloc_sizes array.
  * If we are creating a malloc_sizes cache here it would not be visible to
  * kmem_find_general_cachep till the initialization is complete.
- * Hence we cannot have slabp_cache same as the original cache.
+ * Hence we cannot have freelist_cache same as the original cache.
  */
-static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
-                                  int colour_off, gfp_t local_flags,
-                                  int nodeid)
+static void *alloc_slabmgmt(struct kmem_cache *cachep,
+                                  struct page *page, int colour_off,
+                                  gfp_t local_flags, int nodeid)
 {
-       struct slab *slabp;
+       void *freelist;
+       void *addr = page_address(page);
 
        if (OFF_SLAB(cachep)) {
                /* Slab management obj is off-slab. */
-               slabp = kmem_cache_alloc_node(cachep->slabp_cache,
+               freelist = kmem_cache_alloc_node(cachep->freelist_cache,
                                              local_flags, nodeid);
-               /*
-                * If the first object in the slab is leaked (it's allocated
-                * but no one has a reference to it), we want to make sure
-                * kmemleak does not treat the ->s_mem pointer as a reference
-                * to the object. Otherwise we will not report the leak.
-                */
-               kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
-                                  local_flags);
-               if (!slabp)
+               if (!freelist)
                        return NULL;
        } else {
-               slabp = objp + colour_off;
-               colour_off += cachep->slab_size;
+               freelist = addr + colour_off;
+               colour_off += cachep->freelist_size;
        }
-       slabp->inuse = 0;
-       slabp->colouroff = colour_off;
-       slabp->s_mem = objp + colour_off;
-       slabp->nodeid = nodeid;
-       slabp->free = 0;
-       return slabp;
+       page->active = 0;
+       page->s_mem = addr + colour_off;
+       return freelist;
 }
 
-static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
+static inline unsigned int *slab_freelist(struct page *page)
 {
-       return (kmem_bufctl_t *) (slabp + 1);
+       return (unsigned int *)(page->freelist);
 }
 
 static void cache_init_objs(struct kmem_cache *cachep,
-                           struct slab *slabp)
+                           struct page *page)
 {
        int i;
 
        for (i = 0; i < cachep->num; i++) {
-               void *objp = index_to_obj(cachep, slabp, i);
+               void *objp = index_to_obj(cachep, page, i);
 #if DEBUG
                /* need to poison the objs? */
                if (cachep->flags & SLAB_POISON)
@@ -2681,9 +2589,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
                if (cachep->ctor)
                        cachep->ctor(objp);
 #endif
-               slab_bufctl(slabp)[i] = i + 1;
+               slab_freelist(page)[i] = i;
        }
-       slab_bufctl(slabp)[i - 1] = BUFCTL_END;
 }
 
 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
@@ -2696,41 +2603,41 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
        }
 }
 
-static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
+static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
                                int nodeid)
 {
-       void *objp = index_to_obj(cachep, slabp, slabp->free);
-       kmem_bufctl_t next;
+       void *objp;
 
-       slabp->inuse++;
-       next = slab_bufctl(slabp)[slabp->free];
+       objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]);
+       page->active++;
 #if DEBUG
-       slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-       WARN_ON(slabp->nodeid != nodeid);
+       WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
 #endif
-       slabp->free = next;
 
        return objp;
 }
 
-static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
+static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
                                void *objp, int nodeid)
 {
-       unsigned int objnr = obj_to_index(cachep, slabp, objp);
-
+       unsigned int objnr = obj_to_index(cachep, page, objp);
 #if DEBUG
+       unsigned int i;
+
        /* Verify that the slab belongs to the intended node */
-       WARN_ON(slabp->nodeid != nodeid);
+       WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
 
-       if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
-               printk(KERN_ERR "slab: double free detected in cache "
-                               "'%s', objp %p\n", cachep->name, objp);
-               BUG();
+       /* Verify double free bug */
+       for (i = page->active; i < cachep->num; i++) {
+               if (slab_freelist(page)[i] == objnr) {
+                       printk(KERN_ERR "slab: double free detected in cache "
+                                       "'%s', objp %p\n", cachep->name, objp);
+                       BUG();
+               }
        }
 #endif
-       slab_bufctl(slabp)[objnr] = slabp->free;
-       slabp->free = objnr;
-       slabp->inuse--;
+       page->active--;
+       slab_freelist(page)[page->active] = objnr;
 }
 
 /*
@@ -2738,23 +2645,11 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
  * for the slab allocator to be able to lookup the cache and slab of a
  * virtual address for kfree, ksize, and slab debugging.
  */
-static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
-                          void *addr)
+static void slab_map_pages(struct kmem_cache *cache, struct page *page,
+                          void *freelist)
 {
-       int nr_pages;
-       struct page *page;
-
-       page = virt_to_page(addr);
-
-       nr_pages = 1;
-       if (likely(!PageCompound(page)))
-               nr_pages <<= cache->gfporder;
-
-       do {
-               page->slab_cache = cache;
-               page->slab_page = slab;
-               page++;
-       } while (--nr_pages);
+       page->slab_cache = cache;
+       page->freelist = freelist;
 }
 
 /*
@@ -2762,9 +2657,9 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
 static int cache_grow(struct kmem_cache *cachep,
-               gfp_t flags, int nodeid, void *objp)
+               gfp_t flags, int nodeid, struct page *page)
 {
-       struct slab *slabp;
+       void *freelist;
        size_t offset;
        gfp_t local_flags;
        struct kmem_cache_node *n;
@@ -2805,20 +2700,20 @@ static int cache_grow(struct kmem_cache *cachep,
         * Get mem for the objs.  Attempt to allocate a physical page from
         * 'nodeid'.
         */
-       if (!objp)
-               objp = kmem_getpages(cachep, local_flags, nodeid);
-       if (!objp)
+       if (!page)
+               page = kmem_getpages(cachep, local_flags, nodeid);
+       if (!page)
                goto failed;
 
        /* Get slab management. */
-       slabp = alloc_slabmgmt(cachep, objp, offset,
+       freelist = alloc_slabmgmt(cachep, page, offset,
                        local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
-       if (!slabp)
+       if (!freelist)
                goto opps1;
 
-       slab_map_pages(cachep, slabp, objp);
+       slab_map_pages(cachep, page, freelist);
 
-       cache_init_objs(cachep, slabp);
+       cache_init_objs(cachep, page);
 
        if (local_flags & __GFP_WAIT)
                local_irq_disable();
@@ -2826,13 +2721,13 @@ static int cache_grow(struct kmem_cache *cachep,
        spin_lock(&n->list_lock);
 
        /* Make slab active. */
-       list_add_tail(&slabp->list, &(n->slabs_free));
+       list_add_tail(&page->lru, &(n->slabs_free));
        STATS_INC_GROWN(cachep);
        n->free_objects += cachep->num;
        spin_unlock(&n->list_lock);
        return 1;
 opps1:
-       kmem_freepages(cachep, objp);
+       kmem_freepages(cachep, page);
 failed:
        if (local_flags & __GFP_WAIT)
                local_irq_disable();
@@ -2880,9 +2775,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
                                   unsigned long caller)
 {
-       struct page *page;
        unsigned int objnr;
-       struct slab *slabp;
+       struct page *page;
 
        BUG_ON(virt_to_cache(objp) != cachep);
 
@@ -2890,8 +2784,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        kfree_debugcheck(objp);
        page = virt_to_head_page(objp);
 
-       slabp = page->slab_page;
-
        if (cachep->flags & SLAB_RED_ZONE) {
                verify_redzone_free(cachep, objp);
                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
@@ -2900,14 +2792,11 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        if (cachep->flags & SLAB_STORE_USER)
                *dbg_userword(cachep, objp) = (void *)caller;
 
-       objnr = obj_to_index(cachep, slabp, objp);
+       objnr = obj_to_index(cachep, page, objp);
 
        BUG_ON(objnr >= cachep->num);
-       BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
+       BUG_ON(objp != index_to_obj(cachep, page, objnr));
 
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-       slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
-#endif
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2924,33 +2813,9 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        return objp;
 }
 
-static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
-{
-       kmem_bufctl_t i;
-       int entries = 0;
-
-       /* Check slab's freelist to see if this obj is there. */
-       for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
-               entries++;
-               if (entries > cachep->num || i >= cachep->num)
-                       goto bad;
-       }
-       if (entries != cachep->num - slabp->inuse) {
-bad:
-               printk(KERN_ERR "slab: Internal list corruption detected in "
-                       "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
-                       cachep->name, cachep->num, slabp, slabp->inuse,
-                       print_tainted());
-               print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
-                       sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
-                       1);
-               BUG();
-       }
-}
 #else
 #define kfree_debugcheck(x) do { } while(0)
 #define cache_free_debugcheck(x,objp,z) (objp)
-#define check_slabp(x,y) do { } while(0)
 #endif
 
 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
@@ -2989,7 +2854,7 @@ retry:
 
        while (batchcount > 0) {
                struct list_head *entry;
-               struct slab *slabp;
+               struct page *page;
                /* Get slab alloc is to come from. */
                entry = n->slabs_partial.next;
                if (entry == &n->slabs_partial) {
@@ -2999,8 +2864,7 @@ retry:
                                goto must_grow;
                }
 
-               slabp = list_entry(entry, struct slab, list);
-               check_slabp(cachep, slabp);
+               page = list_entry(entry, struct page, lru);
                check_spinlock_acquired(cachep);
 
                /*
@@ -3008,24 +2872,23 @@ retry:
                 * there must be at least one object available for
                 * allocation.
                 */
-               BUG_ON(slabp->inuse >= cachep->num);
+               BUG_ON(page->active >= cachep->num);
 
-               while (slabp->inuse < cachep->num && batchcount--) {
+               while (page->active < cachep->num && batchcount--) {
                        STATS_INC_ALLOCED(cachep);
                        STATS_INC_ACTIVE(cachep);
                        STATS_SET_HIGH(cachep);
 
-                       ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
+                       ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
                                                                        node));
                }
-               check_slabp(cachep, slabp);
 
                /* move slabp to correct slabp list: */
-               list_del(&slabp->list);
-               if (slabp->free == BUFCTL_END)
-                       list_add(&slabp->list, &n->slabs_full);
+               list_del(&page->lru);
+               if (page->active == cachep->num)
+                       list_add(&page->list, &n->slabs_full);
                else
-                       list_add(&slabp->list, &n->slabs_partial);
+                       list_add(&page->list, &n->slabs_partial);
        }
 
 must_grow:
@@ -3097,16 +2960,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
        }
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-       {
-               struct slab *slabp;
-               unsigned objnr;
-
-               slabp = virt_to_head_page(objp)->slab_page;
-               objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
-               slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
-       }
-#endif
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
                cachep->ctor(objp);
@@ -3248,18 +3101,20 @@ retry:
                 * We may trigger various forms of reclaim on the allowed
                 * set and go into memory reserves if necessary.
                 */
+               struct page *page;
+
                if (local_flags & __GFP_WAIT)
                        local_irq_enable();
                kmem_flagcheck(cache, flags);
-               obj = kmem_getpages(cache, local_flags, numa_mem_id());
+               page = kmem_getpages(cache, local_flags, numa_mem_id());
                if (local_flags & __GFP_WAIT)
                        local_irq_disable();
-               if (obj) {
+               if (page) {
                        /*
                         * Insert into the appropriate per node queues
                         */
-                       nid = page_to_nid(virt_to_page(obj));
-                       if (cache_grow(cache, flags, nid, obj)) {
+                       nid = page_to_nid(page);
+                       if (cache_grow(cache, flags, nid, page)) {
                                obj = ____cache_alloc_node(cache,
                                        flags | GFP_THISNODE, nid);
                                if (!obj)
@@ -3288,7 +3143,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
                                int nodeid)
 {
        struct list_head *entry;
-       struct slab *slabp;
+       struct page *page;
        struct kmem_cache_node *n;
        void *obj;
        int x;
@@ -3308,26 +3163,24 @@ retry:
                        goto must_grow;
        }
 
-       slabp = list_entry(entry, struct slab, list);
+       page = list_entry(entry, struct page, lru);
        check_spinlock_acquired_node(cachep, nodeid);
-       check_slabp(cachep, slabp);
 
        STATS_INC_NODEALLOCS(cachep);
        STATS_INC_ACTIVE(cachep);
        STATS_SET_HIGH(cachep);
 
-       BUG_ON(slabp->inuse == cachep->num);
+       BUG_ON(page->active == cachep->num);
 
-       obj = slab_get_obj(cachep, slabp, nodeid);
-       check_slabp(cachep, slabp);
+       obj = slab_get_obj(cachep, page, nodeid);
        n->free_objects--;
        /* move slabp to correct slabp list: */
-       list_del(&slabp->list);
+       list_del(&page->lru);
 
-       if (slabp->free == BUFCTL_END)
-               list_add(&slabp->list, &n->slabs_full);
+       if (page->active == cachep->num)
+               list_add(&page->lru, &n->slabs_full);
        else
-               list_add(&slabp->list, &n->slabs_partial);
+               list_add(&page->lru, &n->slabs_partial);
 
        spin_unlock(&n->list_lock);
        goto done;
@@ -3477,23 +3330,21 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
 
        for (i = 0; i < nr_objects; i++) {
                void *objp;
-               struct slab *slabp;
+               struct page *page;
 
                clear_obj_pfmemalloc(&objpp[i]);
                objp = objpp[i];
 
-               slabp = virt_to_slab(objp);
+               page = virt_to_head_page(objp);
                n = cachep->node[node];
-               list_del(&slabp->list);
+               list_del(&page->lru);
                check_spinlock_acquired_node(cachep, node);
-               check_slabp(cachep, slabp);
-               slab_put_obj(cachep, slabp, objp, node);
+               slab_put_obj(cachep, page, objp, node);
                STATS_DEC_ACTIVE(cachep);
                n->free_objects++;
-               check_slabp(cachep, slabp);
 
                /* fixup slab chains */
-               if (slabp->inuse == 0) {
+               if (page->active == 0) {
                        if (n->free_objects > n->free_limit) {
                                n->free_objects -= cachep->num;
                                /* No need to drop any previously held
@@ -3502,16 +3353,16 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
                                 * a different cache, refer to comments before
                                 * alloc_slabmgmt.
                                 */
-                               slab_destroy(cachep, slabp);
+                               slab_destroy(cachep, page);
                        } else {
-                               list_add(&slabp->list, &n->slabs_free);
+                               list_add(&page->lru, &n->slabs_free);
                        }
                } else {
                        /* Unconditionally move a slab to the end of the
                         * partial list on free - maximum time for the
                         * other objects to be freed, too.
                         */
-                       list_add_tail(&slabp->list, &n->slabs_partial);
+                       list_add_tail(&page->lru, &n->slabs_partial);
                }
        }
 }
@@ -3551,10 +3402,10 @@ free_done:
 
                p = n->slabs_free.next;
                while (p != &(n->slabs_free)) {
-                       struct slab *slabp;
+                       struct page *page;
 
-                       slabp = list_entry(p, struct slab, list);
-                       BUG_ON(slabp->inuse);
+                       page = list_entry(p, struct page, lru);
+                       BUG_ON(page->active);
 
                        i++;
                        p = p->next;
@@ -4158,7 +4009,7 @@ out:
 #ifdef CONFIG_SLABINFO
 void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 {
-       struct slab *slabp;
+       struct page *page;
        unsigned long active_objs;
        unsigned long num_objs;
        unsigned long active_slabs = 0;
@@ -4178,23 +4029,23 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
                check_irq_on();
                spin_lock_irq(&n->list_lock);
 
-               list_for_each_entry(slabp, &n->slabs_full, list) {
-                       if (slabp->inuse != cachep->num && !error)
+               list_for_each_entry(page, &n->slabs_full, lru) {
+                       if (page->active != cachep->num && !error)
                                error = "slabs_full accounting error";
                        active_objs += cachep->num;
                        active_slabs++;
                }
-               list_for_each_entry(slabp, &n->slabs_partial, list) {
-                       if (slabp->inuse == cachep->num && !error)
-                               error = "slabs_partial inuse accounting error";
-                       if (!slabp->inuse && !error)
-                               error = "slabs_partial/inuse accounting error";
-                       active_objs += slabp->inuse;
+               list_for_each_entry(page, &n->slabs_partial, lru) {
+                       if (page->active == cachep->num && !error)
+                               error = "slabs_partial accounting error";
+                       if (!page->active && !error)
+                               error = "slabs_partial accounting error";
+                       active_objs += page->active;
                        active_slabs++;
                }
-               list_for_each_entry(slabp, &n->slabs_free, list) {
-                       if (slabp->inuse && !error)
-                               error = "slabs_free/inuse accounting error";
+               list_for_each_entry(page, &n->slabs_free, lru) {
+                       if (page->active && !error)
+                               error = "slabs_free accounting error";
                        num_slabs++;
                }
                free_objects += n->free_objects;
@@ -4346,15 +4197,27 @@ static inline int add_caller(unsigned long *n, unsigned long v)
        return 1;
 }
 
-static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
+static void handle_slab(unsigned long *n, struct kmem_cache *c,
+                                               struct page *page)
 {
        void *p;
-       int i;
+       int i, j;
+
        if (n[0] == n[1])
                return;
-       for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) {
-               if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
+       for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
+               bool active = true;
+
+               for (j = page->active; j < c->num; j++) {
+                       /* Skip freed item */
+                       if (slab_freelist(page)[j] == i) {
+                               active = false;
+                               break;
+                       }
+               }
+               if (!active)
                        continue;
+
                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
                        return;
        }
@@ -4379,7 +4242,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
 static int leaks_show(struct seq_file *m, void *p)
 {
        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
-       struct slab *slabp;
+       struct page *page;
        struct kmem_cache_node *n;
        const char *name;
        unsigned long *x = m->private;
@@ -4403,10 +4266,10 @@ static int leaks_show(struct seq_file *m, void *p)
                check_irq_on();
                spin_lock_irq(&n->list_lock);
 
-               list_for_each_entry(slabp, &n->slabs_full, list)
-                       handle_slab(x, cachep, slabp);
-               list_for_each_entry(slabp, &n->slabs_partial, list)
-                       handle_slab(x, cachep, slabp);
+               list_for_each_entry(page, &n->slabs_full, lru)
+                       handle_slab(x, cachep, page);
+               list_for_each_entry(page, &n->slabs_partial, lru)
+                       handle_slab(x, cachep, page);
                spin_unlock_irq(&n->list_lock);
        }
        name = cachep->name;
index 7e8bd8d..545a170 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -155,7 +155,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 /*
  * Maximum number of desirable partial slabs.
  * The existence of more partial slabs makes kmem_cache_shrink
- * sort the partial list by the number of objects in the.
+ * sort the partial list by the number of objects in use.
  */
 #define MAX_PARTIAL 10
 
@@ -933,6 +933,16 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
  * Hooks for other subsystems that check memory allocations. In a typical
  * production configuration these hooks all should produce no code at all.
  */
+static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
+{
+       kmemleak_alloc(ptr, size, 1, flags);
+}
+
+static inline void kfree_hook(const void *x)
+{
+       kmemleak_free(x);
+}
+
 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
 {
        flags &= gfp_allowed_mask;
@@ -1217,8 +1227,8 @@ static unsigned long kmem_cache_flags(unsigned long object_size,
        /*
         * Enable debugging if selected on the kernel commandline.
         */
-       if (slub_debug && (!slub_debug_slabs ||
-               !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
+       if (slub_debug && (!slub_debug_slabs || (name &&
+               !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
                flags |= slub_debug;
 
        return flags;
@@ -1260,13 +1270,30 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
 static inline void dec_slabs_node(struct kmem_cache *s, int node,
                                                        int objects) {}
 
+static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
+{
+       kmemleak_alloc(ptr, size, 1, flags);
+}
+
+static inline void kfree_hook(const void *x)
+{
+       kmemleak_free(x);
+}
+
 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
                                                        { return 0; }
 
 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
-               void *object) {}
+               void *object)
+{
+       kmemleak_alloc_recursive(object, s->object_size, 1, s->flags,
+               flags & gfp_allowed_mask);
+}
 
-static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
+static inline void slab_free_hook(struct kmem_cache *s, void *x)
+{
+       kmemleak_free_recursive(x, s->flags);
+}
 
 #endif /* CONFIG_SLUB_DEBUG */
 
@@ -2829,8 +2856,8 @@ static struct kmem_cache *kmem_cache_node;
  * slab on the node for this slabcache. There are no concurrent accesses
  * possible.
  *
- * Note that this function only works on the kmalloc_node_cache
- * when allocating for the kmalloc_node_cache. This is used for bootstrapping
+ * Note that this function only works on the kmem_cache_node
+ * when allocating for the kmem_cache_node. This is used for bootstrapping
  * memory on a fresh node that has no slab structures yet.
  */
 static void early_kmem_cache_node_alloc(int node)
@@ -3272,7 +3299,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
        if (page)
                ptr = page_address(page);
 
-       kmemleak_alloc(ptr, size, 1, flags);
+       kmalloc_large_node_hook(ptr, size, flags);
        return ptr;
 }
 
@@ -3336,7 +3363,7 @@ void kfree(const void *x)
        page = virt_to_head_page(x);
        if (unlikely(!PageSlab(page))) {
                BUG_ON(!PageCompound(page));
-               kmemleak_free(x);
+               kfree_hook(x);
                __free_memcg_kmem_pages(page, compound_order(page));
                return;
        }
index 7a9f80d..84b26aa 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -82,19 +82,6 @@ static void __put_compound_page(struct page *page)
 
 static void put_compound_page(struct page *page)
 {
-       /*
-        * hugetlbfs pages cannot be split from under us.  If this is a
-        * hugetlbfs page, check refcount on head page and release the page if
-        * the refcount becomes zero.
-        */
-       if (PageHuge(page)) {
-               page = compound_head(page);
-               if (put_page_testzero(page))
-                       __put_compound_page(page);
-
-               return;
-       }
-
        if (unlikely(PageTail(page))) {
                /* __split_huge_page_refcount can run under us */
                struct page *page_head = compound_trans_head(page);
@@ -111,14 +98,31 @@ static void put_compound_page(struct page *page)
                         * still hot on arches that do not support
                         * this_cpu_cmpxchg_double().
                         */
-                       if (PageSlab(page_head)) {
-                               if (PageTail(page)) {
+                       if (PageSlab(page_head) || PageHeadHuge(page_head)) {
+                               if (likely(PageTail(page))) {
+                                       /*
+                                        * __split_huge_page_refcount
+                                        * cannot race here.
+                                        */
+                                       VM_BUG_ON(!PageHead(page_head));
+                                       atomic_dec(&page->_mapcount);
                                        if (put_page_testzero(page_head))
                                                VM_BUG_ON(1);
-
-                                       atomic_dec(&page->_mapcount);
-                                       goto skip_lock_tail;
+                                       if (put_page_testzero(page_head))
+                                               __put_compound_page(page_head);
+                                       return;
                                } else
+                                       /*
+                                        * __split_huge_page_refcount
+                                        * run before us, "page" was a
+                                        * THP tail. The split
+                                        * page_head has been freed
+                                        * and reallocated as slab or
+                                        * hugetlbfs page of smaller
+                                        * order (only possible if
+                                        * reallocated as slab on
+                                        * x86).
+                                        */
                                        goto skip_lock;
                        }
                        /*
@@ -132,8 +136,27 @@ static void put_compound_page(struct page *page)
                                /* __split_huge_page_refcount run before us */
                                compound_unlock_irqrestore(page_head, flags);
 skip_lock:
-                               if (put_page_testzero(page_head))
-                                       __put_single_page(page_head);
+                               if (put_page_testzero(page_head)) {
+                                       /*
+                                        * The head page may have been
+                                        * freed and reallocated as a
+                                        * compound page of smaller
+                                        * order and then freed again.
+                                        * All we know is that it
+                                        * cannot have become: a THP
+                                        * page, a compound page of
+                                        * higher order, a tail page.
+                                        * That is because we still
+                                        * hold the refcount of the
+                                        * split THP tail and
+                                        * page_head was the THP head
+                                        * before the split.
+                                        */
+                                       if (PageHead(page_head))
+                                               __put_compound_page(page_head);
+                                       else
+                                               __put_single_page(page_head);
+                               }
 out_put_single:
                                if (put_page_testzero(page))
                                        __put_single_page(page);
@@ -155,7 +178,6 @@ out_put_single:
                        VM_BUG_ON(atomic_read(&page->_count) != 0);
                        compound_unlock_irqrestore(page_head, flags);
 
-skip_lock_tail:
                        if (put_page_testzero(page_head)) {
                                if (PageHead(page_head))
                                        __put_compound_page(page_head);
@@ -198,51 +220,52 @@ bool __get_page_tail(struct page *page)
         * proper PT lock that already serializes against
         * split_huge_page().
         */
+       unsigned long flags;
        bool got = false;
-       struct page *page_head;
-
-       /*
-        * If this is a hugetlbfs page it cannot be split under us.  Simply
-        * increment refcount for the head page.
-        */
-       if (PageHuge(page)) {
-               page_head = compound_head(page);
-               atomic_inc(&page_head->_count);
-               got = true;
-       } else {
-               unsigned long flags;
+       struct page *page_head = compound_trans_head(page);
 
-               page_head = compound_trans_head(page);
-               if (likely(page != page_head &&
-                                       get_page_unless_zero(page_head))) {
-
-                       /* Ref to put_compound_page() comment. */
-                       if (PageSlab(page_head)) {
-                               if (likely(PageTail(page))) {
-                                       __get_page_tail_foll(page, false);
-                                       return true;
-                               } else {
-                                       put_page(page_head);
-                                       return false;
-                               }
-                       }
-
-                       /*
-                        * page_head wasn't a dangling pointer but it
-                        * may not be a head page anymore by the time
-                        * we obtain the lock. That is ok as long as it
-                        * can't be freed from under us.
-                        */
-                       flags = compound_lock_irqsave(page_head);
-                       /* here __split_huge_page_refcount won't run anymore */
+       if (likely(page != page_head && get_page_unless_zero(page_head))) {
+               /* Ref to put_compound_page() comment. */
+               if (PageSlab(page_head) || PageHeadHuge(page_head)) {
                        if (likely(PageTail(page))) {
+                               /*
+                                * This is a hugetlbfs page or a slab
+                                * page. __split_huge_page_refcount
+                                * cannot race here.
+                                */
+                               VM_BUG_ON(!PageHead(page_head));
                                __get_page_tail_foll(page, false);
-                               got = true;
-                       }
-                       compound_unlock_irqrestore(page_head, flags);
-                       if (unlikely(!got))
+                               return true;
+                       } else {
+                               /*
+                                * __split_huge_page_refcount run
+                                * before us, "page" was a THP
+                                * tail. The split page_head has been
+                                * freed and reallocated as slab or
+                                * hugetlbfs page of smaller order
+                                * (only possible if reallocated as
+                                * slab on x86).
+                                */
                                put_page(page_head);
+                               return false;
+                       }
+               }
+
+               /*
+                * page_head wasn't a dangling pointer but it
+                * may not be a head page anymore by the time
+                * we obtain the lock. That is ok as long as it
+                * can't be freed from under us.
+                */
+               flags = compound_lock_irqsave(page_head);
+               /* here __split_huge_page_refcount won't run anymore */
+               if (likely(PageTail(page))) {
+                       __get_page_tail_foll(page, false);
+                       got = true;
                }
+               compound_unlock_irqrestore(page_head, flags);
+               if (unlikely(!got))
+                       put_page(page_head);
        }
        return got;
 }
index 0715db6..d334678 100644 (file)
@@ -224,7 +224,7 @@ source "net/hsr/Kconfig"
 
 config RPS
        boolean
-       depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
+       depends on SMP && SYSFS
        default y
 
 config RFS_ACCEL
@@ -235,7 +235,7 @@ config RFS_ACCEL
 
 config XPS
        boolean
-       depends on SMP && USE_GENERIC_SMP_HELPERS
+       depends on SMP
        default y
 
 config NETPRIO_CGROUP
index 3dc0c6c..c4638e6 100644 (file)
@@ -1425,7 +1425,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
        do {
                if (dma_async_is_tx_complete(tp->ucopy.dma_chan,
                                              last_issued, &done,
-                                             &used) == DMA_SUCCESS) {
+                                             &used) == DMA_COMPLETE) {
                        /* Safe to free early-copied skbs now */
                        __skb_queue_purge(&sk->sk_async_wait_queue);
                        break;
@@ -1433,7 +1433,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
                        struct sk_buff *skb;
                        while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
                               (dma_async_is_complete(skb->dma_cookie, done,
-                                                     used) == DMA_SUCCESS)) {
+                                                     used) == DMA_COMPLETE)) {
                                __skb_dequeue(&sk->sk_async_wait_queue);
                                kfree_skb(skb);
                        }
index d0d14a0..bf04b30 100644 (file)
@@ -471,15 +471,6 @@ struct rpc_filelist {
        umode_t mode;
 };
 
-static int rpc_delete_dentry(const struct dentry *dentry)
-{
-       return 1;
-}
-
-static const struct dentry_operations rpc_dentry_operations = {
-       .d_delete = rpc_delete_dentry,
-};
-
 static struct inode *
 rpc_get_inode(struct super_block *sb, umode_t mode)
 {
@@ -1266,7 +1257,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
        sb->s_magic = RPCAUTH_GSSMAGIC;
        sb->s_op = &s_ops;
-       sb->s_d_op = &rpc_dentry_operations;
+       sb->s_d_op = &simple_dentry_operations;
        sb->s_time_gran = 1;
 
        inode = rpc_get_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO);
index db0e5cd..91c4117 100644 (file)
@@ -1353,6 +1353,8 @@ static void render_out_of_line_list(FILE *out)
                        render_opcode(out, "ASN1_OP_END_SET_OF%s,\n", act);
                        render_opcode(out, "_jump_target(%u),\n", entry);
                        break;
+               default:
+                       break;
                }
                if (e->action)
                        render_opcode(out, "_action(ACT_%s),\n",
index 61090e0..9c98100 100755 (executable)
@@ -3289,6 +3289,7 @@ sub process {
                        }
                }
                if (!defined $suppress_whiletrailers{$linenr} &&
+                   defined($stat) && defined($cond) &&
                    $line =~ /\b(?:if|while|for)\s*\(/ && $line !~ /^.\s*#/) {
                        my ($s, $c) = ($stat, $cond);
 
index c26c81e..a5918e0 100644 (file)
@@ -16,7 +16,6 @@ obj-$(CONFIG_MMU)                     += min_addr.o
 # Object file lists
 obj-$(CONFIG_SECURITY)                 += security.o capability.o
 obj-$(CONFIG_SECURITYFS)               += inode.o
-# Must precede capability.o in order to stack properly.
 obj-$(CONFIG_SECURITY_SELINUX)         += selinux/built-in.o
 obj-$(CONFIG_SECURITY_SMACK)           += smack/built-in.o
 obj-$(CONFIG_AUDIT)                    += lsm_audit.o
index 031d2d9..89c7865 100644 (file)
@@ -111,7 +111,6 @@ static const char *const aa_audit_type[] = {
 static void audit_pre(struct audit_buffer *ab, void *ca)
 {
        struct common_audit_data *sa = ca;
-       struct task_struct *tsk = sa->aad->tsk ? sa->aad->tsk : current;
 
        if (aa_g_audit_header) {
                audit_log_format(ab, "apparmor=");
@@ -132,11 +131,6 @@ static void audit_pre(struct audit_buffer *ab, void *ca)
 
        if (sa->aad->profile) {
                struct aa_profile *profile = sa->aad->profile;
-               pid_t pid;
-               rcu_read_lock();
-               pid = rcu_dereference(tsk->real_parent)->pid;
-               rcu_read_unlock();
-               audit_log_format(ab, " parent=%d", pid);
                if (profile->ns != root_ns) {
                        audit_log_format(ab, " namespace=");
                        audit_log_untrustedstring(ab, profile->ns->base.hname);
@@ -149,12 +143,6 @@ static void audit_pre(struct audit_buffer *ab, void *ca)
                audit_log_format(ab, " name=");
                audit_log_untrustedstring(ab, sa->aad->name);
        }
-
-       if (sa->aad->tsk) {
-               audit_log_format(ab, " pid=%d comm=", tsk->pid);
-               audit_log_untrustedstring(ab, tsk->comm);
-       }
-
 }
 
 /**
@@ -212,7 +200,7 @@ int aa_audit(int type, struct aa_profile *profile, gfp_t gfp,
 
        if (sa->aad->type == AUDIT_APPARMOR_KILL)
                (void)send_sig_info(SIGKILL, NULL,
-                                   sa->aad->tsk ?  sa->aad->tsk : current);
+                                   sa->u.tsk ?  sa->u.tsk : current);
 
        if (sa->aad->type == AUDIT_APPARMOR_ALLOWED)
                return complain_error(sa->aad->error);
index 84d1f5f..1101c6f 100644 (file)
@@ -53,8 +53,7 @@ static void audit_cb(struct audit_buffer *ab, void *va)
 
 /**
  * audit_caps - audit a capability
- * @profile: profile confining task (NOT NULL)
- * @task: task capability test was performed against (NOT NULL)
+ * @profile: profile being tested for confinement (NOT NULL)
  * @cap: capability tested
  * @error: error code returned by test
  *
@@ -63,8 +62,7 @@ static void audit_cb(struct audit_buffer *ab, void *va)
  *
  * Returns: 0 or sa->error on success,  error code on failure
  */
-static int audit_caps(struct aa_profile *profile, struct task_struct *task,
-                     int cap, int error)
+static int audit_caps(struct aa_profile *profile, int cap, int error)
 {
        struct audit_cache *ent;
        int type = AUDIT_APPARMOR_AUTO;
@@ -73,7 +71,6 @@ static int audit_caps(struct aa_profile *profile, struct task_struct *task,
        sa.type = LSM_AUDIT_DATA_CAP;
        sa.aad = &aad;
        sa.u.cap = cap;
-       sa.aad->tsk = task;
        sa.aad->op = OP_CAPABLE;
        sa.aad->error = error;
 
@@ -124,8 +121,7 @@ static int profile_capable(struct aa_profile *profile, int cap)
 
 /**
  * aa_capable - test permission to use capability
- * @task: task doing capability test against (NOT NULL)
- * @profile: profile confining @task (NOT NULL)
+ * @profile: profile being tested against (NOT NULL)
  * @cap: capability to be tested
  * @audit: whether an audit record should be generated
  *
@@ -133,8 +129,7 @@ static int profile_capable(struct aa_profile *profile, int cap)
  *
  * Returns: 0 on success, or else an error code.
  */
-int aa_capable(struct task_struct *task, struct aa_profile *profile, int cap,
-              int audit)
+int aa_capable(struct aa_profile *profile, int cap, int audit)
 {
        int error = profile_capable(profile, cap);
 
@@ -144,5 +139,5 @@ int aa_capable(struct task_struct *task, struct aa_profile *profile, int cap,
                return error;
        }
 
-       return audit_caps(profile, task, cap, error);
+       return audit_caps(profile, cap, error);
 }
index 26c607c..452567d 100644 (file)
@@ -50,23 +50,21 @@ void aa_free_domain_entries(struct aa_domain *domain)
 
 /**
  * may_change_ptraced_domain - check if can change profile on ptraced task
- * @task: task we want to change profile of   (NOT NULL)
  * @to_profile: profile to change to  (NOT NULL)
  *
- * Check if the task is ptraced and if so if the tracing task is allowed
+ * Check if current is ptraced and if so if the tracing task is allowed
  * to trace the new domain
  *
  * Returns: %0 or error if change not allowed
  */
-static int may_change_ptraced_domain(struct task_struct *task,
-                                    struct aa_profile *to_profile)
+static int may_change_ptraced_domain(struct aa_profile *to_profile)
 {
        struct task_struct *tracer;
        struct aa_profile *tracerp = NULL;
        int error = 0;
 
        rcu_read_lock();
-       tracer = ptrace_parent(task);
+       tracer = ptrace_parent(current);
        if (tracer)
                /* released below */
                tracerp = aa_get_task_profile(tracer);
@@ -75,7 +73,7 @@ static int may_change_ptraced_domain(struct task_struct *task,
        if (!tracer || unconfined(tracerp))
                goto out;
 
-       error = aa_may_ptrace(tracer, tracerp, to_profile, PTRACE_MODE_ATTACH);
+       error = aa_may_ptrace(tracerp, to_profile, PTRACE_MODE_ATTACH);
 
 out:
        rcu_read_unlock();
@@ -477,7 +475,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
        }
 
        if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
-               error = may_change_ptraced_domain(current, new_profile);
+               error = may_change_ptraced_domain(new_profile);
                if (error) {
                        aa_put_profile(new_profile);
                        goto audit;
@@ -690,7 +688,7 @@ int aa_change_hat(const char *hats[], int count, u64 token, bool permtest)
                        }
                }
 
-               error = may_change_ptraced_domain(current, hat);
+               error = may_change_ptraced_domain(hat);
                if (error) {
                        info = "ptraced";
                        error = -EPERM;
@@ -829,7 +827,7 @@ int aa_change_profile(const char *ns_name, const char *hname, bool onexec,
        }
 
        /* check if tracing task is allowed to trace target domain */
-       error = may_change_ptraced_domain(current, target);
+       error = may_change_ptraced_domain(target);
        if (error) {
                info = "ptrace prevents transition";
                goto audit;
index 30e8d76..ba3dfd1 100644 (file)
@@ -109,7 +109,6 @@ struct apparmor_audit_data {
        void *profile;
        const char *name;
        const char *info;
-       struct task_struct *tsk;
        union {
                void *target;
                struct {
index 2e7c9d6..fc3fa38 100644 (file)
@@ -4,7 +4,7 @@
  * This file contains AppArmor capability mediation definitions.
  *
  * Copyright (C) 1998-2008 Novell/SUSE
- * Copyright 2009-2010 Canonical Ltd.
+ * Copyright 2009-2013 Canonical Ltd.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -38,8 +38,7 @@ struct aa_caps {
 
 extern struct aa_fs_entry aa_fs_entry_caps[];
 
-int aa_capable(struct task_struct *task, struct aa_profile *profile, int cap,
-              int audit);
+int aa_capable(struct aa_profile *profile, int cap, int audit);
 
 static inline void aa_free_cap_rules(struct aa_caps *caps)
 {
index aeda0fb..288ca76 100644 (file)
@@ -19,8 +19,8 @@
 
 struct aa_profile;
 
-int aa_may_ptrace(struct task_struct *tracer_task, struct aa_profile *tracer,
-                 struct aa_profile *tracee, unsigned int mode);
+int aa_may_ptrace(struct aa_profile *tracer, struct aa_profile *tracee,
+                 unsigned int mode);
 
 int aa_ptrace(struct task_struct *tracer, struct task_struct *tracee,
              unsigned int mode);
index c51d226..777ac1c 100644 (file)
@@ -54,15 +54,14 @@ static int aa_audit_ptrace(struct aa_profile *profile,
 
 /**
  * aa_may_ptrace - test if tracer task can trace the tracee
- * @tracer_task: task who will do the tracing  (NOT NULL)
  * @tracer: profile of the task doing the tracing  (NOT NULL)
  * @tracee: task to be traced
  * @mode: whether PTRACE_MODE_READ || PTRACE_MODE_ATTACH
  *
  * Returns: %0 else error code if permission denied or error
  */
-int aa_may_ptrace(struct task_struct *tracer_task, struct aa_profile *tracer,
-                 struct aa_profile *tracee, unsigned int mode)
+int aa_may_ptrace(struct aa_profile *tracer, struct aa_profile *tracee,
+                 unsigned int mode)
 {
        /* TODO: currently only based on capability, not extended ptrace
         *       rules,
@@ -72,7 +71,7 @@ int aa_may_ptrace(struct task_struct *tracer_task, struct aa_profile *tracer,
        if (unconfined(tracer) || tracer == tracee)
                return 0;
        /* log this capability request */
-       return aa_capable(tracer_task, tracer, CAP_SYS_PTRACE, 1);
+       return aa_capable(tracer, CAP_SYS_PTRACE, 1);
 }
 
 /**
@@ -101,7 +100,7 @@ int aa_ptrace(struct task_struct *tracer, struct task_struct *tracee,
        if (!unconfined(tracer_p)) {
                struct aa_profile *tracee_p = aa_get_task_profile(tracee);
 
-               error = aa_may_ptrace(tracer, tracer_p, tracee_p, mode);
+               error = aa_may_ptrace(tracer_p, tracee_p, mode);
                error = aa_audit_ptrace(tracer_p, tracee_p, error);
 
                aa_put_profile(tracee_p);
index fb99e18..4257b7e 100644 (file)
@@ -145,7 +145,7 @@ static int apparmor_capable(const struct cred *cred, struct user_namespace *ns,
        if (!error) {
                profile = aa_cred_profile(cred);
                if (!unconfined(profile))
-                       error = aa_capable(current, profile, cap, audit);
+                       error = aa_capable(profile, cap, audit);
        }
        return error;
 }
index dbeb9bc..8b4f24a 100644 (file)
@@ -777,9 +777,15 @@ static int cap_xfrm_policy_delete_security(struct xfrm_sec_ctx *ctx)
        return 0;
 }
 
-static int cap_xfrm_state_alloc_security(struct xfrm_state *x,
-                                        struct xfrm_user_sec_ctx *sec_ctx,
-                                        u32 secid)
+static int cap_xfrm_state_alloc(struct xfrm_state *x,
+                               struct xfrm_user_sec_ctx *sec_ctx)
+{
+       return 0;
+}
+
+static int cap_xfrm_state_alloc_acquire(struct xfrm_state *x,
+                                       struct xfrm_sec_ctx *polsec,
+                                       u32 secid)
 {
        return 0;
 }
@@ -1101,7 +1107,8 @@ void __init security_fixup_ops(struct security_operations *ops)
        set_to_cap_if_null(ops, xfrm_policy_clone_security);
        set_to_cap_if_null(ops, xfrm_policy_free_security);
        set_to_cap_if_null(ops, xfrm_policy_delete_security);
-       set_to_cap_if_null(ops, xfrm_state_alloc_security);
+       set_to_cap_if_null(ops, xfrm_state_alloc);
+       set_to_cap_if_null(ops, xfrm_state_alloc_acquire);
        set_to_cap_if_null(ops, xfrm_state_free_security);
        set_to_cap_if_null(ops, xfrm_state_delete_security);
        set_to_cap_if_null(ops, xfrm_policy_lookup);
index 0b759e1..77ca965 100644 (file)
@@ -13,7 +13,9 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/err.h>
+#include <linux/sched.h>
 #include <linux/rbtree.h>
+#include <linux/cred.h>
 #include <linux/key-type.h>
 #include <linux/digsig.h>
 
 
 static struct key *keyring[INTEGRITY_KEYRING_MAX];
 
+#ifdef CONFIG_IMA_TRUSTED_KEYRING
+static const char *keyring_name[INTEGRITY_KEYRING_MAX] = {
+       ".evm",
+       ".module",
+       ".ima",
+};
+#else
 static const char *keyring_name[INTEGRITY_KEYRING_MAX] = {
        "_evm",
        "_module",
        "_ima",
 };
+#endif
 
 int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
-                                       const char *digest, int digestlen)
+                           const char *digest, int digestlen)
 {
        if (id >= INTEGRITY_KEYRING_MAX)
                return -EINVAL;
 
        if (!keyring[id]) {
                keyring[id] =
-                       request_key(&key_type_keyring, keyring_name[id], NULL);
+                   request_key(&key_type_keyring, keyring_name[id], NULL);
                if (IS_ERR(keyring[id])) {
                        int err = PTR_ERR(keyring[id]);
                        pr_err("no %s keyring: %d\n", keyring_name[id], err);
@@ -44,9 +54,10 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
                }
        }
 
-       switch (sig[0]) {
+       switch (sig[1]) {
        case 1:
-               return digsig_verify(keyring[id], sig, siglen,
+               /* v1 API expect signature without xattr type */
+               return digsig_verify(keyring[id], sig + 1, siglen - 1,
                                     digest, digestlen);
        case 2:
                return asymmetric_verify(keyring[id], sig, siglen,
@@ -55,3 +66,21 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
 
        return -EOPNOTSUPP;
 }
+
+int integrity_init_keyring(const unsigned int id)
+{
+       const struct cred *cred = current_cred();
+       const struct user_struct *user = cred->user;
+
+       keyring[id] = keyring_alloc(keyring_name[id], KUIDT_INIT(0),
+                                   KGIDT_INIT(0), cred,
+                                   ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                                    KEY_USR_VIEW | KEY_USR_READ),
+                                   KEY_ALLOC_NOT_IN_QUOTA, user->uid_keyring);
+       if (!IS_ERR(keyring[id]))
+               set_bit(KEY_FLAG_TRUSTED_ONLY, &keyring[id]->flags);
+       else
+               pr_info("Can't allocate %s keyring (%ld)\n",
+                       keyring_name[id], PTR_ERR(keyring[id]));
+       return 0;
+}
index b475466..9eae480 100644 (file)
 #include "integrity.h"
 
 /*
- * signature format v2 - for using with asymmetric keys
- */
-struct signature_v2_hdr {
-       uint8_t version;        /* signature format version */
-       uint8_t hash_algo;      /* Digest algorithm [enum pkey_hash_algo] */
-       uint32_t keyid;         /* IMA key identifier - not X509/PGP specific*/
-       uint16_t sig_size;      /* signature size */
-       uint8_t sig[0];         /* signature payload */
-} __packed;
-
-/*
  * Request an asymmetric key.
  */
 static struct key *request_asymmetric_key(struct key *keyring, uint32_t keyid)
index af9b685..336b3dd 100644 (file)
@@ -123,7 +123,7 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry,
                goto out;
        }
 
-       xattr_len = rc - 1;
+       xattr_len = rc;
 
        /* check value type */
        switch (xattr_data->type) {
@@ -143,7 +143,7 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry,
                if (rc)
                        break;
                rc = integrity_digsig_verify(INTEGRITY_KEYRING_EVM,
-                                       xattr_data->digest, xattr_len,
+                                       (const char *)xattr_data, xattr_len,
                                        calc.digest, sizeof(calc.digest));
                if (!rc) {
                        /* we probably want to replace rsa with hmac here */
index b1753e9..46408b9 100644 (file)
@@ -11,8 +11,9 @@
 
 #include <linux/module.h>
 #include <linux/xattr.h>
+#include <linux/evm.h>
 
-int posix_xattr_acl(char *xattr)
+int posix_xattr_acl(const char *xattr)
 {
        int xattr_len = strlen(xattr);
 
index 74522db..c49d3f1 100644 (file)
@@ -70,6 +70,8 @@ struct integrity_iint_cache *integrity_iint_find(struct inode *inode)
 
 static void iint_free(struct integrity_iint_cache *iint)
 {
+       kfree(iint->ima_hash);
+       iint->ima_hash = NULL;
        iint->version = 0;
        iint->flags = 0UL;
        iint->ima_file_status = INTEGRITY_UNKNOWN;
index 39196ab..dad8d4c 100644 (file)
@@ -9,6 +9,7 @@ config IMA
        select CRYPTO_HMAC
        select CRYPTO_MD5
        select CRYPTO_SHA1
+       select CRYPTO_HASH_INFO
        select TCG_TPM if HAS_IOMEM && !UML
        select TCG_TIS if TCG_TPM && X86
        select TCG_IBMVTPM if TCG_TPM && PPC64
@@ -45,6 +46,69 @@ config IMA_LSM_RULES
        help
          Disabling this option will disregard LSM based policy rules.
 
+choice
+       prompt "Default template"
+       default IMA_NG_TEMPLATE
+       depends on IMA
+       help
+         Select the default IMA measurement template.
+
+         The original 'ima' measurement list template contains a
+         hash, defined as 20 bytes, and a null terminated pathname,
+         limited to 255 characters.  The 'ima-ng' measurement list
+         template permits both larger hash digests and longer
+         pathnames.
+
+       config IMA_TEMPLATE
+               bool "ima"
+       config IMA_NG_TEMPLATE
+               bool "ima-ng (default)"
+       config IMA_SIG_TEMPLATE
+               bool "ima-sig"
+endchoice
+
+config IMA_DEFAULT_TEMPLATE
+       string
+       depends on IMA
+       default "ima" if IMA_TEMPLATE
+       default "ima-ng" if IMA_NG_TEMPLATE
+       default "ima-sig" if IMA_SIG_TEMPLATE
+
+choice
+       prompt "Default integrity hash algorithm"
+       default IMA_DEFAULT_HASH_SHA1
+       depends on IMA
+       help
+          Select the default hash algorithm used for the measurement
+          list, integrity appraisal and audit log.  The compiled default
+          hash algorithm can be overwritten using the kernel command
+          line 'ima_hash=' option.
+
+       config IMA_DEFAULT_HASH_SHA1
+               bool "SHA1 (default)"
+               depends on CRYPTO_SHA1
+
+       config IMA_DEFAULT_HASH_SHA256
+               bool "SHA256"
+               depends on CRYPTO_SHA256 && !IMA_TEMPLATE
+
+       config IMA_DEFAULT_HASH_SHA512
+               bool "SHA512"
+               depends on CRYPTO_SHA512 && !IMA_TEMPLATE
+
+       config IMA_DEFAULT_HASH_WP512
+               bool "WP512"
+               depends on CRYPTO_WP512 && !IMA_TEMPLATE
+endchoice
+
+config IMA_DEFAULT_HASH
+       string
+       depends on IMA
+       default "sha1" if IMA_DEFAULT_HASH_SHA1
+       default "sha256" if IMA_DEFAULT_HASH_SHA256
+       default "sha512" if IMA_DEFAULT_HASH_SHA512
+       default "wp512" if IMA_DEFAULT_HASH_WP512
+
 config IMA_APPRAISE
        bool "Appraise integrity measurements"
        depends on IMA
@@ -59,3 +123,11 @@ config IMA_APPRAISE
          For more information on integrity appraisal refer to:
          <http://linux-ima.sourceforge.net>
          If unsure, say N.
+
+config IMA_TRUSTED_KEYRING
+       bool "Require all keys on the _ima keyring be signed"
+       depends on IMA_APPRAISE && SYSTEM_TRUSTED_KEYRING
+       default y
+       help
+          This option requires that all keys added to the _ima
+          keyring be signed by a key on the system trusted keyring.
index 56dfee7..d79263d 100644 (file)
@@ -6,5 +6,5 @@
 obj-$(CONFIG_IMA) += ima.o
 
 ima-y := ima_fs.o ima_queue.o ima_init.o ima_main.o ima_crypto.o ima_api.o \
-        ima_policy.o
+        ima_policy.o ima_template.o ima_template_lib.o
 ima-$(CONFIG_IMA_APPRAISE) += ima_appraise.o
index b3dd616..bf03c6a 100644 (file)
@@ -36,23 +36,48 @@ enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8 };
 #define IMA_HASH_BITS 9
 #define IMA_MEASURE_HTABLE_SIZE (1 << IMA_HASH_BITS)
 
+#define IMA_TEMPLATE_FIELD_ID_MAX_LEN  16
+#define IMA_TEMPLATE_NUM_FIELDS_MAX    15
+
+#define IMA_TEMPLATE_IMA_NAME "ima"
+#define IMA_TEMPLATE_IMA_FMT "d|n"
+
 /* set during initialization */
 extern int ima_initialized;
 extern int ima_used_chip;
-extern char *ima_hash;
+extern int ima_hash_algo;
 extern int ima_appraise;
 
-/* IMA inode template definition */
-struct ima_template_data {
-       u8 digest[IMA_DIGEST_SIZE];     /* sha1/md5 measurement hash */
-       char file_name[IMA_EVENT_NAME_LEN_MAX + 1];     /* name + \0 */
+/* IMA template field data definition */
+struct ima_field_data {
+       u8 *data;
+       u32 len;
+};
+
+/* IMA template field definition */
+struct ima_template_field {
+       const char field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN];
+       int (*field_init) (struct integrity_iint_cache *iint, struct file *file,
+                          const unsigned char *filename,
+                          struct evm_ima_xattr_data *xattr_value,
+                          int xattr_len, struct ima_field_data *field_data);
+       void (*field_show) (struct seq_file *m, enum ima_show_type show,
+                           struct ima_field_data *field_data);
+};
+
+/* IMA template descriptor definition */
+struct ima_template_desc {
+       char *name;
+       char *fmt;
+       int num_fields;
+       struct ima_template_field **fields;
 };
 
 struct ima_template_entry {
-       u8 digest[IMA_DIGEST_SIZE];     /* sha1 or md5 measurement hash */
-       const char *template_name;
-       int template_len;
-       struct ima_template_data template;
+       u8 digest[TPM_DIGEST_SIZE];     /* sha1 or md5 measurement hash */
+       struct ima_template_desc *template_desc; /* template descriptor */
+       u32 template_data_len;
+       struct ima_field_data template_data[0]; /* template related data */
 };
 
 struct ima_queue_entry {
@@ -69,13 +94,21 @@ int ima_fs_init(void);
 void ima_fs_cleanup(void);
 int ima_inode_alloc(struct inode *inode);
 int ima_add_template_entry(struct ima_template_entry *entry, int violation,
-                          const char *op, struct inode *inode);
-int ima_calc_file_hash(struct file *file, char *digest);
-int ima_calc_buffer_hash(const void *data, int len, char *digest);
-int ima_calc_boot_aggregate(char *digest);
-void ima_add_violation(struct inode *inode, const unsigned char *filename,
+                          const char *op, struct inode *inode,
+                          const unsigned char *filename);
+int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash);
+int ima_calc_field_array_hash(struct ima_field_data *field_data, int num_fields,
+                             struct ima_digest_data *hash);
+int __init ima_calc_boot_aggregate(struct ima_digest_data *hash);
+void ima_add_violation(struct file *file, const unsigned char *filename,
                       const char *op, const char *cause);
 int ima_init_crypto(void);
+void ima_putc(struct seq_file *m, void *data, int datalen);
+void ima_print_digest(struct seq_file *m, u8 *digest, int size);
+struct ima_template_desc *ima_template_desc_current(void);
+int ima_init_template(void);
+
+int ima_init_template(void);
 
 /*
  * used to protect h_table and sha_table
@@ -98,14 +131,21 @@ static inline unsigned long ima_hash_key(u8 *digest)
 int ima_get_action(struct inode *inode, int mask, int function);
 int ima_must_measure(struct inode *inode, int mask, int function);
 int ima_collect_measurement(struct integrity_iint_cache *iint,
-                           struct file *file);
+                           struct file *file,
+                           struct evm_ima_xattr_data **xattr_value,
+                           int *xattr_len);
 void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file,
-                          const unsigned char *filename);
+                          const unsigned char *filename,
+                          struct evm_ima_xattr_data *xattr_value,
+                          int xattr_len);
 void ima_audit_measurement(struct integrity_iint_cache *iint,
                           const unsigned char *filename);
+int ima_alloc_init_template(struct integrity_iint_cache *iint,
+                           struct file *file, const unsigned char *filename,
+                           struct evm_ima_xattr_data *xattr_value,
+                           int xattr_len, struct ima_template_entry **entry);
 int ima_store_template(struct ima_template_entry *entry, int violation,
-                      struct inode *inode);
-void ima_template_show(struct seq_file *m, void *e, enum ima_show_type show);
+                      struct inode *inode, const unsigned char *filename);
 const char *ima_d_path(struct path *path, char **pathbuf);
 
 /* rbtree tree calls to lookup, insert, delete
@@ -131,17 +171,25 @@ void ima_delete_rules(void);
 
 #ifdef CONFIG_IMA_APPRAISE
 int ima_appraise_measurement(int func, struct integrity_iint_cache *iint,
-                            struct file *file, const unsigned char *filename);
+                            struct file *file, const unsigned char *filename,
+                            struct evm_ima_xattr_data *xattr_value,
+                            int xattr_len);
 int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func);
 void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file);
 enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint,
                                           int func);
+void ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value, int xattr_len,
+                      struct ima_digest_data *hash);
+int ima_read_xattr(struct dentry *dentry,
+                  struct evm_ima_xattr_data **xattr_value);
 
 #else
 static inline int ima_appraise_measurement(int func,
                                           struct integrity_iint_cache *iint,
                                           struct file *file,
-                                          const unsigned char *filename)
+                                          const unsigned char *filename,
+                                          struct evm_ima_xattr_data *xattr_value,
+                                          int xattr_len)
 {
        return INTEGRITY_UNKNOWN;
 }
@@ -162,6 +210,19 @@ static inline enum integrity_status ima_get_cache_status(struct integrity_iint_c
 {
        return INTEGRITY_UNKNOWN;
 }
+
+static inline void ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value,
+                                    int xattr_len,
+                                    struct ima_digest_data *hash)
+{
+}
+
+static inline int ima_read_xattr(struct dentry *dentry,
+                                struct evm_ima_xattr_data **xattr_value)
+{
+       return 0;
+}
+
 #endif
 
 /* LSM based policy rules require audit */
index 1c03e8f..0e75408 100644 (file)
 #include <linux/fs.h>
 #include <linux/xattr.h>
 #include <linux/evm.h>
+#include <crypto/hash_info.h>
 #include "ima.h"
 
-static const char *IMA_TEMPLATE_NAME = "ima";
+/*
+ * ima_alloc_init_template - create and initialize a new template entry
+ */
+int ima_alloc_init_template(struct integrity_iint_cache *iint,
+                           struct file *file, const unsigned char *filename,
+                           struct evm_ima_xattr_data *xattr_value,
+                           int xattr_len, struct ima_template_entry **entry)
+{
+       struct ima_template_desc *template_desc = ima_template_desc_current();
+       int i, result = 0;
+
+       *entry = kzalloc(sizeof(**entry) + template_desc->num_fields *
+                        sizeof(struct ima_field_data), GFP_NOFS);
+       if (!*entry)
+               return -ENOMEM;
+
+       for (i = 0; i < template_desc->num_fields; i++) {
+               struct ima_template_field *field = template_desc->fields[i];
+               u32 len;
+
+               result = field->field_init(iint, file, filename,
+                                          xattr_value, xattr_len,
+                                          &((*entry)->template_data[i]));
+               if (result != 0)
+                       goto out;
+
+               len = (*entry)->template_data[i].len;
+               (*entry)->template_data_len += sizeof(len);
+               (*entry)->template_data_len += len;
+       }
+       (*entry)->template_desc = template_desc;
+       return 0;
+out:
+       kfree(*entry);
+       *entry = NULL;
+       return result;
+}
 
 /*
  * ima_store_template - store ima template measurements
@@ -39,28 +76,34 @@ static const char *IMA_TEMPLATE_NAME = "ima";
  * Returns 0 on success, error code otherwise
  */
 int ima_store_template(struct ima_template_entry *entry,
-                      int violation, struct inode *inode)
+                      int violation, struct inode *inode,
+                      const unsigned char *filename)
 {
        const char *op = "add_template_measure";
        const char *audit_cause = "hashing_error";
+       char *template_name = entry->template_desc->name;
        int result;
-
-       memset(entry->digest, 0, sizeof(entry->digest));
-       entry->template_name = IMA_TEMPLATE_NAME;
-       entry->template_len = sizeof(entry->template);
+       struct {
+               struct ima_digest_data hdr;
+               char digest[TPM_DIGEST_SIZE];
+       } hash;
 
        if (!violation) {
-               result = ima_calc_buffer_hash(&entry->template,
-                                               entry->template_len,
-                                               entry->digest);
+               int num_fields = entry->template_desc->num_fields;
+
+               /* this function uses default algo */
+               hash.hdr.algo = HASH_ALGO_SHA1;
+               result = ima_calc_field_array_hash(&entry->template_data[0],
+                                                  num_fields, &hash.hdr);
                if (result < 0) {
                        integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode,
-                                           entry->template_name, op,
+                                           template_name, op,
                                            audit_cause, result, 0);
                        return result;
                }
+               memcpy(entry->digest, hash.hdr.digest, hash.hdr.length);
        }
-       result = ima_add_template_entry(entry, violation, op, inode);
+       result = ima_add_template_entry(entry, violation, op, inode, filename);
        return result;
 }
 
@@ -71,24 +114,24 @@ int ima_store_template(struct ima_template_entry *entry,
  * By extending the PCR with 0xFF's instead of with zeroes, the PCR
  * value is invalidated.
  */
-void ima_add_violation(struct inode *inode, const unsigned char *filename,
+void ima_add_violation(struct file *file, const unsigned char *filename,
                       const char *op, const char *cause)
 {
        struct ima_template_entry *entry;
+       struct inode *inode = file->f_dentry->d_inode;
        int violation = 1;
        int result;
 
        /* can overflow, only indicator */
        atomic_long_inc(&ima_htable.violations);
 
-       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-       if (!entry) {
+       result = ima_alloc_init_template(NULL, file, filename,
+                                        NULL, 0, &entry);
+       if (result < 0) {
                result = -ENOMEM;
                goto err_out;
        }
-       memset(&entry->template, 0, sizeof(entry->template));
-       strncpy(entry->template.file_name, filename, IMA_EVENT_NAME_LEN_MAX);
-       result = ima_store_template(entry, violation, inode);
+       result = ima_store_template(entry, violation, inode, filename);
        if (result < 0)
                kfree(entry);
 err_out:
@@ -138,20 +181,42 @@ int ima_must_measure(struct inode *inode, int mask, int function)
  * Return 0 on success, error code otherwise
  */
 int ima_collect_measurement(struct integrity_iint_cache *iint,
-                           struct file *file)
+                           struct file *file,
+                           struct evm_ima_xattr_data **xattr_value,
+                           int *xattr_len)
 {
        struct inode *inode = file_inode(file);
        const char *filename = file->f_dentry->d_name.name;
        int result = 0;
+       struct {
+               struct ima_digest_data hdr;
+               char digest[IMA_MAX_DIGEST_SIZE];
+       } hash;
+
+       if (xattr_value)
+               *xattr_len = ima_read_xattr(file->f_dentry, xattr_value);
 
        if (!(iint->flags & IMA_COLLECTED)) {
                u64 i_version = file_inode(file)->i_version;
 
-               iint->ima_xattr.type = IMA_XATTR_DIGEST;
-               result = ima_calc_file_hash(file, iint->ima_xattr.digest);
+               /* use default hash algorithm */
+               hash.hdr.algo = ima_hash_algo;
+
+               if (xattr_value)
+                       ima_get_hash_algo(*xattr_value, *xattr_len, &hash.hdr);
+
+               result = ima_calc_file_hash(file, &hash.hdr);
                if (!result) {
-                       iint->version = i_version;
-                       iint->flags |= IMA_COLLECTED;
+                       int length = sizeof(hash.hdr) + hash.hdr.length;
+                       void *tmpbuf = krealloc(iint->ima_hash, length,
+                                               GFP_NOFS);
+                       if (tmpbuf) {
+                               iint->ima_hash = tmpbuf;
+                               memcpy(iint->ima_hash, &hash, length);
+                               iint->version = i_version;
+                               iint->flags |= IMA_COLLECTED;
+                       } else
+                               result = -ENOMEM;
                }
        }
        if (result)
@@ -177,7 +242,9 @@ int ima_collect_measurement(struct integrity_iint_cache *iint,
  * Must be called with iint->mutex held.
  */
 void ima_store_measurement(struct integrity_iint_cache *iint,
-                          struct file *file, const unsigned char *filename)
+                          struct file *file, const unsigned char *filename,
+                          struct evm_ima_xattr_data *xattr_value,
+                          int xattr_len)
 {
        const char *op = "add_template_measure";
        const char *audit_cause = "ENOMEM";
@@ -189,19 +256,15 @@ void ima_store_measurement(struct integrity_iint_cache *iint,
        if (iint->flags & IMA_MEASURED)
                return;
 
-       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-       if (!entry) {
+       result = ima_alloc_init_template(iint, file, filename,
+                                        xattr_value, xattr_len, &entry);
+       if (result < 0) {
                integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename,
                                    op, audit_cause, result, 0);
                return;
        }
-       memset(&entry->template, 0, sizeof(entry->template));
-       memcpy(entry->template.digest, iint->ima_xattr.digest, IMA_DIGEST_SIZE);
-       strcpy(entry->template.file_name,
-              (strlen(filename) > IMA_EVENT_NAME_LEN_MAX) ?
-              file->f_dentry->d_name.name : filename);
 
-       result = ima_store_template(entry, violation, inode);
+       result = ima_store_template(entry, violation, inode, filename);
        if (!result || result == -EEXIST)
                iint->flags |= IMA_MEASURED;
        if (result < 0)
@@ -212,14 +275,16 @@ void ima_audit_measurement(struct integrity_iint_cache *iint,
                           const unsigned char *filename)
 {
        struct audit_buffer *ab;
-       char hash[(IMA_DIGEST_SIZE * 2) + 1];
+       char hash[(iint->ima_hash->length * 2) + 1];
+       const char *algo_name = hash_algo_name[iint->ima_hash->algo];
+       char algo_hash[sizeof(hash) + strlen(algo_name) + 2];
        int i;
 
        if (iint->flags & IMA_AUDITED)
                return;
 
-       for (i = 0; i < IMA_DIGEST_SIZE; i++)
-               hex_byte_pack(hash + (i * 2), iint->ima_xattr.digest[i]);
+       for (i = 0; i < iint->ima_hash->length; i++)
+               hex_byte_pack(hash + (i * 2), iint->ima_hash->digest[i]);
        hash[i * 2] = '\0';
 
        ab = audit_log_start(current->audit_context, GFP_KERNEL,
@@ -230,7 +295,8 @@ void ima_audit_measurement(struct integrity_iint_cache *iint,
        audit_log_format(ab, "file=");
        audit_log_untrustedstring(ab, filename);
        audit_log_format(ab, " hash=");
-       audit_log_untrustedstring(ab, hash);
+       snprintf(algo_hash, sizeof(algo_hash), "%s:%s", algo_name, hash);
+       audit_log_untrustedstring(ab, algo_hash);
 
        audit_log_task_info(ab, current);
        audit_log_end(ab);
index 2d4beca..46353ee 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/magic.h>
 #include <linux/ima.h>
 #include <linux/evm.h>
+#include <crypto/hash_info.h>
 
 #include "ima.h"
 
@@ -43,19 +44,31 @@ int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func)
 }
 
 static int ima_fix_xattr(struct dentry *dentry,
-                         struct integrity_iint_cache *iint)
+                        struct integrity_iint_cache *iint)
 {
-       iint->ima_xattr.type = IMA_XATTR_DIGEST;
-       return __vfs_setxattr_noperm(dentry, XATTR_NAME_IMA,
-                                    (u8 *)&iint->ima_xattr,
-                                     sizeof(iint->ima_xattr), 0);
+       int rc, offset;
+       u8 algo = iint->ima_hash->algo;
+
+       if (algo <= HASH_ALGO_SHA1) {
+               offset = 1;
+               iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST;
+       } else {
+               offset = 0;
+               iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG;
+               iint->ima_hash->xattr.ng.algo = algo;
+       }
+       rc = __vfs_setxattr_noperm(dentry, XATTR_NAME_IMA,
+                                  &iint->ima_hash->xattr.data[offset],
+                                  (sizeof(iint->ima_hash->xattr) - offset) +
+                                  iint->ima_hash->length, 0);
+       return rc;
 }
 
 /* Return specific func appraised cached result */
 enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint,
                                           int func)
 {
-       switch(func) {
+       switch (func) {
        case MMAP_CHECK:
                return iint->ima_mmap_status;
        case BPRM_CHECK:
@@ -71,7 +84,7 @@ enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint,
 static void ima_set_cache_status(struct integrity_iint_cache *iint,
                                 int func, enum integrity_status status)
 {
-       switch(func) {
+       switch (func) {
        case MMAP_CHECK:
                iint->ima_mmap_status = status;
                break;
@@ -90,7 +103,7 @@ static void ima_set_cache_status(struct integrity_iint_cache *iint,
 
 static void ima_cache_flags(struct integrity_iint_cache *iint, int func)
 {
-       switch(func) {
+       switch (func) {
        case MMAP_CHECK:
                iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED);
                break;
@@ -107,6 +120,50 @@ static void ima_cache_flags(struct integrity_iint_cache *iint, int func)
        }
 }
 
+void ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value, int xattr_len,
+                      struct ima_digest_data *hash)
+{
+       struct signature_v2_hdr *sig;
+
+       if (!xattr_value || xattr_len < 2)
+               return;
+
+       switch (xattr_value->type) {
+       case EVM_IMA_XATTR_DIGSIG:
+               sig = (typeof(sig))xattr_value;
+               if (sig->version != 2 || xattr_len <= sizeof(*sig))
+                       return;
+               hash->algo = sig->hash_algo;
+               break;
+       case IMA_XATTR_DIGEST_NG:
+               hash->algo = xattr_value->digest[0];
+               break;
+       case IMA_XATTR_DIGEST:
+               /* this is for backward compatibility */
+               if (xattr_len == 21) {
+                       unsigned int zero = 0;
+                       if (!memcmp(&xattr_value->digest[16], &zero, 4))
+                               hash->algo = HASH_ALGO_MD5;
+                       else
+                               hash->algo = HASH_ALGO_SHA1;
+               } else if (xattr_len == 17)
+                       hash->algo = HASH_ALGO_MD5;
+               break;
+       }
+}
+
+int ima_read_xattr(struct dentry *dentry,
+                  struct evm_ima_xattr_data **xattr_value)
+{
+       struct inode *inode = dentry->d_inode;
+
+       if (!inode->i_op->getxattr)
+               return 0;
+
+       return vfs_getxattr_alloc(dentry, XATTR_NAME_IMA, (char **)xattr_value,
+                                 0, GFP_NOFS);
+}
+
 /*
  * ima_appraise_measurement - appraise file measurement
  *
@@ -116,23 +173,22 @@ static void ima_cache_flags(struct integrity_iint_cache *iint, int func)
  * Return 0 on success, error code otherwise
  */
 int ima_appraise_measurement(int func, struct integrity_iint_cache *iint,
-                            struct file *file, const unsigned char *filename)
+                            struct file *file, const unsigned char *filename,
+                            struct evm_ima_xattr_data *xattr_value,
+                            int xattr_len)
 {
        struct dentry *dentry = file->f_dentry;
        struct inode *inode = dentry->d_inode;
-       struct evm_ima_xattr_data *xattr_value = NULL;
        enum integrity_status status = INTEGRITY_UNKNOWN;
        const char *op = "appraise_data";
        char *cause = "unknown";
-       int rc;
+       int rc = xattr_len, hash_start = 0;
 
        if (!ima_appraise)
                return 0;
        if (!inode->i_op->getxattr)
                return INTEGRITY_UNKNOWN;
 
-       rc = vfs_getxattr_alloc(dentry, XATTR_NAME_IMA, (char **)&xattr_value,
-                               0, GFP_NOFS);
        if (rc <= 0) {
                if (rc && rc != -ENODATA)
                        goto out;
@@ -153,14 +209,25 @@ int ima_appraise_measurement(int func, struct integrity_iint_cache *iint,
                goto out;
        }
        switch (xattr_value->type) {
+       case IMA_XATTR_DIGEST_NG:
+               /* first byte contains algorithm id */
+               hash_start = 1;
        case IMA_XATTR_DIGEST:
                if (iint->flags & IMA_DIGSIG_REQUIRED) {
                        cause = "IMA signature required";
                        status = INTEGRITY_FAIL;
                        break;
                }
-               rc = memcmp(xattr_value->digest, iint->ima_xattr.digest,
-                           IMA_DIGEST_SIZE);
+               if (xattr_len - sizeof(xattr_value->type) - hash_start >=
+                               iint->ima_hash->length)
+                       /* xattr length may be longer. md5 hash in previous
+                          version occupied 20 bytes in xattr, instead of 16
+                        */
+                       rc = memcmp(&xattr_value->digest[hash_start],
+                                   iint->ima_hash->digest,
+                                   iint->ima_hash->length);
+               else
+                       rc = -EINVAL;
                if (rc) {
                        cause = "invalid-hash";
                        status = INTEGRITY_FAIL;
@@ -171,9 +238,9 @@ int ima_appraise_measurement(int func, struct integrity_iint_cache *iint,
        case EVM_IMA_XATTR_DIGSIG:
                iint->flags |= IMA_DIGSIG;
                rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA,
-                                            xattr_value->digest, rc - 1,
-                                            iint->ima_xattr.digest,
-                                            IMA_DIGEST_SIZE);
+                                            (const char *)xattr_value, rc,
+                                            iint->ima_hash->digest,
+                                            iint->ima_hash->length);
                if (rc == -EOPNOTSUPP) {
                        status = INTEGRITY_UNKNOWN;
                } else if (rc) {
@@ -203,7 +270,6 @@ out:
                ima_cache_flags(iint, func);
        }
        ima_set_cache_status(iint, func, status);
-       kfree(xattr_value);
        return status;
 }
 
@@ -219,7 +285,7 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file)
        if (iint->flags & IMA_DIGSIG)
                return;
 
-       rc = ima_collect_measurement(iint, file);
+       rc = ima_collect_measurement(iint, file, NULL, NULL);
        if (rc < 0)
                return;
 
@@ -315,3 +381,14 @@ int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name)
        }
        return result;
 }
+
+#ifdef CONFIG_IMA_TRUSTED_KEYRING
+static int __init init_ima_keyring(void)
+{
+       int ret;
+
+       ret = integrity_init_keyring(INTEGRITY_KEYRING_IMA);
+       return 0;
+}
+late_initcall(init_ima_keyring);
+#endif
index a02e079..676e029 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <crypto/hash.h>
+#include <crypto/hash_info.h>
 #include "ima.h"
 
 static struct crypto_shash *ima_shash_tfm;
@@ -28,31 +29,58 @@ int ima_init_crypto(void)
 {
        long rc;
 
-       ima_shash_tfm = crypto_alloc_shash(ima_hash, 0, 0);
+       ima_shash_tfm = crypto_alloc_shash(hash_algo_name[ima_hash_algo], 0, 0);
        if (IS_ERR(ima_shash_tfm)) {
                rc = PTR_ERR(ima_shash_tfm);
-               pr_err("Can not allocate %s (reason: %ld)\n", ima_hash, rc);
+               pr_err("Can not allocate %s (reason: %ld)\n",
+                      hash_algo_name[ima_hash_algo], rc);
                return rc;
        }
        return 0;
 }
 
+static struct crypto_shash *ima_alloc_tfm(enum hash_algo algo)
+{
+       struct crypto_shash *tfm = ima_shash_tfm;
+       int rc;
+
+       if (algo != ima_hash_algo && algo < HASH_ALGO__LAST) {
+               tfm = crypto_alloc_shash(hash_algo_name[algo], 0, 0);
+               if (IS_ERR(tfm)) {
+                       rc = PTR_ERR(tfm);
+                       pr_err("Can not allocate %s (reason: %d)\n",
+                              hash_algo_name[algo], rc);
+               }
+       }
+       return tfm;
+}
+
+static void ima_free_tfm(struct crypto_shash *tfm)
+{
+       if (tfm != ima_shash_tfm)
+               crypto_free_shash(tfm);
+}
+
 /*
  * Calculate the MD5/SHA1 file digest
  */
-int ima_calc_file_hash(struct file *file, char *digest)
+static int ima_calc_file_hash_tfm(struct file *file,
+                                 struct ima_digest_data *hash,
+                                 struct crypto_shash *tfm)
 {
        loff_t i_size, offset = 0;
        char *rbuf;
        int rc, read = 0;
        struct {
                struct shash_desc shash;
-               char ctx[crypto_shash_descsize(ima_shash_tfm)];
+               char ctx[crypto_shash_descsize(tfm)];
        } desc;
 
-       desc.shash.tfm = ima_shash_tfm;
+       desc.shash.tfm = tfm;
        desc.shash.flags = 0;
 
+       hash->length = crypto_shash_digestsize(tfm);
+
        rc = crypto_shash_init(&desc.shash);
        if (rc != 0)
                return rc;
@@ -85,27 +113,83 @@ int ima_calc_file_hash(struct file *file, char *digest)
        }
        kfree(rbuf);
        if (!rc)
-               rc = crypto_shash_final(&desc.shash, digest);
+               rc = crypto_shash_final(&desc.shash, hash->digest);
        if (read)
                file->f_mode &= ~FMODE_READ;
 out:
        return rc;
 }
 
+int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash)
+{
+       struct crypto_shash *tfm;
+       int rc;
+
+       tfm = ima_alloc_tfm(hash->algo);
+       if (IS_ERR(tfm))
+               return PTR_ERR(tfm);
+
+       rc = ima_calc_file_hash_tfm(file, hash, tfm);
+
+       ima_free_tfm(tfm);
+
+       return rc;
+}
+
 /*
- * Calculate the hash of a given buffer
+ * Calculate the hash of template data
  */
-int ima_calc_buffer_hash(const void *data, int len, char *digest)
+static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data,
+                                        int num_fields,
+                                        struct ima_digest_data *hash,
+                                        struct crypto_shash *tfm)
 {
        struct {
                struct shash_desc shash;
-               char ctx[crypto_shash_descsize(ima_shash_tfm)];
+               char ctx[crypto_shash_descsize(tfm)];
        } desc;
+       int rc, i;
 
-       desc.shash.tfm = ima_shash_tfm;
+       desc.shash.tfm = tfm;
        desc.shash.flags = 0;
 
-       return crypto_shash_digest(&desc.shash, data, len, digest);
+       hash->length = crypto_shash_digestsize(tfm);
+
+       rc = crypto_shash_init(&desc.shash);
+       if (rc != 0)
+               return rc;
+
+       for (i = 0; i < num_fields; i++) {
+               rc = crypto_shash_update(&desc.shash,
+                                        (const u8 *) &field_data[i].len,
+                                        sizeof(field_data[i].len));
+               rc = crypto_shash_update(&desc.shash, field_data[i].data,
+                                        field_data[i].len);
+               if (rc)
+                       break;
+       }
+
+       if (!rc)
+               rc = crypto_shash_final(&desc.shash, hash->digest);
+
+       return rc;
+}
+
+int ima_calc_field_array_hash(struct ima_field_data *field_data, int num_fields,
+                             struct ima_digest_data *hash)
+{
+       struct crypto_shash *tfm;
+       int rc;
+
+       tfm = ima_alloc_tfm(hash->algo);
+       if (IS_ERR(tfm))
+               return PTR_ERR(tfm);
+
+       rc = ima_calc_field_array_hash_tfm(field_data, num_fields, hash, tfm);
+
+       ima_free_tfm(tfm);
+
+       return rc;
 }
 
 static void __init ima_pcrread(int idx, u8 *pcr)
@@ -120,16 +204,17 @@ static void __init ima_pcrread(int idx, u8 *pcr)
 /*
  * Calculate the boot aggregate hash
  */
-int __init ima_calc_boot_aggregate(char *digest)
+static int __init ima_calc_boot_aggregate_tfm(char *digest,
+                                             struct crypto_shash *tfm)
 {
-       u8 pcr_i[IMA_DIGEST_SIZE];
+       u8 pcr_i[TPM_DIGEST_SIZE];
        int rc, i;
        struct {
                struct shash_desc shash;
-               char ctx[crypto_shash_descsize(ima_shash_tfm)];
+               char ctx[crypto_shash_descsize(tfm)];
        } desc;
 
-       desc.shash.tfm = ima_shash_tfm;
+       desc.shash.tfm = tfm;
        desc.shash.flags = 0;
 
        rc = crypto_shash_init(&desc.shash);
@@ -140,9 +225,26 @@ int __init ima_calc_boot_aggregate(char *digest)
        for (i = TPM_PCR0; i < TPM_PCR8; i++) {
                ima_pcrread(i, pcr_i);
                /* now accumulate with current aggregate */
-               rc = crypto_shash_update(&desc.shash, pcr_i, IMA_DIGEST_SIZE);
+               rc = crypto_shash_update(&desc.shash, pcr_i, TPM_DIGEST_SIZE);
        }
        if (!rc)
                crypto_shash_final(&desc.shash, digest);
        return rc;
 }
+
+int __init ima_calc_boot_aggregate(struct ima_digest_data *hash)
+{
+       struct crypto_shash *tfm;
+       int rc;
+
+       tfm = ima_alloc_tfm(hash->algo);
+       if (IS_ERR(tfm))
+               return PTR_ERR(tfm);
+
+       hash->length = crypto_shash_digestsize(tfm);
+       rc = ima_calc_boot_aggregate_tfm(hash->digest, tfm);
+
+       ima_free_tfm(tfm);
+
+       return rc;
+}
index 38477c9..d47a7c8 100644 (file)
@@ -88,8 +88,7 @@ static void *ima_measurements_next(struct seq_file *m, void *v, loff_t *pos)
         * against concurrent list-extension
         */
        rcu_read_lock();
-       qe = list_entry_rcu(qe->later.next,
-                           struct ima_queue_entry, later);
+       qe = list_entry_rcu(qe->later.next, struct ima_queue_entry, later);
        rcu_read_unlock();
        (*pos)++;
 
@@ -100,7 +99,7 @@ static void ima_measurements_stop(struct seq_file *m, void *v)
 {
 }
 
-static void ima_putc(struct seq_file *m, void *data, int datalen)
+void ima_putc(struct seq_file *m, void *data, int datalen)
 {
        while (datalen--)
                seq_putc(m, *(char *)data++);
@@ -111,6 +110,7 @@ static void ima_putc(struct seq_file *m, void *data, int datalen)
  *       char[20]=template digest
  *       32bit-le=template name size
  *       char[n]=template name
+ *       [eventdata length]
  *       eventdata[n]=template specific data
  */
 static int ima_measurements_show(struct seq_file *m, void *v)
@@ -120,6 +120,7 @@ static int ima_measurements_show(struct seq_file *m, void *v)
        struct ima_template_entry *e;
        int namelen;
        u32 pcr = CONFIG_IMA_MEASURE_PCR_IDX;
+       int i;
 
        /* get entry */
        e = qe->entry;
@@ -134,18 +135,25 @@ static int ima_measurements_show(struct seq_file *m, void *v)
        ima_putc(m, &pcr, sizeof pcr);
 
        /* 2nd: template digest */
-       ima_putc(m, e->digest, IMA_DIGEST_SIZE);
+       ima_putc(m, e->digest, TPM_DIGEST_SIZE);
 
        /* 3rd: template name size */
-       namelen = strlen(e->template_name);
+       namelen = strlen(e->template_desc->name);
        ima_putc(m, &namelen, sizeof namelen);
 
        /* 4th:  template name */
-       ima_putc(m, (void *)e->template_name, namelen);
+       ima_putc(m, e->template_desc->name, namelen);
+
+       /* 5th:  template length (except for 'ima' template) */
+       if (strcmp(e->template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0)
+               ima_putc(m, &e->template_data_len,
+                        sizeof(e->template_data_len));
 
-       /* 5th:  template specific data */
-       ima_template_show(m, (struct ima_template_data *)&e->template,
-                         IMA_SHOW_BINARY);
+       /* 6th:  template specific data */
+       for (i = 0; i < e->template_desc->num_fields; i++) {
+               e->template_desc->fields[i]->field_show(m, IMA_SHOW_BINARY,
+                                                       &e->template_data[i]);
+       }
        return 0;
 }
 
@@ -168,41 +176,21 @@ static const struct file_operations ima_measurements_ops = {
        .release = seq_release,
 };
 
-static void ima_print_digest(struct seq_file *m, u8 *digest)
+void ima_print_digest(struct seq_file *m, u8 *digest, int size)
 {
        int i;
 
-       for (i = 0; i < IMA_DIGEST_SIZE; i++)
+       for (i = 0; i < size; i++)
                seq_printf(m, "%02x", *(digest + i));
 }
 
-void ima_template_show(struct seq_file *m, void *e, enum ima_show_type show)
-{
-       struct ima_template_data *entry = e;
-       int namelen;
-
-       switch (show) {
-       case IMA_SHOW_ASCII:
-               ima_print_digest(m, entry->digest);
-               seq_printf(m, " %s\n", entry->file_name);
-               break;
-       case IMA_SHOW_BINARY:
-               ima_putc(m, entry->digest, IMA_DIGEST_SIZE);
-
-               namelen = strlen(entry->file_name);
-               ima_putc(m, &namelen, sizeof namelen);
-               ima_putc(m, entry->file_name, namelen);
-       default:
-               break;
-       }
-}
-
 /* print in ascii */
 static int ima_ascii_measurements_show(struct seq_file *m, void *v)
 {
        /* the list never shrinks, so we don't need a lock here */
        struct ima_queue_entry *qe = v;
        struct ima_template_entry *e;
+       int i;
 
        /* get entry */
        e = qe->entry;
@@ -213,14 +201,21 @@ static int ima_ascii_measurements_show(struct seq_file *m, void *v)
        seq_printf(m, "%2d ", CONFIG_IMA_MEASURE_PCR_IDX);
 
        /* 2nd: SHA1 template hash */
-       ima_print_digest(m, e->digest);
+       ima_print_digest(m, e->digest, TPM_DIGEST_SIZE);
 
        /* 3th:  template name */
-       seq_printf(m, " %s ", e->template_name);
+       seq_printf(m, " %s", e->template_desc->name);
 
        /* 4th:  template specific data */
-       ima_template_show(m, (struct ima_template_data *)&e->template,
-                         IMA_SHOW_ASCII);
+       for (i = 0; i < e->template_desc->num_fields; i++) {
+               seq_puts(m, " ");
+               if (e->template_data[i].len == 0)
+                       continue;
+
+               e->template_desc->fields[i]->field_show(m, IMA_SHOW_ASCII,
+                                                       &e->template_data[i]);
+       }
+       seq_puts(m, "\n");
        return 0;
 }
 
index 162ea72..15f34bd 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <linux/err.h>
+#include <crypto/hash_info.h>
 #include "ima.h"
 
 /* name for boot aggregate entry */
@@ -42,28 +43,38 @@ int ima_used_chip;
 static void __init ima_add_boot_aggregate(void)
 {
        struct ima_template_entry *entry;
+       struct integrity_iint_cache tmp_iint, *iint = &tmp_iint;
        const char *op = "add_boot_aggregate";
        const char *audit_cause = "ENOMEM";
        int result = -ENOMEM;
-       int violation = 1;
+       int violation = 0;
+       struct {
+               struct ima_digest_data hdr;
+               char digest[TPM_DIGEST_SIZE];
+       } hash;
 
-       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-       if (!entry)
-               goto err_out;
+       memset(iint, 0, sizeof(*iint));
+       memset(&hash, 0, sizeof(hash));
+       iint->ima_hash = &hash.hdr;
+       iint->ima_hash->algo = HASH_ALGO_SHA1;
+       iint->ima_hash->length = SHA1_DIGEST_SIZE;
 
-       memset(&entry->template, 0, sizeof(entry->template));
-       strncpy(entry->template.file_name, boot_aggregate_name,
-               IMA_EVENT_NAME_LEN_MAX);
        if (ima_used_chip) {
-               violation = 0;
-               result = ima_calc_boot_aggregate(entry->template.digest);
+               result = ima_calc_boot_aggregate(&hash.hdr);
                if (result < 0) {
                        audit_cause = "hashing_error";
                        kfree(entry);
                        goto err_out;
                }
        }
-       result = ima_store_template(entry, violation, NULL);
+
+       result = ima_alloc_init_template(iint, NULL, boot_aggregate_name,
+                                        NULL, 0, &entry);
+       if (result < 0)
+               return;
+
+       result = ima_store_template(entry, violation, NULL,
+                                   boot_aggregate_name);
        if (result < 0)
                kfree(entry);
        return;
@@ -74,7 +85,7 @@ err_out:
 
 int __init ima_init(void)
 {
-       u8 pcr_i[IMA_DIGEST_SIZE];
+       u8 pcr_i[TPM_DIGEST_SIZE];
        int rc;
 
        ima_used_chip = 0;
@@ -88,6 +99,10 @@ int __init ima_init(void)
        rc = ima_init_crypto();
        if (rc)
                return rc;
+       rc = ima_init_template();
+       if (rc != 0)
+               return rc;
+
        ima_add_boot_aggregate();       /* boot aggregate must be first entry */
        ima_init_policy();
 
index e9508d5..149ee11 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/ima.h>
+#include <crypto/hash_info.h>
 
 #include "ima.h"
 
@@ -35,11 +36,33 @@ int ima_appraise = IMA_APPRAISE_ENFORCE;
 int ima_appraise;
 #endif
 
-char *ima_hash = "sha1";
+int ima_hash_algo = HASH_ALGO_SHA1;
+static int hash_setup_done;
+
 static int __init hash_setup(char *str)
 {
-       if (strncmp(str, "md5", 3) == 0)
-               ima_hash = "md5";
+       struct ima_template_desc *template_desc = ima_template_desc_current();
+       int i;
+
+       if (hash_setup_done)
+               return 1;
+
+       if (strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) == 0) {
+               if (strncmp(str, "sha1", 4) == 0)
+                       ima_hash_algo = HASH_ALGO_SHA1;
+               else if (strncmp(str, "md5", 3) == 0)
+                       ima_hash_algo = HASH_ALGO_MD5;
+               goto out;
+       }
+
+       for (i = 0; i < HASH_ALGO__LAST; i++) {
+               if (strcmp(str, hash_algo_name[i]) == 0) {
+                       ima_hash_algo = i;
+                       break;
+               }
+       }
+out:
+       hash_setup_done = 1;
        return 1;
 }
 __setup("ima_hash=", hash_setup);
@@ -92,10 +115,9 @@ out:
                pathname = dentry->d_name.name;
 
        if (send_tomtou)
-               ima_add_violation(inode, pathname,
-                                 "invalid_pcr", "ToMToU");
+               ima_add_violation(file, pathname, "invalid_pcr", "ToMToU");
        if (send_writers)
-               ima_add_violation(inode, pathname,
+               ima_add_violation(file, pathname,
                                  "invalid_pcr", "open_writers");
        kfree(pathbuf);
 }
@@ -144,9 +166,12 @@ static int process_measurement(struct file *file, const char *filename,
 {
        struct inode *inode = file_inode(file);
        struct integrity_iint_cache *iint;
+       struct ima_template_desc *template_desc = ima_template_desc_current();
        char *pathbuf = NULL;
        const char *pathname = NULL;
        int rc = -ENOMEM, action, must_appraise, _func;
+       struct evm_ima_xattr_data *xattr_value = NULL, **xattr_ptr = NULL;
+       int xattr_len = 0;
 
        if (!ima_initialized || !S_ISREG(inode->i_mode))
                return 0;
@@ -185,7 +210,13 @@ static int process_measurement(struct file *file, const char *filename,
                goto out_digsig;
        }
 
-       rc = ima_collect_measurement(iint, file);
+       if (strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) == 0) {
+               if (action & IMA_APPRAISE_SUBMASK)
+                       xattr_ptr = &xattr_value;
+       } else
+               xattr_ptr = &xattr_value;
+
+       rc = ima_collect_measurement(iint, file, xattr_ptr, &xattr_len);
        if (rc != 0)
                goto out_digsig;
 
@@ -194,9 +225,11 @@ static int process_measurement(struct file *file, const char *filename,
                pathname = (const char *)file->f_dentry->d_name.name;
 
        if (action & IMA_MEASURE)
-               ima_store_measurement(iint, file, pathname);
+               ima_store_measurement(iint, file, pathname,
+                                     xattr_value, xattr_len);
        if (action & IMA_APPRAISE_SUBMASK)
-               rc = ima_appraise_measurement(_func, iint, file, pathname);
+               rc = ima_appraise_measurement(_func, iint, file, pathname,
+                                             xattr_value, xattr_len);
        if (action & IMA_AUDIT)
                ima_audit_measurement(iint, pathname);
        kfree(pathbuf);
@@ -205,6 +238,7 @@ out_digsig:
                rc = -EACCES;
 out:
        mutex_unlock(&inode->i_mutex);
+       kfree(xattr_value);
        if ((rc && must_appraise) && (ima_appraise & IMA_APPRAISE_ENFORCE))
                return -EACCES;
        return 0;
@@ -244,9 +278,9 @@ int ima_file_mmap(struct file *file, unsigned long prot)
 int ima_bprm_check(struct linux_binprm *bprm)
 {
        return process_measurement(bprm->file,
-                                (strcmp(bprm->filename, bprm->interp) == 0) ?
-                                bprm->filename : bprm->interp,
-                                MAY_EXEC, BPRM_CHECK);
+                                  (strcmp(bprm->filename, bprm->interp) == 0) ?
+                                  bprm->filename : bprm->interp,
+                                  MAY_EXEC, BPRM_CHECK);
 }
 
 /**
@@ -263,8 +297,8 @@ int ima_file_check(struct file *file, int mask)
 {
        ima_rdwr_violation_check(file);
        return process_measurement(file, NULL,
-                                mask & (MAY_READ | MAY_WRITE | MAY_EXEC),
-                                FILE_CHECK);
+                                  mask & (MAY_READ | MAY_WRITE | MAY_EXEC),
+                                  FILE_CHECK);
 }
 EXPORT_SYMBOL_GPL(ima_file_check);
 
@@ -294,6 +328,7 @@ static int __init init_ima(void)
 {
        int error;
 
+       hash_setup(CONFIG_IMA_DEFAULT_HASH);
        error = ima_init();
        if (!error)
                ima_initialized = 1;
index 399433a..a9c3d3c 100644 (file)
@@ -73,7 +73,6 @@ static struct ima_rule_entry default_rules[] = {
        {.action = DONT_MEASURE,.fsmagic = SYSFS_MAGIC,.flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE,.fsmagic = DEBUGFS_MAGIC,.flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE,.fsmagic = TMPFS_MAGIC,.flags = IMA_FSMAGIC},
-       {.action = DONT_MEASURE,.fsmagic = RAMFS_MAGIC,.flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE,.fsmagic = DEVPTS_SUPER_MAGIC,.flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE,.fsmagic = BINFMTFS_MAGIC,.flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC,.flags = IMA_FSMAGIC},
index ff63fe0..d85e997 100644 (file)
@@ -50,7 +50,7 @@ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value)
        key = ima_hash_key(digest_value);
        rcu_read_lock();
        hlist_for_each_entry_rcu(qe, &ima_htable.queue[key], hnext) {
-               rc = memcmp(qe->entry->digest, digest_value, IMA_DIGEST_SIZE);
+               rc = memcmp(qe->entry->digest, digest_value, TPM_DIGEST_SIZE);
                if (rc == 0) {
                        ret = qe;
                        break;
@@ -104,9 +104,10 @@ static int ima_pcr_extend(const u8 *hash)
  * and extend the pcr.
  */
 int ima_add_template_entry(struct ima_template_entry *entry, int violation,
-                          const char *op, struct inode *inode)
+                          const char *op, struct inode *inode,
+                          const unsigned char *filename)
 {
-       u8 digest[IMA_DIGEST_SIZE];
+       u8 digest[TPM_DIGEST_SIZE];
        const char *audit_cause = "hash_added";
        char tpm_audit_cause[AUDIT_CAUSE_LEN_MAX];
        int audit_info = 1;
@@ -141,8 +142,7 @@ int ima_add_template_entry(struct ima_template_entry *entry, int violation,
        }
 out:
        mutex_unlock(&ima_extend_list_mutex);
-       integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode,
-                           entry->template.file_name,
+       integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename,
                            op, audit_cause, result, audit_info);
        return result;
 }
diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c
new file mode 100644 (file)
index 0000000..4e5da99
--- /dev/null
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2013 Politecnico di Torino, Italy
+ *                    TORSEC group -- http://security.polito.it
+ *
+ * Author: Roberto Sassu <roberto.sassu@polito.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * File: ima_template.c
+ *      Helpers to manage template descriptors.
+ */
+#include <crypto/hash_info.h>
+
+#include "ima.h"
+#include "ima_template_lib.h"
+
+static struct ima_template_desc defined_templates[] = {
+       {.name = IMA_TEMPLATE_IMA_NAME, .fmt = IMA_TEMPLATE_IMA_FMT},
+       {.name = "ima-ng",.fmt = "d-ng|n-ng"},
+       {.name = "ima-sig",.fmt = "d-ng|n-ng|sig"},
+};
+
+static struct ima_template_field supported_fields[] = {
+       {.field_id = "d",.field_init = ima_eventdigest_init,
+        .field_show = ima_show_template_digest},
+       {.field_id = "n",.field_init = ima_eventname_init,
+        .field_show = ima_show_template_string},
+       {.field_id = "d-ng",.field_init = ima_eventdigest_ng_init,
+        .field_show = ima_show_template_digest_ng},
+       {.field_id = "n-ng",.field_init = ima_eventname_ng_init,
+        .field_show = ima_show_template_string},
+       {.field_id = "sig",.field_init = ima_eventsig_init,
+        .field_show = ima_show_template_sig},
+};
+
+static struct ima_template_desc *ima_template;
+static struct ima_template_desc *lookup_template_desc(const char *name);
+
+static int __init ima_template_setup(char *str)
+{
+       struct ima_template_desc *template_desc;
+       int template_len = strlen(str);
+
+       /*
+        * Verify that a template with the supplied name exists.
+        * If not, use CONFIG_IMA_DEFAULT_TEMPLATE.
+        */
+       template_desc = lookup_template_desc(str);
+       if (!template_desc)
+               return 1;
+
+       /*
+        * Verify whether the current hash algorithm is supported
+        * by the 'ima' template.
+        */
+       if (template_len == 3 && strcmp(str, IMA_TEMPLATE_IMA_NAME) == 0 &&
+           ima_hash_algo != HASH_ALGO_SHA1 && ima_hash_algo != HASH_ALGO_MD5) {
+               pr_err("IMA: template does not support hash alg\n");
+               return 1;
+       }
+
+       ima_template = template_desc;
+       return 1;
+}
+__setup("ima_template=", ima_template_setup);
+
+static struct ima_template_desc *lookup_template_desc(const char *name)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(defined_templates); i++) {
+               if (strcmp(defined_templates[i].name, name) == 0)
+                       return defined_templates + i;
+       }
+
+       return NULL;
+}
+
+static struct ima_template_field *lookup_template_field(const char *field_id)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(supported_fields); i++)
+               if (strncmp(supported_fields[i].field_id, field_id,
+                           IMA_TEMPLATE_FIELD_ID_MAX_LEN) == 0)
+                       return &supported_fields[i];
+       return NULL;
+}
+
+static int template_fmt_size(char *template_fmt)
+{
+       char c;
+       int template_fmt_len = strlen(template_fmt);
+       int i = 0, j = 0;
+
+       while (i < template_fmt_len) {
+               c = template_fmt[i];
+               if (c == '|')
+                       j++;
+               i++;
+       }
+
+       return j + 1;
+}
+
+static int template_desc_init_fields(char *template_fmt,
+                                    struct ima_template_field ***fields,
+                                    int *num_fields)
+{
+       char *c, *template_fmt_ptr = template_fmt;
+       int template_num_fields = template_fmt_size(template_fmt);
+       int i, result = 0;
+
+       if (template_num_fields > IMA_TEMPLATE_NUM_FIELDS_MAX)
+               return -EINVAL;
+
+       *fields = kzalloc(template_num_fields * sizeof(*fields), GFP_KERNEL);
+       if (*fields == NULL) {
+               result = -ENOMEM;
+               goto out;
+       }
+       for (i = 0; (c = strsep(&template_fmt_ptr, "|")) != NULL &&
+            i < template_num_fields; i++) {
+               struct ima_template_field *f = lookup_template_field(c);
+
+               if (!f) {
+                       result = -ENOENT;
+                       goto out;
+               }
+               (*fields)[i] = f;
+       }
+       *num_fields = i;
+       return 0;
+out:
+       kfree(*fields);
+       *fields = NULL;
+       return result;
+}
+
+static int init_defined_templates(void)
+{
+       int i = 0;
+       int result = 0;
+
+       /* Init defined templates. */
+       for (i = 0; i < ARRAY_SIZE(defined_templates); i++) {
+               struct ima_template_desc *template = &defined_templates[i];
+
+               result = template_desc_init_fields(template->fmt,
+                                                  &(template->fields),
+                                                  &(template->num_fields));
+               if (result < 0)
+                       return result;
+       }
+       return result;
+}
+
+struct ima_template_desc *ima_template_desc_current(void)
+{
+       if (!ima_template)
+               ima_template =
+                   lookup_template_desc(CONFIG_IMA_DEFAULT_TEMPLATE);
+       return ima_template;
+}
+
+int ima_init_template(void)
+{
+       int result;
+
+       result = init_defined_templates();
+       if (result < 0)
+               return result;
+
+       return 0;
+}
diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c
new file mode 100644 (file)
index 0000000..6d66ad6
--- /dev/null
@@ -0,0 +1,347 @@
+/*
+ * Copyright (C) 2013 Politecnico di Torino, Italy
+ *                    TORSEC group -- http://security.polito.it
+ *
+ * Author: Roberto Sassu <roberto.sassu@polito.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * File: ima_template_lib.c
+ *      Library of supported template fields.
+ */
+#include <crypto/hash_info.h>
+
+#include "ima_template_lib.h"
+
+static bool ima_template_hash_algo_allowed(u8 algo)
+{
+       if (algo == HASH_ALGO_SHA1 || algo == HASH_ALGO_MD5)
+               return true;
+
+       return false;
+}
+
+enum data_formats {
+       DATA_FMT_DIGEST = 0,
+       DATA_FMT_DIGEST_WITH_ALGO,
+       DATA_FMT_EVENT_NAME,
+       DATA_FMT_STRING,
+       DATA_FMT_HEX
+};
+
+static int ima_write_template_field_data(const void *data, const u32 datalen,
+                                        enum data_formats datafmt,
+                                        struct ima_field_data *field_data)
+{
+       u8 *buf, *buf_ptr;
+       u32 buflen;
+
+       switch (datafmt) {
+       case DATA_FMT_EVENT_NAME:
+               buflen = IMA_EVENT_NAME_LEN_MAX + 1;
+               break;
+       case DATA_FMT_STRING:
+               buflen = datalen + 1;
+               break;
+       default:
+               buflen = datalen;
+       }
+
+       buf = kzalloc(buflen, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       memcpy(buf, data, datalen);
+
+       /*
+        * Replace all space characters with underscore for event names and
+        * strings. This avoid that, during the parsing of a measurements list,
+        * filenames with spaces or that end with the suffix ' (deleted)' are
+        * split into multiple template fields (the space is the delimitator
+        * character for measurements lists in ASCII format).
+        */
+       if (datafmt == DATA_FMT_EVENT_NAME || datafmt == DATA_FMT_STRING) {
+               for (buf_ptr = buf; buf_ptr - buf < datalen; buf_ptr++)
+                       if (*buf_ptr == ' ')
+                               *buf_ptr = '_';
+       }
+
+       field_data->data = buf;
+       field_data->len = buflen;
+       return 0;
+}
+
+static void ima_show_template_data_ascii(struct seq_file *m,
+                                        enum ima_show_type show,
+                                        enum data_formats datafmt,
+                                        struct ima_field_data *field_data)
+{
+       u8 *buf_ptr = field_data->data, buflen = field_data->len;
+
+       switch (datafmt) {
+       case DATA_FMT_DIGEST_WITH_ALGO:
+               buf_ptr = strnchr(field_data->data, buflen, ':');
+               if (buf_ptr != field_data->data)
+                       seq_printf(m, "%s", field_data->data);
+
+               /* skip ':' and '\0' */
+               buf_ptr += 2;
+               buflen -= buf_ptr - field_data->data;
+       case DATA_FMT_DIGEST:
+       case DATA_FMT_HEX:
+               if (!buflen)
+                       break;
+               ima_print_digest(m, buf_ptr, buflen);
+               break;
+       case DATA_FMT_STRING:
+               seq_printf(m, "%s", buf_ptr);
+               break;
+       default:
+               break;
+       }
+}
+
+static void ima_show_template_data_binary(struct seq_file *m,
+                                         enum ima_show_type show,
+                                         enum data_formats datafmt,
+                                         struct ima_field_data *field_data)
+{
+       ima_putc(m, &field_data->len, sizeof(u32));
+       if (!field_data->len)
+               return;
+       ima_putc(m, field_data->data, field_data->len);
+}
+
+static void ima_show_template_field_data(struct seq_file *m,
+                                        enum ima_show_type show,
+                                        enum data_formats datafmt,
+                                        struct ima_field_data *field_data)
+{
+       switch (show) {
+       case IMA_SHOW_ASCII:
+               ima_show_template_data_ascii(m, show, datafmt, field_data);
+               break;
+       case IMA_SHOW_BINARY:
+               ima_show_template_data_binary(m, show, datafmt, field_data);
+               break;
+       default:
+               break;
+       }
+}
+
+void ima_show_template_digest(struct seq_file *m, enum ima_show_type show,
+                             struct ima_field_data *field_data)
+{
+       ima_show_template_field_data(m, show, DATA_FMT_DIGEST, field_data);
+}
+
+void ima_show_template_digest_ng(struct seq_file *m, enum ima_show_type show,
+                                struct ima_field_data *field_data)
+{
+       ima_show_template_field_data(m, show, DATA_FMT_DIGEST_WITH_ALGO,
+                                    field_data);
+}
+
+void ima_show_template_string(struct seq_file *m, enum ima_show_type show,
+                             struct ima_field_data *field_data)
+{
+       ima_show_template_field_data(m, show, DATA_FMT_STRING, field_data);
+}
+
+void ima_show_template_sig(struct seq_file *m, enum ima_show_type show,
+                          struct ima_field_data *field_data)
+{
+       ima_show_template_field_data(m, show, DATA_FMT_HEX, field_data);
+}
+
+static int ima_eventdigest_init_common(u8 *digest, u32 digestsize, u8 hash_algo,
+                                      struct ima_field_data *field_data,
+                                      bool size_limit)
+{
+       /*
+        * digest formats:
+        *  - DATA_FMT_DIGEST: digest
+        *  - DATA_FMT_DIGEST_WITH_ALGO: [<hash algo>] + ':' + '\0' + digest,
+        *    where <hash algo> is provided if the hash algoritm is not
+        *    SHA1 or MD5
+        */
+       u8 buffer[CRYPTO_MAX_ALG_NAME + 2 + IMA_MAX_DIGEST_SIZE] = { 0 };
+       enum data_formats fmt = DATA_FMT_DIGEST;
+       u32 offset = 0;
+
+       if (!size_limit) {
+               fmt = DATA_FMT_DIGEST_WITH_ALGO;
+               if (hash_algo < HASH_ALGO__LAST)
+                       offset += snprintf(buffer, CRYPTO_MAX_ALG_NAME + 1,
+                                          "%s", hash_algo_name[hash_algo]);
+               buffer[offset] = ':';
+               offset += 2;
+       }
+
+       if (digest)
+               memcpy(buffer + offset, digest, digestsize);
+       else
+               /*
+                * If digest is NULL, the event being recorded is a violation.
+                * Make room for the digest by increasing the offset of
+                * IMA_DIGEST_SIZE.
+                */
+               offset += IMA_DIGEST_SIZE;
+
+       return ima_write_template_field_data(buffer, offset + digestsize,
+                                            fmt, field_data);
+}
+
+/*
+ * This function writes the digest of an event (with size limit).
+ */
+int ima_eventdigest_init(struct integrity_iint_cache *iint, struct file *file,
+                        const unsigned char *filename,
+                        struct evm_ima_xattr_data *xattr_value, int xattr_len,
+                        struct ima_field_data *field_data)
+{
+       struct {
+               struct ima_digest_data hdr;
+               char digest[IMA_MAX_DIGEST_SIZE];
+       } hash;
+       u8 *cur_digest = NULL;
+       u32 cur_digestsize = 0;
+       struct inode *inode;
+       int result;
+
+       memset(&hash, 0, sizeof(hash));
+
+       if (!iint)              /* recording a violation. */
+               goto out;
+
+       if (ima_template_hash_algo_allowed(iint->ima_hash->algo)) {
+               cur_digest = iint->ima_hash->digest;
+               cur_digestsize = iint->ima_hash->length;
+               goto out;
+       }
+
+       if (!file)              /* missing info to re-calculate the digest */
+               return -EINVAL;
+
+       inode = file_inode(file);
+       hash.hdr.algo = ima_template_hash_algo_allowed(ima_hash_algo) ?
+           ima_hash_algo : HASH_ALGO_SHA1;
+       result = ima_calc_file_hash(file, &hash.hdr);
+       if (result) {
+               integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode,
+                                   filename, "collect_data",
+                                   "failed", result, 0);
+               return result;
+       }
+       cur_digest = hash.hdr.digest;
+       cur_digestsize = hash.hdr.length;
+out:
+       return ima_eventdigest_init_common(cur_digest, cur_digestsize, -1,
+                                          field_data, true);
+}
+
+/*
+ * This function writes the digest of an event (without size limit).
+ */
+int ima_eventdigest_ng_init(struct integrity_iint_cache *iint,
+                           struct file *file, const unsigned char *filename,
+                           struct evm_ima_xattr_data *xattr_value,
+                           int xattr_len, struct ima_field_data *field_data)
+{
+       u8 *cur_digest = NULL, hash_algo = HASH_ALGO__LAST;
+       u32 cur_digestsize = 0;
+
+       /* If iint is NULL, we are recording a violation. */
+       if (!iint)
+               goto out;
+
+       cur_digest = iint->ima_hash->digest;
+       cur_digestsize = iint->ima_hash->length;
+
+       hash_algo = iint->ima_hash->algo;
+out:
+       return ima_eventdigest_init_common(cur_digest, cur_digestsize,
+                                          hash_algo, field_data, false);
+}
+
+static int ima_eventname_init_common(struct integrity_iint_cache *iint,
+                                    struct file *file,
+                                    const unsigned char *filename,
+                                    struct ima_field_data *field_data,
+                                    bool size_limit)
+{
+       const char *cur_filename = NULL;
+       u32 cur_filename_len = 0;
+       enum data_formats fmt = size_limit ?
+           DATA_FMT_EVENT_NAME : DATA_FMT_STRING;
+
+       BUG_ON(filename == NULL && file == NULL);
+
+       if (filename) {
+               cur_filename = filename;
+               cur_filename_len = strlen(filename);
+
+               if (!size_limit || cur_filename_len <= IMA_EVENT_NAME_LEN_MAX)
+                       goto out;
+       }
+
+       if (file) {
+               cur_filename = file->f_dentry->d_name.name;
+               cur_filename_len = strlen(cur_filename);
+       } else
+               /*
+                * Truncate filename if the latter is too long and
+                * the file descriptor is not available.
+                */
+               cur_filename_len = IMA_EVENT_NAME_LEN_MAX;
+out:
+       return ima_write_template_field_data(cur_filename, cur_filename_len,
+                                            fmt, field_data);
+}
+
+/*
+ * This function writes the name of an event (with size limit).
+ */
+int ima_eventname_init(struct integrity_iint_cache *iint, struct file *file,
+                      const unsigned char *filename,
+                      struct evm_ima_xattr_data *xattr_value, int xattr_len,
+                      struct ima_field_data *field_data)
+{
+       return ima_eventname_init_common(iint, file, filename,
+                                        field_data, true);
+}
+
+/*
+ * This function writes the name of an event (without size limit).
+ */
+int ima_eventname_ng_init(struct integrity_iint_cache *iint, struct file *file,
+                         const unsigned char *filename,
+                         struct evm_ima_xattr_data *xattr_value, int xattr_len,
+                         struct ima_field_data *field_data)
+{
+       return ima_eventname_init_common(iint, file, filename,
+                                        field_data, false);
+}
+
+/*
+ *  ima_eventsig_init - include the file signature as part of the template data
+ */
+int ima_eventsig_init(struct integrity_iint_cache *iint, struct file *file,
+                     const unsigned char *filename,
+                     struct evm_ima_xattr_data *xattr_value, int xattr_len,
+                     struct ima_field_data *field_data)
+{
+       enum data_formats fmt = DATA_FMT_HEX;
+       int rc = 0;
+
+       if ((!xattr_value) || (xattr_value->type != EVM_IMA_XATTR_DIGSIG))
+               goto out;
+
+       rc = ima_write_template_field_data(xattr_value, xattr_len, fmt,
+                                          field_data);
+out:
+       return rc;
+}
diff --git a/security/integrity/ima/ima_template_lib.h b/security/integrity/ima/ima_template_lib.h
new file mode 100644 (file)
index 0000000..63f6b52
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2013 Politecnico di Torino, Italy
+ *                    TORSEC group -- http://security.polito.it
+ *
+ * Author: Roberto Sassu <roberto.sassu@polito.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * File: ima_template_lib.h
+ *      Header for the library of supported template fields.
+ */
+#ifndef __LINUX_IMA_TEMPLATE_LIB_H
+#define __LINUX_IMA_TEMPLATE_LIB_H
+
+#include <linux/seq_file.h>
+#include "ima.h"
+
+void ima_show_template_digest(struct seq_file *m, enum ima_show_type show,
+                             struct ima_field_data *field_data);
+void ima_show_template_digest_ng(struct seq_file *m, enum ima_show_type show,
+                                struct ima_field_data *field_data);
+void ima_show_template_string(struct seq_file *m, enum ima_show_type show,
+                             struct ima_field_data *field_data);
+void ima_show_template_sig(struct seq_file *m, enum ima_show_type show,
+                          struct ima_field_data *field_data);
+int ima_eventdigest_init(struct integrity_iint_cache *iint, struct file *file,
+                        const unsigned char *filename,
+                        struct evm_ima_xattr_data *xattr_value, int xattr_len,
+                        struct ima_field_data *field_data);
+int ima_eventname_init(struct integrity_iint_cache *iint, struct file *file,
+                      const unsigned char *filename,
+                      struct evm_ima_xattr_data *xattr_value, int xattr_len,
+                      struct ima_field_data *field_data);
+int ima_eventdigest_ng_init(struct integrity_iint_cache *iint,
+                           struct file *file, const unsigned char *filename,
+                           struct evm_ima_xattr_data *xattr_value,
+                           int xattr_len, struct ima_field_data *field_data);
+int ima_eventname_ng_init(struct integrity_iint_cache *iint, struct file *file,
+                         const unsigned char *filename,
+                         struct evm_ima_xattr_data *xattr_value, int xattr_len,
+                         struct ima_field_data *field_data);
+int ima_eventsig_init(struct integrity_iint_cache *iint, struct file *file,
+                     const unsigned char *filename,
+                     struct evm_ima_xattr_data *xattr_value, int xattr_len,
+                     struct ima_field_data *field_data);
+#endif /* __LINUX_IMA_TEMPLATE_LIB_H */
index c42fb7a..b9e7c13 100644 (file)
@@ -54,25 +54,57 @@ enum evm_ima_xattr_type {
        IMA_XATTR_DIGEST = 0x01,
        EVM_XATTR_HMAC,
        EVM_IMA_XATTR_DIGSIG,
+       IMA_XATTR_DIGEST_NG,
 };
 
 struct evm_ima_xattr_data {
        u8 type;
        u8 digest[SHA1_DIGEST_SIZE];
-}  __attribute__((packed));
+} __packed;
+
+#define IMA_MAX_DIGEST_SIZE    64
+
+struct ima_digest_data {
+       u8 algo;
+       u8 length;
+       union {
+               struct {
+                       u8 unused;
+                       u8 type;
+               } sha1;
+               struct {
+                       u8 type;
+                       u8 algo;
+               } ng;
+               u8 data[2];
+       } xattr;
+       u8 digest[0];
+} __packed;
+
+/*
+ * signature format v2 - for using with asymmetric keys
+ */
+struct signature_v2_hdr {
+       uint8_t type;           /* xattr type */
+       uint8_t version;        /* signature format version */
+       uint8_t hash_algo;      /* Digest algorithm [enum pkey_hash_algo] */
+       uint32_t keyid;         /* IMA key identifier - not X509/PGP specific */
+       uint16_t sig_size;      /* signature size */
+       uint8_t sig[0];         /* signature payload */
+} __packed;
 
 /* integrity data associated with an inode */
 struct integrity_iint_cache {
-       struct rb_node rb_node; /* rooted in integrity_iint_tree */
+       struct rb_node rb_node; /* rooted in integrity_iint_tree */
        struct inode *inode;    /* back pointer to inode in question */
        u64 version;            /* track inode changes */
        unsigned long flags;
-       struct evm_ima_xattr_data ima_xattr;
        enum integrity_status ima_file_status:4;
        enum integrity_status ima_mmap_status:4;
        enum integrity_status ima_bprm_status:4;
        enum integrity_status ima_module_status:4;
        enum integrity_status evm_status:4;
+       struct ima_digest_data *ima_hash;
 };
 
 /* rbtree tree calls to lookup, insert, delete
@@ -89,7 +121,7 @@ struct integrity_iint_cache *integrity_iint_find(struct inode *inode);
 #ifdef CONFIG_INTEGRITY_SIGNATURE
 
 int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
-                                       const char *digest, int digestlen);
+                           const char *digest, int digestlen);
 
 #else
 
@@ -105,12 +137,19 @@ static inline int integrity_digsig_verify(const unsigned int id,
 #ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS
 int asymmetric_verify(struct key *keyring, const char *sig,
                      int siglen, const char *data, int datalen);
+
+int integrity_init_keyring(const unsigned int id);
 #else
 static inline int asymmetric_verify(struct key *keyring, const char *sig,
                                    int siglen, const char *data, int datalen)
 {
        return -EOPNOTSUPP;
 }
+
+static int integrity_init_keyring(const unsigned int id)
+{
+       return 0;
+}
 #endif
 
 #ifdef CONFIG_INTEGRITY_AUDIT
index a90d6d3..a4f3f8c 100644 (file)
@@ -4,6 +4,7 @@
 
 config KEYS
        bool "Enable access key retention support"
+       select ASSOCIATIVE_ARRAY
        help
          This option provides support for retaining authentication tokens and
          access keys in the kernel.
@@ -19,6 +20,34 @@ config KEYS
 
          If you are unsure as to whether this is required, answer N.
 
+config PERSISTENT_KEYRINGS
+       bool "Enable register of persistent per-UID keyrings"
+       depends on KEYS
+       help
+         This option provides a register of persistent per-UID keyrings,
+         primarily aimed at Kerberos key storage.  The keyrings are persistent
+         in the sense that they stay around after all processes of that UID
+         have exited, not that they survive the machine being rebooted.
+
+         A particular keyring may be accessed by either the user whose keyring
+         it is or by a process with administrative privileges.  The active
+         LSMs gets to rule on which admin-level processes get to access the
+         cache.
+
+         Keyrings are created and added into the register upon demand and get
+         removed if they expire (a default timeout is set upon creation).
+
+config BIG_KEYS
+       bool "Large payload keys"
+       depends on KEYS
+       depends on TMPFS
+       help
+         This option provides support for holding large keys within the kernel
+         (for example Kerberos ticket caches).  The data may be stored out to
+         swapspace by tmpfs.
+
+         If you are unsure as to whether this is required, answer N.
+
 config TRUSTED_KEYS
        tristate "TRUSTED KEYS"
        depends on KEYS && TCG_TPM
index 504aaa0..dfb3a7b 100644 (file)
@@ -18,9 +18,11 @@ obj-y := \
 obj-$(CONFIG_KEYS_COMPAT) += compat.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_SYSCTL) += sysctl.o
+obj-$(CONFIG_PERSISTENT_KEYRINGS) += persistent.o
 
 #
 # Key types
 #
+obj-$(CONFIG_BIG_KEYS) += big_key.o
 obj-$(CONFIG_TRUSTED_KEYS) += trusted.o
 obj-$(CONFIG_ENCRYPTED_KEYS) += encrypted-keys/
diff --git a/security/keys/big_key.c b/security/keys/big_key.c
new file mode 100644 (file)
index 0000000..7f44c32
--- /dev/null
@@ -0,0 +1,207 @@
+/* Large capacity key type
+ *
+ * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/file.h>
+#include <linux/shmem_fs.h>
+#include <linux/err.h>
+#include <keys/user-type.h>
+#include <keys/big_key-type.h>
+
+MODULE_LICENSE("GPL");
+
+/*
+ * If the data is under this limit, there's no point creating a shm file to
+ * hold it as the permanently resident metadata for the shmem fs will be at
+ * least as large as the data.
+ */
+#define BIG_KEY_FILE_THRESHOLD (sizeof(struct inode) + sizeof(struct dentry))
+
+/*
+ * big_key defined keys take an arbitrary string as the description and an
+ * arbitrary blob of data as the payload
+ */
+struct key_type key_type_big_key = {
+       .name                   = "big_key",
+       .def_lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
+       .instantiate            = big_key_instantiate,
+       .match                  = user_match,
+       .revoke                 = big_key_revoke,
+       .destroy                = big_key_destroy,
+       .describe               = big_key_describe,
+       .read                   = big_key_read,
+};
+
+/*
+ * Instantiate a big key
+ */
+int big_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
+{
+       struct path *path = (struct path *)&key->payload.data2;
+       struct file *file;
+       ssize_t written;
+       size_t datalen = prep->datalen;
+       int ret;
+
+       ret = -EINVAL;
+       if (datalen <= 0 || datalen > 1024 * 1024 || !prep->data)
+               goto error;
+
+       /* Set an arbitrary quota */
+       ret = key_payload_reserve(key, 16);
+       if (ret < 0)
+               goto error;
+
+       key->type_data.x[1] = datalen;
+
+       if (datalen > BIG_KEY_FILE_THRESHOLD) {
+               /* Create a shmem file to store the data in.  This will permit the data
+                * to be swapped out if needed.
+                *
+                * TODO: Encrypt the stored data with a temporary key.
+                */
+               file = shmem_file_setup("", datalen, 0);
+               if (IS_ERR(file)) {
+                       ret = PTR_ERR(file);
+                       goto err_quota;
+               }
+
+               written = kernel_write(file, prep->data, prep->datalen, 0);
+               if (written != datalen) {
+                       ret = written;
+                       if (written >= 0)
+                               ret = -ENOMEM;
+                       goto err_fput;
+               }
+
+               /* Pin the mount and dentry to the key so that we can open it again
+                * later
+                */
+               *path = file->f_path;
+               path_get(path);
+               fput(file);
+       } else {
+               /* Just store the data in a buffer */
+               void *data = kmalloc(datalen, GFP_KERNEL);
+               if (!data) {
+                       ret = -ENOMEM;
+                       goto err_quota;
+               }
+
+               key->payload.data = memcpy(data, prep->data, prep->datalen);
+       }
+       return 0;
+
+err_fput:
+       fput(file);
+err_quota:
+       key_payload_reserve(key, 0);
+error:
+       return ret;
+}
+
+/*
+ * dispose of the links from a revoked keyring
+ * - called with the key sem write-locked
+ */
+void big_key_revoke(struct key *key)
+{
+       struct path *path = (struct path *)&key->payload.data2;
+
+       /* clear the quota */
+       key_payload_reserve(key, 0);
+       if (key_is_instantiated(key) && key->type_data.x[1] > BIG_KEY_FILE_THRESHOLD)
+               vfs_truncate(path, 0);
+}
+
+/*
+ * dispose of the data dangling from the corpse of a big_key key
+ */
+void big_key_destroy(struct key *key)
+{
+       if (key->type_data.x[1] > BIG_KEY_FILE_THRESHOLD) {
+               struct path *path = (struct path *)&key->payload.data2;
+               path_put(path);
+               path->mnt = NULL;
+               path->dentry = NULL;
+       } else {
+               kfree(key->payload.data);
+               key->payload.data = NULL;
+       }
+}
+
+/*
+ * describe the big_key key
+ */
+void big_key_describe(const struct key *key, struct seq_file *m)
+{
+       unsigned long datalen = key->type_data.x[1];
+
+       seq_puts(m, key->description);
+
+       if (key_is_instantiated(key))
+               seq_printf(m, ": %lu [%s]",
+                          datalen,
+                          datalen > BIG_KEY_FILE_THRESHOLD ? "file" : "buff");
+}
+
+/*
+ * read the key data
+ * - the key's semaphore is read-locked
+ */
+long big_key_read(const struct key *key, char __user *buffer, size_t buflen)
+{
+       unsigned long datalen = key->type_data.x[1];
+       long ret;
+
+       if (!buffer || buflen < datalen)
+               return datalen;
+
+       if (datalen > BIG_KEY_FILE_THRESHOLD) {
+               struct path *path = (struct path *)&key->payload.data2;
+               struct file *file;
+               loff_t pos;
+
+               file = dentry_open(path, O_RDONLY, current_cred());
+               if (IS_ERR(file))
+                       return PTR_ERR(file);
+
+               pos = 0;
+               ret = vfs_read(file, buffer, datalen, &pos);
+               fput(file);
+               if (ret >= 0 && ret != datalen)
+                       ret = -EIO;
+       } else {
+               ret = datalen;
+               if (copy_to_user(buffer, key->payload.data, datalen) != 0)
+                       ret = -EFAULT;
+       }
+
+       return ret;
+}
+
+/*
+ * Module stuff
+ */
+static int __init big_key_init(void)
+{
+       return register_key_type(&key_type_big_key);
+}
+
+static void __exit big_key_cleanup(void)
+{
+       unregister_key_type(&key_type_big_key);
+}
+
+module_init(big_key_init);
+module_exit(big_key_cleanup);
index d65fa7f..bbd32c7 100644 (file)
@@ -138,6 +138,9 @@ asmlinkage long compat_sys_keyctl(u32 option,
        case KEYCTL_INVALIDATE:
                return keyctl_invalidate_key(arg2);
 
+       case KEYCTL_GET_PERSISTENT:
+               return keyctl_get_persistent(arg2, arg3);
+
        default:
                return -EOPNOTSUPP;
        }
index d67c97b..d3222b6 100644 (file)
@@ -131,50 +131,6 @@ void key_gc_keytype(struct key_type *ktype)
 }
 
 /*
- * Garbage collect pointers from a keyring.
- *
- * Not called with any locks held.  The keyring's key struct will not be
- * deallocated under us as only our caller may deallocate it.
- */
-static void key_gc_keyring(struct key *keyring, time_t limit)
-{
-       struct keyring_list *klist;
-       int loop;
-
-       kenter("%x", key_serial(keyring));
-
-       if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) |
-                             (1 << KEY_FLAG_REVOKED)))
-               goto dont_gc;
-
-       /* scan the keyring looking for dead keys */
-       rcu_read_lock();
-       klist = rcu_dereference(keyring->payload.subscriptions);
-       if (!klist)
-               goto unlock_dont_gc;
-
-       loop = klist->nkeys;
-       smp_rmb();
-       for (loop--; loop >= 0; loop--) {
-               struct key *key = rcu_dereference(klist->keys[loop]);
-               if (key_is_dead(key, limit))
-                       goto do_gc;
-       }
-
-unlock_dont_gc:
-       rcu_read_unlock();
-dont_gc:
-       kleave(" [no gc]");
-       return;
-
-do_gc:
-       rcu_read_unlock();
-
-       keyring_gc(keyring, limit);
-       kleave(" [gc]");
-}
-
-/*
  * Garbage collect a list of unreferenced, detached keys
  */
 static noinline void key_gc_unused_keys(struct list_head *keys)
@@ -392,8 +348,7 @@ found_unreferenced_key:
         */
 found_keyring:
        spin_unlock(&key_serial_lock);
-       kdebug("scan keyring %d", key->serial);
-       key_gc_keyring(key, limit);
+       keyring_gc(key, limit);
        goto maybe_resched;
 
        /* We found a dead key that is still referenced.  Reset its type and
index d4f1468..80b2aac 100644 (file)
@@ -89,42 +89,53 @@ extern struct key_type *key_type_lookup(const char *type);
 extern void key_type_put(struct key_type *ktype);
 
 extern int __key_link_begin(struct key *keyring,
-                           const struct key_type *type,
-                           const char *description,
-                           unsigned long *_prealloc);
+                           const struct keyring_index_key *index_key,
+                           struct assoc_array_edit **_edit);
 extern int __key_link_check_live_key(struct key *keyring, struct key *key);
-extern void __key_link(struct key *keyring, struct key *key,
-                      unsigned long *_prealloc);
+extern void __key_link(struct key *key, struct assoc_array_edit **_edit);
 extern void __key_link_end(struct key *keyring,
-                          struct key_type *type,
-                          unsigned long prealloc);
+                          const struct keyring_index_key *index_key,
+                          struct assoc_array_edit *edit);
 
-extern key_ref_t __keyring_search_one(key_ref_t keyring_ref,
-                                     const struct key_type *type,
-                                     const char *description,
-                                     key_perm_t perm);
+extern key_ref_t find_key_to_update(key_ref_t keyring_ref,
+                                   const struct keyring_index_key *index_key);
 
 extern struct key *keyring_search_instkey(struct key *keyring,
                                          key_serial_t target_id);
 
+extern int iterate_over_keyring(const struct key *keyring,
+                               int (*func)(const struct key *key, void *data),
+                               void *data);
+
 typedef int (*key_match_func_t)(const struct key *, const void *);
 
+struct keyring_search_context {
+       struct keyring_index_key index_key;
+       const struct cred       *cred;
+       key_match_func_t        match;
+       const void              *match_data;
+       unsigned                flags;
+#define KEYRING_SEARCH_LOOKUP_TYPE     0x0001  /* [as type->def_lookup_type] */
+#define KEYRING_SEARCH_NO_STATE_CHECK  0x0002  /* Skip state checks */
+#define KEYRING_SEARCH_DO_STATE_CHECK  0x0004  /* Override NO_STATE_CHECK */
+#define KEYRING_SEARCH_NO_UPDATE_TIME  0x0008  /* Don't update times */
+#define KEYRING_SEARCH_NO_CHECK_PERM   0x0010  /* Don't check permissions */
+#define KEYRING_SEARCH_DETECT_TOO_DEEP 0x0020  /* Give an error on excessive depth */
+
+       int (*iterator)(const void *object, void *iterator_data);
+
+       /* Internal stuff */
+       int                     skipped_ret;
+       bool                    possessed;
+       key_ref_t               result;
+       struct timespec         now;
+};
+
 extern key_ref_t keyring_search_aux(key_ref_t keyring_ref,
-                                   const struct cred *cred,
-                                   struct key_type *type,
-                                   const void *description,
-                                   key_match_func_t match,
-                                   bool no_state_check);
-
-extern key_ref_t search_my_process_keyrings(struct key_type *type,
-                                           const void *description,
-                                           key_match_func_t match,
-                                           bool no_state_check,
-                                           const struct cred *cred);
-extern key_ref_t search_process_keyrings(struct key_type *type,
-                                        const void *description,
-                                        key_match_func_t match,
-                                        const struct cred *cred);
+                                   struct keyring_search_context *ctx);
+
+extern key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx);
+extern key_ref_t search_process_keyrings(struct keyring_search_context *ctx);
 
 extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check);
 
@@ -202,7 +213,7 @@ extern struct key *key_get_instantiation_authkey(key_serial_t target_id);
 /*
  * Determine whether a key is dead.
  */
-static inline bool key_is_dead(struct key *key, time_t limit)
+static inline bool key_is_dead(const struct key *key, time_t limit)
 {
        return
                key->flags & ((1 << KEY_FLAG_DEAD) |
@@ -244,6 +255,15 @@ extern long keyctl_invalidate_key(key_serial_t);
 extern long keyctl_instantiate_key_common(key_serial_t,
                                          const struct iovec *,
                                          unsigned, size_t, key_serial_t);
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+extern long keyctl_get_persistent(uid_t, key_serial_t);
+extern unsigned persistent_keyring_expiry;
+#else
+static inline long keyctl_get_persistent(uid_t uid, key_serial_t destring)
+{
+       return -EOPNOTSUPP;
+}
+#endif
 
 /*
  * Debugging key validation
index 8fb7c7b..55d110f 100644 (file)
@@ -242,8 +242,8 @@ struct key *key_alloc(struct key_type *type, const char *desc,
                }
        }
 
-       desclen = strlen(desc) + 1;
-       quotalen = desclen + type->def_datalen;
+       desclen = strlen(desc);
+       quotalen = desclen + 1 + type->def_datalen;
 
        /* get hold of the key tracking for this user */
        user = key_user_lookup(uid);
@@ -277,7 +277,8 @@ struct key *key_alloc(struct key_type *type, const char *desc,
                goto no_memory_2;
 
        if (desc) {
-               key->description = kmemdup(desc, desclen, GFP_KERNEL);
+               key->index_key.desc_len = desclen;
+               key->index_key.description = kmemdup(desc, desclen + 1, GFP_KERNEL);
                if (!key->description)
                        goto no_memory_3;
        }
@@ -285,7 +286,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
        atomic_set(&key->usage, 1);
        init_rwsem(&key->sem);
        lockdep_set_class(&key->sem, &type->lock_class);
-       key->type = type;
+       key->index_key.type = type;
        key->user = user;
        key->quotalen = quotalen;
        key->datalen = type->def_datalen;
@@ -299,6 +300,8 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 
        if (!(flags & KEY_ALLOC_NOT_IN_QUOTA))
                key->flags |= 1 << KEY_FLAG_IN_QUOTA;
+       if (flags & KEY_ALLOC_TRUSTED)
+               key->flags |= 1 << KEY_FLAG_TRUSTED;
 
        memset(&key->type_data, 0, sizeof(key->type_data));
 
@@ -408,7 +411,7 @@ static int __key_instantiate_and_link(struct key *key,
                                      struct key_preparsed_payload *prep,
                                      struct key *keyring,
                                      struct key *authkey,
-                                     unsigned long *_prealloc)
+                                     struct assoc_array_edit **_edit)
 {
        int ret, awaken;
 
@@ -435,7 +438,7 @@ static int __key_instantiate_and_link(struct key *key,
 
                        /* and link it into the destination keyring */
                        if (keyring)
-                               __key_link(keyring, key, _prealloc);
+                               __key_link(key, _edit);
 
                        /* disable the authorisation key */
                        if (authkey)
@@ -475,7 +478,7 @@ int key_instantiate_and_link(struct key *key,
                             struct key *authkey)
 {
        struct key_preparsed_payload prep;
-       unsigned long prealloc;
+       struct assoc_array_edit *edit;
        int ret;
 
        memset(&prep, 0, sizeof(prep));
@@ -489,17 +492,15 @@ int key_instantiate_and_link(struct key *key,
        }
 
        if (keyring) {
-               ret = __key_link_begin(keyring, key->type, key->description,
-                                      &prealloc);
+               ret = __key_link_begin(keyring, &key->index_key, &edit);
                if (ret < 0)
                        goto error_free_preparse;
        }
 
-       ret = __key_instantiate_and_link(key, &prep, keyring, authkey,
-                                        &prealloc);
+       ret = __key_instantiate_and_link(key, &prep, keyring, authkey, &edit);
 
        if (keyring)
-               __key_link_end(keyring, key->type, prealloc);
+               __key_link_end(keyring, &key->index_key, edit);
 
 error_free_preparse:
        if (key->type->preparse)
@@ -537,7 +538,7 @@ int key_reject_and_link(struct key *key,
                        struct key *keyring,
                        struct key *authkey)
 {
-       unsigned long prealloc;
+       struct assoc_array_edit *edit;
        struct timespec now;
        int ret, awaken, link_ret = 0;
 
@@ -548,8 +549,7 @@ int key_reject_and_link(struct key *key,
        ret = -EBUSY;
 
        if (keyring)
-               link_ret = __key_link_begin(keyring, key->type,
-                                           key->description, &prealloc);
+               link_ret = __key_link_begin(keyring, &key->index_key, &edit);
 
        mutex_lock(&key_construction_mutex);
 
@@ -557,9 +557,10 @@ int key_reject_and_link(struct key *key,
        if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) {
                /* mark the key as being negatively instantiated */
                atomic_inc(&key->user->nikeys);
+               key->type_data.reject_error = -error;
+               smp_wmb();
                set_bit(KEY_FLAG_NEGATIVE, &key->flags);
                set_bit(KEY_FLAG_INSTANTIATED, &key->flags);
-               key->type_data.reject_error = -error;
                now = current_kernel_time();
                key->expiry = now.tv_sec + timeout;
                key_schedule_gc(key->expiry + key_gc_delay);
@@ -571,7 +572,7 @@ int key_reject_and_link(struct key *key,
 
                /* and link it into the destination keyring */
                if (keyring && link_ret == 0)
-                       __key_link(keyring, key, &prealloc);
+                       __key_link(key, &edit);
 
                /* disable the authorisation key */
                if (authkey)
@@ -581,7 +582,7 @@ int key_reject_and_link(struct key *key,
        mutex_unlock(&key_construction_mutex);
 
        if (keyring)
-               __key_link_end(keyring, key->type, prealloc);
+               __key_link_end(keyring, &key->index_key, edit);
 
        /* wake up anyone waiting for a key to be constructed */
        if (awaken)
@@ -645,7 +646,7 @@ found:
        /* this races with key_put(), but that doesn't matter since key_put()
         * doesn't actually change the key
         */
-       atomic_inc(&key->usage);
+       __key_get(key);
 
 error:
        spin_unlock(&key_serial_lock);
@@ -780,25 +781,27 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
                               key_perm_t perm,
                               unsigned long flags)
 {
-       unsigned long prealloc;
+       struct keyring_index_key index_key = {
+               .description    = description,
+       };
        struct key_preparsed_payload prep;
+       struct assoc_array_edit *edit;
        const struct cred *cred = current_cred();
-       struct key_type *ktype;
        struct key *keyring, *key = NULL;
        key_ref_t key_ref;
        int ret;
 
        /* look up the key type to see if it's one of the registered kernel
         * types */
-       ktype = key_type_lookup(type);
-       if (IS_ERR(ktype)) {
+       index_key.type = key_type_lookup(type);
+       if (IS_ERR(index_key.type)) {
                key_ref = ERR_PTR(-ENODEV);
                goto error;
        }
 
        key_ref = ERR_PTR(-EINVAL);
-       if (!ktype->match || !ktype->instantiate ||
-           (!description && !ktype->preparse))
+       if (!index_key.type->match || !index_key.type->instantiate ||
+           (!index_key.description && !index_key.type->preparse))
                goto error_put_type;
 
        keyring = key_ref_to_ptr(keyring_ref);
@@ -812,21 +815,28 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
        memset(&prep, 0, sizeof(prep));
        prep.data = payload;
        prep.datalen = plen;
-       prep.quotalen = ktype->def_datalen;
-       if (ktype->preparse) {
-               ret = ktype->preparse(&prep);
+       prep.quotalen = index_key.type->def_datalen;
+       prep.trusted = flags & KEY_ALLOC_TRUSTED;
+       if (index_key.type->preparse) {
+               ret = index_key.type->preparse(&prep);
                if (ret < 0) {
                        key_ref = ERR_PTR(ret);
                        goto error_put_type;
                }
-               if (!description)
-                       description = prep.description;
+               if (!index_key.description)
+                       index_key.description = prep.description;
                key_ref = ERR_PTR(-EINVAL);
-               if (!description)
+               if (!index_key.description)
                        goto error_free_prep;
        }
+       index_key.desc_len = strlen(index_key.description);
+
+       key_ref = ERR_PTR(-EPERM);
+       if (!prep.trusted && test_bit(KEY_FLAG_TRUSTED_ONLY, &keyring->flags))
+               goto error_free_prep;
+       flags |= prep.trusted ? KEY_ALLOC_TRUSTED : 0;
 
-       ret = __key_link_begin(keyring, ktype, description, &prealloc);
+       ret = __key_link_begin(keyring, &index_key, &edit);
        if (ret < 0) {
                key_ref = ERR_PTR(ret);
                goto error_free_prep;
@@ -844,10 +854,9 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
         * key of the same type and description in the destination keyring and
         * update that instead if possible
         */
-       if (ktype->update) {
-               key_ref = __keyring_search_one(keyring_ref, ktype, description,
-                                              0);
-               if (!IS_ERR(key_ref))
+       if (index_key.type->update) {
+               key_ref = find_key_to_update(keyring_ref, &index_key);
+               if (key_ref)
                        goto found_matching_key;
        }
 
@@ -856,23 +865,24 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
                perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
                perm |= KEY_USR_VIEW;
 
-               if (ktype->read)
+               if (index_key.type->read)
                        perm |= KEY_POS_READ;
 
-               if (ktype == &key_type_keyring || ktype->update)
+               if (index_key.type == &key_type_keyring ||
+                   index_key.type->update)
                        perm |= KEY_POS_WRITE;
        }
 
        /* allocate a new key */
-       key = key_alloc(ktype, description, cred->fsuid, cred->fsgid, cred,
-                       perm, flags);
+       key = key_alloc(index_key.type, index_key.description,
+                       cred->fsuid, cred->fsgid, cred, perm, flags);
        if (IS_ERR(key)) {
                key_ref = ERR_CAST(key);
                goto error_link_end;
        }
 
        /* instantiate it and link it into the target keyring */
-       ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &prealloc);
+       ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &edit);
        if (ret < 0) {
                key_put(key);
                key_ref = ERR_PTR(ret);
@@ -882,12 +892,12 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
        key_ref = make_key_ref(key, is_key_possessed(keyring_ref));
 
 error_link_end:
-       __key_link_end(keyring, ktype, prealloc);
+       __key_link_end(keyring, &index_key, edit);
 error_free_prep:
-       if (ktype->preparse)
-               ktype->free_preparse(&prep);
+       if (index_key.type->preparse)
+               index_key.type->free_preparse(&prep);
 error_put_type:
-       key_type_put(ktype);
+       key_type_put(index_key.type);
 error:
        return key_ref;
 
@@ -895,7 +905,7 @@ error:
        /* we found a matching key, so we're going to try to update it
         * - we can drop the locks first as we have the key pinned
         */
-       __key_link_end(keyring, ktype, prealloc);
+       __key_link_end(keyring, &index_key, edit);
 
        key_ref = __key_update(key_ref, &prep);
        goto error_free_prep;
index 33cfd27..cee72ce 100644 (file)
@@ -1667,6 +1667,9 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
        case KEYCTL_INVALIDATE:
                return keyctl_invalidate_key((key_serial_t) arg2);
 
+       case KEYCTL_GET_PERSISTENT:
+               return keyctl_get_persistent((uid_t)arg2, (key_serial_t)arg3);
+
        default:
                return -EOPNOTSUPP;
        }
index 6ece7f2..69f0cb7 100644 (file)
@@ -1,6 +1,6 @@
 /* Keyring handling
  *
- * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-2005, 2008, 2013 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
 #include <linux/seq_file.h>
 #include <linux/err.h>
 #include <keys/keyring-type.h>
+#include <keys/user-type.h>
+#include <linux/assoc_array_priv.h>
 #include <linux/uaccess.h>
 #include "internal.h"
 
-#define rcu_dereference_locked_keyring(keyring)                                \
-       (rcu_dereference_protected(                                     \
-               (keyring)->payload.subscriptions,                       \
-               rwsem_is_locked((struct rw_semaphore *)&(keyring)->sem)))
-
-#define rcu_deref_link_locked(klist, index, keyring)                   \
-       (rcu_dereference_protected(                                     \
-               (klist)->keys[index],                                   \
-               rwsem_is_locked((struct rw_semaphore *)&(keyring)->sem)))
-
-#define MAX_KEYRING_LINKS                                              \
-       min_t(size_t, USHRT_MAX - 1,                                    \
-             ((PAGE_SIZE - sizeof(struct keyring_list)) / sizeof(struct key *)))
-
-#define KEY_LINK_FIXQUOTA 1UL
-
 /*
  * When plumbing the depths of the key tree, this sets a hard limit
  * set on how deep we're willing to go.
  */
 #define KEYRING_NAME_HASH_SIZE (1 << 5)
 
+/*
+ * We mark pointers we pass to the associative array with bit 1 set if
+ * they're keyrings and clear otherwise.
+ */
+#define KEYRING_PTR_SUBTYPE    0x2UL
+
+static inline bool keyring_ptr_is_keyring(const struct assoc_array_ptr *x)
+{
+       return (unsigned long)x & KEYRING_PTR_SUBTYPE;
+}
+static inline struct key *keyring_ptr_to_key(const struct assoc_array_ptr *x)
+{
+       void *object = assoc_array_ptr_to_leaf(x);
+       return (struct key *)((unsigned long)object & ~KEYRING_PTR_SUBTYPE);
+}
+static inline void *keyring_key_to_ptr(struct key *key)
+{
+       if (key->type == &key_type_keyring)
+               return (void *)((unsigned long)key | KEYRING_PTR_SUBTYPE);
+       return key;
+}
+
 static struct list_head        keyring_name_hash[KEYRING_NAME_HASH_SIZE];
 static DEFINE_RWLOCK(keyring_name_lock);
 
@@ -67,7 +75,6 @@ static inline unsigned keyring_hash(const char *desc)
  */
 static int keyring_instantiate(struct key *keyring,
                               struct key_preparsed_payload *prep);
-static int keyring_match(const struct key *keyring, const void *criterion);
 static void keyring_revoke(struct key *keyring);
 static void keyring_destroy(struct key *keyring);
 static void keyring_describe(const struct key *keyring, struct seq_file *m);
@@ -76,9 +83,9 @@ static long keyring_read(const struct key *keyring,
 
 struct key_type key_type_keyring = {
        .name           = "keyring",
-       .def_datalen    = sizeof(struct keyring_list),
+       .def_datalen    = 0,
        .instantiate    = keyring_instantiate,
-       .match          = keyring_match,
+       .match          = user_match,
        .revoke         = keyring_revoke,
        .destroy        = keyring_destroy,
        .describe       = keyring_describe,
@@ -127,6 +134,7 @@ static int keyring_instantiate(struct key *keyring,
 
        ret = -EINVAL;
        if (prep->datalen == 0) {
+               assoc_array_init(&keyring->keys);
                /* make the keyring available by name if it has one */
                keyring_publish_name(keyring);
                ret = 0;
@@ -136,15 +144,226 @@ static int keyring_instantiate(struct key *keyring,
 }
 
 /*
- * Match keyrings on their name
+ * Multiply 64-bits by 32-bits to 96-bits and fold back to 64-bit.  Ideally we'd
+ * fold the carry back too, but that requires inline asm.
+ */
+static u64 mult_64x32_and_fold(u64 x, u32 y)
+{
+       u64 hi = (u64)(u32)(x >> 32) * y;
+       u64 lo = (u64)(u32)(x) * y;
+       return lo + ((u64)(u32)hi << 32) + (u32)(hi >> 32);
+}
+
+/*
+ * Hash a key type and description.
+ */
+static unsigned long hash_key_type_and_desc(const struct keyring_index_key *index_key)
+{
+       const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP;
+       const unsigned long level_mask = ASSOC_ARRAY_LEVEL_STEP_MASK;
+       const char *description = index_key->description;
+       unsigned long hash, type;
+       u32 piece;
+       u64 acc;
+       int n, desc_len = index_key->desc_len;
+
+       type = (unsigned long)index_key->type;
+
+       acc = mult_64x32_and_fold(type, desc_len + 13);
+       acc = mult_64x32_and_fold(acc, 9207);
+       for (;;) {
+               n = desc_len;
+               if (n <= 0)
+                       break;
+               if (n > 4)
+                       n = 4;
+               piece = 0;
+               memcpy(&piece, description, n);
+               description += n;
+               desc_len -= n;
+               acc = mult_64x32_and_fold(acc, piece);
+               acc = mult_64x32_and_fold(acc, 9207);
+       }
+
+       /* Fold the hash down to 32 bits if need be. */
+       hash = acc;
+       if (ASSOC_ARRAY_KEY_CHUNK_SIZE == 32)
+               hash ^= acc >> 32;
+
+       /* Squidge all the keyrings into a separate part of the tree to
+        * ordinary keys by making sure the lowest level segment in the hash is
+        * zero for keyrings and non-zero otherwise.
+        */
+       if (index_key->type != &key_type_keyring && (hash & level_mask) == 0)
+               return hash | (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1;
+       if (index_key->type == &key_type_keyring && (hash & level_mask) != 0)
+               return (hash + (hash << level_shift)) & ~level_mask;
+       return hash;
+}
+
+/*
+ * Build the next index key chunk.
+ *
+ * On 32-bit systems the index key is laid out as:
+ *
+ *     0       4       5       9...
+ *     hash    desclen typeptr desc[]
+ *
+ * On 64-bit systems:
+ *
+ *     0       8       9       17...
+ *     hash    desclen typeptr desc[]
+ *
+ * We return it one word-sized chunk at a time.
  */
-static int keyring_match(const struct key *keyring, const void *description)
+static unsigned long keyring_get_key_chunk(const void *data, int level)
+{
+       const struct keyring_index_key *index_key = data;
+       unsigned long chunk = 0;
+       long offset = 0;
+       int desc_len = index_key->desc_len, n = sizeof(chunk);
+
+       level /= ASSOC_ARRAY_KEY_CHUNK_SIZE;
+       switch (level) {
+       case 0:
+               return hash_key_type_and_desc(index_key);
+       case 1:
+               return ((unsigned long)index_key->type << 8) | desc_len;
+       case 2:
+               if (desc_len == 0)
+                       return (u8)((unsigned long)index_key->type >>
+                                   (ASSOC_ARRAY_KEY_CHUNK_SIZE - 8));
+               n--;
+               offset = 1;
+       default:
+               offset += sizeof(chunk) - 1;
+               offset += (level - 3) * sizeof(chunk);
+               if (offset >= desc_len)
+                       return 0;
+               desc_len -= offset;
+               if (desc_len > n)
+                       desc_len = n;
+               offset += desc_len;
+               do {
+                       chunk <<= 8;
+                       chunk |= ((u8*)index_key->description)[--offset];
+               } while (--desc_len > 0);
+
+               if (level == 2) {
+                       chunk <<= 8;
+                       chunk |= (u8)((unsigned long)index_key->type >>
+                                     (ASSOC_ARRAY_KEY_CHUNK_SIZE - 8));
+               }
+               return chunk;
+       }
+}
+
+static unsigned long keyring_get_object_key_chunk(const void *object, int level)
+{
+       const struct key *key = keyring_ptr_to_key(object);
+       return keyring_get_key_chunk(&key->index_key, level);
+}
+
+static bool keyring_compare_object(const void *object, const void *data)
 {
-       return keyring->description &&
-               strcmp(keyring->description, description) == 0;
+       const struct keyring_index_key *index_key = data;
+       const struct key *key = keyring_ptr_to_key(object);
+
+       return key->index_key.type == index_key->type &&
+               key->index_key.desc_len == index_key->desc_len &&
+               memcmp(key->index_key.description, index_key->description,
+                      index_key->desc_len) == 0;
 }
 
 /*
+ * Compare the index keys of a pair of objects and determine the bit position
+ * at which they differ - if they differ.
+ */
+static int keyring_diff_objects(const void *_a, const void *_b)
+{
+       const struct key *key_a = keyring_ptr_to_key(_a);
+       const struct key *key_b = keyring_ptr_to_key(_b);
+       const struct keyring_index_key *a = &key_a->index_key;
+       const struct keyring_index_key *b = &key_b->index_key;
+       unsigned long seg_a, seg_b;
+       int level, i;
+
+       level = 0;
+       seg_a = hash_key_type_and_desc(a);
+       seg_b = hash_key_type_and_desc(b);
+       if ((seg_a ^ seg_b) != 0)
+               goto differ;
+
+       /* The number of bits contributed by the hash is controlled by a
+        * constant in the assoc_array headers.  Everything else thereafter we
+        * can deal with as being machine word-size dependent.
+        */
+       level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8;
+       seg_a = a->desc_len;
+       seg_b = b->desc_len;
+       if ((seg_a ^ seg_b) != 0)
+               goto differ;
+
+       /* The next bit may not work on big endian */
+       level++;
+       seg_a = (unsigned long)a->type;
+       seg_b = (unsigned long)b->type;
+       if ((seg_a ^ seg_b) != 0)
+               goto differ;
+
+       level += sizeof(unsigned long);
+       if (a->desc_len == 0)
+               goto same;
+
+       i = 0;
+       if (((unsigned long)a->description | (unsigned long)b->description) &
+           (sizeof(unsigned long) - 1)) {
+               do {
+                       seg_a = *(unsigned long *)(a->description + i);
+                       seg_b = *(unsigned long *)(b->description + i);
+                       if ((seg_a ^ seg_b) != 0)
+                               goto differ_plus_i;
+                       i += sizeof(unsigned long);
+               } while (i < (a->desc_len & (sizeof(unsigned long) - 1)));
+       }
+
+       for (; i < a->desc_len; i++) {
+               seg_a = *(unsigned char *)(a->description + i);
+               seg_b = *(unsigned char *)(b->description + i);
+               if ((seg_a ^ seg_b) != 0)
+                       goto differ_plus_i;
+       }
+
+same:
+       return -1;
+
+differ_plus_i:
+       level += i;
+differ:
+       i = level * 8 + __ffs(seg_a ^ seg_b);
+       return i;
+}
+
+/*
+ * Free an object after stripping the keyring flag off of the pointer.
+ */
+static void keyring_free_object(void *object)
+{
+       key_put(keyring_ptr_to_key(object));
+}
+
+/*
+ * Operations for keyring management by the index-tree routines.
+ */
+static const struct assoc_array_ops keyring_assoc_array_ops = {
+       .get_key_chunk          = keyring_get_key_chunk,
+       .get_object_key_chunk   = keyring_get_object_key_chunk,
+       .compare_object         = keyring_compare_object,
+       .diff_objects           = keyring_diff_objects,
+       .free_object            = keyring_free_object,
+};
+
+/*
  * Clean up a keyring when it is destroyed.  Unpublish its name if it had one
  * and dispose of its data.
  *
@@ -155,9 +374,6 @@ static int keyring_match(const struct key *keyring, const void *description)
  */
 static void keyring_destroy(struct key *keyring)
 {
-       struct keyring_list *klist;
-       int loop;
-
        if (keyring->description) {
                write_lock(&keyring_name_lock);
 
@@ -168,12 +384,7 @@ static void keyring_destroy(struct key *keyring)
                write_unlock(&keyring_name_lock);
        }
 
-       klist = rcu_access_pointer(keyring->payload.subscriptions);
-       if (klist) {
-               for (loop = klist->nkeys - 1; loop >= 0; loop--)
-                       key_put(rcu_access_pointer(klist->keys[loop]));
-               kfree(klist);
-       }
+       assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops);
 }
 
 /*
@@ -181,76 +392,88 @@ static void keyring_destroy(struct key *keyring)
  */
 static void keyring_describe(const struct key *keyring, struct seq_file *m)
 {
-       struct keyring_list *klist;
-
        if (keyring->description)
                seq_puts(m, keyring->description);
        else
                seq_puts(m, "[anon]");
 
        if (key_is_instantiated(keyring)) {
-               rcu_read_lock();
-               klist = rcu_dereference(keyring->payload.subscriptions);
-               if (klist)
-                       seq_printf(m, ": %u/%u", klist->nkeys, klist->maxkeys);
+               if (keyring->keys.nr_leaves_on_tree != 0)
+                       seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree);
                else
                        seq_puts(m, ": empty");
-               rcu_read_unlock();
        }
 }
 
+struct keyring_read_iterator_context {
+       size_t                  qty;
+       size_t                  count;
+       key_serial_t __user     *buffer;
+};
+
+static int keyring_read_iterator(const void *object, void *data)
+{
+       struct keyring_read_iterator_context *ctx = data;
+       const struct key *key = keyring_ptr_to_key(object);
+       int ret;
+
+       kenter("{%s,%d},,{%zu/%zu}",
+              key->type->name, key->serial, ctx->count, ctx->qty);
+
+       if (ctx->count >= ctx->qty)
+               return 1;
+
+       ret = put_user(key->serial, ctx->buffer);
+       if (ret < 0)
+               return ret;
+       ctx->buffer++;
+       ctx->count += sizeof(key->serial);
+       return 0;
+}
+
 /*
  * Read a list of key IDs from the keyring's contents in binary form
  *
- * The keyring's semaphore is read-locked by the caller.
+ * The keyring's semaphore is read-locked by the caller.  This prevents someone
+ * from modifying it under us - which could cause us to read key IDs multiple
+ * times.
  */
 static long keyring_read(const struct key *keyring,
                         char __user *buffer, size_t buflen)
 {
-       struct keyring_list *klist;
-       struct key *key;
-       size_t qty, tmp;
-       int loop, ret;
+       struct keyring_read_iterator_context ctx;
+       unsigned long nr_keys;
+       int ret;
 
-       ret = 0;
-       klist = rcu_dereference_locked_keyring(keyring);
-       if (klist) {
-               /* calculate how much data we could return */
-               qty = klist->nkeys * sizeof(key_serial_t);
-
-               if (buffer && buflen > 0) {
-                       if (buflen > qty)
-                               buflen = qty;
-
-                       /* copy the IDs of the subscribed keys into the
-                        * buffer */
-                       ret = -EFAULT;
-
-                       for (loop = 0; loop < klist->nkeys; loop++) {
-                               key = rcu_deref_link_locked(klist, loop,
-                                                           keyring);
-
-                               tmp = sizeof(key_serial_t);
-                               if (tmp > buflen)
-                                       tmp = buflen;
-
-                               if (copy_to_user(buffer,
-                                                &key->serial,
-                                                tmp) != 0)
-                                       goto error;
-
-                               buflen -= tmp;
-                               if (buflen == 0)
-                                       break;
-                               buffer += tmp;
-                       }
-               }
+       kenter("{%d},,%zu", key_serial(keyring), buflen);
+
+       if (buflen & (sizeof(key_serial_t) - 1))
+               return -EINVAL;
+
+       nr_keys = keyring->keys.nr_leaves_on_tree;
+       if (nr_keys == 0)
+               return 0;
 
-               ret = qty;
+       /* Calculate how much data we could return */
+       ctx.qty = nr_keys * sizeof(key_serial_t);
+
+       if (!buffer || !buflen)
+               return ctx.qty;
+
+       if (buflen > ctx.qty)
+               ctx.qty = buflen;
+
+       /* Copy the IDs of the subscribed keys into the buffer */
+       ctx.buffer = (key_serial_t __user *)buffer;
+       ctx.count = 0;
+       ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx);
+       if (ret < 0) {
+               kleave(" = %d [iterate]", ret);
+               return ret;
        }
 
-error:
-       return ret;
+       kleave(" = %zu [ok]", ctx.count);
+       return ctx.count;
 }
 
 /*
@@ -277,227 +500,361 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 }
 EXPORT_SYMBOL(keyring_alloc);
 
-/**
- * keyring_search_aux - Search a keyring tree for a key matching some criteria
- * @keyring_ref: A pointer to the keyring with possession indicator.
- * @cred: The credentials to use for permissions checks.
- * @type: The type of key to search for.
- * @description: Parameter for @match.
- * @match: Function to rule on whether or not a key is the one required.
- * @no_state_check: Don't check if a matching key is bad
- *
- * Search the supplied keyring tree for a key that matches the criteria given.
- * The root keyring and any linked keyrings must grant Search permission to the
- * caller to be searchable and keys can only be found if they too grant Search
- * to the caller. The possession flag on the root keyring pointer controls use
- * of the possessor bits in permissions checking of the entire tree.  In
- * addition, the LSM gets to forbid keyring searches and key matches.
- *
- * The search is performed as a breadth-then-depth search up to the prescribed
- * limit (KEYRING_SEARCH_MAX_DEPTH).
- *
- * Keys are matched to the type provided and are then filtered by the match
- * function, which is given the description to use in any way it sees fit.  The
- * match function may use any attributes of a key that it wishes to to
- * determine the match.  Normally the match function from the key type would be
- * used.
- *
- * RCU is used to prevent the keyring key lists from disappearing without the
- * need to take lots of locks.
- *
- * Returns a pointer to the found key and increments the key usage count if
- * successful; -EAGAIN if no matching keys were found, or if expired or revoked
- * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the
- * specified keyring wasn't a keyring.
- *
- * In the case of a successful return, the possession attribute from
- * @keyring_ref is propagated to the returned key reference.
+/*
+ * Iteration function to consider each key found.
  */
-key_ref_t keyring_search_aux(key_ref_t keyring_ref,
-                            const struct cred *cred,
-                            struct key_type *type,
-                            const void *description,
-                            key_match_func_t match,
-                            bool no_state_check)
+static int keyring_search_iterator(const void *object, void *iterator_data)
 {
-       struct {
-               /* Need a separate keylist pointer for RCU purposes */
-               struct key *keyring;
-               struct keyring_list *keylist;
-               int kix;
-       } stack[KEYRING_SEARCH_MAX_DEPTH];
-
-       struct keyring_list *keylist;
-       struct timespec now;
-       unsigned long possessed, kflags;
-       struct key *keyring, *key;
-       key_ref_t key_ref;
-       long err;
-       int sp, nkeys, kix;
+       struct keyring_search_context *ctx = iterator_data;
+       const struct key *key = keyring_ptr_to_key(object);
+       unsigned long kflags = key->flags;
 
-       keyring = key_ref_to_ptr(keyring_ref);
-       possessed = is_key_possessed(keyring_ref);
-       key_check(keyring);
+       kenter("{%d}", key->serial);
 
-       /* top keyring must have search permission to begin the search */
-       err = key_task_permission(keyring_ref, cred, KEY_SEARCH);
-       if (err < 0) {
-               key_ref = ERR_PTR(err);
-               goto error;
+       /* ignore keys not of this type */
+       if (key->type != ctx->index_key.type) {
+               kleave(" = 0 [!type]");
+               return 0;
        }
 
-       key_ref = ERR_PTR(-ENOTDIR);
-       if (keyring->type != &key_type_keyring)
-               goto error;
+       /* skip invalidated, revoked and expired keys */
+       if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) {
+               if (kflags & ((1 << KEY_FLAG_INVALIDATED) |
+                             (1 << KEY_FLAG_REVOKED))) {
+                       ctx->result = ERR_PTR(-EKEYREVOKED);
+                       kleave(" = %d [invrev]", ctx->skipped_ret);
+                       goto skipped;
+               }
 
-       rcu_read_lock();
+               if (key->expiry && ctx->now.tv_sec >= key->expiry) {
+                       ctx->result = ERR_PTR(-EKEYEXPIRED);
+                       kleave(" = %d [expire]", ctx->skipped_ret);
+                       goto skipped;
+               }
+       }
 
-       now = current_kernel_time();
-       err = -EAGAIN;
-       sp = 0;
-
-       /* firstly we should check to see if this top-level keyring is what we
-        * are looking for */
-       key_ref = ERR_PTR(-EAGAIN);
-       kflags = keyring->flags;
-       if (keyring->type == type && match(keyring, description)) {
-               key = keyring;
-               if (no_state_check)
-                       goto found;
+       /* keys that don't match */
+       if (!ctx->match(key, ctx->match_data)) {
+               kleave(" = 0 [!match]");
+               return 0;
+       }
 
-               /* check it isn't negative and hasn't expired or been
-                * revoked */
-               if (kflags & (1 << KEY_FLAG_REVOKED))
-                       goto error_2;
-               if (key->expiry && now.tv_sec >= key->expiry)
-                       goto error_2;
-               key_ref = ERR_PTR(key->type_data.reject_error);
-               if (kflags & (1 << KEY_FLAG_NEGATIVE))
-                       goto error_2;
-               goto found;
+       /* key must have search permissions */
+       if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) &&
+           key_task_permission(make_key_ref(key, ctx->possessed),
+                               ctx->cred, KEY_SEARCH) < 0) {
+               ctx->result = ERR_PTR(-EACCES);
+               kleave(" = %d [!perm]", ctx->skipped_ret);
+               goto skipped;
        }
 
-       /* otherwise, the top keyring must not be revoked, expired, or
-        * negatively instantiated if we are to search it */
-       key_ref = ERR_PTR(-EAGAIN);
-       if (kflags & ((1 << KEY_FLAG_INVALIDATED) |
-                     (1 << KEY_FLAG_REVOKED) |
-                     (1 << KEY_FLAG_NEGATIVE)) ||
-           (keyring->expiry && now.tv_sec >= keyring->expiry))
-               goto error_2;
-
-       /* start processing a new keyring */
-descend:
-       kflags = keyring->flags;
-       if (kflags & ((1 << KEY_FLAG_INVALIDATED) |
-                     (1 << KEY_FLAG_REVOKED)))
-               goto not_this_keyring;
+       if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) {
+               /* we set a different error code if we pass a negative key */
+               if (kflags & (1 << KEY_FLAG_NEGATIVE)) {
+                       smp_rmb();
+                       ctx->result = ERR_PTR(key->type_data.reject_error);
+                       kleave(" = %d [neg]", ctx->skipped_ret);
+                       goto skipped;
+               }
+       }
 
-       keylist = rcu_dereference(keyring->payload.subscriptions);
-       if (!keylist)
-               goto not_this_keyring;
+       /* Found */
+       ctx->result = make_key_ref(key, ctx->possessed);
+       kleave(" = 1 [found]");
+       return 1;
 
-       /* iterate through the keys in this keyring first */
-       nkeys = keylist->nkeys;
-       smp_rmb();
-       for (kix = 0; kix < nkeys; kix++) {
-               key = rcu_dereference(keylist->keys[kix]);
-               kflags = key->flags;
+skipped:
+       return ctx->skipped_ret;
+}
 
-               /* ignore keys not of this type */
-               if (key->type != type)
-                       continue;
+/*
+ * Search inside a keyring for a key.  We can search by walking to it
+ * directly based on its index-key or we can iterate over the entire
+ * tree looking for it, based on the match function.
+ */
+static int search_keyring(struct key *keyring, struct keyring_search_context *ctx)
+{
+       if ((ctx->flags & KEYRING_SEARCH_LOOKUP_TYPE) ==
+           KEYRING_SEARCH_LOOKUP_DIRECT) {
+               const void *object;
+
+               object = assoc_array_find(&keyring->keys,
+                                         &keyring_assoc_array_ops,
+                                         &ctx->index_key);
+               return object ? ctx->iterator(object, ctx) : 0;
+       }
+       return assoc_array_iterate(&keyring->keys, ctx->iterator, ctx);
+}
 
-               /* skip invalidated, revoked and expired keys */
-               if (!no_state_check) {
-                       if (kflags & ((1 << KEY_FLAG_INVALIDATED) |
-                                     (1 << KEY_FLAG_REVOKED)))
-                               continue;
+/*
+ * Search a tree of keyrings that point to other keyrings up to the maximum
+ * depth.
+ */
+static bool search_nested_keyrings(struct key *keyring,
+                                  struct keyring_search_context *ctx)
+{
+       struct {
+               struct key *keyring;
+               struct assoc_array_node *node;
+               int slot;
+       } stack[KEYRING_SEARCH_MAX_DEPTH];
 
-                       if (key->expiry && now.tv_sec >= key->expiry)
-                               continue;
-               }
+       struct assoc_array_shortcut *shortcut;
+       struct assoc_array_node *node;
+       struct assoc_array_ptr *ptr;
+       struct key *key;
+       int sp = 0, slot;
 
-               /* keys that don't match */
-               if (!match(key, description))
-                       continue;
+       kenter("{%d},{%s,%s}",
+              keyring->serial,
+              ctx->index_key.type->name,
+              ctx->index_key.description);
 
-               /* key must have search permissions */
-               if (key_task_permission(make_key_ref(key, possessed),
-                                       cred, KEY_SEARCH) < 0)
-                       continue;
+       if (ctx->index_key.description)
+               ctx->index_key.desc_len = strlen(ctx->index_key.description);
 
-               if (no_state_check)
+       /* Check to see if this top-level keyring is what we are looking for
+        * and whether it is valid or not.
+        */
+       if (ctx->flags & KEYRING_SEARCH_LOOKUP_ITERATE ||
+           keyring_compare_object(keyring, &ctx->index_key)) {
+               ctx->skipped_ret = 2;
+               ctx->flags |= KEYRING_SEARCH_DO_STATE_CHECK;
+               switch (ctx->iterator(keyring_key_to_ptr(keyring), ctx)) {
+               case 1:
                        goto found;
-
-               /* we set a different error code if we pass a negative key */
-               if (kflags & (1 << KEY_FLAG_NEGATIVE)) {
-                       err = key->type_data.reject_error;
-                       continue;
+               case 2:
+                       return false;
+               default:
+                       break;
                }
+       }
+
+       ctx->skipped_ret = 0;
+       if (ctx->flags & KEYRING_SEARCH_NO_STATE_CHECK)
+               ctx->flags &= ~KEYRING_SEARCH_DO_STATE_CHECK;
 
+       /* Start processing a new keyring */
+descend_to_keyring:
+       kdebug("descend to %d", keyring->serial);
+       if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) |
+                             (1 << KEY_FLAG_REVOKED)))
+               goto not_this_keyring;
+
+       /* Search through the keys in this keyring before its searching its
+        * subtrees.
+        */
+       if (search_keyring(keyring, ctx))
                goto found;
-       }
 
-       /* search through the keyrings nested in this one */
-       kix = 0;
-ascend:
-       nkeys = keylist->nkeys;
-       smp_rmb();
-       for (; kix < nkeys; kix++) {
-               key = rcu_dereference(keylist->keys[kix]);
-               if (key->type != &key_type_keyring)
-                       continue;
+       /* Then manually iterate through the keyrings nested in this one.
+        *
+        * Start from the root node of the index tree.  Because of the way the
+        * hash function has been set up, keyrings cluster on the leftmost
+        * branch of the root node (root slot 0) or in the root node itself.
+        * Non-keyrings avoid the leftmost branch of the root entirely (root
+        * slots 1-15).
+        */
+       ptr = ACCESS_ONCE(keyring->keys.root);
+       if (!ptr)
+               goto not_this_keyring;
 
-               /* recursively search nested keyrings
-                * - only search keyrings for which we have search permission
+       if (assoc_array_ptr_is_shortcut(ptr)) {
+               /* If the root is a shortcut, either the keyring only contains
+                * keyring pointers (everything clusters behind root slot 0) or
+                * doesn't contain any keyring pointers.
                 */
-               if (sp >= KEYRING_SEARCH_MAX_DEPTH)
+               shortcut = assoc_array_ptr_to_shortcut(ptr);
+               smp_read_barrier_depends();
+               if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0)
+                       goto not_this_keyring;
+
+               ptr = ACCESS_ONCE(shortcut->next_node);
+               node = assoc_array_ptr_to_node(ptr);
+               goto begin_node;
+       }
+
+       node = assoc_array_ptr_to_node(ptr);
+       smp_read_barrier_depends();
+
+       ptr = node->slots[0];
+       if (!assoc_array_ptr_is_meta(ptr))
+               goto begin_node;
+
+descend_to_node:
+       /* Descend to a more distal node in this keyring's content tree and go
+        * through that.
+        */
+       kdebug("descend");
+       if (assoc_array_ptr_is_shortcut(ptr)) {
+               shortcut = assoc_array_ptr_to_shortcut(ptr);
+               smp_read_barrier_depends();
+               ptr = ACCESS_ONCE(shortcut->next_node);
+               BUG_ON(!assoc_array_ptr_is_node(ptr));
+               node = assoc_array_ptr_to_node(ptr);
+       }
+
+begin_node:
+       kdebug("begin_node");
+       smp_read_barrier_depends();
+       slot = 0;
+ascend_to_node:
+       /* Go through the slots in a node */
+       for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
+               ptr = ACCESS_ONCE(node->slots[slot]);
+
+               if (assoc_array_ptr_is_meta(ptr) && node->back_pointer)
+                       goto descend_to_node;
+
+               if (!keyring_ptr_is_keyring(ptr))
                        continue;
 
-               if (key_task_permission(make_key_ref(key, possessed),
-                                       cred, KEY_SEARCH) < 0)
+               key = keyring_ptr_to_key(ptr);
+
+               if (sp >= KEYRING_SEARCH_MAX_DEPTH) {
+                       if (ctx->flags & KEYRING_SEARCH_DETECT_TOO_DEEP) {
+                               ctx->result = ERR_PTR(-ELOOP);
+                               return false;
+                       }
+                       goto not_this_keyring;
+               }
+
+               /* Search a nested keyring */
+               if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) &&
+                   key_task_permission(make_key_ref(key, ctx->possessed),
+                                       ctx->cred, KEY_SEARCH) < 0)
                        continue;
 
                /* stack the current position */
                stack[sp].keyring = keyring;
-               stack[sp].keylist = keylist;
-               stack[sp].kix = kix;
+               stack[sp].node = node;
+               stack[sp].slot = slot;
                sp++;
 
                /* begin again with the new keyring */
                keyring = key;
-               goto descend;
+               goto descend_to_keyring;
        }
 
-       /* the keyring we're looking at was disqualified or didn't contain a
-        * matching key */
+       /* We've dealt with all the slots in the current node, so now we need
+        * to ascend to the parent and continue processing there.
+        */
+       ptr = ACCESS_ONCE(node->back_pointer);
+       slot = node->parent_slot;
+
+       if (ptr && assoc_array_ptr_is_shortcut(ptr)) {
+               shortcut = assoc_array_ptr_to_shortcut(ptr);
+               smp_read_barrier_depends();
+               ptr = ACCESS_ONCE(shortcut->back_pointer);
+               slot = shortcut->parent_slot;
+       }
+       if (!ptr)
+               goto not_this_keyring;
+       node = assoc_array_ptr_to_node(ptr);
+       smp_read_barrier_depends();
+       slot++;
+
+       /* If we've ascended to the root (zero backpointer), we must have just
+        * finished processing the leftmost branch rather than the root slots -
+        * so there can't be any more keyrings for us to find.
+        */
+       if (node->back_pointer) {
+               kdebug("ascend %d", slot);
+               goto ascend_to_node;
+       }
+
+       /* The keyring we're looking at was disqualified or didn't contain a
+        * matching key.
+        */
 not_this_keyring:
-       if (sp > 0) {
-               /* resume the processing of a keyring higher up in the tree */
-               sp--;
-               keyring = stack[sp].keyring;
-               keylist = stack[sp].keylist;
-               kix = stack[sp].kix + 1;
-               goto ascend;
+       kdebug("not_this_keyring %d", sp);
+       if (sp <= 0) {
+               kleave(" = false");
+               return false;
        }
 
-       key_ref = ERR_PTR(err);
-       goto error_2;
+       /* Resume the processing of a keyring higher up in the tree */
+       sp--;
+       keyring = stack[sp].keyring;
+       node = stack[sp].node;
+       slot = stack[sp].slot + 1;
+       kdebug("ascend to %d [%d]", keyring->serial, slot);
+       goto ascend_to_node;
 
-       /* we found a viable match */
+       /* We found a viable match */
 found:
-       atomic_inc(&key->usage);
-       key->last_used_at = now.tv_sec;
-       keyring->last_used_at = now.tv_sec;
-       while (sp > 0)
-               stack[--sp].keyring->last_used_at = now.tv_sec;
+       key = key_ref_to_ptr(ctx->result);
        key_check(key);
-       key_ref = make_key_ref(key, possessed);
-error_2:
+       if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) {
+               key->last_used_at = ctx->now.tv_sec;
+               keyring->last_used_at = ctx->now.tv_sec;
+               while (sp > 0)
+                       stack[--sp].keyring->last_used_at = ctx->now.tv_sec;
+       }
+       kleave(" = true");
+       return true;
+}
+
+/**
+ * keyring_search_aux - Search a keyring tree for a key matching some criteria
+ * @keyring_ref: A pointer to the keyring with possession indicator.
+ * @ctx: The keyring search context.
+ *
+ * Search the supplied keyring tree for a key that matches the criteria given.
+ * The root keyring and any linked keyrings must grant Search permission to the
+ * caller to be searchable and keys can only be found if they too grant Search
+ * to the caller. The possession flag on the root keyring pointer controls use
+ * of the possessor bits in permissions checking of the entire tree.  In
+ * addition, the LSM gets to forbid keyring searches and key matches.
+ *
+ * The search is performed as a breadth-then-depth search up to the prescribed
+ * limit (KEYRING_SEARCH_MAX_DEPTH).
+ *
+ * Keys are matched to the type provided and are then filtered by the match
+ * function, which is given the description to use in any way it sees fit.  The
+ * match function may use any attributes of a key that it wishes to to
+ * determine the match.  Normally the match function from the key type would be
+ * used.
+ *
+ * RCU can be used to prevent the keyring key lists from disappearing without
+ * the need to take lots of locks.
+ *
+ * Returns a pointer to the found key and increments the key usage count if
+ * successful; -EAGAIN if no matching keys were found, or if expired or revoked
+ * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the
+ * specified keyring wasn't a keyring.
+ *
+ * In the case of a successful return, the possession attribute from
+ * @keyring_ref is propagated to the returned key reference.
+ */
+key_ref_t keyring_search_aux(key_ref_t keyring_ref,
+                            struct keyring_search_context *ctx)
+{
+       struct key *keyring;
+       long err;
+
+       ctx->iterator = keyring_search_iterator;
+       ctx->possessed = is_key_possessed(keyring_ref);
+       ctx->result = ERR_PTR(-EAGAIN);
+
+       keyring = key_ref_to_ptr(keyring_ref);
+       key_check(keyring);
+
+       if (keyring->type != &key_type_keyring)
+               return ERR_PTR(-ENOTDIR);
+
+       if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM)) {
+               err = key_task_permission(keyring_ref, ctx->cred, KEY_SEARCH);
+               if (err < 0)
+                       return ERR_PTR(err);
+       }
+
+       rcu_read_lock();
+       ctx->now = current_kernel_time();
+       if (search_nested_keyrings(keyring, ctx))
+               __key_get(key_ref_to_ptr(ctx->result));
        rcu_read_unlock();
-error:
-       return key_ref;
+       return ctx->result;
 }
 
 /**
@@ -507,77 +864,73 @@ error:
  * @description: The name of the keyring we want to find.
  *
  * As keyring_search_aux() above, but using the current task's credentials and
- * type's default matching function.
+ * type's default matching function and preferred search method.
  */
 key_ref_t keyring_search(key_ref_t keyring,
                         struct key_type *type,
                         const char *description)
 {
-       if (!type->match)
+       struct keyring_search_context ctx = {
+               .index_key.type         = type,
+               .index_key.description  = description,
+               .cred                   = current_cred(),
+               .match                  = type->match,
+               .match_data             = description,
+               .flags                  = (type->def_lookup_type |
+                                          KEYRING_SEARCH_DO_STATE_CHECK),
+       };
+
+       if (!ctx.match)
                return ERR_PTR(-ENOKEY);
 
-       return keyring_search_aux(keyring, current->cred,
-                                 type, description, type->match, false);
+       return keyring_search_aux(keyring, &ctx);
 }
 EXPORT_SYMBOL(keyring_search);
 
 /*
- * Search the given keyring only (no recursion).
+ * Search the given keyring for a key that might be updated.
  *
  * The caller must guarantee that the keyring is a keyring and that the
- * permission is granted to search the keyring as no check is made here.
- *
- * RCU is used to make it unnecessary to lock the keyring key list here.
+ * permission is granted to modify the keyring as no check is made here.  The
+ * caller must also hold a lock on the keyring semaphore.
  *
  * Returns a pointer to the found key with usage count incremented if
- * successful and returns -ENOKEY if not found.  Revoked keys and keys not
- * providing the requested permission are skipped over.
+ * successful and returns NULL if not found.  Revoked and invalidated keys are
+ * skipped over.
  *
  * If successful, the possession indicator is propagated from the keyring ref
  * to the returned key reference.
  */
-key_ref_t __keyring_search_one(key_ref_t keyring_ref,
-                              const struct key_type *ktype,
-                              const char *description,
-                              key_perm_t perm)
+key_ref_t find_key_to_update(key_ref_t keyring_ref,
+                            const struct keyring_index_key *index_key)
 {
-       struct keyring_list *klist;
-       unsigned long possessed;
        struct key *keyring, *key;
-       int nkeys, loop;
+       const void *object;
 
        keyring = key_ref_to_ptr(keyring_ref);
-       possessed = is_key_possessed(keyring_ref);
 
-       rcu_read_lock();
+       kenter("{%d},{%s,%s}",
+              keyring->serial, index_key->type->name, index_key->description);
 
-       klist = rcu_dereference(keyring->payload.subscriptions);
-       if (klist) {
-               nkeys = klist->nkeys;
-               smp_rmb();
-               for (loop = 0; loop < nkeys ; loop++) {
-                       key = rcu_dereference(klist->keys[loop]);
-                       if (key->type == ktype &&
-                           (!key->type->match ||
-                            key->type->match(key, description)) &&
-                           key_permission(make_key_ref(key, possessed),
-                                          perm) == 0 &&
-                           !(key->flags & ((1 << KEY_FLAG_INVALIDATED) |
-                                           (1 << KEY_FLAG_REVOKED)))
-                           )
-                               goto found;
-               }
-       }
+       object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops,
+                                 index_key);
 
-       rcu_read_unlock();
-       return ERR_PTR(-ENOKEY);
+       if (object)
+               goto found;
+
+       kleave(" = NULL");
+       return NULL;
 
 found:
-       atomic_inc(&key->usage);
-       keyring->last_used_at = key->last_used_at =
-               current_kernel_time().tv_sec;
-       rcu_read_unlock();
-       return make_key_ref(key, possessed);
+       key = keyring_ptr_to_key(object);
+       if (key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+                         (1 << KEY_FLAG_REVOKED))) {
+               kleave(" = NULL [x]");
+               return NULL;
+       }
+       __key_get(key);
+       kleave(" = {%d}", key->serial);
+       return make_key_ref(key, is_key_possessed(keyring_ref));
 }
 
 /*
@@ -640,6 +993,19 @@ out:
        return keyring;
 }
 
+static int keyring_detect_cycle_iterator(const void *object,
+                                        void *iterator_data)
+{
+       struct keyring_search_context *ctx = iterator_data;
+       const struct key *key = keyring_ptr_to_key(object);
+
+       kenter("{%d}", key->serial);
+
+       BUG_ON(key != ctx->match_data);
+       ctx->result = ERR_PTR(-EDEADLK);
+       return 1;
+}
+
 /*
  * See if a cycle will will be created by inserting acyclic tree B in acyclic
  * tree A at the topmost level (ie: as a direct child of A).
@@ -649,116 +1015,39 @@ out:
  */
 static int keyring_detect_cycle(struct key *A, struct key *B)
 {
-       struct {
-               struct keyring_list *keylist;
-               int kix;
-       } stack[KEYRING_SEARCH_MAX_DEPTH];
-
-       struct keyring_list *keylist;
-       struct key *subtree, *key;
-       int sp, nkeys, kix, ret;
+       struct keyring_search_context ctx = {
+               .index_key      = A->index_key,
+               .match_data     = A,
+               .iterator       = keyring_detect_cycle_iterator,
+               .flags          = (KEYRING_SEARCH_LOOKUP_DIRECT |
+                                  KEYRING_SEARCH_NO_STATE_CHECK |
+                                  KEYRING_SEARCH_NO_UPDATE_TIME |
+                                  KEYRING_SEARCH_NO_CHECK_PERM |
+                                  KEYRING_SEARCH_DETECT_TOO_DEEP),
+       };
 
        rcu_read_lock();
-
-       ret = -EDEADLK;
-       if (A == B)
-               goto cycle_detected;
-
-       subtree = B;
-       sp = 0;
-
-       /* start processing a new keyring */
-descend:
-       if (test_bit(KEY_FLAG_REVOKED, &subtree->flags))
-               goto not_this_keyring;
-
-       keylist = rcu_dereference(subtree->payload.subscriptions);
-       if (!keylist)
-               goto not_this_keyring;
-       kix = 0;
-
-ascend:
-       /* iterate through the remaining keys in this keyring */
-       nkeys = keylist->nkeys;
-       smp_rmb();
-       for (; kix < nkeys; kix++) {
-               key = rcu_dereference(keylist->keys[kix]);
-
-               if (key == A)
-                       goto cycle_detected;
-
-               /* recursively check nested keyrings */
-               if (key->type == &key_type_keyring) {
-                       if (sp >= KEYRING_SEARCH_MAX_DEPTH)
-                               goto too_deep;
-
-                       /* stack the current position */
-                       stack[sp].keylist = keylist;
-                       stack[sp].kix = kix;
-                       sp++;
-
-                       /* begin again with the new keyring */
-                       subtree = key;
-                       goto descend;
-               }
-       }
-
-       /* the keyring we're looking at was disqualified or didn't contain a
-        * matching key */
-not_this_keyring:
-       if (sp > 0) {
-               /* resume the checking of a keyring higher up in the tree */
-               sp--;
-               keylist = stack[sp].keylist;
-               kix = stack[sp].kix + 1;
-               goto ascend;
-       }
-
-       ret = 0; /* no cycles detected */
-
-error:
+       search_nested_keyrings(B, &ctx);
        rcu_read_unlock();
-       return ret;
-
-too_deep:
-       ret = -ELOOP;
-       goto error;
-
-cycle_detected:
-       ret = -EDEADLK;
-       goto error;
-}
-
-/*
- * Dispose of a keyring list after the RCU grace period, freeing the unlinked
- * key
- */
-static void keyring_unlink_rcu_disposal(struct rcu_head *rcu)
-{
-       struct keyring_list *klist =
-               container_of(rcu, struct keyring_list, rcu);
-
-       if (klist->delkey != USHRT_MAX)
-               key_put(rcu_access_pointer(klist->keys[klist->delkey]));
-       kfree(klist);
+       return PTR_ERR(ctx.result) == -EAGAIN ? 0 : PTR_ERR(ctx.result);
 }
 
 /*
  * Preallocate memory so that a key can be linked into to a keyring.
  */
-int __key_link_begin(struct key *keyring, const struct key_type *type,
-                    const char *description, unsigned long *_prealloc)
+int __key_link_begin(struct key *keyring,
+                    const struct keyring_index_key *index_key,
+                    struct assoc_array_edit **_edit)
        __acquires(&keyring->sem)
        __acquires(&keyring_serialise_link_sem)
 {
-       struct keyring_list *klist, *nklist;
-       unsigned long prealloc;
-       unsigned max;
-       time_t lowest_lru;
-       size_t size;
-       int loop, lru, ret;
+       struct assoc_array_edit *edit;
+       int ret;
+
+       kenter("%d,%s,%s,",
+              keyring->serial, index_key->type->name, index_key->description);
 
-       kenter("%d,%s,%s,", key_serial(keyring), type->name, description);
+       BUG_ON(index_key->desc_len == 0);
 
        if (keyring->type != &key_type_keyring)
                return -ENOTDIR;
@@ -771,100 +1060,39 @@ int __key_link_begin(struct key *keyring, const struct key_type *type,
 
        /* serialise link/link calls to prevent parallel calls causing a cycle
         * when linking two keyring in opposite orders */
-       if (type == &key_type_keyring)
+       if (index_key->type == &key_type_keyring)
                down_write(&keyring_serialise_link_sem);
 
-       klist = rcu_dereference_locked_keyring(keyring);
-
-       /* see if there's a matching key we can displace */
-       lru = -1;
-       if (klist && klist->nkeys > 0) {
-               lowest_lru = TIME_T_MAX;
-               for (loop = klist->nkeys - 1; loop >= 0; loop--) {
-                       struct key *key = rcu_deref_link_locked(klist, loop,
-                                                               keyring);
-                       if (key->type == type &&
-                           strcmp(key->description, description) == 0) {
-                               /* Found a match - we'll replace the link with
-                                * one to the new key.  We record the slot
-                                * position.
-                                */
-                               klist->delkey = loop;
-                               prealloc = 0;
-                               goto done;
-                       }
-                       if (key->last_used_at < lowest_lru) {
-                               lowest_lru = key->last_used_at;
-                               lru = loop;
-                       }
-               }
-       }
-
-       /* If the keyring is full then do an LRU discard */
-       if (klist &&
-           klist->nkeys == klist->maxkeys &&
-           klist->maxkeys >= MAX_KEYRING_LINKS) {
-               kdebug("LRU discard %d\n", lru);
-               klist->delkey = lru;
-               prealloc = 0;
-               goto done;
-       }
-
-       /* check that we aren't going to overrun the user's quota */
-       ret = key_payload_reserve(keyring,
-                                 keyring->datalen + KEYQUOTA_LINK_BYTES);
-       if (ret < 0)
+       /* Create an edit script that will insert/replace the key in the
+        * keyring tree.
+        */
+       edit = assoc_array_insert(&keyring->keys,
+                                 &keyring_assoc_array_ops,
+                                 index_key,
+                                 NULL);
+       if (IS_ERR(edit)) {
+               ret = PTR_ERR(edit);
                goto error_sem;
+       }
 
-       if (klist && klist->nkeys < klist->maxkeys) {
-               /* there's sufficient slack space to append directly */
-               klist->delkey = klist->nkeys;
-               prealloc = KEY_LINK_FIXQUOTA;
-       } else {
-               /* grow the key list */
-               max = 4;
-               if (klist) {
-                       max += klist->maxkeys;
-                       if (max > MAX_KEYRING_LINKS)
-                               max = MAX_KEYRING_LINKS;
-                       BUG_ON(max <= klist->maxkeys);
-               }
-
-               size = sizeof(*klist) + sizeof(struct key *) * max;
-
-               ret = -ENOMEM;
-               nklist = kmalloc(size, GFP_KERNEL);
-               if (!nklist)
-                       goto error_quota;
-
-               nklist->maxkeys = max;
-               if (klist) {
-                       memcpy(nklist->keys, klist->keys,
-                              sizeof(struct key *) * klist->nkeys);
-                       nklist->delkey = klist->nkeys;
-                       nklist->nkeys = klist->nkeys + 1;
-                       klist->delkey = USHRT_MAX;
-               } else {
-                       nklist->nkeys = 1;
-                       nklist->delkey = 0;
-               }
-
-               /* add the key into the new space */
-               RCU_INIT_POINTER(nklist->keys[nklist->delkey], NULL);
-               prealloc = (unsigned long)nklist | KEY_LINK_FIXQUOTA;
+       /* If we're not replacing a link in-place then we're going to need some
+        * extra quota.
+        */
+       if (!edit->dead_leaf) {
+               ret = key_payload_reserve(keyring,
+                                         keyring->datalen + KEYQUOTA_LINK_BYTES);
+               if (ret < 0)
+                       goto error_cancel;
        }
 
-done:
-       *_prealloc = prealloc;
+       *_edit = edit;
        kleave(" = 0");
        return 0;
 
-error_quota:
-       /* undo the quota changes */
-       key_payload_reserve(keyring,
-                           keyring->datalen - KEYQUOTA_LINK_BYTES);
+error_cancel:
+       assoc_array_cancel_edit(edit);
 error_sem:
-       if (type == &key_type_keyring)
+       if (index_key->type == &key_type_keyring)
                up_write(&keyring_serialise_link_sem);
 error_krsem:
        up_write(&keyring->sem);
@@ -895,60 +1123,12 @@ int __key_link_check_live_key(struct key *keyring, struct key *key)
  * holds at most one link to any given key of a particular type+description
  * combination.
  */
-void __key_link(struct key *keyring, struct key *key,
-               unsigned long *_prealloc)
+void __key_link(struct key *key, struct assoc_array_edit **_edit)
 {
-       struct keyring_list *klist, *nklist;
-       struct key *discard;
-
-       nklist = (struct keyring_list *)(*_prealloc & ~KEY_LINK_FIXQUOTA);
-       *_prealloc = 0;
-
-       kenter("%d,%d,%p", keyring->serial, key->serial, nklist);
-
-       klist = rcu_dereference_locked_keyring(keyring);
-
-       atomic_inc(&key->usage);
-       keyring->last_used_at = key->last_used_at =
-               current_kernel_time().tv_sec;
-
-       /* there's a matching key we can displace or an empty slot in a newly
-        * allocated list we can fill */
-       if (nklist) {
-               kdebug("reissue %hu/%hu/%hu",
-                      nklist->delkey, nklist->nkeys, nklist->maxkeys);
-
-               RCU_INIT_POINTER(nklist->keys[nklist->delkey], key);
-
-               rcu_assign_pointer(keyring->payload.subscriptions, nklist);
-
-               /* dispose of the old keyring list and, if there was one, the
-                * displaced key */
-               if (klist) {
-                       kdebug("dispose %hu/%hu/%hu",
-                              klist->delkey, klist->nkeys, klist->maxkeys);
-                       call_rcu(&klist->rcu, keyring_unlink_rcu_disposal);
-               }
-       } else if (klist->delkey < klist->nkeys) {
-               kdebug("replace %hu/%hu/%hu",
-                      klist->delkey, klist->nkeys, klist->maxkeys);
-
-               discard = rcu_dereference_protected(
-                       klist->keys[klist->delkey],
-                       rwsem_is_locked(&keyring->sem));
-               rcu_assign_pointer(klist->keys[klist->delkey], key);
-               /* The garbage collector will take care of RCU
-                * synchronisation */
-               key_put(discard);
-       } else {
-               /* there's sufficient slack space to append directly */
-               kdebug("append %hu/%hu/%hu",
-                      klist->delkey, klist->nkeys, klist->maxkeys);
-
-               RCU_INIT_POINTER(klist->keys[klist->delkey], key);
-               smp_wmb();
-               klist->nkeys++;
-       }
+       __key_get(key);
+       assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key));
+       assoc_array_apply_edit(*_edit);
+       *_edit = NULL;
 }
 
 /*
@@ -956,24 +1136,22 @@ void __key_link(struct key *keyring, struct key *key,
  *
  * Must be called with __key_link_begin() having being called.
  */
-void __key_link_end(struct key *keyring, struct key_type *type,
-                   unsigned long prealloc)
+void __key_link_end(struct key *keyring,
+                   const struct keyring_index_key *index_key,
+                   struct assoc_array_edit *edit)
        __releases(&keyring->sem)
        __releases(&keyring_serialise_link_sem)
 {
-       BUG_ON(type == NULL);
-       BUG_ON(type->name == NULL);
-       kenter("%d,%s,%lx", keyring->serial, type->name, prealloc);
+       BUG_ON(index_key->type == NULL);
+       kenter("%d,%s,", keyring->serial, index_key->type->name);
 
-       if (type == &key_type_keyring)
+       if (index_key->type == &key_type_keyring)
                up_write(&keyring_serialise_link_sem);
 
-       if (prealloc) {
-               if (prealloc & KEY_LINK_FIXQUOTA)
-                       key_payload_reserve(keyring,
-                                           keyring->datalen -
-                                           KEYQUOTA_LINK_BYTES);
-               kfree((struct keyring_list *)(prealloc & ~KEY_LINK_FIXQUOTA));
+       if (edit && !edit->dead_leaf) {
+               key_payload_reserve(keyring,
+                                   keyring->datalen - KEYQUOTA_LINK_BYTES);
+               assoc_array_cancel_edit(edit);
        }
        up_write(&keyring->sem);
 }
@@ -1000,20 +1178,28 @@ void __key_link_end(struct key *keyring, struct key_type *type,
  */
 int key_link(struct key *keyring, struct key *key)
 {
-       unsigned long prealloc;
+       struct assoc_array_edit *edit;
        int ret;
 
+       kenter("{%d,%d}", keyring->serial, atomic_read(&keyring->usage));
+
        key_check(keyring);
        key_check(key);
 
-       ret = __key_link_begin(keyring, key->type, key->description, &prealloc);
+       if (test_bit(KEY_FLAG_TRUSTED_ONLY, &keyring->flags) &&
+           !test_bit(KEY_FLAG_TRUSTED, &key->flags))
+               return -EPERM;
+
+       ret = __key_link_begin(keyring, &key->index_key, &edit);
        if (ret == 0) {
+               kdebug("begun {%d,%d}", keyring->serial, atomic_read(&keyring->usage));
                ret = __key_link_check_live_key(keyring, key);
                if (ret == 0)
-                       __key_link(keyring, key, &prealloc);
-               __key_link_end(keyring, key->type, prealloc);
+                       __key_link(key, &edit);
+               __key_link_end(keyring, &key->index_key, edit);
        }
 
+       kleave(" = %d {%d,%d}", ret, keyring->serial, atomic_read(&keyring->usage));
        return ret;
 }
 EXPORT_SYMBOL(key_link);
@@ -1037,90 +1223,37 @@ EXPORT_SYMBOL(key_link);
  */
 int key_unlink(struct key *keyring, struct key *key)
 {
-       struct keyring_list *klist, *nklist;
-       int loop, ret;
+       struct assoc_array_edit *edit;
+       int ret;
 
        key_check(keyring);
        key_check(key);
 
-       ret = -ENOTDIR;
        if (keyring->type != &key_type_keyring)
-               goto error;
+               return -ENOTDIR;
 
        down_write(&keyring->sem);
 
-       klist = rcu_dereference_locked_keyring(keyring);
-       if (klist) {
-               /* search the keyring for the key */
-               for (loop = 0; loop < klist->nkeys; loop++)
-                       if (rcu_access_pointer(klist->keys[loop]) == key)
-                               goto key_is_present;
+       edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops,
+                                 &key->index_key);
+       if (IS_ERR(edit)) {
+               ret = PTR_ERR(edit);
+               goto error;
        }
-
-       up_write(&keyring->sem);
        ret = -ENOENT;
-       goto error;
-
-key_is_present:
-       /* we need to copy the key list for RCU purposes */
-       nklist = kmalloc(sizeof(*klist) +
-                        sizeof(struct key *) * klist->maxkeys,
-                        GFP_KERNEL);
-       if (!nklist)
-               goto nomem;
-       nklist->maxkeys = klist->maxkeys;
-       nklist->nkeys = klist->nkeys - 1;
-
-       if (loop > 0)
-               memcpy(&nklist->keys[0],
-                      &klist->keys[0],
-                      loop * sizeof(struct key *));
-
-       if (loop < nklist->nkeys)
-               memcpy(&nklist->keys[loop],
-                      &klist->keys[loop + 1],
-                      (nklist->nkeys - loop) * sizeof(struct key *));
-
-       /* adjust the user's quota */
-       key_payload_reserve(keyring,
-                           keyring->datalen - KEYQUOTA_LINK_BYTES);
-
-       rcu_assign_pointer(keyring->payload.subscriptions, nklist);
-
-       up_write(&keyring->sem);
-
-       /* schedule for later cleanup */
-       klist->delkey = loop;
-       call_rcu(&klist->rcu, keyring_unlink_rcu_disposal);
+       if (edit == NULL)
+               goto error;
 
+       assoc_array_apply_edit(edit);
+       key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES);
        ret = 0;
 
 error:
-       return ret;
-nomem:
-       ret = -ENOMEM;
        up_write(&keyring->sem);
-       goto error;
+       return ret;
 }
 EXPORT_SYMBOL(key_unlink);
 
-/*
- * Dispose of a keyring list after the RCU grace period, releasing the keys it
- * links to.
- */
-static void keyring_clear_rcu_disposal(struct rcu_head *rcu)
-{
-       struct keyring_list *klist;
-       int loop;
-
-       klist = container_of(rcu, struct keyring_list, rcu);
-
-       for (loop = klist->nkeys - 1; loop >= 0; loop--)
-               key_put(rcu_access_pointer(klist->keys[loop]));
-
-       kfree(klist);
-}
-
 /**
  * keyring_clear - Clear a keyring
  * @keyring: The keyring to clear.
@@ -1131,33 +1264,25 @@ static void keyring_clear_rcu_disposal(struct rcu_head *rcu)
  */
 int keyring_clear(struct key *keyring)
 {
-       struct keyring_list *klist;
+       struct assoc_array_edit *edit;
        int ret;
 
-       ret = -ENOTDIR;
-       if (keyring->type == &key_type_keyring) {
-               /* detach the pointer block with the locks held */
-               down_write(&keyring->sem);
-
-               klist = rcu_dereference_locked_keyring(keyring);
-               if (klist) {
-                       /* adjust the quota */
-                       key_payload_reserve(keyring,
-                                           sizeof(struct keyring_list));
-
-                       rcu_assign_pointer(keyring->payload.subscriptions,
-                                          NULL);
-               }
-
-               up_write(&keyring->sem);
+       if (keyring->type != &key_type_keyring)
+               return -ENOTDIR;
 
-               /* free the keys after the locks have been dropped */
-               if (klist)
-                       call_rcu(&klist->rcu, keyring_clear_rcu_disposal);
+       down_write(&keyring->sem);
 
+       edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops);
+       if (IS_ERR(edit)) {
+               ret = PTR_ERR(edit);
+       } else {
+               if (edit)
+                       assoc_array_apply_edit(edit);
+               key_payload_reserve(keyring, 0);
                ret = 0;
        }
 
+       up_write(&keyring->sem);
        return ret;
 }
 EXPORT_SYMBOL(keyring_clear);
@@ -1169,111 +1294,68 @@ EXPORT_SYMBOL(keyring_clear);
  */
 static void keyring_revoke(struct key *keyring)
 {
-       struct keyring_list *klist;
+       struct assoc_array_edit *edit;
+
+       edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops);
+       if (!IS_ERR(edit)) {
+               if (edit)
+                       assoc_array_apply_edit(edit);
+               key_payload_reserve(keyring, 0);
+       }
+}
+
+static bool keyring_gc_select_iterator(void *object, void *iterator_data)
+{
+       struct key *key = keyring_ptr_to_key(object);
+       time_t *limit = iterator_data;
 
-       klist = rcu_dereference_locked_keyring(keyring);
+       if (key_is_dead(key, *limit))
+               return false;
+       key_get(key);
+       return true;
+}
 
-       /* adjust the quota */
-       key_payload_reserve(keyring, 0);
+static int keyring_gc_check_iterator(const void *object, void *iterator_data)
+{
+       const struct key *key = keyring_ptr_to_key(object);
+       time_t *limit = iterator_data;
 
-       if (klist) {
-               rcu_assign_pointer(keyring->payload.subscriptions, NULL);
-               call_rcu(&klist->rcu, keyring_clear_rcu_disposal);
-       }
+       key_check(key);
+       return key_is_dead(key, *limit);
 }
 
 /*
- * Collect garbage from the contents of a keyring, replacing the old list with
- * a new one with the pointers all shuffled down.
+ * Garbage collect pointers from a keyring.
  *
- * Dead keys are classed as oned that are flagged as being dead or are revoked,
- * expired or negative keys that were revoked or expired before the specified
- * limit.
+ * Not called with any locks held.  The keyring's key struct will not be
+ * deallocated under us as only our caller may deallocate it.
  */
 void keyring_gc(struct key *keyring, time_t limit)
 {
-       struct keyring_list *klist, *new;
-       struct key *key;
-       int loop, keep, max;
-
-       kenter("{%x,%s}", key_serial(keyring), keyring->description);
-
-       down_write(&keyring->sem);
-
-       klist = rcu_dereference_locked_keyring(keyring);
-       if (!klist)
-               goto no_klist;
-
-       /* work out how many subscriptions we're keeping */
-       keep = 0;
-       for (loop = klist->nkeys - 1; loop >= 0; loop--)
-               if (!key_is_dead(rcu_deref_link_locked(klist, loop, keyring),
-                                limit))
-                       keep++;
-
-       if (keep == klist->nkeys)
-               goto just_return;
-
-       /* allocate a new keyring payload */
-       max = roundup(keep, 4);
-       new = kmalloc(sizeof(struct keyring_list) + max * sizeof(struct key *),
-                     GFP_KERNEL);
-       if (!new)
-               goto nomem;
-       new->maxkeys = max;
-       new->nkeys = 0;
-       new->delkey = 0;
-
-       /* install the live keys
-        * - must take care as expired keys may be updated back to life
-        */
-       keep = 0;
-       for (loop = klist->nkeys - 1; loop >= 0; loop--) {
-               key = rcu_deref_link_locked(klist, loop, keyring);
-               if (!key_is_dead(key, limit)) {
-                       if (keep >= max)
-                               goto discard_new;
-                       RCU_INIT_POINTER(new->keys[keep++], key_get(key));
-               }
-       }
-       new->nkeys = keep;
-
-       /* adjust the quota */
-       key_payload_reserve(keyring,
-                           sizeof(struct keyring_list) +
-                           KEYQUOTA_LINK_BYTES * keep);
+       int result;
 
-       if (keep == 0) {
-               rcu_assign_pointer(keyring->payload.subscriptions, NULL);
-               kfree(new);
-       } else {
-               rcu_assign_pointer(keyring->payload.subscriptions, new);
-       }
+       kenter("%x{%s}", keyring->serial, keyring->description ?: "");
 
-       up_write(&keyring->sem);
+       if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) |
+                             (1 << KEY_FLAG_REVOKED)))
+               goto dont_gc;
 
-       call_rcu(&klist->rcu, keyring_clear_rcu_disposal);
-       kleave(" [yes]");
-       return;
-
-discard_new:
-       new->nkeys = keep;
-       keyring_clear_rcu_disposal(&new->rcu);
-       up_write(&keyring->sem);
-       kleave(" [discard]");
-       return;
-
-just_return:
-       up_write(&keyring->sem);
-       kleave(" [no dead]");
-       return;
+       /* scan the keyring looking for dead keys */
+       rcu_read_lock();
+       result = assoc_array_iterate(&keyring->keys,
+                                    keyring_gc_check_iterator, &limit);
+       rcu_read_unlock();
+       if (result == true)
+               goto do_gc;
 
-no_klist:
-       up_write(&keyring->sem);
-       kleave(" [no_klist]");
+dont_gc:
+       kleave(" [no gc]");
        return;
 
-nomem:
+do_gc:
+       down_write(&keyring->sem);
+       assoc_array_gc(&keyring->keys, &keyring_assoc_array_ops,
+                      keyring_gc_select_iterator, &limit);
        up_write(&keyring->sem);
-       kleave(" [oom]");
+       kleave(" [gc]");
 }
diff --git a/security/keys/persistent.c b/security/keys/persistent.c
new file mode 100644 (file)
index 0000000..0ad3ee2
--- /dev/null
@@ -0,0 +1,167 @@
+/* General persistent per-UID keyrings register
+ *
+ * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/user_namespace.h>
+#include "internal.h"
+
+unsigned persistent_keyring_expiry = 3 * 24 * 3600; /* Expire after 3 days of non-use */
+
+/*
+ * Create the persistent keyring register for the current user namespace.
+ *
+ * Called with the namespace's sem locked for writing.
+ */
+static int key_create_persistent_register(struct user_namespace *ns)
+{
+       struct key *reg = keyring_alloc(".persistent_register",
+                                       KUIDT_INIT(0), KGIDT_INIT(0),
+                                       current_cred(),
+                                       ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                                        KEY_USR_VIEW | KEY_USR_READ),
+                                       KEY_ALLOC_NOT_IN_QUOTA, NULL);
+       if (IS_ERR(reg))
+               return PTR_ERR(reg);
+
+       ns->persistent_keyring_register = reg;
+       return 0;
+}
+
+/*
+ * Create the persistent keyring for the specified user.
+ *
+ * Called with the namespace's sem locked for writing.
+ */
+static key_ref_t key_create_persistent(struct user_namespace *ns, kuid_t uid,
+                                      struct keyring_index_key *index_key)
+{
+       struct key *persistent;
+       key_ref_t reg_ref, persistent_ref;
+
+       if (!ns->persistent_keyring_register) {
+               long err = key_create_persistent_register(ns);
+               if (err < 0)
+                       return ERR_PTR(err);
+       } else {
+               reg_ref = make_key_ref(ns->persistent_keyring_register, true);
+               persistent_ref = find_key_to_update(reg_ref, index_key);
+               if (persistent_ref)
+                       return persistent_ref;
+       }
+
+       persistent = keyring_alloc(index_key->description,
+                                  uid, INVALID_GID, current_cred(),
+                                  ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                                   KEY_USR_VIEW | KEY_USR_READ),
+                                  KEY_ALLOC_NOT_IN_QUOTA,
+                                  ns->persistent_keyring_register);
+       if (IS_ERR(persistent))
+               return ERR_CAST(persistent);
+
+       return make_key_ref(persistent, true);
+}
+
+/*
+ * Get the persistent keyring for a specific UID and link it to the nominated
+ * keyring.
+ */
+static long key_get_persistent(struct user_namespace *ns, kuid_t uid,
+                              key_ref_t dest_ref)
+{
+       struct keyring_index_key index_key;
+       struct key *persistent;
+       key_ref_t reg_ref, persistent_ref;
+       char buf[32];
+       long ret;
+
+       /* Look in the register if it exists */
+       index_key.type = &key_type_keyring;
+       index_key.description = buf;
+       index_key.desc_len = sprintf(buf, "_persistent.%u", from_kuid(ns, uid));
+
+       if (ns->persistent_keyring_register) {
+               reg_ref = make_key_ref(ns->persistent_keyring_register, true);
+               down_read(&ns->persistent_keyring_register_sem);
+               persistent_ref = find_key_to_update(reg_ref, &index_key);
+               up_read(&ns->persistent_keyring_register_sem);
+
+               if (persistent_ref)
+                       goto found;
+       }
+
+       /* It wasn't in the register, so we'll need to create it.  We might
+        * also need to create the register.
+        */
+       down_write(&ns->persistent_keyring_register_sem);
+       persistent_ref = key_create_persistent(ns, uid, &index_key);
+       up_write(&ns->persistent_keyring_register_sem);
+       if (!IS_ERR(persistent_ref))
+               goto found;
+
+       return PTR_ERR(persistent_ref);
+
+found:
+       ret = key_task_permission(persistent_ref, current_cred(), KEY_LINK);
+       if (ret == 0) {
+               persistent = key_ref_to_ptr(persistent_ref);
+               ret = key_link(key_ref_to_ptr(dest_ref), persistent);
+               if (ret == 0) {
+                       key_set_timeout(persistent, persistent_keyring_expiry);
+                       ret = persistent->serial;               
+               }
+       }
+
+       key_ref_put(persistent_ref);
+       return ret;
+}
+
+/*
+ * Get the persistent keyring for a specific UID and link it to the nominated
+ * keyring.
+ */
+long keyctl_get_persistent(uid_t _uid, key_serial_t destid)
+{
+       struct user_namespace *ns = current_user_ns();
+       key_ref_t dest_ref;
+       kuid_t uid;
+       long ret;
+
+       /* -1 indicates the current user */
+       if (_uid == (uid_t)-1) {
+               uid = current_uid();
+       } else {
+               uid = make_kuid(ns, _uid);
+               if (!uid_valid(uid))
+                       return -EINVAL;
+
+               /* You can only see your own persistent cache if you're not
+                * sufficiently privileged.
+                */
+               if (!uid_eq(uid, current_uid()) &&
+                   !uid_eq(uid, current_euid()) &&
+                   !ns_capable(ns, CAP_SETUID))
+                       return -EPERM;
+       }
+
+       /* There must be a destination keyring */
+       dest_ref = lookup_user_key(destid, KEY_LOOKUP_CREATE, KEY_WRITE);
+       if (IS_ERR(dest_ref))
+               return PTR_ERR(dest_ref);
+       if (key_ref_to_ptr(dest_ref)->type != &key_type_keyring) {
+               ret = -ENOTDIR;
+               goto out_put_dest;
+       }
+
+       ret = key_get_persistent(ns, uid, dest_ref);
+
+out_put_dest:
+       key_ref_put(dest_ref);
+       return ret;
+}
index 217b685..88e9a46 100644 (file)
@@ -182,7 +182,6 @@ static void proc_keys_stop(struct seq_file *p, void *v)
 
 static int proc_keys_show(struct seq_file *m, void *v)
 {
-       const struct cred *cred = current_cred();
        struct rb_node *_p = v;
        struct key *key = rb_entry(_p, struct key, serial_node);
        struct timespec now;
@@ -191,15 +190,23 @@ static int proc_keys_show(struct seq_file *m, void *v)
        char xbuf[12];
        int rc;
 
+       struct keyring_search_context ctx = {
+               .index_key.type         = key->type,
+               .index_key.description  = key->description,
+               .cred                   = current_cred(),
+               .match                  = lookup_user_key_possessed,
+               .match_data             = key,
+               .flags                  = (KEYRING_SEARCH_NO_STATE_CHECK |
+                                          KEYRING_SEARCH_LOOKUP_DIRECT),
+       };
+
        key_ref = make_key_ref(key, 0);
 
        /* determine if the key is possessed by this process (a test we can
         * skip if the key does not indicate the possessor can view it
         */
        if (key->perm & KEY_POS_VIEW) {
-               skey_ref = search_my_process_keyrings(key->type, key,
-                                                     lookup_user_key_possessed,
-                                                     true, cred);
+               skey_ref = search_my_process_keyrings(&ctx);
                if (!IS_ERR(skey_ref)) {
                        key_ref_put(skey_ref);
                        key_ref = make_key_ref(key, 1);
@@ -211,7 +218,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
         * - the caller holds a spinlock, and thus the RCU read lock, making our
         *   access to __current_cred() safe
         */
-       rc = key_task_permission(key_ref, cred, KEY_VIEW);
+       rc = key_task_permission(key_ref, ctx.cred, KEY_VIEW);
        if (rc < 0)
                return 0;
 
index 42defae..0cf8a13 100644 (file)
@@ -235,7 +235,7 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
                if (IS_ERR(keyring))
                        return PTR_ERR(keyring);
        } else {
-               atomic_inc(&keyring->usage);
+               __key_get(keyring);
        }
 
        /* install the keyring */
@@ -319,11 +319,7 @@ void key_fsgid_changed(struct task_struct *tsk)
  * In the case of a successful return, the possession attribute is set on the
  * returned key reference.
  */
-key_ref_t search_my_process_keyrings(struct key_type *type,
-                                    const void *description,
-                                    key_match_func_t match,
-                                    bool no_state_check,
-                                    const struct cred *cred)
+key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx)
 {
        key_ref_t key_ref, ret, err;
 
@@ -339,10 +335,9 @@ key_ref_t search_my_process_keyrings(struct key_type *type,
        err = ERR_PTR(-EAGAIN);
 
        /* search the thread keyring first */
-       if (cred->thread_keyring) {
+       if (ctx->cred->thread_keyring) {
                key_ref = keyring_search_aux(
-                       make_key_ref(cred->thread_keyring, 1),
-                       cred, type, description, match, no_state_check);
+                       make_key_ref(ctx->cred->thread_keyring, 1), ctx);
                if (!IS_ERR(key_ref))
                        goto found;
 
@@ -358,10 +353,9 @@ key_ref_t search_my_process_keyrings(struct key_type *type,
        }
 
        /* search the process keyring second */
-       if (cred->process_keyring) {
+       if (ctx->cred->process_keyring) {
                key_ref = keyring_search_aux(
-                       make_key_ref(cred->process_keyring, 1),
-                       cred, type, description, match, no_state_check);
+                       make_key_ref(ctx->cred->process_keyring, 1), ctx);
                if (!IS_ERR(key_ref))
                        goto found;
 
@@ -379,11 +373,11 @@ key_ref_t search_my_process_keyrings(struct key_type *type,
        }
 
        /* search the session keyring */
-       if (cred->session_keyring) {
+       if (ctx->cred->session_keyring) {
                rcu_read_lock();
                key_ref = keyring_search_aux(
-                       make_key_ref(rcu_dereference(cred->session_keyring), 1),
-                       cred, type, description, match, no_state_check);
+                       make_key_ref(rcu_dereference(ctx->cred->session_keyring), 1),
+                       ctx);
                rcu_read_unlock();
 
                if (!IS_ERR(key_ref))
@@ -402,10 +396,10 @@ key_ref_t search_my_process_keyrings(struct key_type *type,
                }
        }
        /* or search the user-session keyring */
-       else if (cred->user->session_keyring) {
+       else if (ctx->cred->user->session_keyring) {
                key_ref = keyring_search_aux(
-                       make_key_ref(cred->user->session_keyring, 1),
-                       cred, type, description, match, no_state_check);
+                       make_key_ref(ctx->cred->user->session_keyring, 1),
+                       ctx);
                if (!IS_ERR(key_ref))
                        goto found;
 
@@ -437,18 +431,14 @@ found:
  *
  * Return same as search_my_process_keyrings().
  */
-key_ref_t search_process_keyrings(struct key_type *type,
-                                 const void *description,
-                                 key_match_func_t match,
-                                 const struct cred *cred)
+key_ref_t search_process_keyrings(struct keyring_search_context *ctx)
 {
        struct request_key_auth *rka;
        key_ref_t key_ref, ret = ERR_PTR(-EACCES), err;
 
        might_sleep();
 
-       key_ref = search_my_process_keyrings(type, description, match,
-                                            false, cred);
+       key_ref = search_my_process_keyrings(ctx);
        if (!IS_ERR(key_ref))
                goto found;
        err = key_ref;
@@ -457,18 +447,21 @@ key_ref_t search_process_keyrings(struct key_type *type,
         * search the keyrings of the process mentioned there
         * - we don't permit access to request_key auth keys via this method
         */
-       if (cred->request_key_auth &&
-           cred == current_cred() &&
-           type != &key_type_request_key_auth
+       if (ctx->cred->request_key_auth &&
+           ctx->cred == current_cred() &&
+           ctx->index_key.type != &key_type_request_key_auth
            ) {
+               const struct cred *cred = ctx->cred;
+
                /* defend against the auth key being revoked */
                down_read(&cred->request_key_auth->sem);
 
-               if (key_validate(cred->request_key_auth) == 0) {
-                       rka = cred->request_key_auth->payload.data;
+               if (key_validate(ctx->cred->request_key_auth) == 0) {
+                       rka = ctx->cred->request_key_auth->payload.data;
 
-                       key_ref = search_process_keyrings(type, description,
-                                                         match, rka->cred);
+                       ctx->cred = rka->cred;
+                       key_ref = search_process_keyrings(ctx);
+                       ctx->cred = cred;
 
                        up_read(&cred->request_key_auth->sem);
 
@@ -522,19 +515,23 @@ int lookup_user_key_possessed(const struct key *key, const void *target)
 key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags,
                          key_perm_t perm)
 {
+       struct keyring_search_context ctx = {
+               .match  = lookup_user_key_possessed,
+               .flags  = (KEYRING_SEARCH_NO_STATE_CHECK |
+                          KEYRING_SEARCH_LOOKUP_DIRECT),
+       };
        struct request_key_auth *rka;
-       const struct cred *cred;
        struct key *key;
        key_ref_t key_ref, skey_ref;
        int ret;
 
 try_again:
-       cred = get_current_cred();
+       ctx.cred = get_current_cred();
        key_ref = ERR_PTR(-ENOKEY);
 
        switch (id) {
        case KEY_SPEC_THREAD_KEYRING:
-               if (!cred->thread_keyring) {
+               if (!ctx.cred->thread_keyring) {
                        if (!(lflags & KEY_LOOKUP_CREATE))
                                goto error;
 
@@ -546,13 +543,13 @@ try_again:
                        goto reget_creds;
                }
 
-               key = cred->thread_keyring;
-               atomic_inc(&key->usage);
+               key = ctx.cred->thread_keyring;
+               __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;
 
        case KEY_SPEC_PROCESS_KEYRING:
-               if (!cred->process_keyring) {
+               if (!ctx.cred->process_keyring) {
                        if (!(lflags & KEY_LOOKUP_CREATE))
                                goto error;
 
@@ -564,13 +561,13 @@ try_again:
                        goto reget_creds;
                }
 
-               key = cred->process_keyring;
-               atomic_inc(&key->usage);
+               key = ctx.cred->process_keyring;
+               __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;
 
        case KEY_SPEC_SESSION_KEYRING:
-               if (!cred->session_keyring) {
+               if (!ctx.cred->session_keyring) {
                        /* always install a session keyring upon access if one
                         * doesn't exist yet */
                        ret = install_user_keyrings();
@@ -580,13 +577,13 @@ try_again:
                                ret = join_session_keyring(NULL);
                        else
                                ret = install_session_keyring(
-                                       cred->user->session_keyring);
+                                       ctx.cred->user->session_keyring);
 
                        if (ret < 0)
                                goto error;
                        goto reget_creds;
-               } else if (cred->session_keyring ==
-                          cred->user->session_keyring &&
+               } else if (ctx.cred->session_keyring ==
+                          ctx.cred->user->session_keyring &&
                           lflags & KEY_LOOKUP_CREATE) {
                        ret = join_session_keyring(NULL);
                        if (ret < 0)
@@ -595,33 +592,33 @@ try_again:
                }
 
                rcu_read_lock();
-               key = rcu_dereference(cred->session_keyring);
-               atomic_inc(&key->usage);
+               key = rcu_dereference(ctx.cred->session_keyring);
+               __key_get(key);
                rcu_read_unlock();
                key_ref = make_key_ref(key, 1);
                break;
 
        case KEY_SPEC_USER_KEYRING:
-               if (!cred->user->uid_keyring) {
+               if (!ctx.cred->user->uid_keyring) {
                        ret = install_user_keyrings();
                        if (ret < 0)
                                goto error;
                }
 
-               key = cred->user->uid_keyring;
-               atomic_inc(&key->usage);
+               key = ctx.cred->user->uid_keyring;
+               __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;
 
        case KEY_SPEC_USER_SESSION_KEYRING:
-               if (!cred->user->session_keyring) {
+               if (!ctx.cred->user->session_keyring) {
                        ret = install_user_keyrings();
                        if (ret < 0)
                                goto error;
                }
 
-               key = cred->user->session_keyring;
-               atomic_inc(&key->usage);
+               key = ctx.cred->user->session_keyring;
+               __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;
 
@@ -631,29 +628,29 @@ try_again:
                goto error;
 
        case KEY_SPEC_REQKEY_AUTH_KEY:
-               key = cred->request_key_auth;
+               key = ctx.cred->request_key_auth;
                if (!key)
                        goto error;
 
-               atomic_inc(&key->usage);
+               __key_get(key);
                key_ref = make_key_ref(key, 1);
                break;
 
        case KEY_SPEC_REQUESTOR_KEYRING:
-               if (!cred->request_key_auth)
+               if (!ctx.cred->request_key_auth)
                        goto error;
 
-               down_read(&cred->request_key_auth->sem);
+               down_read(&ctx.cred->request_key_auth->sem);
                if (test_bit(KEY_FLAG_REVOKED,
-                            &cred->request_key_auth->flags)) {
+                            &ctx.cred->request_key_auth->flags)) {
                        key_ref = ERR_PTR(-EKEYREVOKED);
                        key = NULL;
                } else {
-                       rka = cred->request_key_auth->payload.data;
+                       rka = ctx.cred->request_key_auth->payload.data;
                        key = rka->dest_keyring;
-                       atomic_inc(&key->usage);
+                       __key_get(key);
                }
-               up_read(&cred->request_key_auth->sem);
+               up_read(&ctx.cred->request_key_auth->sem);
                if (!key)
                        goto error;
                key_ref = make_key_ref(key, 1);
@@ -673,9 +670,13 @@ try_again:
                key_ref = make_key_ref(key, 0);
 
                /* check to see if we possess the key */
-               skey_ref = search_process_keyrings(key->type, key,
-                                                  lookup_user_key_possessed,
-                                                  cred);
+               ctx.index_key.type              = key->type;
+               ctx.index_key.description       = key->description;
+               ctx.index_key.desc_len          = strlen(key->description);
+               ctx.match_data                  = key;
+               kdebug("check possessed");
+               skey_ref = search_process_keyrings(&ctx);
+               kdebug("possessed=%p", skey_ref);
 
                if (!IS_ERR(skey_ref)) {
                        key_put(key);
@@ -715,14 +716,14 @@ try_again:
                goto invalid_key;
 
        /* check the permissions */
-       ret = key_task_permission(key_ref, cred, perm);
+       ret = key_task_permission(key_ref, ctx.cred, perm);
        if (ret < 0)
                goto invalid_key;
 
        key->last_used_at = current_kernel_time().tv_sec;
 
 error:
-       put_cred(cred);
+       put_cred(ctx.cred);
        return key_ref;
 
 invalid_key:
@@ -733,7 +734,7 @@ invalid_key:
        /* if we attempted to install a keyring, then it may have caused new
         * creds to be installed */
 reget_creds:
-       put_cred(cred);
+       put_cred(ctx.cred);
        goto try_again;
 }
 
@@ -856,3 +857,13 @@ void key_change_session_keyring(struct callback_head *twork)
 
        commit_creds(new);
 }
+
+/*
+ * Make sure that root's user and user-session keyrings exist.
+ */
+static int __init init_root_keyring(void)
+{
+       return install_user_keyrings();
+}
+
+late_initcall(init_root_keyring);
index c411f9b..3814119 100644 (file)
@@ -345,33 +345,34 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
  * May return a key that's already under construction instead if there was a
  * race between two thread calling request_key().
  */
-static int construct_alloc_key(struct key_type *type,
-                              const char *description,
+static int construct_alloc_key(struct keyring_search_context *ctx,
                               struct key *dest_keyring,
                               unsigned long flags,
                               struct key_user *user,
                               struct key **_key)
 {
-       const struct cred *cred = current_cred();
-       unsigned long prealloc;
+       struct assoc_array_edit *edit;
        struct key *key;
        key_perm_t perm;
        key_ref_t key_ref;
        int ret;
 
-       kenter("%s,%s,,,", type->name, description);
+       kenter("%s,%s,,,",
+              ctx->index_key.type->name, ctx->index_key.description);
 
        *_key = NULL;
        mutex_lock(&user->cons_lock);
 
        perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
        perm |= KEY_USR_VIEW;
-       if (type->read)
+       if (ctx->index_key.type->read)
                perm |= KEY_POS_READ;
-       if (type == &key_type_keyring || type->update)
+       if (ctx->index_key.type == &key_type_keyring ||
+           ctx->index_key.type->update)
                perm |= KEY_POS_WRITE;
 
-       key = key_alloc(type, description, cred->fsuid, cred->fsgid, cred,
+       key = key_alloc(ctx->index_key.type, ctx->index_key.description,
+                       ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred,
                        perm, flags);
        if (IS_ERR(key))
                goto alloc_failed;
@@ -379,8 +380,7 @@ static int construct_alloc_key(struct key_type *type,
        set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags);
 
        if (dest_keyring) {
-               ret = __key_link_begin(dest_keyring, type, description,
-                                      &prealloc);
+               ret = __key_link_begin(dest_keyring, &ctx->index_key, &edit);
                if (ret < 0)
                        goto link_prealloc_failed;
        }
@@ -390,16 +390,16 @@ static int construct_alloc_key(struct key_type *type,
         * waited for locks */
        mutex_lock(&key_construction_mutex);
 
-       key_ref = search_process_keyrings(type, description, type->match, cred);
+       key_ref = search_process_keyrings(ctx);
        if (!IS_ERR(key_ref))
                goto key_already_present;
 
        if (dest_keyring)
-               __key_link(dest_keyring, key, &prealloc);
+               __key_link(key, &edit);
 
        mutex_unlock(&key_construction_mutex);
        if (dest_keyring)
-               __key_link_end(dest_keyring, type, prealloc);
+               __key_link_end(dest_keyring, &ctx->index_key, edit);
        mutex_unlock(&user->cons_lock);
        *_key = key;
        kleave(" = 0 [%d]", key_serial(key));
@@ -414,8 +414,8 @@ key_already_present:
        if (dest_keyring) {
                ret = __key_link_check_live_key(dest_keyring, key);
                if (ret == 0)
-                       __key_link(dest_keyring, key, &prealloc);
-               __key_link_end(dest_keyring, type, prealloc);
+                       __key_link(key, &edit);
+               __key_link_end(dest_keyring, &ctx->index_key, edit);
                if (ret < 0)
                        goto link_check_failed;
        }
@@ -444,8 +444,7 @@ alloc_failed:
 /*
  * Commence key construction.
  */
-static struct key *construct_key_and_link(struct key_type *type,
-                                         const char *description,
+static struct key *construct_key_and_link(struct keyring_search_context *ctx,
                                          const char *callout_info,
                                          size_t callout_len,
                                          void *aux,
@@ -464,8 +463,7 @@ static struct key *construct_key_and_link(struct key_type *type,
 
        construct_get_dest_keyring(&dest_keyring);
 
-       ret = construct_alloc_key(type, description, dest_keyring, flags, user,
-                                 &key);
+       ret = construct_alloc_key(ctx, dest_keyring, flags, user, &key);
        key_user_put(user);
 
        if (ret == 0) {
@@ -529,17 +527,24 @@ struct key *request_key_and_link(struct key_type *type,
                                 struct key *dest_keyring,
                                 unsigned long flags)
 {
-       const struct cred *cred = current_cred();
+       struct keyring_search_context ctx = {
+               .index_key.type         = type,
+               .index_key.description  = description,
+               .cred                   = current_cred(),
+               .match                  = type->match,
+               .match_data             = description,
+               .flags                  = KEYRING_SEARCH_LOOKUP_DIRECT,
+       };
        struct key *key;
        key_ref_t key_ref;
        int ret;
 
        kenter("%s,%s,%p,%zu,%p,%p,%lx",
-              type->name, description, callout_info, callout_len, aux,
-              dest_keyring, flags);
+              ctx.index_key.type->name, ctx.index_key.description,
+              callout_info, callout_len, aux, dest_keyring, flags);
 
        /* search all the process keyrings for a key */
-       key_ref = search_process_keyrings(type, description, type->match, cred);
+       key_ref = search_process_keyrings(&ctx);
 
        if (!IS_ERR(key_ref)) {
                key = key_ref_to_ptr(key_ref);
@@ -562,9 +567,8 @@ struct key *request_key_and_link(struct key_type *type,
                if (!callout_info)
                        goto error;
 
-               key = construct_key_and_link(type, description, callout_info,
-                                            callout_len, aux, dest_keyring,
-                                            flags);
+               key = construct_key_and_link(&ctx, callout_info, callout_len,
+                                            aux, dest_keyring, flags);
        }
 
 error:
@@ -592,8 +596,10 @@ int wait_for_key_construction(struct key *key, bool intr)
                          intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
        if (ret < 0)
                return ret;
-       if (test_bit(KEY_FLAG_NEGATIVE, &key->flags))
+       if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) {
+               smp_rmb();
                return key->type_data.reject_error;
+       }
        return key_validate(key);
 }
 EXPORT_SYMBOL(wait_for_key_construction);
index 85730d5..7495a93 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "internal.h"
+#include <keys/user-type.h>
 
 static int request_key_auth_instantiate(struct key *,
                                        struct key_preparsed_payload *);
@@ -222,32 +223,26 @@ error_alloc:
 }
 
 /*
- * See if an authorisation key is associated with a particular key.
- */
-static int key_get_instantiation_authkey_match(const struct key *key,
-                                              const void *_id)
-{
-       struct request_key_auth *rka = key->payload.data;
-       key_serial_t id = (key_serial_t)(unsigned long) _id;
-
-       return rka->target_key->serial == id;
-}
-
-/*
  * Search the current process's keyrings for the authorisation key for
  * instantiation of a key.
  */
 struct key *key_get_instantiation_authkey(key_serial_t target_id)
 {
-       const struct cred *cred = current_cred();
+       char description[16];
+       struct keyring_search_context ctx = {
+               .index_key.type         = &key_type_request_key_auth,
+               .index_key.description  = description,
+               .cred                   = current_cred(),
+               .match                  = user_match,
+               .match_data             = description,
+               .flags                  = KEYRING_SEARCH_LOOKUP_DIRECT,
+       };
        struct key *authkey;
        key_ref_t authkey_ref;
 
-       authkey_ref = search_process_keyrings(
-               &key_type_request_key_auth,
-               (void *) (unsigned long) target_id,
-               key_get_instantiation_authkey_match,
-               cred);
+       sprintf(description, "%x", target_id);
+
+       authkey_ref = search_process_keyrings(&ctx);
 
        if (IS_ERR(authkey_ref)) {
                authkey = ERR_CAST(authkey_ref);
index ee32d18..8c0af08 100644 (file)
@@ -61,5 +61,16 @@ ctl_table key_sysctls[] = {
                .extra1 = (void *) &zero,
                .extra2 = (void *) &max,
        },
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+       {
+               .procname = "persistent_keyring_expiry",
+               .data = &persistent_keyring_expiry,
+               .maxlen = sizeof(unsigned),
+               .mode = 0644,
+               .proc_handler = proc_dointvec_minmax,
+               .extra1 = (void *) &zero,
+               .extra2 = (void *) &max,
+       },
+#endif
        { }
 };
index 55dc889..faa2cae 100644 (file)
@@ -25,14 +25,15 @@ static int logon_vet_description(const char *desc);
  * arbitrary blob of data as the payload
  */
 struct key_type key_type_user = {
-       .name           = "user",
-       .instantiate    = user_instantiate,
-       .update         = user_update,
-       .match          = user_match,
-       .revoke         = user_revoke,
-       .destroy        = user_destroy,
-       .describe       = user_describe,
-       .read           = user_read,
+       .name                   = "user",
+       .def_lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
+       .instantiate            = user_instantiate,
+       .update                 = user_update,
+       .match                  = user_match,
+       .revoke                 = user_revoke,
+       .destroy                = user_destroy,
+       .describe               = user_describe,
+       .read                   = user_read,
 };
 
 EXPORT_SYMBOL_GPL(key_type_user);
@@ -45,6 +46,7 @@ EXPORT_SYMBOL_GPL(key_type_user);
  */
 struct key_type key_type_logon = {
        .name                   = "logon",
+       .def_lookup_type        = KEYRING_SEARCH_LOOKUP_DIRECT,
        .instantiate            = user_instantiate,
        .update                 = user_update,
        .match                  = user_match,
index 234bc2a..9a62045 100644 (file)
@@ -397,7 +397,8 @@ void common_lsm_audit(struct common_audit_data *a,
        if (a == NULL)
                return;
        /* we use GFP_ATOMIC so we won't sleep */
-       ab = audit_log_start(current->audit_context, GFP_ATOMIC, AUDIT_AVC);
+       ab = audit_log_start(current->audit_context, GFP_ATOMIC | __GFP_NOWARN,
+                            AUDIT_AVC);
 
        if (ab == NULL)
                return;
index 4dc31f4..15b6928 100644 (file)
@@ -1340,22 +1340,17 @@ int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
        return security_ops->xfrm_policy_delete_security(ctx);
 }
 
-int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+int security_xfrm_state_alloc(struct xfrm_state *x,
+                             struct xfrm_user_sec_ctx *sec_ctx)
 {
-       return security_ops->xfrm_state_alloc_security(x, sec_ctx, 0);
+       return security_ops->xfrm_state_alloc(x, sec_ctx);
 }
 EXPORT_SYMBOL(security_xfrm_state_alloc);
 
 int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
                                      struct xfrm_sec_ctx *polsec, u32 secid)
 {
-       if (!polsec)
-               return 0;
-       /*
-        * We want the context to be taken from secid which is usually
-        * from the sock.
-        */
-       return security_ops->xfrm_state_alloc_security(x, NULL, secid);
+       return security_ops->xfrm_state_alloc_acquire(x, polsec, secid);
 }
 
 int security_xfrm_state_delete(struct xfrm_state *x)
index c540795..794c3ca 100644 (file)
@@ -95,7 +95,9 @@
 #include "audit.h"
 #include "avc_ss.h"
 
-#define NUM_SEL_MNT_OPTS 5
+#define SB_TYPE_FMT "%s%s%s"
+#define SB_SUBTYPE(sb) (sb->s_subtype && sb->s_subtype[0])
+#define SB_TYPE_ARGS(sb) sb->s_type->name, SB_SUBTYPE(sb) ? "." : "", SB_SUBTYPE(sb) ? sb->s_subtype : ""
 
 extern struct security_operations *security_ops;
 
@@ -139,12 +141,28 @@ static struct kmem_cache *sel_inode_cache;
  * This function checks the SECMARK reference counter to see if any SECMARK
  * targets are currently configured, if the reference counter is greater than
  * zero SECMARK is considered to be enabled.  Returns true (1) if SECMARK is
- * enabled, false (0) if SECMARK is disabled.
+ * enabled, false (0) if SECMARK is disabled.  If the always_check_network
+ * policy capability is enabled, SECMARK is always considered enabled.
  *
  */
 static int selinux_secmark_enabled(void)
 {
-       return (atomic_read(&selinux_secmark_refcount) > 0);
+       return (selinux_policycap_alwaysnetwork || atomic_read(&selinux_secmark_refcount));
+}
+
+/**
+ * selinux_peerlbl_enabled - Check to see if peer labeling is currently enabled
+ *
+ * Description:
+ * This function checks if NetLabel or labeled IPSEC is enabled.  Returns true
+ * (1) if any are enabled or false (0) if neither are enabled.  If the
+ * always_check_network policy capability is enabled, peer labeling
+ * is always considered enabled.
+ *
+ */
+static int selinux_peerlbl_enabled(void)
+{
+       return (selinux_policycap_alwaysnetwork || netlbl_enabled() || selinux_xfrm_enabled());
 }
 
 /*
@@ -309,8 +327,11 @@ enum {
        Opt_defcontext = 3,
        Opt_rootcontext = 4,
        Opt_labelsupport = 5,
+       Opt_nextmntopt = 6,
 };
 
+#define NUM_SEL_MNT_OPTS       (Opt_nextmntopt - 1)
+
 static const match_table_t tokens = {
        {Opt_context, CONTEXT_STR "%s"},
        {Opt_fscontext, FSCONTEXT_STR "%s"},
@@ -355,6 +376,29 @@ static int may_context_mount_inode_relabel(u32 sid,
        return rc;
 }
 
+static int selinux_is_sblabel_mnt(struct super_block *sb)
+{
+       struct superblock_security_struct *sbsec = sb->s_security;
+
+       if (sbsec->behavior == SECURITY_FS_USE_XATTR ||
+           sbsec->behavior == SECURITY_FS_USE_TRANS ||
+           sbsec->behavior == SECURITY_FS_USE_TASK)
+               return 1;
+
+       /* Special handling for sysfs. Is genfs but also has setxattr handler*/
+       if (strncmp(sb->s_type->name, "sysfs", sizeof("sysfs")) == 0)
+               return 1;
+
+       /*
+        * Special handling for rootfs. Is genfs but supports
+        * setting SELinux context on in-core inodes.
+        */
+       if (strncmp(sb->s_type->name, "rootfs", sizeof("rootfs")) == 0)
+               return 1;
+
+       return 0;
+}
+
 static int sb_finish_set_opts(struct super_block *sb)
 {
        struct superblock_security_struct *sbsec = sb->s_security;
@@ -369,8 +413,8 @@ static int sb_finish_set_opts(struct super_block *sb)
                   the first boot of the SELinux kernel before we have
                   assigned xattr values to the filesystem. */
                if (!root_inode->i_op->getxattr) {
-                       printk(KERN_WARNING "SELinux: (dev %s, type %s) has no "
-                              "xattr support\n", sb->s_id, sb->s_type->name);
+                       printk(KERN_WARNING "SELinux: (dev %s, type "SB_TYPE_FMT") has no "
+                              "xattr support\n", sb->s_id, SB_TYPE_ARGS(sb));
                        rc = -EOPNOTSUPP;
                        goto out;
                }
@@ -378,35 +422,27 @@ static int sb_finish_set_opts(struct super_block *sb)
                if (rc < 0 && rc != -ENODATA) {
                        if (rc == -EOPNOTSUPP)
                                printk(KERN_WARNING "SELinux: (dev %s, type "
-                                      "%s) has no security xattr handler\n",
-                                      sb->s_id, sb->s_type->name);
+                                      SB_TYPE_FMT") has no security xattr handler\n",
+                                      sb->s_id, SB_TYPE_ARGS(sb));
                        else
                                printk(KERN_WARNING "SELinux: (dev %s, type "
-                                      "%s) getxattr errno %d\n", sb->s_id,
-                                      sb->s_type->name, -rc);
+                                      SB_TYPE_FMT") getxattr errno %d\n", sb->s_id,
+                                      SB_TYPE_ARGS(sb), -rc);
                        goto out;
                }
        }
 
-       sbsec->flags |= (SE_SBINITIALIZED | SE_SBLABELSUPP);
-
        if (sbsec->behavior > ARRAY_SIZE(labeling_behaviors))
-               printk(KERN_ERR "SELinux: initialized (dev %s, type %s), unknown behavior\n",
-                      sb->s_id, sb->s_type->name);
+               printk(KERN_ERR "SELinux: initialized (dev %s, type "SB_TYPE_FMT"), unknown behavior\n",
+                      sb->s_id, SB_TYPE_ARGS(sb));
        else
-               printk(KERN_DEBUG "SELinux: initialized (dev %s, type %s), %s\n",
-                      sb->s_id, sb->s_type->name,
+               printk(KERN_DEBUG "SELinux: initialized (dev %s, type "SB_TYPE_FMT"), %s\n",
+                      sb->s_id, SB_TYPE_ARGS(sb),
                       labeling_behaviors[sbsec->behavior-1]);
 
-       if (sbsec->behavior == SECURITY_FS_USE_GENFS ||
-           sbsec->behavior == SECURITY_FS_USE_MNTPOINT ||
-           sbsec->behavior == SECURITY_FS_USE_NONE ||
-           sbsec->behavior > ARRAY_SIZE(labeling_behaviors))
-               sbsec->flags &= ~SE_SBLABELSUPP;
-
-       /* Special handling for sysfs. Is genfs but also has setxattr handler*/
-       if (strncmp(sb->s_type->name, "sysfs", sizeof("sysfs")) == 0)
-               sbsec->flags |= SE_SBLABELSUPP;
+       sbsec->flags |= SE_SBINITIALIZED;
+       if (selinux_is_sblabel_mnt(sb))
+               sbsec->flags |= SBLABEL_MNT;
 
        /* Initialize the root inode. */
        rc = inode_doinit_with_dentry(root_inode, root);
@@ -460,15 +496,18 @@ static int selinux_get_mnt_opts(const struct super_block *sb,
        if (!ss_initialized)
                return -EINVAL;
 
+       /* make sure we always check enough bits to cover the mask */
+       BUILD_BUG_ON(SE_MNTMASK >= (1 << NUM_SEL_MNT_OPTS));
+
        tmp = sbsec->flags & SE_MNTMASK;
        /* count the number of mount options for this sb */
-       for (i = 0; i < 8; i++) {
+       for (i = 0; i < NUM_SEL_MNT_OPTS; i++) {
                if (tmp & 0x01)
                        opts->num_mnt_opts++;
                tmp >>= 1;
        }
        /* Check if the Label support flag is set */
-       if (sbsec->flags & SE_SBLABELSUPP)
+       if (sbsec->flags & SBLABEL_MNT)
                opts->num_mnt_opts++;
 
        opts->mnt_opts = kcalloc(opts->num_mnt_opts, sizeof(char *), GFP_ATOMIC);
@@ -515,9 +554,9 @@ static int selinux_get_mnt_opts(const struct super_block *sb,
                opts->mnt_opts[i] = context;
                opts->mnt_opts_flags[i++] = ROOTCONTEXT_MNT;
        }
-       if (sbsec->flags & SE_SBLABELSUPP) {
+       if (sbsec->flags & SBLABEL_MNT) {
                opts->mnt_opts[i] = NULL;
-               opts->mnt_opts_flags[i++] = SE_SBLABELSUPP;
+               opts->mnt_opts_flags[i++] = SBLABEL_MNT;
        }
 
        BUG_ON(i != opts->num_mnt_opts);
@@ -561,7 +600,6 @@ static int selinux_set_mnt_opts(struct super_block *sb,
        const struct cred *cred = current_cred();
        int rc = 0, i;
        struct superblock_security_struct *sbsec = sb->s_security;
-       const char *name = sb->s_type->name;
        struct inode *inode = sbsec->sb->s_root->d_inode;
        struct inode_security_struct *root_isec = inode->i_security;
        u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0;
@@ -614,14 +652,14 @@ static int selinux_set_mnt_opts(struct super_block *sb,
        for (i = 0; i < num_opts; i++) {
                u32 sid;
 
-               if (flags[i] == SE_SBLABELSUPP)
+               if (flags[i] == SBLABEL_MNT)
                        continue;
                rc = security_context_to_sid(mount_options[i],
                                             strlen(mount_options[i]), &sid);
                if (rc) {
                        printk(KERN_WARNING "SELinux: security_context_to_sid"
-                              "(%s) failed for (dev %s, type %s) errno=%d\n",
-                              mount_options[i], sb->s_id, name, rc);
+                              "(%s) failed for (dev %s, type "SB_TYPE_FMT") errno=%d\n",
+                              mount_options[i], sb->s_id, SB_TYPE_ARGS(sb), rc);
                        goto out;
                }
                switch (flags[i]) {
@@ -685,9 +723,7 @@ static int selinux_set_mnt_opts(struct super_block *sb,
                 * Determine the labeling behavior to use for this
                 * filesystem type.
                 */
-               rc = security_fs_use((sbsec->flags & SE_SBPROC) ?
-                                       "proc" : sb->s_type->name,
-                                       &sbsec->behavior, &sbsec->sid);
+               rc = security_fs_use(sb);
                if (rc) {
                        printk(KERN_WARNING
                                "%s: security_fs_use(%s) returned %d\n",
@@ -770,7 +806,8 @@ out:
 out_double_mount:
        rc = -EINVAL;
        printk(KERN_WARNING "SELinux: mount invalid.  Same superblock, different "
-              "security settings for (dev %s, type %s)\n", sb->s_id, name);
+              "security settings for (dev %s, type "SB_TYPE_FMT")\n", sb->s_id,
+              SB_TYPE_ARGS(sb));
        goto out;
 }
 
@@ -1037,7 +1074,7 @@ static void selinux_write_opts(struct seq_file *m,
                case DEFCONTEXT_MNT:
                        prefix = DEFCONTEXT_STR;
                        break;
-               case SE_SBLABELSUPP:
+               case SBLABEL_MNT:
                        seq_putc(m, ',');
                        seq_puts(m, LABELSUPP_STR);
                        continue;
@@ -1649,7 +1686,7 @@ static int may_create(struct inode *dir,
        if (rc)
                return rc;
 
-       if (!newsid || !(sbsec->flags & SE_SBLABELSUPP)) {
+       if (!newsid || !(sbsec->flags & SBLABEL_MNT)) {
                rc = security_transition_sid(sid, dsec->sid, tclass,
                                             &dentry->d_name, &newsid);
                if (rc)
@@ -2437,14 +2474,14 @@ static int selinux_sb_remount(struct super_block *sb, void *data)
                u32 sid;
                size_t len;
 
-               if (flags[i] == SE_SBLABELSUPP)
+               if (flags[i] == SBLABEL_MNT)
                        continue;
                len = strlen(mount_options[i]);
                rc = security_context_to_sid(mount_options[i], len, &sid);
                if (rc) {
                        printk(KERN_WARNING "SELinux: security_context_to_sid"
-                              "(%s) failed for (dev %s, type %s) errno=%d\n",
-                              mount_options[i], sb->s_id, sb->s_type->name, rc);
+                              "(%s) failed for (dev %s, type "SB_TYPE_FMT") errno=%d\n",
+                              mount_options[i], sb->s_id, SB_TYPE_ARGS(sb), rc);
                        goto out_free_opts;
                }
                rc = -EINVAL;
@@ -2482,8 +2519,8 @@ out_free_secdata:
        return rc;
 out_bad_option:
        printk(KERN_WARNING "SELinux: unable to change security options "
-              "during remount (dev %s, type=%s)\n", sb->s_id,
-              sb->s_type->name);
+              "during remount (dev %s, type "SB_TYPE_FMT")\n", sb->s_id,
+              SB_TYPE_ARGS(sb));
        goto out_free_opts;
 }
 
@@ -2606,7 +2643,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
        if ((sbsec->flags & SE_SBINITIALIZED) &&
            (sbsec->behavior == SECURITY_FS_USE_MNTPOINT))
                newsid = sbsec->mntpoint_sid;
-       else if (!newsid || !(sbsec->flags & SE_SBLABELSUPP)) {
+       else if (!newsid || !(sbsec->flags & SBLABEL_MNT)) {
                rc = security_transition_sid(sid, dsec->sid,
                                             inode_mode_to_security_class(inode->i_mode),
                                             qstr, &newsid);
@@ -2628,7 +2665,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
                isec->initialized = 1;
        }
 
-       if (!ss_initialized || !(sbsec->flags & SE_SBLABELSUPP))
+       if (!ss_initialized || !(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;
 
        if (name)
@@ -2830,7 +2867,7 @@ static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
                return selinux_inode_setotherxattr(dentry, name);
 
        sbsec = inode->i_sb->s_security;
-       if (!(sbsec->flags & SE_SBLABELSUPP))
+       if (!(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;
 
        if (!inode_owner_or_capable(inode))
@@ -3791,8 +3828,12 @@ static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid)
        u32 nlbl_sid;
        u32 nlbl_type;
 
-       selinux_skb_xfrm_sid(skb, &xfrm_sid);
-       selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid);
+       err = selinux_skb_xfrm_sid(skb, &xfrm_sid);
+       if (unlikely(err))
+               return -EACCES;
+       err = selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid);
+       if (unlikely(err))
+               return -EACCES;
 
        err = security_net_peersid_resolve(nlbl_sid, nlbl_type, xfrm_sid, sid);
        if (unlikely(err)) {
@@ -4246,7 +4287,7 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
                return selinux_sock_rcv_skb_compat(sk, skb, family);
 
        secmark_active = selinux_secmark_enabled();
-       peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
+       peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return 0;
 
@@ -4628,7 +4669,7 @@ static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex,
 
        secmark_active = selinux_secmark_enabled();
        netlbl_active = netlbl_enabled();
-       peerlbl_active = netlbl_active || selinux_xfrm_enabled();
+       peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return NF_ACCEPT;
 
@@ -4780,7 +4821,7 @@ static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex,
                return NF_ACCEPT;
 #endif
        secmark_active = selinux_secmark_enabled();
-       peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
+       peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return NF_ACCEPT;
 
@@ -5784,7 +5825,8 @@ static struct security_operations selinux_ops = {
        .xfrm_policy_clone_security =   selinux_xfrm_policy_clone,
        .xfrm_policy_free_security =    selinux_xfrm_policy_free,
        .xfrm_policy_delete_security =  selinux_xfrm_policy_delete,
-       .xfrm_state_alloc_security =    selinux_xfrm_state_alloc,
+       .xfrm_state_alloc =             selinux_xfrm_state_alloc,
+       .xfrm_state_alloc_acquire =     selinux_xfrm_state_alloc_acquire,
        .xfrm_state_free_security =     selinux_xfrm_state_free,
        .xfrm_state_delete_security =   selinux_xfrm_state_delete,
        .xfrm_policy_lookup =           selinux_xfrm_policy_lookup,
index aa47bca..b1dfe10 100644 (file)
@@ -58,8 +58,8 @@ struct superblock_security_struct {
        u32 sid;                        /* SID of file system superblock */
        u32 def_sid;                    /* default SID for labeling */
        u32 mntpoint_sid;               /* SECURITY_FS_USE_MNTPOINT context for files */
-       unsigned int behavior;          /* labeling behavior */
-       unsigned char flags;            /* which mount options were specified */
+       unsigned short behavior;        /* labeling behavior */
+       unsigned short flags;           /* which mount options were specified */
        struct mutex lock;
        struct list_head isec_head;
        spinlock_t isec_lock;
index 8fd8e18..fe341ae 100644 (file)
 /* Mask for just the mount related flags */
 #define SE_MNTMASK     0x0f
 /* Super block security struct flags for mount options */
+/* BE CAREFUL, these need to be the low order bits for selinux_get_mnt_opts */
 #define CONTEXT_MNT    0x01
 #define FSCONTEXT_MNT  0x02
 #define ROOTCONTEXT_MNT        0x04
 #define DEFCONTEXT_MNT 0x08
+#define SBLABEL_MNT    0x10
 /* Non-mount related flags */
-#define SE_SBINITIALIZED       0x10
-#define SE_SBPROC              0x20
-#define SE_SBLABELSUPP 0x40
+#define SE_SBINITIALIZED       0x0100
+#define SE_SBPROC              0x0200
 
 #define CONTEXT_STR    "context="
 #define FSCONTEXT_STR  "fscontext="
@@ -68,12 +69,15 @@ extern int selinux_enabled;
 enum {
        POLICYDB_CAPABILITY_NETPEER,
        POLICYDB_CAPABILITY_OPENPERM,
+       POLICYDB_CAPABILITY_REDHAT1,
+       POLICYDB_CAPABILITY_ALWAYSNETWORK,
        __POLICYDB_CAPABILITY_MAX
 };
 #define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1)
 
 extern int selinux_policycap_netpeer;
 extern int selinux_policycap_openperm;
+extern int selinux_policycap_alwaysnetwork;
 
 /*
  * type_datum properties
@@ -172,8 +176,7 @@ int security_get_allow_unknown(void);
 #define SECURITY_FS_USE_NATIVE         7 /* use native label support */
 #define SECURITY_FS_USE_MAX            7 /* Highest SECURITY_FS_USE_XXX */
 
-int security_fs_use(const char *fstype, unsigned int *behavior,
-       u32 *sid);
+int security_fs_use(struct super_block *sb);
 
 int security_genfs_sid(const char *fstype, char *name, u16 sclass,
        u32 *sid);
index 6713f04..0dec76c 100644 (file)
 #include <net/flow.h>
 
 int selinux_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
-                             struct xfrm_user_sec_ctx *sec_ctx);
+                             struct xfrm_user_sec_ctx *uctx);
 int selinux_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
                              struct xfrm_sec_ctx **new_ctxp);
 void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx);
 int selinux_xfrm_policy_delete(struct xfrm_sec_ctx *ctx);
 int selinux_xfrm_state_alloc(struct xfrm_state *x,
-       struct xfrm_user_sec_ctx *sec_ctx, u32 secid);
+                            struct xfrm_user_sec_ctx *uctx);
+int selinux_xfrm_state_alloc_acquire(struct xfrm_state *x,
+                                    struct xfrm_sec_ctx *polsec, u32 secid);
 void selinux_xfrm_state_free(struct xfrm_state *x);
 int selinux_xfrm_state_delete(struct xfrm_state *x);
 int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir);
 int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x,
-                       struct xfrm_policy *xp, const struct flowi *fl);
-
-/*
- * Extract the security blob from the sock (it's actually on the socket)
- */
-static inline struct inode_security_struct *get_sock_isec(struct sock *sk)
-{
-       if (!sk->sk_socket)
-               return NULL;
-
-       return SOCK_INODE(sk->sk_socket)->i_security;
-}
+                                     struct xfrm_policy *xp,
+                                     const struct flowi *fl);
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 extern atomic_t selinux_xfrm_refcount;
@@ -42,10 +34,10 @@ static inline int selinux_xfrm_enabled(void)
        return (atomic_read(&selinux_xfrm_refcount) > 0);
 }
 
-int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb,
-                       struct common_audit_data *ad);
-int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb,
-                       struct common_audit_data *ad, u8 proto);
+int selinux_xfrm_sock_rcv_skb(u32 sk_sid, struct sk_buff *skb,
+                             struct common_audit_data *ad);
+int selinux_xfrm_postroute_last(u32 sk_sid, struct sk_buff *skb,
+                               struct common_audit_data *ad, u8 proto);
 int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, int ckall);
 
 static inline void selinux_xfrm_notify_policyload(void)
@@ -64,19 +56,21 @@ static inline int selinux_xfrm_enabled(void)
        return 0;
 }
 
-static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb,
-                       struct common_audit_data *ad)
+static inline int selinux_xfrm_sock_rcv_skb(u32 sk_sid, struct sk_buff *skb,
+                                           struct common_audit_data *ad)
 {
        return 0;
 }
 
-static inline int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb,
-                       struct common_audit_data *ad, u8 proto)
+static inline int selinux_xfrm_postroute_last(u32 sk_sid, struct sk_buff *skb,
+                                             struct common_audit_data *ad,
+                                             u8 proto)
 {
        return 0;
 }
 
-static inline int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, int ckall)
+static inline int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid,
+                                             int ckall)
 {
        *sid = SECSID_NULL;
        return 0;
@@ -87,10 +81,9 @@ static inline void selinux_xfrm_notify_policyload(void)
 }
 #endif
 
-static inline void selinux_skb_xfrm_sid(struct sk_buff *skb, u32 *sid)
+static inline int selinux_skb_xfrm_sid(struct sk_buff *skb, u32 *sid)
 {
-       int err = selinux_xfrm_decode_session(skb, sid, 0);
-       BUG_ON(err);
+       return selinux_xfrm_decode_session(skb, sid, 0);
 }
 
 #endif /* _SELINUX_XFRM_H_ */
index da4b8b2..6235d05 100644 (file)
@@ -442,8 +442,7 @@ int selinux_netlbl_socket_connect(struct sock *sk, struct sockaddr *addr)
            sksec->nlbl_state != NLBL_CONNLABELED)
                return 0;
 
-       local_bh_disable();
-       bh_lock_sock_nested(sk);
+       lock_sock(sk);
 
        /* connected sockets are allowed to disconnect when the address family
         * is set to AF_UNSPEC, if that is what is happening we want to reset
@@ -464,7 +463,6 @@ int selinux_netlbl_socket_connect(struct sock *sk, struct sockaddr *addr)
                sksec->nlbl_state = NLBL_CONNLABELED;
 
 socket_connect_return:
-       bh_unlock_sock(sk);
-       local_bh_enable();
+       release_sock(sk);
        return rc;
 }
index c5454c0..03a72c3 100644 (file)
@@ -166,6 +166,7 @@ static void sel_netnode_insert(struct sel_netnode *node)
                break;
        default:
                BUG();
+               return;
        }
 
        /* we need to impose a limit on the growth of the hash table so check
@@ -225,6 +226,7 @@ static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid)
                break;
        default:
                BUG();
+               ret = -EINVAL;
        }
        if (ret != 0)
                goto out;
index 855e464..332ac8a 100644 (file)
@@ -116,6 +116,8 @@ static struct nlmsg_perm nlmsg_audit_perms[] =
        { AUDIT_MAKE_EQUIV,     NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
        { AUDIT_TTY_GET,        NETLINK_AUDIT_SOCKET__NLMSG_READ     },
        { AUDIT_TTY_SET,        NETLINK_AUDIT_SOCKET__NLMSG_TTY_AUDIT   },
+       { AUDIT_GET_FEATURE,    NETLINK_AUDIT_SOCKET__NLMSG_READ     },
+       { AUDIT_SET_FEATURE,    NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 };
 
 
index ff42773..5122aff 100644 (file)
@@ -44,7 +44,9 @@
 /* Policy capability filenames */
 static char *policycap_names[] = {
        "network_peer_controls",
-       "open_perms"
+       "open_perms",
+       "redhat1",
+       "always_check_network"
 };
 
 unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE;
index 30f119b..820313a 100644 (file)
@@ -213,7 +213,12 @@ netlbl_import_failure:
 }
 #endif /* CONFIG_NETLABEL */
 
-int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2)
+/*
+ * Check to see if all the bits set in e2 are also set in e1. Optionally,
+ * if last_e2bit is non-zero, the highest set bit in e2 cannot exceed
+ * last_e2bit.
+ */
+int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2, u32 last_e2bit)
 {
        struct ebitmap_node *n1, *n2;
        int i;
@@ -223,14 +228,25 @@ int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2)
 
        n1 = e1->node;
        n2 = e2->node;
+
        while (n1 && n2 && (n1->startbit <= n2->startbit)) {
                if (n1->startbit < n2->startbit) {
                        n1 = n1->next;
                        continue;
                }
-               for (i = 0; i < EBITMAP_UNIT_NUMS; i++) {
+               for (i = EBITMAP_UNIT_NUMS - 1; (i >= 0) && !n2->maps[i]; )
+                       i--;    /* Skip trailing NULL map entries */
+               if (last_e2bit && (i >= 0)) {
+                       u32 lastsetbit = n2->startbit + i * EBITMAP_UNIT_SIZE +
+                                        __fls(n2->maps[i]);
+                       if (lastsetbit > last_e2bit)
+                               return 0;
+               }
+
+               while (i >= 0) {
                        if ((n1->maps[i] & n2->maps[i]) != n2->maps[i])
                                return 0;
+                       i--;
                }
 
                n1 = n1->next;
index 922f8af..712c8a7 100644 (file)
 
 #include <net/netlabel.h>
 
-#define EBITMAP_UNIT_NUMS      ((32 - sizeof(void *) - sizeof(u32))    \
+#ifdef CONFIG_64BIT
+#define        EBITMAP_NODE_SIZE       64
+#else
+#define        EBITMAP_NODE_SIZE       32
+#endif
+
+#define EBITMAP_UNIT_NUMS      ((EBITMAP_NODE_SIZE-sizeof(void *)-sizeof(u32))\
                                        / sizeof(unsigned long))
 #define EBITMAP_UNIT_SIZE      BITS_PER_LONG
 #define EBITMAP_SIZE           (EBITMAP_UNIT_NUMS * EBITMAP_UNIT_SIZE)
@@ -117,7 +123,7 @@ static inline void ebitmap_node_clr_bit(struct ebitmap_node *n,
 
 int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2);
 int ebitmap_cpy(struct ebitmap *dst, struct ebitmap *src);
-int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2);
+int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2, u32 last_e2bit);
 int ebitmap_get_bit(struct ebitmap *e, unsigned long bit);
 int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value);
 void ebitmap_destroy(struct ebitmap *e);
index 40de8d3..c85bc1e 100644 (file)
@@ -160,8 +160,6 @@ void mls_sid_to_context(struct context *context,
 int mls_level_isvalid(struct policydb *p, struct mls_level *l)
 {
        struct level_datum *levdatum;
-       struct ebitmap_node *node;
-       int i;
 
        if (!l->sens || l->sens > p->p_levels.nprim)
                return 0;
@@ -170,19 +168,13 @@ int mls_level_isvalid(struct policydb *p, struct mls_level *l)
        if (!levdatum)
                return 0;
 
-       ebitmap_for_each_positive_bit(&l->cat, node, i) {
-               if (i > p->p_cats.nprim)
-                       return 0;
-               if (!ebitmap_get_bit(&levdatum->level->cat, i)) {
-                       /*
-                        * Category may not be associated with
-                        * sensitivity.
-                        */
-                       return 0;
-               }
-       }
-
-       return 1;
+       /*
+        * Return 1 iff all the bits set in l->cat are also be set in
+        * levdatum->level->cat and no bit in l->cat is larger than
+        * p->p_cats.nprim.
+        */
+       return ebitmap_contains(&levdatum->level->cat, &l->cat,
+                               p->p_cats.nprim);
 }
 
 int mls_range_isvalid(struct policydb *p, struct mls_range *r)
index 03bed52..e936487 100644 (file)
@@ -35,7 +35,7 @@ static inline int mls_level_eq(struct mls_level *l1, struct mls_level *l2)
 static inline int mls_level_dom(struct mls_level *l1, struct mls_level *l2)
 {
        return ((l1->sens >= l2->sens) &&
-               ebitmap_contains(&l1->cat, &l2->cat));
+               ebitmap_contains(&l1->cat, &l2->cat, 0));
 }
 
 #define mls_level_incomp(l1, l2) \
index c8adde3..f6195eb 100644 (file)
@@ -3203,9 +3203,8 @@ static int range_write_helper(void *key, void *data, void *ptr)
 
 static int range_write(struct policydb *p, void *fp)
 {
-       size_t nel;
        __le32 buf[1];
-       int rc;
+       int rc, nel;
        struct policy_data pd;
 
        pd.p = p;
index b4feecc..ee470a0 100644 (file)
@@ -72,6 +72,7 @@
 
 int selinux_policycap_netpeer;
 int selinux_policycap_openperm;
+int selinux_policycap_alwaysnetwork;
 
 static DEFINE_RWLOCK(policy_rwlock);
 
@@ -1812,6 +1813,8 @@ static void security_load_policycaps(void)
                                                  POLICYDB_CAPABILITY_NETPEER);
        selinux_policycap_openperm = ebitmap_get_bit(&policydb.policycaps,
                                                  POLICYDB_CAPABILITY_OPENPERM);
+       selinux_policycap_alwaysnetwork = ebitmap_get_bit(&policydb.policycaps,
+                                                 POLICYDB_CAPABILITY_ALWAYSNETWORK);
 }
 
 static int security_preserve_bools(struct policydb *p);
@@ -2323,43 +2326,74 @@ out:
 
 /**
  * security_fs_use - Determine how to handle labeling for a filesystem.
- * @fstype: filesystem type
- * @behavior: labeling behavior
- * @sid: SID for filesystem (superblock)
+ * @sb: superblock in question
  */
-int security_fs_use(
-       const char *fstype,
-       unsigned int *behavior,
-       u32 *sid)
+int security_fs_use(struct super_block *sb)
 {
        int rc = 0;
        struct ocontext *c;
+       struct superblock_security_struct *sbsec = sb->s_security;
+       const char *fstype = sb->s_type->name;
+       const char *subtype = (sb->s_subtype && sb->s_subtype[0]) ? sb->s_subtype : NULL;
+       struct ocontext *base = NULL;
 
        read_lock(&policy_rwlock);
 
-       c = policydb.ocontexts[OCON_FSUSE];
-       while (c) {
-               if (strcmp(fstype, c->u.name) == 0)
+       for (c = policydb.ocontexts[OCON_FSUSE]; c; c = c->next) {
+               char *sub;
+               int baselen;
+
+               baselen = strlen(fstype);
+
+               /* if base does not match, this is not the one */
+               if (strncmp(fstype, c->u.name, baselen))
+                       continue;
+
+               /* if there is no subtype, this is the one! */
+               if (!subtype)
+                       break;
+
+               /* skip past the base in this entry */
+               sub = c->u.name + baselen;
+
+               /* entry is only a base. save it. keep looking for subtype */
+               if (sub[0] == '\0') {
+                       base = c;
+                       continue;
+               }
+
+               /* entry is not followed by a subtype, so it is not a match */
+               if (sub[0] != '.')
+                       continue;
+
+               /* whew, we found a subtype of this fstype */
+               sub++; /* move past '.' */
+
+               /* exact match of fstype AND subtype */
+               if (!strcmp(subtype, sub))
                        break;
-               c = c->next;
        }
 
+       /* in case we had found an fstype match but no subtype match */
+       if (!c)
+               c = base;
+
        if (c) {
-               *behavior = c->v.behavior;
+               sbsec->behavior = c->v.behavior;
                if (!c->sid[0]) {
                        rc = sidtab_context_to_sid(&sidtab, &c->context[0],
                                                   &c->sid[0]);
                        if (rc)
                                goto out;
                }
-               *sid = c->sid[0];
+               sbsec->sid = c->sid[0];
        } else {
-               rc = security_genfs_sid(fstype, "/", SECCLASS_DIR, sid);
+               rc = security_genfs_sid(fstype, "/", SECCLASS_DIR, &sbsec->sid);
                if (rc) {
-                       *behavior = SECURITY_FS_USE_NONE;
+                       sbsec->behavior = SECURITY_FS_USE_NONE;
                        rc = 0;
                } else {
-                       *behavior = SECURITY_FS_USE_GENFS;
+                       sbsec->behavior = SECURITY_FS_USE_GENFS;
                }
        }
 
index d030818..a91d205 100644 (file)
@@ -56,7 +56,7 @@
 atomic_t selinux_xfrm_refcount = ATOMIC_INIT(0);
 
 /*
- * Returns true if an LSM/SELinux context
+ * Returns true if the context is an LSM/SELinux context.
  */
 static inline int selinux_authorizable_ctx(struct xfrm_sec_ctx *ctx)
 {
@@ -66,7 +66,7 @@ static inline int selinux_authorizable_ctx(struct xfrm_sec_ctx *ctx)
 }
 
 /*
- * Returns true if the xfrm contains a security blob for SELinux
+ * Returns true if the xfrm contains a security blob for SELinux.
  */
 static inline int selinux_authorizable_xfrm(struct xfrm_state *x)
 {
@@ -74,48 +74,111 @@ static inline int selinux_authorizable_xfrm(struct xfrm_state *x)
 }
 
 /*
- * LSM hook implementation that authorizes that a flow can use
- * a xfrm policy rule.
+ * Allocates a xfrm_sec_state and populates it using the supplied security
+ * xfrm_user_sec_ctx context.
  */
-int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir)
+static int selinux_xfrm_alloc_user(struct xfrm_sec_ctx **ctxp,
+                                  struct xfrm_user_sec_ctx *uctx)
 {
        int rc;
-       u32 sel_sid;
+       const struct task_security_struct *tsec = current_security();
+       struct xfrm_sec_ctx *ctx = NULL;
+       u32 str_len;
 
-       /* Context sid is either set to label or ANY_ASSOC */
-       if (ctx) {
-               if (!selinux_authorizable_ctx(ctx))
-                       return -EINVAL;
-
-               sel_sid = ctx->ctx_sid;
-       } else
-               /*
-                * All flows should be treated as polmatch'ing an
-                * otherwise applicable "non-labeled" policy. This
-                * would prevent inadvertent "leaks".
-                */
-               return 0;
+       if (ctxp == NULL || uctx == NULL ||
+           uctx->ctx_doi != XFRM_SC_DOI_LSM ||
+           uctx->ctx_alg != XFRM_SC_ALG_SELINUX)
+               return -EINVAL;
 
-       rc = avc_has_perm(fl_secid, sel_sid, SECCLASS_ASSOCIATION,
-                         ASSOCIATION__POLMATCH,
-                         NULL);
+       str_len = uctx->ctx_len;
+       if (str_len >= PAGE_SIZE)
+               return -ENOMEM;
 
-       if (rc == -EACCES)
-               return -ESRCH;
+       ctx = kmalloc(sizeof(*ctx) + str_len + 1, GFP_KERNEL);
+       if (!ctx)
+               return -ENOMEM;
 
+       ctx->ctx_doi = XFRM_SC_DOI_LSM;
+       ctx->ctx_alg = XFRM_SC_ALG_SELINUX;
+       ctx->ctx_len = str_len;
+       memcpy(ctx->ctx_str, &uctx[1], str_len);
+       ctx->ctx_str[str_len] = '\0';
+       rc = security_context_to_sid(ctx->ctx_str, str_len, &ctx->ctx_sid);
+       if (rc)
+               goto err;
+
+       rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
+                         SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, NULL);
+       if (rc)
+               goto err;
+
+       *ctxp = ctx;
+       atomic_inc(&selinux_xfrm_refcount);
+       return 0;
+
+err:
+       kfree(ctx);
        return rc;
 }
 
 /*
+ * Free the xfrm_sec_ctx structure.
+ */
+static void selinux_xfrm_free(struct xfrm_sec_ctx *ctx)
+{
+       if (!ctx)
+               return;
+
+       atomic_dec(&selinux_xfrm_refcount);
+       kfree(ctx);
+}
+
+/*
+ * Authorize the deletion of a labeled SA or policy rule.
+ */
+static int selinux_xfrm_delete(struct xfrm_sec_ctx *ctx)
+{
+       const struct task_security_struct *tsec = current_security();
+
+       if (!ctx)
+               return 0;
+
+       return avc_has_perm(tsec->sid, ctx->ctx_sid,
+                           SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT,
+                           NULL);
+}
+
+/*
+ * LSM hook implementation that authorizes that a flow can use a xfrm policy
+ * rule.
+ */
+int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir)
+{
+       int rc;
+
+       /* All flows should be treated as polmatch'ing an otherwise applicable
+        * "non-labeled" policy. This would prevent inadvertent "leaks". */
+       if (!ctx)
+               return 0;
+
+       /* Context sid is either set to label or ANY_ASSOC */
+       if (!selinux_authorizable_ctx(ctx))
+               return -EINVAL;
+
+       rc = avc_has_perm(fl_secid, ctx->ctx_sid,
+                         SECCLASS_ASSOCIATION, ASSOCIATION__POLMATCH, NULL);
+       return (rc == -EACCES ? -ESRCH : rc);
+}
+
+/*
  * LSM hook implementation that authorizes that a state matches
  * the given policy, flow combo.
  */
-
-int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp,
-                       const struct flowi *fl)
+int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x,
+                                     struct xfrm_policy *xp,
+                                     const struct flowi *fl)
 {
        u32 state_sid;
-       int rc;
 
        if (!xp->security)
                if (x->security)
@@ -138,187 +201,80 @@ int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *
        if (fl->flowi_secid != state_sid)
                return 0;
 
-       rc = avc_has_perm(fl->flowi_secid, state_sid, SECCLASS_ASSOCIATION,
-                         ASSOCIATION__SENDTO,
-                         NULL)? 0:1;
-
-       /*
-        * We don't need a separate SA Vs. policy polmatch check
-        * since the SA is now of the same label as the flow and
-        * a flow Vs. policy polmatch check had already happened
-        * in selinux_xfrm_policy_lookup() above.
-        */
-
-       return rc;
+       /* We don't need a separate SA Vs. policy polmatch check since the SA
+        * is now of the same label as the flow and a flow Vs. policy polmatch
+        * check had already happened in selinux_xfrm_policy_lookup() above. */
+       return (avc_has_perm(fl->flowi_secid, state_sid,
+                           SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO,
+                           NULL) ? 0 : 1);
 }
 
 /*
  * LSM hook implementation that checks and/or returns the xfrm sid for the
  * incoming packet.
  */
-
 int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, int ckall)
 {
+       u32 sid_session = SECSID_NULL;
        struct sec_path *sp;
 
-       *sid = SECSID_NULL;
-
        if (skb == NULL)
-               return 0;
+               goto out;
 
        sp = skb->sp;
        if (sp) {
-               int i, sid_set = 0;
+               int i;
 
-               for (i = sp->len-1; i >= 0; i--) {
+               for (i = sp->len - 1; i >= 0; i--) {
                        struct xfrm_state *x = sp->xvec[i];
                        if (selinux_authorizable_xfrm(x)) {
                                struct xfrm_sec_ctx *ctx = x->security;
 
-                               if (!sid_set) {
-                                       *sid = ctx->ctx_sid;
-                                       sid_set = 1;
-
+                               if (sid_session == SECSID_NULL) {
+                                       sid_session = ctx->ctx_sid;
                                        if (!ckall)
-                                               break;
-                               } else if (*sid != ctx->ctx_sid)
+                                               goto out;
+                               } else if (sid_session != ctx->ctx_sid) {
+                                       *sid = SECSID_NULL;
                                        return -EINVAL;
+                               }
                        }
                }
        }
 
-       return 0;
-}
-
-/*
- * Security blob allocation for xfrm_policy and xfrm_state
- * CTX does not have a meaningful value on input
- */
-static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp,
-       struct xfrm_user_sec_ctx *uctx, u32 sid)
-{
-       int rc = 0;
-       const struct task_security_struct *tsec = current_security();
-       struct xfrm_sec_ctx *ctx = NULL;
-       char *ctx_str = NULL;
-       u32 str_len;
-
-       BUG_ON(uctx && sid);
-
-       if (!uctx)
-               goto not_from_user;
-
-       if (uctx->ctx_alg != XFRM_SC_ALG_SELINUX)
-               return -EINVAL;
-
-       str_len = uctx->ctx_len;
-       if (str_len >= PAGE_SIZE)
-               return -ENOMEM;
-
-       *ctxp = ctx = kmalloc(sizeof(*ctx) +
-                             str_len + 1,
-                             GFP_KERNEL);
-
-       if (!ctx)
-               return -ENOMEM;
-
-       ctx->ctx_doi = uctx->ctx_doi;
-       ctx->ctx_len = str_len;
-       ctx->ctx_alg = uctx->ctx_alg;
-
-       memcpy(ctx->ctx_str,
-              uctx+1,
-              str_len);
-       ctx->ctx_str[str_len] = 0;
-       rc = security_context_to_sid(ctx->ctx_str,
-                                    str_len,
-                                    &ctx->ctx_sid);
-
-       if (rc)
-               goto out;
-
-       /*
-        * Does the subject have permission to set security context?
-        */
-       rc = avc_has_perm(tsec->sid, ctx->ctx_sid,
-                         SECCLASS_ASSOCIATION,
-                         ASSOCIATION__SETCONTEXT, NULL);
-       if (rc)
-               goto out;
-
-       return rc;
-
-not_from_user:
-       rc = security_sid_to_context(sid, &ctx_str, &str_len);
-       if (rc)
-               goto out;
-
-       *ctxp = ctx = kmalloc(sizeof(*ctx) +
-                             str_len,
-                             GFP_ATOMIC);
-
-       if (!ctx) {
-               rc = -ENOMEM;
-               goto out;
-       }
-
-       ctx->ctx_doi = XFRM_SC_DOI_LSM;
-       ctx->ctx_alg = XFRM_SC_ALG_SELINUX;
-       ctx->ctx_sid = sid;
-       ctx->ctx_len = str_len;
-       memcpy(ctx->ctx_str,
-              ctx_str,
-              str_len);
-
-       goto out2;
-
 out:
-       *ctxp = NULL;
-       kfree(ctx);
-out2:
-       kfree(ctx_str);
-       return rc;
+       *sid = sid_session;
+       return 0;
 }
 
 /*
- * LSM hook implementation that allocs and transfers uctx spec to
- * xfrm_policy.
+ * LSM hook implementation that allocs and transfers uctx spec to xfrm_policy.
  */
 int selinux_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
                              struct xfrm_user_sec_ctx *uctx)
 {
-       int err;
-
-       BUG_ON(!uctx);
-
-       err = selinux_xfrm_sec_ctx_alloc(ctxp, uctx, 0);
-       if (err == 0)
-               atomic_inc(&selinux_xfrm_refcount);
-
-       return err;
+       return selinux_xfrm_alloc_user(ctxp, uctx);
 }
 
-
 /*
- * LSM hook implementation that copies security data structure from old to
- * new for policy cloning.
+ * LSM hook implementation that copies security data structure from old to new
+ * for policy cloning.
  */
 int selinux_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
                              struct xfrm_sec_ctx **new_ctxp)
 {
        struct xfrm_sec_ctx *new_ctx;
 
-       if (old_ctx) {
-               new_ctx = kmalloc(sizeof(*old_ctx) + old_ctx->ctx_len,
-                                 GFP_ATOMIC);
-               if (!new_ctx)
-                       return -ENOMEM;
+       if (!old_ctx)
+               return 0;
+
+       new_ctx = kmemdup(old_ctx, sizeof(*old_ctx) + old_ctx->ctx_len,
+                         GFP_ATOMIC);
+       if (!new_ctx)
+               return -ENOMEM;
+       atomic_inc(&selinux_xfrm_refcount);
+       *new_ctxp = new_ctx;
 
-               memcpy(new_ctx, old_ctx, sizeof(*new_ctx));
-               memcpy(new_ctx->ctx_str, old_ctx->ctx_str, new_ctx->ctx_len);
-               atomic_inc(&selinux_xfrm_refcount);
-               *new_ctxp = new_ctx;
-       }
        return 0;
 }
 
@@ -327,8 +283,7 @@ int selinux_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
  */
 void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
 {
-       atomic_dec(&selinux_xfrm_refcount);
-       kfree(ctx);
+       selinux_xfrm_free(ctx);
 }
 
 /*
@@ -336,31 +291,55 @@ void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
  */
 int selinux_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
 {
-       const struct task_security_struct *tsec = current_security();
-
-       if (!ctx)
-               return 0;
+       return selinux_xfrm_delete(ctx);
+}
 
-       return avc_has_perm(tsec->sid, ctx->ctx_sid,
-                           SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT,
-                           NULL);
+/*
+ * LSM hook implementation that allocates a xfrm_sec_state, populates it using
+ * the supplied security context, and assigns it to the xfrm_state.
+ */
+int selinux_xfrm_state_alloc(struct xfrm_state *x,
+                            struct xfrm_user_sec_ctx *uctx)
+{
+       return selinux_xfrm_alloc_user(&x->security, uctx);
 }
 
 /*
- * LSM hook implementation that allocs and transfers sec_ctx spec to
- * xfrm_state.
+ * LSM hook implementation that allocates a xfrm_sec_state and populates based
+ * on a secid.
  */
-int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uctx,
-               u32 secid)
+int selinux_xfrm_state_alloc_acquire(struct xfrm_state *x,
+                                    struct xfrm_sec_ctx *polsec, u32 secid)
 {
-       int err;
+       int rc;
+       struct xfrm_sec_ctx *ctx;
+       char *ctx_str = NULL;
+       int str_len;
+
+       if (!polsec)
+               return 0;
 
-       BUG_ON(!x);
+       if (secid == 0)
+               return -EINVAL;
 
-       err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid);
-       if (err == 0)
-               atomic_inc(&selinux_xfrm_refcount);
-       return err;
+       rc = security_sid_to_context(secid, &ctx_str, &str_len);
+       if (rc)
+               return rc;
+
+       ctx = kmalloc(sizeof(*ctx) + str_len, GFP_ATOMIC);
+       if (!ctx)
+               return -ENOMEM;
+
+       ctx->ctx_doi = XFRM_SC_DOI_LSM;
+       ctx->ctx_alg = XFRM_SC_ALG_SELINUX;
+       ctx->ctx_sid = secid;
+       ctx->ctx_len = str_len;
+       memcpy(ctx->ctx_str, ctx_str, str_len);
+       kfree(ctx_str);
+
+       x->security = ctx;
+       atomic_inc(&selinux_xfrm_refcount);
+       return 0;
 }
 
 /*
@@ -368,24 +347,15 @@ int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uct
  */
 void selinux_xfrm_state_free(struct xfrm_state *x)
 {
-       atomic_dec(&selinux_xfrm_refcount);
-       kfree(x->security);
+       selinux_xfrm_free(x->security);
 }
 
- /*
 * LSM hook implementation that authorizes deletion of labeled SAs.
 */
+/*
+ * LSM hook implementation that authorizes deletion of labeled SAs.
+ */
 int selinux_xfrm_state_delete(struct xfrm_state *x)
 {
-       const struct task_security_struct *tsec = current_security();
-       struct xfrm_sec_ctx *ctx = x->security;
-
-       if (!ctx)
-               return 0;
-
-       return avc_has_perm(tsec->sid, ctx->ctx_sid,
-                           SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT,
-                           NULL);
+       return selinux_xfrm_delete(x->security);
 }
 
 /*
@@ -395,14 +365,12 @@ int selinux_xfrm_state_delete(struct xfrm_state *x)
  * we need to check for unlabelled access since this may not have
  * gone thru the IPSec process.
  */
-int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb,
-                               struct common_audit_data *ad)
+int selinux_xfrm_sock_rcv_skb(u32 sk_sid, struct sk_buff *skb,
+                             struct common_audit_data *ad)
 {
-       int i, rc = 0;
-       struct sec_path *sp;
-       u32 sel_sid = SECINITSID_UNLABELED;
-
-       sp = skb->sp;
+       int i;
+       struct sec_path *sp = skb->sp;
+       u32 peer_sid = SECINITSID_UNLABELED;
 
        if (sp) {
                for (i = 0; i < sp->len; i++) {
@@ -410,23 +378,17 @@ int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb,
 
                        if (x && selinux_authorizable_xfrm(x)) {
                                struct xfrm_sec_ctx *ctx = x->security;
-                               sel_sid = ctx->ctx_sid;
+                               peer_sid = ctx->ctx_sid;
                                break;
                        }
                }
        }
 
-       /*
-        * This check even when there's no association involved is
-        * intended, according to Trent Jaeger, to make sure a
-        * process can't engage in non-ipsec communication unless
-        * explicitly allowed by policy.
-        */
-
-       rc = avc_has_perm(isec_sid, sel_sid, SECCLASS_ASSOCIATION,
-                         ASSOCIATION__RECVFROM, ad);
-
-       return rc;
+       /* This check even when there's no association involved is intended,
+        * according to Trent Jaeger, to make sure a process can't engage in
+        * non-IPsec communication unless explicitly allowed by policy. */
+       return avc_has_perm(sk_sid, peer_sid,
+                           SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, ad);
 }
 
 /*
@@ -436,49 +398,38 @@ int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb,
  * If we do have a authorizable security association, then it has already been
  * checked in the selinux_xfrm_state_pol_flow_match hook above.
  */
-int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb,
-                                       struct common_audit_data *ad, u8 proto)
+int selinux_xfrm_postroute_last(u32 sk_sid, struct sk_buff *skb,
+                               struct common_audit_data *ad, u8 proto)
 {
        struct dst_entry *dst;
-       int rc = 0;
-
-       dst = skb_dst(skb);
-
-       if (dst) {
-               struct dst_entry *dst_test;
-
-               for (dst_test = dst; dst_test != NULL;
-                    dst_test = dst_test->child) {
-                       struct xfrm_state *x = dst_test->xfrm;
-
-                       if (x && selinux_authorizable_xfrm(x))
-                               goto out;
-               }
-       }
 
        switch (proto) {
        case IPPROTO_AH:
        case IPPROTO_ESP:
        case IPPROTO_COMP:
-               /*
-                * We should have already seen this packet once before
-                * it underwent xfrm(s). No need to subject it to the
-                * unlabeled check.
-                */
-               goto out;
+               /* We should have already seen this packet once before it
+                * underwent xfrm(s). No need to subject it to the unlabeled
+                * check. */
+               return 0;
        default:
                break;
        }
 
-       /*
-        * This check even when there's no association involved is
-        * intended, according to Trent Jaeger, to make sure a
-        * process can't engage in non-ipsec communication unless
-        * explicitly allowed by policy.
-        */
+       dst = skb_dst(skb);
+       if (dst) {
+               struct dst_entry *iter;
 
-       rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION,
-                         ASSOCIATION__SENDTO, ad);
-out:
-       return rc;
+               for (iter = dst; iter != NULL; iter = iter->child) {
+                       struct xfrm_state *x = iter->xfrm;
+
+                       if (x && selinux_authorizable_xfrm(x))
+                               return 0;
+               }
+       }
+
+       /* This check even when there's no association involved is intended,
+        * according to Trent Jaeger, to make sure a process can't engage in
+        * non-IPsec communication unless explicitly allowed by policy. */
+       return avc_has_perm(sk_sid, SECINITSID_UNLABELED,
+                           SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, ad);
 }
index 076b8e8..364cc64 100644 (file)
@@ -177,9 +177,13 @@ struct smk_port_label {
 #define SMACK_CIPSO_MAXCATNUM           184     /* 23 * 8 */
 
 /*
- * Flag for transmute access
+ * Flags for untraditional access modes.
+ * It shouldn't be necessary to avoid conflicts with definitions
+ * in fs.h, but do so anyway.
  */
-#define MAY_TRANSMUTE  64
+#define MAY_TRANSMUTE  0x00001000      /* Controls directory labeling */
+#define MAY_LOCK       0x00002000      /* Locks should be writes, but ... */
+
 /*
  * Just to make the common cases easier to deal with
  */
@@ -188,9 +192,9 @@ struct smk_port_label {
 #define MAY_NOT                0
 
 /*
- * Number of access types used by Smack (rwxat)
+ * Number of access types used by Smack (rwxatl)
  */
-#define SMK_NUM_ACCESS_TYPE 5
+#define SMK_NUM_ACCESS_TYPE 6
 
 /* SMACK data */
 struct smack_audit_data {
index b3b59b1..14293cd 100644 (file)
@@ -84,6 +84,8 @@ int log_policy = SMACK_AUDIT_DENIED;
  *
  * Do the object check first because that is more
  * likely to differ.
+ *
+ * Allowing write access implies allowing locking.
  */
 int smk_access_entry(char *subject_label, char *object_label,
                        struct list_head *rule_list)
@@ -99,6 +101,11 @@ int smk_access_entry(char *subject_label, char *object_label,
                }
        }
 
+       /*
+        * MAY_WRITE implies MAY_LOCK.
+        */
+       if ((may & MAY_WRITE) == MAY_WRITE)
+               may |= MAY_LOCK;
        return may;
 }
 
@@ -245,6 +252,7 @@ out_audit:
 static inline void smack_str_from_perm(char *string, int access)
 {
        int i = 0;
+
        if (access & MAY_READ)
                string[i++] = 'r';
        if (access & MAY_WRITE)
@@ -255,6 +263,8 @@ static inline void smack_str_from_perm(char *string, int access)
                string[i++] = 'a';
        if (access & MAY_TRANSMUTE)
                string[i++] = 't';
+       if (access & MAY_LOCK)
+               string[i++] = 'l';
        string[i] = '\0';
 }
 /**
index 8825375..b0be893 100644 (file)
@@ -185,7 +185,7 @@ static int smack_ptrace_access_check(struct task_struct *ctp, unsigned int mode)
        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
        smk_ad_setfield_u_tsk(&ad, ctp);
 
-       rc = smk_curacc(skp->smk_known, MAY_READWRITE, &ad);
+       rc = smk_curacc(skp->smk_known, mode, &ad);
        return rc;
 }
 
@@ -1146,7 +1146,7 @@ static int smack_file_ioctl(struct file *file, unsigned int cmd,
  * @file: the object
  * @cmd: unused
  *
- * Returns 0 if current has write access, error code otherwise
+ * Returns 0 if current has lock access, error code otherwise
  */
 static int smack_file_lock(struct file *file, unsigned int cmd)
 {
@@ -1154,7 +1154,7 @@ static int smack_file_lock(struct file *file, unsigned int cmd)
 
        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
        smk_ad_setfield_u_fs_path(&ad, file->f_path);
-       return smk_curacc(file->f_security, MAY_WRITE, &ad);
+       return smk_curacc(file->f_security, MAY_LOCK, &ad);
 }
 
 /**
@@ -1178,8 +1178,13 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
 
        switch (cmd) {
        case F_GETLK:
+               break;
        case F_SETLK:
        case F_SETLKW:
+               smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
+               smk_ad_setfield_u_fs_path(&ad, file->f_path);
+               rc = smk_curacc(file->f_security, MAY_LOCK, &ad);
+               break;
        case F_SETOWN:
        case F_SETSIG:
                smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
index 80f4b4a..160aa08 100644 (file)
@@ -139,7 +139,7 @@ const char *smack_cipso_option = SMACK_CIPSO_OPTION;
  * SMK_LOADLEN: Smack rule length
  */
 #define SMK_OACCESS    "rwxa"
-#define SMK_ACCESS     "rwxat"
+#define SMK_ACCESS     "rwxatl"
 #define SMK_OACCESSLEN (sizeof(SMK_OACCESS) - 1)
 #define SMK_ACCESSLEN  (sizeof(SMK_ACCESS) - 1)
 #define SMK_OLOADLEN   (SMK_LABELLEN + SMK_LABELLEN + SMK_OACCESSLEN)
@@ -282,6 +282,10 @@ static int smk_perm_from_str(const char *string)
                case 'T':
                        perm |= MAY_TRANSMUTE;
                        break;
+               case 'l':
+               case 'L':
+                       perm |= MAY_LOCK;
+                       break;
                default:
                        return perm;
                }
@@ -452,7 +456,7 @@ static ssize_t smk_write_rules_list(struct file *file, const char __user *buf,
                /*
                 * Minor hack for backward compatibility
                 */
-               if (count != SMK_OLOADLEN && count != SMK_LOADLEN)
+               if (count < SMK_OLOADLEN || count > SMK_LOADLEN)
                        return -EINVAL;
        } else {
                if (count >= PAGE_SIZE) {
@@ -592,6 +596,8 @@ static void smk_rule_show(struct seq_file *s, struct smack_rule *srp, int max)
                seq_putc(s, 'a');
        if (srp->smk_access & MAY_TRANSMUTE)
                seq_putc(s, 't');
+       if (srp->smk_access & MAY_LOCK)
+               seq_putc(s, 'l');
 
        seq_putc(s, '\n');
 }
index fa64cd8..fb5d107 100644 (file)
@@ -238,7 +238,7 @@ static void davinci_pcm_dma_irq(unsigned link, u16 ch_status, void *data)
        print_buf_info(prtd->ram_channel, "i ram_channel");
        pr_debug("davinci_pcm: link=%d, status=0x%x\n", link, ch_status);
 
-       if (unlikely(ch_status != DMA_COMPLETE))
+       if (unlikely(ch_status != EDMA_DMA_COMPLETE))
                return;
 
        if (snd_pcm_running(substream)) {
index fe70207..9d77f13 100644 (file)
@@ -2,7 +2,7 @@
  * turbostat -- show CPU frequency and C-state residency
  * on modern Intel turbo-capable processors.
  *
- * Copyright (c) 2012 Intel Corporation.
+ * Copyright (c) 2013 Intel Corporation.
  * Len Brown <len.brown@intel.com>
  *
  * This program is free software; you can redistribute it and/or modify it
@@ -47,6 +47,8 @@ unsigned int skip_c1;
 unsigned int do_nhm_cstates;
 unsigned int do_snb_cstates;
 unsigned int do_c8_c9_c10;
+unsigned int do_slm_cstates;
+unsigned int use_c1_residency_msr;
 unsigned int has_aperf;
 unsigned int has_epb;
 unsigned int units = 1000000000;       /* Ghz etc */
@@ -81,6 +83,8 @@ double rapl_joule_counter_range;
 #define RAPL_DRAM      (1 << 3)
 #define RAPL_PKG_PERF_STATUS   (1 << 4)
 #define RAPL_DRAM_PERF_STATUS  (1 << 5)
+#define RAPL_PKG_POWER_INFO    (1 << 6)
+#define RAPL_CORE_POLICY       (1 << 7)
 #define        TJMAX_DEFAULT   100
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -96,7 +100,7 @@ struct thread_data {
        unsigned long long tsc;
        unsigned long long aperf;
        unsigned long long mperf;
-       unsigned long long c1;  /* derived */
+       unsigned long long c1;
        unsigned long long extra_msr64;
        unsigned long long extra_delta64;
        unsigned long long extra_msr32;
@@ -266,7 +270,7 @@ void print_header(void)
                outp += sprintf(outp, "           MSR 0x%03X", extra_msr_offset64);
        if (do_nhm_cstates)
                outp += sprintf(outp, "    %%c1");
-       if (do_nhm_cstates)
+       if (do_nhm_cstates && !do_slm_cstates)
                outp += sprintf(outp, "    %%c3");
        if (do_nhm_cstates)
                outp += sprintf(outp, "    %%c6");
@@ -280,9 +284,9 @@ void print_header(void)
 
        if (do_snb_cstates)
                outp += sprintf(outp, "   %%pc2");
-       if (do_nhm_cstates)
+       if (do_nhm_cstates && !do_slm_cstates)
                outp += sprintf(outp, "   %%pc3");
-       if (do_nhm_cstates)
+       if (do_nhm_cstates && !do_slm_cstates)
                outp += sprintf(outp, "   %%pc6");
        if (do_snb_cstates)
                outp += sprintf(outp, "   %%pc7");
@@ -480,7 +484,7 @@ int format_counters(struct thread_data *t, struct core_data *c,
        if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
                goto done;
 
-       if (do_nhm_cstates)
+       if (do_nhm_cstates && !do_slm_cstates)
                outp += sprintf(outp, " %6.2f", 100.0 * c->c3/t->tsc);
        if (do_nhm_cstates)
                outp += sprintf(outp, " %6.2f", 100.0 * c->c6/t->tsc);
@@ -499,9 +503,9 @@ int format_counters(struct thread_data *t, struct core_data *c,
 
        if (do_snb_cstates)
                outp += sprintf(outp, " %6.2f", 100.0 * p->pc2/t->tsc);
-       if (do_nhm_cstates)
+       if (do_nhm_cstates && !do_slm_cstates)
                outp += sprintf(outp, " %6.2f", 100.0 * p->pc3/t->tsc);
-       if (do_nhm_cstates)
+       if (do_nhm_cstates && !do_slm_cstates)
                outp += sprintf(outp, " %6.2f", 100.0 * p->pc6/t->tsc);
        if (do_snb_cstates)
                outp += sprintf(outp, " %6.2f", 100.0 * p->pc7/t->tsc);
@@ -648,17 +652,24 @@ delta_thread(struct thread_data *new, struct thread_data *old,
        }
 
 
-       /*
-        * As counter collection is not atomic,
-        * it is possible for mperf's non-halted cycles + idle states
-        * to exceed TSC's all cycles: show c1 = 0% in that case.
-        */
-       if ((old->mperf + core_delta->c3 + core_delta->c6 + core_delta->c7) > old->tsc)
-               old->c1 = 0;
-       else {
-               /* normal case, derive c1 */
-               old->c1 = old->tsc - old->mperf - core_delta->c3
+       if (use_c1_residency_msr) {
+               /*
+                * Some models have a dedicated C1 residency MSR,
+                * which should be more accurate than the derivation below.
+                */
+       } else {
+               /*
+                * As counter collection is not atomic,
+                * it is possible for mperf's non-halted cycles + idle states
+                * to exceed TSC's all cycles: show c1 = 0% in that case.
+                */
+               if ((old->mperf + core_delta->c3 + core_delta->c6 + core_delta->c7) > old->tsc)
+                       old->c1 = 0;
+               else {
+                       /* normal case, derive c1 */
+                       old->c1 = old->tsc - old->mperf - core_delta->c3
                                - core_delta->c6 - core_delta->c7;
+               }
        }
 
        if (old->mperf == 0) {
@@ -872,13 +883,21 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                if (get_msr(cpu, extra_msr_offset64, &t->extra_msr64))
                        return -5;
 
+       if (use_c1_residency_msr) {
+               if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1))
+                       return -6;
+       }
+
        /* collect core counters only for 1st thread in core */
        if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
                return 0;
 
-       if (do_nhm_cstates) {
+       if (do_nhm_cstates && !do_slm_cstates) {
                if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3))
                        return -6;
+       }
+
+       if (do_nhm_cstates) {
                if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6))
                        return -7;
        }
@@ -898,7 +917,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
        if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
                return 0;
 
-       if (do_nhm_cstates) {
+       if (do_nhm_cstates && !do_slm_cstates) {
                if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3))
                        return -9;
                if (get_msr(cpu, MSR_PKG_C6_RESIDENCY, &p->pc6))
@@ -977,7 +996,7 @@ void print_verbose_header(void)
                ratio, bclk, ratio * bclk);
 
        get_msr(0, MSR_IA32_POWER_CTL, &msr);
-       fprintf(stderr, "cpu0: MSR_IA32_POWER_CTL: 0x%08llx (C1E: %sabled)\n",
+       fprintf(stderr, "cpu0: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n",
                msr, msr & 0x2 ? "EN" : "DIS");
 
        if (!do_ivt_turbo_ratio_limit)
@@ -1046,25 +1065,28 @@ print_nhm_turbo_ratio_limits:
 
        switch(msr & 0x7) {
        case 0:
-               fprintf(stderr, "pc0");
+               fprintf(stderr, do_slm_cstates ? "no pkg states" : "pc0");
                break;
        case 1:
-               fprintf(stderr, do_snb_cstates ? "pc2" : "pc0");
+               fprintf(stderr, do_slm_cstates ? "no pkg states" : do_snb_cstates ? "pc2" : "pc0");
                break;
        case 2:
-               fprintf(stderr, do_snb_cstates ? "pc6-noret" : "pc3");
+               fprintf(stderr, do_slm_cstates ? "invalid" : do_snb_cstates ? "pc6-noret" : "pc3");
                break;
        case 3:
-               fprintf(stderr, "pc6");
+               fprintf(stderr, do_slm_cstates ? "invalid" : "pc6");
                break;
        case 4:
-               fprintf(stderr, "pc7");
+               fprintf(stderr, do_slm_cstates ? "pc4" : "pc7");
                break;
        case 5:
-               fprintf(stderr, do_snb_cstates ? "pc7s" : "invalid");
+               fprintf(stderr, do_slm_cstates ? "invalid" : do_snb_cstates ? "pc7s" : "invalid");
+               break;
+       case 6:
+               fprintf(stderr, do_slm_cstates ? "pc6" : "invalid");
                break;
        case 7:
-               fprintf(stderr, "unlimited");
+               fprintf(stderr, do_slm_cstates ? "pc7" : "unlimited");
                break;
        default:
                fprintf(stderr, "invalid");
@@ -1460,6 +1482,8 @@ int has_nehalem_turbo_ratio_limit(unsigned int family, unsigned int model)
        case 0x3F:      /* HSW */
        case 0x45:      /* HSW */
        case 0x46:      /* HSW */
+       case 0x37:      /* BYT */
+       case 0x4D:      /* AVN */
                return 1;
        case 0x2E:      /* Nehalem-EX Xeon - Beckton */
        case 0x2F:      /* Westmere-EX Xeon - Eagleton */
@@ -1532,14 +1556,33 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 #define        RAPL_POWER_GRANULARITY  0x7FFF  /* 15 bit power granularity */
 #define        RAPL_TIME_GRANULARITY   0x3F /* 6 bit time granularity */
 
+double get_tdp(model)
+{
+       unsigned long long msr;
+
+       if (do_rapl & RAPL_PKG_POWER_INFO)
+               if (!get_msr(0, MSR_PKG_POWER_INFO, &msr))
+                       return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
+
+       switch (model) {
+       case 0x37:
+       case 0x4D:
+               return 30.0;
+       default:
+               return 135.0;
+       }
+}
+
+
 /*
  * rapl_probe()
  *
- * sets do_rapl
+ * sets do_rapl, rapl_power_units, rapl_energy_units, rapl_time_units
  */
 void rapl_probe(unsigned int family, unsigned int model)
 {
        unsigned long long msr;
+       unsigned int time_unit;
        double tdp;
 
        if (!genuine_intel)
@@ -1555,11 +1598,15 @@ void rapl_probe(unsigned int family, unsigned int model)
        case 0x3F:      /* HSW */
        case 0x45:      /* HSW */
        case 0x46:      /* HSW */
-               do_rapl = RAPL_PKG | RAPL_CORES | RAPL_GFX;
+               do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_GFX | RAPL_PKG_POWER_INFO;
                break;
        case 0x2D:
        case 0x3E:
-               do_rapl = RAPL_PKG | RAPL_CORES | RAPL_DRAM | RAPL_PKG_PERF_STATUS | RAPL_DRAM_PERF_STATUS;
+               do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_PKG_PERF_STATUS | RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO;
+               break;
+       case 0x37:      /* BYT */
+       case 0x4D:      /* AVN */
+               do_rapl = RAPL_PKG | RAPL_CORES ;
                break;
        default:
                return;
@@ -1570,19 +1617,22 @@ void rapl_probe(unsigned int family, unsigned int model)
                return;
 
        rapl_power_units = 1.0 / (1 << (msr & 0xF));
-       rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
-       rapl_time_units = 1.0 / (1 << (msr >> 16 & 0xF));
+       if (model == 0x37)
+               rapl_energy_units = 1.0 * (1 << (msr >> 8 & 0x1F)) / 1000000;
+       else
+               rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
 
-       /* get TDP to determine energy counter range */
-       if (get_msr(0, MSR_PKG_POWER_INFO, &msr))
-               return;
+       time_unit = msr >> 16 & 0xF;
+       if (time_unit == 0)
+               time_unit = 0xA;
 
-       tdp = ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
+       rapl_time_units = 1.0 / (1 << (time_unit));
 
-       rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
+       tdp = get_tdp(model);
 
+       rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
        if (verbose)
-               fprintf(stderr, "RAPL: %.0f sec. Joule Counter Range\n", rapl_joule_counter_range);
+               fprintf(stderr, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
 
        return;
 }
@@ -1668,7 +1718,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
        unsigned long long msr;
        int cpu;
-       double local_rapl_power_units, local_rapl_energy_units, local_rapl_time_units;
 
        if (!do_rapl)
                return 0;
@@ -1686,23 +1735,13 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
        if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr))
                return -1;
 
-       local_rapl_power_units = 1.0 / (1 << (msr & 0xF));
-       local_rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
-       local_rapl_time_units = 1.0 / (1 << (msr >> 16 & 0xF));
-
-       if (local_rapl_power_units != rapl_power_units)
-               fprintf(stderr, "cpu%d, ERROR: Power units mis-match\n", cpu);
-       if (local_rapl_energy_units != rapl_energy_units)
-               fprintf(stderr, "cpu%d, ERROR: Energy units mis-match\n", cpu);
-       if (local_rapl_time_units != rapl_time_units)
-               fprintf(stderr, "cpu%d, ERROR: Time units mis-match\n", cpu);
-
        if (verbose) {
                fprintf(stderr, "cpu%d: MSR_RAPL_POWER_UNIT: 0x%08llx "
                        "(%f Watts, %f Joules, %f sec.)\n", cpu, msr,
-                       local_rapl_power_units, local_rapl_energy_units, local_rapl_time_units);
+                       rapl_power_units, rapl_energy_units, rapl_time_units);
        }
-       if (do_rapl & RAPL_PKG) {
+       if (do_rapl & RAPL_PKG_POWER_INFO) {
+
                if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr))
                        return -5;
 
@@ -1714,6 +1753,9 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                        ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
                        ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
 
+       }
+       if (do_rapl & RAPL_PKG) {
+
                if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr))
                        return -9;
 
@@ -1749,12 +1791,16 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
                print_power_limit_msr(cpu, msr, "DRAM Limit");
        }
-       if (do_rapl & RAPL_CORES) {
+       if (do_rapl & RAPL_CORE_POLICY) {
                if (verbose) {
                        if (get_msr(cpu, MSR_PP0_POLICY, &msr))
                                return -7;
 
                        fprintf(stderr, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF);
+               }
+       }
+       if (do_rapl & RAPL_CORES) {
+               if (verbose) {
 
                        if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
                                return -9;
@@ -1813,10 +1859,48 @@ int has_c8_c9_c10(unsigned int family, unsigned int model)
 }
 
 
+int is_slm(unsigned int family, unsigned int model)
+{
+       if (!genuine_intel)
+               return 0;
+       switch (model) {
+       case 0x37:      /* BYT */
+       case 0x4D:      /* AVN */
+               return 1;
+       }
+       return 0;
+}
+
+#define SLM_BCLK_FREQS 5
+double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0};
+
+double slm_bclk(void)
+{
+       unsigned long long msr = 3;
+       unsigned int i;
+       double freq;
+
+       if (get_msr(0, MSR_FSB_FREQ, &msr))
+               fprintf(stderr, "SLM BCLK: unknown\n");
+
+       i = msr & 0xf;
+       if (i >= SLM_BCLK_FREQS) {
+               fprintf(stderr, "SLM BCLK[%d] invalid\n", i);
+               msr = 3;
+       }
+       freq = slm_freq_table[i];
+
+       fprintf(stderr, "SLM BCLK: %.1f Mhz\n", freq);
+
+       return freq;
+}
+
 double discover_bclk(unsigned int family, unsigned int model)
 {
        if (is_snb(family, model))
                return 100.00;
+       else if (is_slm(family, model))
+               return slm_bclk();
        else
                return 133.33;
 }
@@ -1873,7 +1957,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
                fprintf(stderr, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n",
                        cpu, msr, target_c_local);
 
-       if (target_c_local < 85 || target_c_local > 120)
+       if (target_c_local < 85 || target_c_local > 127)
                goto guess;
 
        tcc_activation_temp = target_c_local;
@@ -1970,6 +2054,7 @@ void check_cpuid()
        do_smi = do_nhm_cstates;
        do_snb_cstates = is_snb(family, model);
        do_c8_c9_c10 = has_c8_c9_c10(family, model);
+       do_slm_cstates = is_slm(family, model);
        bclk = discover_bclk(family, model);
 
        do_nehalem_turbo_ratio_limit = has_nehalem_turbo_ratio_limit(family, model);
@@ -2331,7 +2416,7 @@ int main(int argc, char **argv)
        cmdline(argc, argv);
 
        if (verbose)
-               fprintf(stderr, "turbostat v3.4 April 17, 2013"
+               fprintf(stderr, "turbostat v3.5 April 26, 2013"
                        " - Len Brown <lenb@kernel.org>\n");
 
        turbostat_init();
index 662f34c..a0aa84b 100644 (file)
@@ -1615,8 +1615,9 @@ EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
 
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
-       return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
-                                   offset, len);
+       const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+
+       return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);