OSDN Git Service

Merge tag 'v4.4.207' into 10
author0ranko0P <ranko0p@outlook.com>
Fri, 3 Jan 2020 14:15:27 +0000 (22:15 +0800)
committer0ranko0P <ranko0p@outlook.com>
Fri, 3 Jan 2020 14:15:27 +0000 (22:15 +0800)
42 files changed:
1  2 
Makefile
arch/arm/include/asm/uaccess.h
arch/mips/Kconfig
drivers/net/wireless/ath/ar5523/ar5523.c
drivers/net/wireless/iwlwifi/mvm/mac80211.c
drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c
drivers/thermal/thermal_core.c
drivers/tty/serial/msm_serial.c
drivers/tty/serial/serial_core.c
drivers/usb/core/hub.c
drivers/usb/gadget/configfs.c
drivers/usb/gadget/function/u_serial.c
drivers/usb/host/xhci-hub.c
drivers/usb/host/xhci-mem.c
drivers/usb/host/xhci-ring.c
drivers/usb/host/xhci.c
drivers/usb/host/xhci.h
drivers/video/hdmi.c
drivers/virtio/virtio_balloon.c
fs/cifs/file.c
fs/fuse/dir.c
fs/fuse/fuse_i.h
fs/proc/array.c
include/linux/dma-mapping.h
include/linux/netdevice.h
include/linux/regulator/consumer.h
include/linux/serial_core.h
include/linux/thread_info.h
include/net/ip.h
include/net/tcp.h
kernel/module.c
kernel/sched/fair.c
kernel/workqueue.c
mm/shmem.c
net/bridge/br_device.c
net/core/dev.c
net/ipv4/devinet.c
net/ipv4/ip_output.c
net/ipv4/tcp_output.c
net/ipv4/tcp_timer.c
scripts/mod/modpost.c
sound/core/pcm_lib.c

diff --combined Makefile
+++ b/Makefile
@@@ -1,7 -1,7 +1,7 @@@
  VERSION = 4
  PATCHLEVEL = 4
- SUBLEVEL = 206
- EXTRAVERSION = -rc1
+ SUBLEVEL = 207
+ EXTRAVERSION =
  NAME = Blurry Fish Butt
  
  # *DOCUMENTATION*
@@@ -30,7 -30,7 +30,7 @@@ unexport GREP_OPTION
  # Most importantly: sub-Makefiles should only ever modify files in
  # their own directory. If in some directory we have a dependency on
  # a file in another dir (which doesn't happen often, but it's often
 -# unavoidable when linking the built-in.o targets which finally
 +# unavoidable when linking the built-in.a targets which finally
  # turn into vmlinux), we will call a sub make in that other dir, and
  # after that we are sure that everything which is in that other dir
  # is now up to date.
@@@ -148,7 -148,7 +148,7 @@@ PHONY += $(MAKECMDGOALS) sub-mak
  $(filter-out _all sub-make $(CURDIR)/Makefile, $(MAKECMDGOALS)) _all: sub-make
        @:
  
 -sub-make: FORCE
 +sub-make:
        $(Q)$(MAKE) -C $(KBUILD_OUTPUT) KBUILD_SRC=$(CURDIR) \
        -f $(CURDIR)/Makefile $(filter-out _all sub-make,$(MAKECMDGOALS))
  
@@@ -303,7 -303,7 +303,7 @@@ CONFIG_SHELL := $(shell if [ -x "$$BASH
  
  HOSTCC       = gcc
  HOSTCXX      = g++
 -HOSTCFLAGS   := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89
 +HOSTCFLAGS   := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 -pipe
  HOSTCXXFLAGS = -O2
  
  # Decide whether to build built-in, modular, or both.
@@@ -343,7 -343,6 +343,7 @@@ include scripts/Kbuild.includ
  # Make variables (CC, etc...)
  AS            = $(CROSS_COMPILE)as
  LD            = $(CROSS_COMPILE)ld
 +LDLLD         = ld.lld
  CC            = $(CROSS_COMPILE)gcc
  CPP           = $(CC) -E
  AR            = $(CROSS_COMPILE)ar
@@@ -367,7 -366,6 +367,7 @@@ LDFLAGS_MODULE  
  CFLAGS_KERNEL =
  AFLAGS_KERNEL =
  CFLAGS_GCOV   = -fprofile-arcs -ftest-coverage -fno-tree-loop-im
 +CFLAGS_KCOV   = -fsanitize-coverage=trace-pc
  
  
  # Use USERINCLUDE when you must reference the UAPI directories only.
@@@ -390,15 -388,13 +390,15 @@@ LINUXINCLUDE    := 
  
  KBUILD_CPPFLAGS := -D__KERNEL__
  
 -KBUILD_CFLAGS   := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 +KBUILD_CFLAGS   := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs -pipe \
                   -fno-strict-aliasing -fno-common \
                   -Werror-implicit-function-declaration \
                   -Wno-format-security \
                   -std=gnu89 $(call cc-option,-fno-PIE)
  
 -
 +ifeq ($(TARGET_BOARD_TYPE),auto)
 +KBUILD_CFLAGS    += -DCONFIG_PLATFORM_AUTO
 +endif
  KBUILD_AFLAGS_KERNEL :=
  KBUILD_CFLAGS_KERNEL :=
  KBUILD_AFLAGS   := -D__ASSEMBLY__ $(call cc-option,-fno-PIE)
@@@ -418,8 -414,7 +418,8 @@@ export HOSTCXX HOSTCXXFLAGS LDFLAGS_MOD
  
  export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
  export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV
 -export CFLAGS_KASAN CFLAGS_KASAN_NOSANITIZE
 +export CFLAGS_KASAN CFLAGS_UBSAN CFLAGS_KASAN_NOSANITIZE
 +export CFLAGS_KCOV
  export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
  export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
  export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
@@@ -614,11 -609,7 +614,11 @@@ all: vmlinu
  
  ifeq ($(cc-name),clang)
  ifneq ($(CROSS_COMPILE),)
 -CLANG_TARGET  := --target=$(notdir $(CROSS_COMPILE:%-=%))
 +CLANG_TRIPLE  ?= $(CROSS_COMPILE)
 +CLANG_TARGET  := --target=$(notdir $(CLANG_TRIPLE:%-=%))
 +ifeq ($(shell $(srctree)/scripts/clang-android.sh $(CC) $(CLANG_TARGET)), y)
 +$(error "Clang with Android --target detected. Did you specify CLANG_TRIPLE?")
 +endif
  GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
  CLANG_PREFIX  := --prefix=$(GCC_TOOLCHAIN_DIR)
  GCC_TOOLCHAIN := $(realpath $(GCC_TOOLCHAIN_DIR)/..)
@@@ -632,26 -623,6 +632,26 @@@ KBUILD_CFLAGS += $(call cc-option, -no-
  KBUILD_AFLAGS += $(call cc-option, -no-integrated-as)
  endif
  
 +# Make toolchain changes before including arch/$(SRCARCH)/Makefile to ensure
 +# ar/cc/ld-* macros return correct values.
 +ifdef CONFIG_LTO_CLANG
 +# use LLVM linker LLD for LTO linking and vmlinux_link
 +LD            := $(LDLLD)
 +# use llvm-ar for building symbol tables from IR files, and llvm-nm instead
 +# of objdump for processing symbol versions and exports
 +LLVM_AR               := llvm-ar
 +LLVM_NM               := llvm-nm
 +export LLVM_AR LLVM_NM
 +endif
 +
 +ifeq ($(cc-name),clang)
 +ifeq ($(ld-name),lld)
 +KBUILD_CFLAGS += -fuse-ld=lld
 +LDFLAGS               += -O2
 +endif
 +KBUILD_CPPFLAGS       += -Qunused-arguments
 +endif
 +
  # The arch Makefile can set ARCH_{CPP,A,C}FLAGS to override the default
  # values of the respective KBUILD_* variables
  ARCH_CPPFLAGS :=
@@@ -659,29 -630,6 +659,29 @@@ ARCH_AFLAGS :
  ARCH_CFLAGS :=
  include arch/$(SRCARCH)/Makefile
  
 +ifeq ($(cc-name),clang)
 +KBUILD_CFLAGS += -O3
 +KBUILD_CFLAGS += $(call cc-option, -mllvm -polly) \
 +                 $(call cc-option, -mllvm -polly-run-dce) \
 +                 $(call cc-option, -mllvm -polly-run-inliner) \
 +                 $(call cc-option, -mllvm -polly-opt-fusion=max) \
 +                 $(call cc-option, -mllvm -polly-ast-use-context) \
 +                 $(call cc-option, -mllvm -polly-detect-keep-going) \
 +                 $(call cc-option, -mllvm -polly-vectorizer=stripmine) \
 +                 $(call cc-option, -mllvm -polly-invariant-load-hoisting)
 +else
 +KBUILD_CFLAGS += -O2
 +endif
 +
 +ifeq ($(cc-name),gcc)
 +KBUILD_CFLAGS += -mcpu=cortex-a73.cortex-a53
 +KBUILD_AFLAGS += -mcpu=cortex-a73.cortex-a53
 +endif
 +ifeq ($(cc-name),clang)
 +KBUILD_CFLAGS += -mcpu=cortex-a53
 +KBUILD_AFLAGS += -mcpu=cortex-a53
 +endif
 +
  KBUILD_CFLAGS += $(call cc-option,-fno-delete-null-pointer-checks,)
  KBUILD_CFLAGS += $(call cc-disable-warning,maybe-uninitialized,)
  KBUILD_CFLAGS += $(call cc-disable-warning,frame-address,)
@@@ -691,8 -639,14 +691,8 @@@ KBUILD_CFLAGS     += $(call cc-disable-warn
  KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
  KBUILD_CFLAGS += $(call cc-disable-warning, attribute-alias)
  
 -ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 -KBUILD_CFLAGS += -Os
 -else
 -ifdef CONFIG_PROFILE_ALL_BRANCHES
 -KBUILD_CFLAGS += -O2
 -else
 -KBUILD_CFLAGS   += -O2
 -endif
 +ifdef CONFIG_CC_WERROR
 +KBUILD_CFLAGS += -Werror
  endif
  
  # Tell gcc to never replace conditional load with a non-conditional one
@@@ -755,24 -709,17 +755,24 @@@ endi
  endif
  KBUILD_CFLAGS += $(stackp-flag)
  
 +ifdef CONFIG_KCOV
 +  ifeq ($(call cc-option, $(CFLAGS_KCOV)),)
 +    $(warning Cannot use CONFIG_KCOV: \
 +             -fsanitize-coverage=trace-pc is not supported by compiler)
 +    CFLAGS_KCOV =
 +  endif
 +endif
 +
  ifeq ($(cc-name),clang)
 -KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
  KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
  KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
 +KBUILD_CFLAGS += $(call cc-disable-warning, duplicate-decl-specifier)
  # Quiet clang warning: comparison of unsigned expression < 0 is always false
  KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
  # CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the
  # source of a reference will be _MergedGlobals and not on of the whitelisted names.
  # See modpost pattern 2
  KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
 -KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
  else
  
  # These warnings generated too much noise in a regular build.
@@@ -794,11 -741,6 +794,11 @@@ KBUILD_CFLAGS    += -fomit-frame-pointe
  endif
  endif
  
 +# Initialize all stack variables with a pattern, if desired.
 +ifdef CONFIG_INIT_STACK_ALL
 +KBUILD_CFLAGS += $(call cc-option, -ftrivial-auto-var-init=pattern)
 +endif
 +
  KBUILD_CFLAGS   += $(call cc-option, -fno-var-tracking-assignments)
  
  ifdef CONFIG_DEBUG_INFO
@@@ -841,33 -783,6 +841,33 @@@ ifdef CONFIG_DEBUG_SECTION_MISMATC
  KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
  endif
  
 +ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
 +KBUILD_CFLAGS_KERNEL  += $(call cc-option,-ffunction-sections,)
 +KBUILD_CFLAGS_KERNEL  += $(call cc-option,-fdata-sections,)
 +endif
 +
 +ifdef CONFIG_LTO_CLANG
 +ifdef CONFIG_THINLTO
 +lto-clang-flags := -flto=thin
 +LDFLAGS += --thinlto-cache-dir=.thinlto-cache
 +else
 +lto-clang-flags       := -flto
 +endif
 +lto-clang-flags += -fvisibility=hidden
 +
 +# allow disabling only clang LTO where needed
 +DISABLE_LTO_CLANG := -fno-lto -fvisibility=default
 +export DISABLE_LTO_CLANG
 +endif
 +
 +ifdef CONFIG_LTO
 +LTO_CFLAGS    := $(lto-clang-flags)
 +KBUILD_CFLAGS += $(LTO_CFLAGS)
 +
 +DISABLE_LTO   := $(DISABLE_LTO_CLANG)
 +export LTO_CFLAGS DISABLE_LTO
 +endif
 +
  # arch Makefile may override CC so keep this after arch Makefile is included
  NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include)
  CHECKFLAGS     += $(NOSTDINC_FLAGS)
@@@ -919,7 -834,6 +919,7 @@@ KBUILD_ARFLAGS := $(call ar-option,D
  
  include scripts/Makefile.kasan
  include scripts/Makefile.extrawarn
 +include scripts/Makefile.ubsan
  
  # Add any arch overrides and user supplied CPPFLAGS, AFLAGS and CFLAGS as the
  # last assignments
@@@ -933,10 -847,6 +933,10 @@@ LDFLAGS_BUILD_ID = $(patsubst -Wl$(comm
  KBUILD_LDFLAGS_MODULE += $(LDFLAGS_BUILD_ID)
  LDFLAGS_vmlinux += $(LDFLAGS_BUILD_ID)
  
 +ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
 +LDFLAGS_vmlinux       += $(call ld-option, --gc-sections,)
 +endif
 +
  ifeq ($(CONFIG_STRIP_ASM_SYMS),y)
  LDFLAGS_vmlinux       += $(call ld-option, -X,)
  endif
@@@ -1034,24 -944,24 +1034,24 @@@ vmlinux-dirs  := $(patsubst %/,%,$(filte
  vmlinux-alldirs       := $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
                     $(init-) $(core-) $(drivers-) $(net-) $(libs-) $(virt-))))
  
 -init-y                := $(patsubst %/, %/built-in.o, $(init-y))
 -core-y                := $(patsubst %/, %/built-in.o, $(core-y))
 -drivers-y     := $(patsubst %/, %/built-in.o, $(drivers-y))
 -net-y         := $(patsubst %/, %/built-in.o, $(net-y))
 +init-y                := $(patsubst %/, %/built-in.a, $(init-y))
 +core-y                := $(patsubst %/, %/built-in.a, $(core-y))
 +drivers-y     := $(patsubst %/, %/built-in.a, $(drivers-y))
 +net-y         := $(patsubst %/, %/built-in.a, $(net-y))
  libs-y1               := $(patsubst %/, %/lib.a, $(libs-y))
 -libs-y2               := $(patsubst %/, %/built-in.o, $(libs-y))
 -libs-y                := $(libs-y1) $(libs-y2)
 -virt-y                := $(patsubst %/, %/built-in.o, $(virt-y))
 +libs-y2               := $(patsubst %/, %/built-in.a, $(filter-out %.a, $(libs-y)))
 +virt-y                := $(patsubst %/, %/built-in.a, $(virt-y))
  
  # Externally visible symbols (used by link-vmlinux.sh)
  export KBUILD_VMLINUX_INIT := $(head-y) $(init-y)
 -export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) $(virt-y)
 +export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y2) $(drivers-y) $(net-y) $(virt-y)
 +export KBUILD_VMLINUX_LIBS := $(libs-y1)
  export KBUILD_LDS          := arch/$(SRCARCH)/kernel/vmlinux.lds
  export LDFLAGS_vmlinux
  # used by scripts/pacmage/Makefile
  export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools)
  
 -vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN)
 +vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN) $(KBUILD_VMLINUX_LIBS)
  
  # Final link of vmlinux
        cmd_link-vmlinux = $(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux)
@@@ -1086,7 -996,7 +1086,7 @@@ $(sort $(vmlinux-deps)): $(vmlinux-dirs
  
  PHONY += $(vmlinux-dirs)
  $(vmlinux-dirs): prepare scripts
 -      $(Q)$(MAKE) $(build)=$@
 +      $(Q)$(MAKE) $(build)=$@ need-builtin=1
  
  define filechk_kernel.release
        echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))"
@@@ -1128,29 -1038,12 +1128,29 @@@ prepare1: prepare2 $(version_h) include
  
  archprepare: archheaders archscripts prepare1 scripts_basic
  
 -prepare0: archprepare FORCE
 +prepare0: archprepare
        $(Q)$(MAKE) $(build)=.
  
  # All the preparing..
  prepare: prepare0
  
 +# Make sure we're using a supported toolchain with LTO_CLANG
 +ifdef CONFIG_LTO_CLANG
 +  ifneq ($(call clang-ifversion, -ge, 0800, y), y)
 +      @echo Cannot use CONFIG_LTO_CLANG: requires clang 8.0 or later >&2 && exit 1
 +  endif
 +  ifneq ($(ld-name),lld)
 +      @echo Cannot use CONFIG_LTO_CLANG: requires LLD >&2 && exit 1
 +  endif
 +endif
 +# Make sure compiler supports LTO flags
 +ifdef lto-flags
 +  ifeq ($(call cc-option, $(lto-flags)),)
 +      @echo Cannot use CONFIG_LTO: $(lto-flags) not supported by compiler \
 +              >&2 && exit 1
 +  endif
 +endif
 +
  # Generate some files
  # ---------------------------------------------------------------------------
  
@@@ -1190,7 -1083,7 +1190,7 @@@ INSTALL_FW_PATH=$(INSTALL_MOD_PATH)/lib
  export INSTALL_FW_PATH
  
  PHONY += firmware_install
 -firmware_install: FORCE
 +firmware_install:
        @mkdir -p $(objtree)/firmware
        $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_install
  
@@@ -1210,7 -1103,7 +1210,7 @@@ PHONY += archscript
  archscripts:
  
  PHONY += __headers
 -__headers: $(version_h) scripts_basic asm-generic archheaders archscripts FORCE
 +__headers: $(version_h) scripts_basic asm-generic archheaders archscripts
        $(Q)$(MAKE) $(build)=scripts build_unifdef
  
  PHONY += headers_install_all
@@@ -1531,9 -1424,6 +1531,6 @@@ else # KBUILD_EXTMO
  
  # We are always building modules
  KBUILD_MODULES := 1
- PHONY += crmodverdir
- crmodverdir:
-       $(cmd_crmodverdir)
  
  PHONY += $(objtree)/Module.symvers
  $(objtree)/Module.symvers:
  
  module-dirs := $(addprefix _module_,$(KBUILD_EXTMOD))
  PHONY += $(module-dirs) modules
- $(module-dirs): crmodverdir $(objtree)/Module.symvers
+ $(module-dirs): prepare $(objtree)/Module.symvers
        $(Q)$(MAKE) $(build)=$(patsubst _module_%,%,$@)
  
  modules: $(module-dirs)
@@@ -1585,7 -1475,8 +1582,8 @@@ help
  
  # Dummies...
  PHONY += prepare scripts
- prepare: ;
+ prepare:
+       $(cmd_crmodverdir)
  scripts: ;
  endif # KBUILD_EXTMOD
  
@@@ -1601,8 -1492,7 +1599,8 @@@ clean: $(clean-dirs
                -o -name '*.symtypes' -o -name 'modules.order' \
                -o -name modules.builtin -o -name '.tmp_*.o.*' \
                -o -name '*.ll' \
 -              -o -name '*.gcno' \) -type f -print | xargs rm -f
 +              -o -name '*.gcno' \
 +              -o -name '*.*.symversions' \) -type f -print | xargs rm -f
  
  # Generate tags for editors
  # ---------------------------------------------------------------------------
@@@ -1710,17 -1600,14 +1708,14 @@@ endi
  
  # Modules
  /: prepare scripts FORCE
-       $(cmd_crmodverdir)
        $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
        $(build)=$(build-dir)
  # Make sure the latest headers are built for Documentation
  Documentation/: headers_install
  %/: prepare scripts FORCE
-       $(cmd_crmodverdir)
        $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
        $(build)=$(build-dir)
  %.ko: prepare scripts FORCE
-       $(cmd_crmodverdir)
        $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1)   \
        $(build)=$(build-dir) $(@:.ko=.o)
        $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost
@@@ -387,6 -387,13 +387,13 @@@ do {                                                                     
  #define __get_user_asm_byte(x, addr, err)                     \
        __get_user_asm(x, addr, err, ldrb)
  
+ #if __LINUX_ARM_ARCH__ >= 6
+ #define __get_user_asm_half(x, addr, err)                     \
+       __get_user_asm(x, addr, err, ldrh)
+ #else
  #ifndef __ARMEB__
  #define __get_user_asm_half(x, __gu_addr, err)                        \
  ({                                                            \
  })
  #endif
  
+ #endif /* __LINUX_ARM_ARCH__ >= 6 */
  #define __get_user_asm_word(x, addr, err)                     \
        __get_user_asm(x, addr, err, ldr)
  #endif
  #define __put_user_asm_byte(x, __pu_addr, err)                        \
        __put_user_asm(x, __pu_addr, err, strb)
  
+ #if __LINUX_ARM_ARCH__ >= 6
+ #define __put_user_asm_half(x, __pu_addr, err)                        \
+       __put_user_asm(x, __pu_addr, err, strh)
+ #else
  #ifndef __ARMEB__
  #define __put_user_asm_half(x, __pu_addr, err)                        \
  ({                                                            \
  })
  #endif
  
+ #endif /* __LINUX_ARM_ARCH__ >= 6 */
  #define __put_user_asm_word(x, __pu_addr, err)                        \
        __put_user_asm(x, __pu_addr, err, str)
  
@@@ -537,10 -555,7 +555,10 @@@ arm_copy_from_user(void *to, const voi
  static inline unsigned long __must_check
  __copy_from_user(void *to, const void __user *from, unsigned long n)
  {
 -      unsigned int __ua_flags = uaccess_save_and_enable();
 +      unsigned int __ua_flags;
 +
 +      check_object_size(to, n, false);
 +      __ua_flags = uaccess_save_and_enable();
        n = arm_copy_from_user(to, from, n);
        uaccess_restore(__ua_flags);
        return n;
@@@ -555,15 -570,11 +573,15 @@@ static inline unsigned long __must_chec
  __copy_to_user(void __user *to, const void *from, unsigned long n)
  {
  #ifndef CONFIG_UACCESS_WITH_MEMCPY
 -      unsigned int __ua_flags = uaccess_save_and_enable();
 +      unsigned int __ua_flags;
 +
 +      check_object_size(from, n, true);
 +      __ua_flags = uaccess_save_and_enable();
        n = arm_copy_to_user(to, from, n);
        uaccess_restore(__ua_flags);
        return n;
  #else
 +      check_object_size(from, n, true);
        return arm_copy_to_user(to, from, n);
  #endif
  }
diff --combined arch/mips/Kconfig
@@@ -65,8 -65,6 +65,8 @@@ config MIP
        select HAVE_IRQ_TIME_ACCOUNTING
        select GENERIC_TIME_VSYSCALL
        select ARCH_CLOCKSOURCE_DATA
 +      select HANDLE_DOMAIN_IRQ
 +      select HAVE_EXIT_THREAD
  
  menu "Machine selection"
  
@@@ -74,57 -72,6 +74,57 @@@ choic
        prompt "System type"
        default SGI_IP22
  
 +config MIPS_GENERIC
 +      bool "Generic board-agnostic MIPS kernel"
 +      select BOOT_RAW
 +      select BUILTIN_DTB
 +      select CEVT_R4K
 +      select CLKSRC_MIPS_GIC
 +      select COMMON_CLK
 +      select CPU_MIPSR2_IRQ_VI
 +      select CPU_MIPSR2_IRQ_EI
 +      select CSRC_R4K
 +      select DMA_PERDEV_COHERENT
 +      select HW_HAS_PCI
 +      select IRQ_MIPS_CPU
 +      select LIBFDT
 +      select MIPS_CPU_SCACHE
 +      select MIPS_GIC
 +      select MIPS_L1_CACHE_SHIFT_7
 +      select NO_EXCEPT_FILL
 +      select PCI_DRIVERS_GENERIC
 +      select PINCTRL
 +      select SMP_UP if SMP
 +      select SYS_HAS_CPU_MIPS32_R1
 +      select SYS_HAS_CPU_MIPS32_R2
 +      select SYS_HAS_CPU_MIPS32_R6
 +      select SYS_HAS_CPU_MIPS64_R1
 +      select SYS_HAS_CPU_MIPS64_R2
 +      select SYS_HAS_CPU_MIPS64_R6
 +      select SYS_SUPPORTS_32BIT_KERNEL
 +      select SYS_SUPPORTS_64BIT_KERNEL
 +      select SYS_SUPPORTS_BIG_ENDIAN
 +      select SYS_SUPPORTS_HIGHMEM
 +      select SYS_SUPPORTS_LITTLE_ENDIAN
 +      select SYS_SUPPORTS_MICROMIPS
 +      select SYS_SUPPORTS_MIPS_CPS
 +      select SYS_SUPPORTS_MIPS16
 +      select SYS_SUPPORTS_MULTITHREADING
 +      select SYS_SUPPORTS_RELOCATABLE
 +      select SYS_SUPPORTS_SMARTMIPS
 +      select USB_EHCI_BIG_ENDIAN_DESC if BIG_ENDIAN
 +      select USB_EHCI_BIG_ENDIAN_MMIO if BIG_ENDIAN
 +      select USB_OHCI_BIG_ENDIAN_DESC if BIG_ENDIAN
 +      select USB_OHCI_BIG_ENDIAN_MMIO if BIG_ENDIAN
 +      select USB_UHCI_BIG_ENDIAN_DESC if BIG_ENDIAN
 +      select USB_UHCI_BIG_ENDIAN_MMIO if BIG_ENDIAN
 +      select USE_OF
 +      help
 +        Select this to build a kernel which aims to support multiple boards,
 +        generally using a flattened device tree passed from the bootloader
 +        using the boot protocol defined in the UHI (Unified Hosting
 +        Interface) specification.
 +
  config MIPS_ALCHEMY
        bool "Alchemy processor based machines"
        select ARCH_PHYS_ADDR_T_64BIT
@@@ -826,6 -773,7 +826,7 @@@ config SIBYTE_LITTLESU
        select SYS_SUPPORTS_BIG_ENDIAN
        select SYS_SUPPORTS_HIGHMEM
        select SYS_SUPPORTS_LITTLE_ENDIAN
+       select ZONE_DMA32 if 64BIT
  
  config SIBYTE_SENTOSA
        bool "Sibyte BCM91250E-Sentosa"
@@@ -1031,7 -979,6 +1032,7 @@@ source "arch/mips/ath79/Kconfig
  source "arch/mips/bcm47xx/Kconfig"
  source "arch/mips/bcm63xx/Kconfig"
  source "arch/mips/bmips/Kconfig"
 +source "arch/mips/generic/Kconfig"
  source "arch/mips/jazz/Kconfig"
  source "arch/mips/jz4740/Kconfig"
  source "arch/mips/lantiq/Kconfig"
@@@ -1141,10 -1088,6 +1142,10 @@@ config DMA_MAYBE_COHEREN
        select DMA_NONCOHERENT
        bool
  
 +config DMA_PERDEV_COHERENT
 +      bool
 +      select DMA_MAYBE_COHERENT
 +
  config DMA_COHERENT
        bool
  
@@@ -2040,7 -1983,7 +2041,7 @@@ config CPU_SUPPORTS_UNCACHED_ACCELERATE
        bool
  config MIPS_PGD_C0_CONTEXT
        bool
 -      default y if 64BIT && CPU_MIPSR2 && !CPU_XLP
 +      default y if 64BIT && (CPU_MIPSR2 || CPU_MIPSR6) && !CPU_XLP
  
  #
  # Set to y for ptrace access to watch registers.
@@@ -2329,7 -2272,7 +2330,7 @@@ config MIPS_CM
  
  config MIPS_CPS
        bool "MIPS Coherent Processing System support"
 -      depends on SYS_SUPPORTS_MIPS_CPS && !CPU_MIPSR6
 +      depends on SYS_SUPPORTS_MIPS_CPS
        select MIPS_CM
        select MIPS_CPC
        select MIPS_CPS_PM if HOTPLUG_CPU
        select SMP
        select SYNC_R4K if (CEVT_R4K || CSRC_R4K)
        select SYS_SUPPORTS_HOTPLUG_CPU
 +      select SYS_SUPPORTS_SCHED_SMT if CPU_MIPSR6
        select SYS_SUPPORTS_SMP
        select WEAK_ORDERING
        help
@@@ -255,7 -255,8 +255,8 @@@ static int ar5523_cmd(struct ar5523 *ar
  
        if (flags & AR5523_CMD_FLAG_MAGIC)
                hdr->magic = cpu_to_be32(1 << 24);
-       memcpy(hdr + 1, idata, ilen);
+       if (ilen)
+               memcpy(hdr + 1, idata, ilen);
  
        cmd->odata = odata;
        cmd->olen = olen;
@@@ -1471,12 -1472,12 +1472,12 @@@ static int ar5523_init_modes(struct ar5
        memcpy(ar->channels, ar5523_channels, sizeof(ar5523_channels));
        memcpy(ar->rates, ar5523_rates, sizeof(ar5523_rates));
  
 -      ar->band.band = IEEE80211_BAND_2GHZ;
 +      ar->band.band = NL80211_BAND_2GHZ;
        ar->band.channels = ar->channels;
        ar->band.n_channels = ARRAY_SIZE(ar5523_channels);
        ar->band.bitrates = ar->rates;
        ar->band.n_bitrates = ARRAY_SIZE(ar5523_rates);
 -      ar->hw->wiphy->bands[IEEE80211_BAND_2GHZ] = &ar->band;
 +      ar->hw->wiphy->bands[NL80211_BAND_2GHZ] = &ar->band;
        return 0;
  }
  
@@@ -548,18 -548,18 +548,18 @@@ int iwl_mvm_mac_setup_register(struct i
        else
                mvm->max_scans = IWL_MVM_MAX_LMAC_SCANS;
  
 -      if (mvm->nvm_data->bands[IEEE80211_BAND_2GHZ].n_channels)
 -              hw->wiphy->bands[IEEE80211_BAND_2GHZ] =
 -                      &mvm->nvm_data->bands[IEEE80211_BAND_2GHZ];
 -      if (mvm->nvm_data->bands[IEEE80211_BAND_5GHZ].n_channels) {
 -              hw->wiphy->bands[IEEE80211_BAND_5GHZ] =
 -                      &mvm->nvm_data->bands[IEEE80211_BAND_5GHZ];
 +      if (mvm->nvm_data->bands[NL80211_BAND_2GHZ].n_channels)
 +              hw->wiphy->bands[NL80211_BAND_2GHZ] =
 +                      &mvm->nvm_data->bands[NL80211_BAND_2GHZ];
 +      if (mvm->nvm_data->bands[NL80211_BAND_5GHZ].n_channels) {
 +              hw->wiphy->bands[NL80211_BAND_5GHZ] =
 +                      &mvm->nvm_data->bands[NL80211_BAND_5GHZ];
  
                if (fw_has_capa(&mvm->fw->ucode_capa,
                                IWL_UCODE_TLV_CAPA_BEAMFORMER) &&
                    fw_has_api(&mvm->fw->ucode_capa,
                               IWL_UCODE_TLV_API_LQ_SS_PARAMS))
 -                      hw->wiphy->bands[IEEE80211_BAND_5GHZ]->vht_cap.cap |=
 +                      hw->wiphy->bands[NL80211_BAND_5GHZ]->vht_cap.cap |=
                                IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE;
        }
  
@@@ -741,6 -741,21 +741,21 @@@ static void iwl_mvm_mac_tx(struct ieee8
                     !ieee80211_is_action(hdr->frame_control)))
                sta = NULL;
  
+       /* If there is no sta, and it's not offchannel - send through AP */
+       if (info->control.vif->type == NL80211_IFTYPE_STATION &&
+           info->hw_queue != IWL_MVM_OFFCHANNEL_QUEUE && !sta) {
+               struct iwl_mvm_vif *mvmvif =
+                       iwl_mvm_vif_from_mac80211(info->control.vif);
+               u8 ap_sta_id = READ_ONCE(mvmvif->ap_sta_id);
+               if (ap_sta_id < IWL_MVM_STATION_COUNT) {
+                       /* mac80211 holds rcu read lock */
+                       sta = rcu_dereference(mvm->fw_id_to_mac_id[ap_sta_id]);
+                       if (IS_ERR_OR_NULL(sta))
+                               goto drop;
+               }
+       }
        if (sta) {
                if (iwl_mvm_defer_tx(mvm, sta, skb))
                        return;
@@@ -3126,7 -3141,7 +3141,7 @@@ static int iwl_mvm_send_aux_roc_cmd(str
                        cpu_to_le32(FW_CMD_ID_AND_COLOR(MAC_INDEX_AUX, 0)),
                .sta_id_and_color = cpu_to_le32(mvm->aux_sta.sta_id),
                /* Set the channel info data */
 -              .channel_info.band = (channel->band == IEEE80211_BAND_2GHZ) ?
 +              .channel_info.band = (channel->band == NL80211_BAND_2GHZ) ?
                        PHY_BAND_24 : PHY_BAND_5,
                .channel_info.channel = channel->hw_value,
                .channel_info.width = PHY_VHT_CHANNEL_MODE20,
@@@ -1,6 -1,6 +1,6 @@@
  /*
   * Copyright (c) 2015, Sony Mobile Communications AB.
 - * Copyright (c) 2013, The Linux Foundation. All rights reserved.
 + * Copyright (c) 2013, 2018 The Linux Foundation. All rights reserved.
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License version 2 and
@@@ -23,7 -23,6 +23,7 @@@
  #include <linux/gpio.h>
  #include <linux/interrupt.h>
  #include <linux/of_device.h>
 +#include <linux/of_irq.h>
  
  #include <dt-bindings/pinctrl/qcom,pmic-gpio.h>
  
@@@ -379,7 -378,7 +379,7 @@@ static int pm8xxx_pin_config_set(struc
                        banks |= BIT(0);
                        break;
                case PM8XXX_QCOM_DRIVE_STRENGH:
 -                      if (arg > PMIC_GPIO_STRENGTH_LOW) {
 +                      if (arg > PM8921_GPIO_STRENGTH_LOW) {
                                dev_err(pctrl->dev, "invalid drive strength\n");
                                return -EINVAL;
                        }
@@@ -665,12 -664,11 +665,12 @@@ static int pm8xxx_pin_populate(struct p
  }
  
  static const struct of_device_id pm8xxx_gpio_of_match[] = {
 -      { .compatible = "qcom,pm8018-gpio", .data = (void *)6 },
 -      { .compatible = "qcom,pm8038-gpio", .data = (void *)12 },
 -      { .compatible = "qcom,pm8058-gpio", .data = (void *)40 },
 -      { .compatible = "qcom,pm8917-gpio", .data = (void *)38 },
 -      { .compatible = "qcom,pm8921-gpio", .data = (void *)44 },
 +      { .compatible = "qcom,pm8018-gpio" },
 +      { .compatible = "qcom,pm8038-gpio" },
 +      { .compatible = "qcom,pm8058-gpio" },
 +      { .compatible = "qcom,pm8917-gpio" },
 +      { .compatible = "qcom,pm8921-gpio" },
 +      { .compatible = "qcom,ssbi-gpio" },
        { },
  };
  MODULE_DEVICE_TABLE(of, pm8xxx_gpio_of_match);
@@@ -681,19 -679,14 +681,19 @@@ static int pm8xxx_gpio_probe(struct pla
        struct pinctrl_pin_desc *pins;
        struct pm8xxx_gpio *pctrl;
        int ret;
 -      int i;
 +      int i, npins;
  
        pctrl = devm_kzalloc(&pdev->dev, sizeof(*pctrl), GFP_KERNEL);
        if (!pctrl)
                return -ENOMEM;
  
        pctrl->dev = &pdev->dev;
 -      pctrl->npins = (unsigned long)of_device_get_match_data(&pdev->dev);
 +      npins = platform_irq_count(pdev);
 +      if (!npins)
 +              return -EINVAL;
 +      if (npins < 0)
 +              return npins;
 +      pctrl->npins = npins;
  
        pctrl->regmap = dev_get_regmap(pdev->dev.parent, NULL);
        if (!pctrl->regmap) {
                goto unregister_pinctrl;
        }
  
-       ret = gpiochip_add_pin_range(&pctrl->chip,
-                                    dev_name(pctrl->dev),
-                                    0, 0, pctrl->chip.ngpio);
-       if (ret) {
-               dev_err(pctrl->dev, "failed to add pin range\n");
-               goto unregister_gpiochip;
+       /*
+        * For DeviceTree-supported systems, the gpio core checks the
+        * pinctrl's device node for the "gpio-ranges" property.
+        * If it is present, it takes care of adding the pin ranges
+        * for the driver. In this case the driver can skip ahead.
+        *
+        * In order to remain compatible with older, existing DeviceTree
+        * files which don't set the "gpio-ranges" property or systems that
+        * utilize ACPI the driver has to call gpiochip_add_pin_range().
+        */
+       if (!of_property_read_bool(pctrl->dev->of_node, "gpio-ranges")) {
+               ret = gpiochip_add_pin_range(&pctrl->chip, dev_name(pctrl->dev),
+                                            0, 0, pctrl->chip.ngpio);
+               if (ret) {
+                       dev_err(pctrl->dev, "failed to add pin range\n");
+                       goto unregister_gpiochip;
+               }
        }
  
        platform_set_drvdata(pdev, pctrl);
@@@ -4,7 -4,6 +4,7 @@@
   *  Copyright (C) 2008 Intel Corp
   *  Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com>
   *  Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com>
 + *  Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
   *
   *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   *
@@@ -36,9 -35,9 +36,9 @@@
  #include <linux/reboot.h>
  #include <linux/string.h>
  #include <linux/of.h>
 +#include <linux/kthread.h>
  #include <net/netlink.h>
  #include <net/genetlink.h>
 -#include <linux/suspend.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/thermal.h>
  #include "thermal_core.h"
  #include "thermal_hwmon.h"
  
 +#define THERMAL_UEVENT_DATA "type"
 +
  MODULE_AUTHOR("Zhang Rui");
  MODULE_DESCRIPTION("Generic thermal management sysfs support");
  MODULE_LICENSE("GPL v2");
  
 +#define THERMAL_MAX_ACTIVE    16
 +
  static DEFINE_IDR(thermal_tz_idr);
  static DEFINE_IDR(thermal_cdev_idr);
  static DEFINE_MUTEX(thermal_idr_lock);
@@@ -65,10 -60,10 +65,10 @@@ static LIST_HEAD(thermal_governor_list)
  static DEFINE_MUTEX(thermal_list_lock);
  static DEFINE_MUTEX(thermal_governor_lock);
  
 -static atomic_t in_suspend;
 -
  static struct thermal_governor *def_governor;
  
 +static struct workqueue_struct *thermal_passive_wq;
 +
  static struct thermal_governor *__find_governor(const char *name)
  {
        struct thermal_governor *pos;
@@@ -211,407 -206,6 +211,407 @@@ exit
        return;
  }
  
 +static LIST_HEAD(sensor_info_list);
 +static DEFINE_MUTEX(sensor_list_lock);
 +
 +static struct sensor_info *get_sensor(uint32_t sensor_id)
 +{
 +      struct sensor_info *pos = NULL, *matching_sensor = NULL;
 +
 +      rcu_read_lock();
 +      list_for_each_entry_rcu(pos, &sensor_info_list, sensor_list) {
 +              if (pos->sensor_id == sensor_id) {
 +                      matching_sensor = pos;
 +                      break;
 +              }
 +      }
 +      rcu_read_unlock();
 +
 +      return matching_sensor;
 +}
 +
 +int sensor_get_id(char *name)
 +{
 +      struct sensor_info *pos = NULL;
 +      int matching_id = -ENODEV;
 +
 +      if (!name)
 +              return matching_id;
 +
 +      rcu_read_lock();
 +      list_for_each_entry_rcu(pos, &sensor_info_list, sensor_list) {
 +              if (!strcmp(pos->tz->type, name)) {
 +                      matching_id = pos->sensor_id;
 +                      break;
 +              }
 +      }
 +      rcu_read_unlock();
 +
 +      return matching_id;
 +}
 +EXPORT_SYMBOL(sensor_get_id);
 +
 +static void init_sensor_trip(struct sensor_info *sensor)
 +{
 +      int ret = 0, i = 0;
 +      enum thermal_trip_type type;
 +
 +      for (i = 0; ((sensor->max_idx == -1) ||
 +              (sensor->min_idx == -1)) &&
 +              (sensor->tz->ops->get_trip_type) &&
 +              (i < sensor->tz->trips); i++) {
 +
 +              sensor->tz->ops->get_trip_type(sensor->tz, i, &type);
 +              if (type == THERMAL_TRIP_CONFIGURABLE_HI)
 +                      sensor->max_idx = i;
 +              if (type == THERMAL_TRIP_CONFIGURABLE_LOW)
 +                      sensor->min_idx = i;
 +              type = 0;
 +      }
 +
 +      ret = sensor->tz->ops->get_trip_temp(sensor->tz,
 +              sensor->min_idx, &sensor->threshold_min);
 +      if (ret)
 +              pr_err("Unable to get MIN trip temp. sensor:%d err:%d\n",
 +                              sensor->sensor_id, ret);
 +
 +      ret = sensor->tz->ops->get_trip_temp(sensor->tz,
 +              sensor->max_idx, &sensor->threshold_max);
 +      if (ret)
 +              pr_err("Unable to get MAX trip temp. sensor:%d err:%d\n",
 +                              sensor->sensor_id, ret);
 +}
 +
 +static int __update_sensor_thresholds(struct sensor_info *sensor)
 +{
 +      long max_of_low_thresh = LONG_MIN;
 +      long min_of_high_thresh = LONG_MAX;
 +      struct sensor_threshold *pos = NULL;
 +      int ret = 0;
 +
 +      if (!sensor->tz->ops->set_trip_temp ||
 +              !sensor->tz->ops->activate_trip_type ||
 +              !sensor->tz->ops->get_trip_type ||
 +              !sensor->tz->ops->get_trip_temp) {
 +              ret = -ENODEV;
 +              goto update_done;
 +      }
 +
 +      if ((sensor->max_idx == -1) || (sensor->min_idx == -1))
 +              init_sensor_trip(sensor);
 +
 +      list_for_each_entry(pos, &sensor->threshold_list, list) {
 +              if (!pos->active)
 +                      continue;
 +              if (pos->trip == THERMAL_TRIP_CONFIGURABLE_LOW) {
 +                      if (pos->temp > max_of_low_thresh)
 +                              max_of_low_thresh = pos->temp;
 +              }
 +              if (pos->trip == THERMAL_TRIP_CONFIGURABLE_HI) {
 +                      if (pos->temp < min_of_high_thresh)
 +                              min_of_high_thresh = pos->temp;
 +              }
 +      }
 +
 +      pr_debug("sensor %d: Thresholds: max of low: %ld min of high: %ld\n",
 +                      sensor->sensor_id, max_of_low_thresh,
 +                      min_of_high_thresh);
 +
 +      if (min_of_high_thresh != LONG_MAX) {
 +              ret = sensor->tz->ops->set_trip_temp(sensor->tz,
 +                      sensor->max_idx, min_of_high_thresh);
 +              if (ret) {
 +                      pr_err("sensor %d: Unable to set high threshold %d",
 +                                      sensor->sensor_id, ret);
 +                      goto update_done;
 +              }
 +              sensor->threshold_max = min_of_high_thresh;
 +      }
 +      ret = sensor->tz->ops->activate_trip_type(sensor->tz,
 +              sensor->max_idx,
 +              (min_of_high_thresh == LONG_MAX) ?
 +              THERMAL_TRIP_ACTIVATION_DISABLED :
 +              THERMAL_TRIP_ACTIVATION_ENABLED);
 +      if (ret) {
 +              pr_err("sensor %d: Unable to activate high threshold %d",
 +                      sensor->sensor_id, ret);
 +              goto update_done;
 +      }
 +
 +      if (max_of_low_thresh != LONG_MIN) {
 +              ret = sensor->tz->ops->set_trip_temp(sensor->tz,
 +                      sensor->min_idx, max_of_low_thresh);
 +              if (ret) {
 +                      pr_err("sensor %d: Unable to set low threshold %d",
 +                              sensor->sensor_id, ret);
 +                      goto update_done;
 +              }
 +              sensor->threshold_min = max_of_low_thresh;
 +      }
 +      ret = sensor->tz->ops->activate_trip_type(sensor->tz,
 +              sensor->min_idx,
 +              (max_of_low_thresh == LONG_MIN) ?
 +              THERMAL_TRIP_ACTIVATION_DISABLED :
 +              THERMAL_TRIP_ACTIVATION_ENABLED);
 +      if (ret) {
 +              pr_err("sensor %d: Unable to activate low threshold %d",
 +                      sensor->sensor_id, ret);
 +              goto update_done;
 +      }
 +
 +      pr_debug("sensor %d: low: %d high: %d\n",
 +              sensor->sensor_id,
 +              sensor->threshold_min, sensor->threshold_max);
 +
 +update_done:
 +      return ret;
 +}
 +
 +static void sensor_update_work(struct work_struct *work)
 +{
 +      struct sensor_info *sensor = container_of(work, struct sensor_info,
 +                                              work);
 +      int ret = 0;
 +      mutex_lock(&sensor->lock);
 +      ret = __update_sensor_thresholds(sensor);
 +      if (ret)
 +              pr_err("sensor %d: Error %d setting threshold\n",
 +                      sensor->sensor_id, ret);
 +      mutex_unlock(&sensor->lock);
 +}
 +
 +static __ref int sensor_sysfs_notify(void *data)
 +{
 +      int ret = 0;
 +      struct sensor_info *sensor = (struct sensor_info *)data;
 +
 +      while (!kthread_should_stop()) {
 +              if (wait_for_completion_interruptible(
 +                      &sensor->sysfs_notify_complete) != 0)
 +                      continue;
 +              if (sensor->deregister_active)
 +                      return ret;
 +              reinit_completion(&sensor->sysfs_notify_complete);
 +              sysfs_notify(&sensor->tz->device.kobj, NULL,
 +                                      THERMAL_UEVENT_DATA);
 +      }
 +      return ret;
 +}
 +
 +/* May be called in an interrupt context.
 + * Do NOT call sensor_set_trip from this function
 + */
 +int thermal_sensor_trip(struct thermal_zone_device *tz,
 +              enum thermal_trip_type trip, long temp)
 +{
 +      struct sensor_threshold *pos = NULL;
 +      int ret = -ENODEV;
 +
 +      if (trip != THERMAL_TRIP_CONFIGURABLE_HI &&
 +                      trip != THERMAL_TRIP_CONFIGURABLE_LOW)
 +              return 0;
 +
 +      if (list_empty(&tz->sensor.threshold_list))
 +              return 0;
 +
 +      rcu_read_lock();
 +      list_for_each_entry_rcu(pos, &tz->sensor.threshold_list, list) {
 +              if ((pos->trip != trip) || (!pos->active))
 +                      continue;
 +              if (((trip == THERMAL_TRIP_CONFIGURABLE_LOW) &&
 +                      (pos->temp <= tz->sensor.threshold_min) &&
 +                      (pos->temp >= temp)) ||
 +                      ((trip == THERMAL_TRIP_CONFIGURABLE_HI) &&
 +                              (pos->temp >= tz->sensor.threshold_max) &&
 +                              (pos->temp <= temp))) {
 +                      if ((pos == &tz->tz_threshold[0])
 +                              || (pos == &tz->tz_threshold[1]))
 +                              complete(&tz->sensor.sysfs_notify_complete);
 +                      pos->active = 0;
 +                      pos->notify(trip, temp, pos->data);
 +              }
 +      }
 +      rcu_read_unlock();
 +
 +      schedule_work(&tz->sensor.work);
 +
 +      return ret;
 +}
 +EXPORT_SYMBOL(thermal_sensor_trip);
 +
 +int sensor_get_temp(uint32_t sensor_id, int *temp)
 +{
 +      struct sensor_info *sensor = get_sensor(sensor_id);
 +      int ret = 0;
 +
 +      if (!sensor)
 +              return -ENODEV;
 +
 +      ret = sensor->tz->ops->get_temp(sensor->tz, temp);
 +
 +      return ret;
 +}
 +EXPORT_SYMBOL(sensor_get_temp);
 +
 +int sensor_activate_trip(uint32_t sensor_id,
 +      struct sensor_threshold *threshold, bool enable)
 +{
 +      struct sensor_info *sensor = get_sensor(sensor_id);
 +      int ret = 0;
 +
 +      if (!sensor || !threshold) {
 +              pr_err("%s: uninitialized data\n",
 +                      KBUILD_MODNAME);
 +              ret = -ENODEV;
 +              goto activate_trip_exit;
 +      }
 +
 +      mutex_lock(&sensor->lock);
 +      threshold->active = (enable) ? 1 : 0;
 +      ret = __update_sensor_thresholds(sensor);
 +      mutex_unlock(&sensor->lock);
 +
 +activate_trip_exit:
 +      return ret;
 +}
 +EXPORT_SYMBOL(sensor_activate_trip);
 +
 +int sensor_set_trip(uint32_t sensor_id, struct sensor_threshold *threshold)
 +{
 +      struct sensor_threshold *pos = NULL;
 +      struct sensor_info *sensor = get_sensor(sensor_id);
 +
 +      if (!sensor)
 +              return -ENODEV;
 +
 +      if (!threshold || !threshold->notify)
 +              return -EFAULT;
 +
 +      mutex_lock(&sensor->lock);
 +      list_for_each_entry(pos, &sensor->threshold_list, list) {
 +              if (pos == threshold)
 +                      break;
 +      }
 +
 +      if (pos != threshold) {
 +              INIT_LIST_HEAD(&threshold->list);
 +              list_add_rcu(&threshold->list, &sensor->threshold_list);
 +      }
 +      threshold->active = 0; /* Do not allow active threshold right away */
 +
 +      mutex_unlock(&sensor->lock);
 +
 +      return 0;
 +
 +}
 +EXPORT_SYMBOL(sensor_set_trip);
 +
 +int sensor_cancel_trip(uint32_t sensor_id, struct sensor_threshold *threshold)
 +{
 +      struct sensor_threshold *pos = NULL, *var = NULL;
 +      struct sensor_info *sensor = get_sensor(sensor_id);
 +      int ret = 0;
 +
 +      if (!sensor)
 +              return -ENODEV;
 +
 +      mutex_lock(&sensor->lock);
 +      list_for_each_entry_safe(pos, var, &sensor->threshold_list, list) {
 +              if (pos == threshold) {
 +                      pos->active = 0;
 +                      list_del_rcu(&pos->list);
 +                      break;
 +              }
 +      }
 +
 +      ret = __update_sensor_thresholds(sensor);
 +      mutex_unlock(&sensor->lock);
 +
 +      return ret;
 +}
 +EXPORT_SYMBOL(sensor_cancel_trip);
 +
 +static int tz_notify_trip(enum thermal_trip_type type, int temp, void *data)
 +{
 +      struct thermal_zone_device *tz = (struct thermal_zone_device *)data;
 +
 +      pr_debug("sensor %d tripped: type %d temp %d\n",
 +                      tz->sensor.sensor_id, type, temp);
 +
 +      return 0;
 +}
 +
 +static void get_trip_threshold(struct thermal_zone_device *tz, int trip,
 +      struct sensor_threshold **threshold)
 +{
 +      enum thermal_trip_type type;
 +
 +      tz->ops->get_trip_type(tz, trip, &type);
 +
 +      if (type == THERMAL_TRIP_CONFIGURABLE_HI)
 +              *threshold = &tz->tz_threshold[0];
 +      else if (type == THERMAL_TRIP_CONFIGURABLE_LOW)
 +              *threshold = &tz->tz_threshold[1];
 +      else
 +              *threshold = NULL;
 +}
 +
 +int sensor_set_trip_temp(struct thermal_zone_device *tz,
 +              int trip, long temp)
 +{
 +      int ret = 0;
 +      struct sensor_threshold *threshold = NULL;
 +
 +      if (!tz->ops->get_trip_type)
 +              return -EPERM;
 +
 +      get_trip_threshold(tz, trip, &threshold);
 +      if (threshold) {
 +              threshold->temp = temp;
 +              ret = sensor_set_trip(tz->sensor.sensor_id, threshold);
 +      } else {
 +              ret = tz->ops->set_trip_temp(tz, trip, temp);
 +      }
 +
 +      return ret;
 +}
 +
 +int sensor_init(struct thermal_zone_device *tz)
 +{
 +      struct sensor_info *sensor = &tz->sensor;
 +
 +      sensor->sensor_id = tz->id;
 +      sensor->tz = tz;
 +      sensor->threshold_min = INT_MIN;
 +      sensor->threshold_max = INT_MAX;
 +      sensor->max_idx = -1;
 +      sensor->min_idx = -1;
 +      sensor->deregister_active = false;
 +      mutex_init(&sensor->lock);
 +      INIT_LIST_HEAD_RCU(&sensor->sensor_list);
 +      INIT_LIST_HEAD_RCU(&sensor->threshold_list);
 +      INIT_LIST_HEAD(&tz->tz_threshold[0].list);
 +      INIT_LIST_HEAD(&tz->tz_threshold[1].list);
 +      tz->tz_threshold[0].notify = tz_notify_trip;
 +      tz->tz_threshold[0].data = tz;
 +      tz->tz_threshold[0].trip = THERMAL_TRIP_CONFIGURABLE_HI;
 +      tz->tz_threshold[1].notify = tz_notify_trip;
 +      tz->tz_threshold[1].data = tz;
 +      tz->tz_threshold[1].trip = THERMAL_TRIP_CONFIGURABLE_LOW;
 +      list_add_rcu(&sensor->sensor_list, &sensor_info_list);
 +      INIT_WORK(&sensor->work, sensor_update_work);
 +      init_completion(&sensor->sysfs_notify_complete);
 +      sensor->sysfs_notify_thread = kthread_run(sensor_sysfs_notify,
 +                                                &tz->sensor,
 +                                                "therm_core:notify%d",
 +                                                tz->id);
 +      if (IS_ERR(sensor->sysfs_notify_thread))
 +              pr_err("Failed to create notify thread %d", tz->id);
 +
 +
 +      return 0;
 +}
 +
  static int get_idr(struct idr *idr, struct mutex *lock, int *id)
  {
        int ret;
@@@ -798,18 -392,17 +798,18 @@@ exit
        mutex_unlock(&thermal_list_lock);
  }
  
 -static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
 +static void thermal_zone_device_set_polling(struct workqueue_struct *queue,
 +                                          struct thermal_zone_device *tz,
                                            int delay)
  {
        if (delay > 1000)
 -              mod_delayed_work(system_freezable_wq, &tz->poll_queue,
 +              mod_delayed_work(queue, &tz->poll_queue,
                                 round_jiffies(msecs_to_jiffies(delay)));
        else if (delay)
 -              mod_delayed_work(system_freezable_wq, &tz->poll_queue,
 +              mod_delayed_work(queue, &tz->poll_queue,
                                 msecs_to_jiffies(delay));
        else
-               cancel_delayed_work_sync(&tz->poll_queue);
+               cancel_delayed_work(&tz->poll_queue);
  }
  
  static void monitor_thermal_zone(struct thermal_zone_device *tz)
        mutex_lock(&tz->lock);
  
        if (tz->passive)
 -              thermal_zone_device_set_polling(tz, tz->passive_delay);
 +              thermal_zone_device_set_polling(thermal_passive_wq,
 +                                              tz, tz->passive_delay);
        else if (tz->polling_delay)
 -              thermal_zone_device_set_polling(tz, tz->polling_delay);
 +              thermal_zone_device_set_polling(
 +                              system_freezable_power_efficient_wq,
 +                              tz, tz->polling_delay);
        else
 -              thermal_zone_device_set_polling(tz, 0);
 +              thermal_zone_device_set_polling(NULL, tz, 0);
  
        mutex_unlock(&tz->lock);
  }
@@@ -844,19 -434,15 +844,19 @@@ static void handle_critical_trips(struc
        tz->ops->get_trip_temp(tz, trip, &trip_temp);
  
        /* If we have not crossed the trip_temp, we do not care. */
 -      if (trip_temp <= 0 || tz->temperature < trip_temp)
 -              return;
 -
 -      trace_thermal_zone_trip(tz, trip, trip_type);
 +      if (trip_type != THERMAL_TRIP_CRITICAL_LOW &&
 +          trip_type != THERMAL_TRIP_CONFIGURABLE_LOW) {
 +              if (tz->temperature < trip_temp)
 +                      return;
 +      } else
 +              if (tz->temperature >= trip_temp)
 +                      return;
  
        if (tz->ops->notify)
                tz->ops->notify(tz, trip, trip_type);
  
 -      if (trip_type == THERMAL_TRIP_CRITICAL) {
 +      if (trip_type == THERMAL_TRIP_CRITICAL ||
 +          trip_type == THERMAL_TRIP_CRITICAL_LOW) {
                dev_emerg(&tz->device,
                          "critical temperature reached(%d C),shutting down\n",
                          tz->temperature / 1000);
@@@ -874,10 -460,7 +874,10 @@@ static void handle_thermal_trip(struct 
  
        tz->ops->get_trip_type(tz, trip, &type);
  
 -      if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT)
 +      if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT ||
 +          type == THERMAL_TRIP_CONFIGURABLE_HI ||
 +          type == THERMAL_TRIP_CONFIGURABLE_LOW ||
 +          type == THERMAL_TRIP_CRITICAL_LOW)
                handle_critical_trips(tz, trip, type);
        else
                handle_non_critical_trips(tz, trip, type);
@@@ -978,6 -561,9 +978,6 @@@ void thermal_zone_device_update(struct 
  {
        int count;
  
 -      if (atomic_read(&in_suspend))
 -              return;
 -
        if (!tz->ops->get_temp)
                return;
  
@@@ -1087,12 -673,6 +1087,12 @@@ trip_point_type_show(struct device *dev
                return sprintf(buf, "critical\n");
        case THERMAL_TRIP_HOT:
                return sprintf(buf, "hot\n");
 +      case THERMAL_TRIP_CONFIGURABLE_HI:
 +              return sprintf(buf, "configurable_hi\n");
 +      case THERMAL_TRIP_CONFIGURABLE_LOW:
 +              return sprintf(buf, "configurable_low\n");
 +      case THERMAL_TRIP_CRITICAL_LOW:
 +              return sprintf(buf, "critical_low\n");
        case THERMAL_TRIP_PASSIVE:
                return sprintf(buf, "passive\n");
        case THERMAL_TRIP_ACTIVE:
  }
  
  static ssize_t
 +trip_point_type_activate(struct device *dev, struct device_attribute *attr,
 +              const char *buf, size_t count)
 +{
 +      struct thermal_zone_device *tz = to_thermal_zone(dev);
 +      int trip, result = 0;
 +      bool activate;
 +      struct sensor_threshold *threshold = NULL;
 +
 +      if (!tz->ops->get_trip_type ||
 +              !tz->ops->activate_trip_type) {
 +              result = -EPERM;
 +              goto trip_activate_exit;
 +      }
 +
 +      if (!sscanf(attr->attr.name, "trip_point_%d_type", &trip)) {
 +              result = -EINVAL;
 +              goto trip_activate_exit;
 +      }
 +
 +      if (!strcmp(buf, "enabled")) {
 +              activate = true;
 +      } else if (!strcmp(buf, "disabled")) {
 +              activate = false;
 +      } else {
 +              result = -EINVAL;
 +              goto trip_activate_exit;
 +      }
 +
 +      get_trip_threshold(tz, trip, &threshold);
 +      if (threshold)
 +              result = sensor_activate_trip(tz->sensor.sensor_id,
 +                      threshold, activate);
 +      else
 +              result = tz->ops->activate_trip_type(tz, trip,
 +                      activate ? THERMAL_TRIP_ACTIVATION_ENABLED :
 +                      THERMAL_TRIP_ACTIVATION_DISABLED);
 +
 +trip_activate_exit:
 +      if (result)
 +              return result;
 +
 +      return count;
 +}
 +
 +static ssize_t
  trip_point_temp_store(struct device *dev, struct device_attribute *attr,
                     const char *buf, size_t count)
  {
        struct thermal_zone_device *tz = to_thermal_zone(dev);
        int trip, ret;
 -      unsigned long temperature;
 +      long temperature;
  
        if (!tz->ops->set_trip_temp)
                return -EPERM;
        if (!sscanf(attr->attr.name, "trip_point_%d_temp", &trip))
                return -EINVAL;
  
 -      if (kstrtoul(buf, 10, &temperature))
 +      if (kstrtol(buf, 10, &temperature))
                return -EINVAL;
  
 -      ret = tz->ops->set_trip_temp(tz, trip, temperature);
 +      ret = sensor_set_trip_temp(tz, trip, temperature);
  
        return ret ? ret : count;
  }
@@@ -1184,6 -719,7 +1184,6 @@@ trip_point_temp_show(struct device *dev
                return -EINVAL;
  
        ret = tz->ops->get_trip_temp(tz, trip, &temperature);
 -
        if (ret)
                return ret;
  
@@@ -2166,9 -1702,8 +2166,9 @@@ static int create_trip_attrs(struct the
                sysfs_attr_init(&tz->trip_type_attrs[indx].attr.attr);
                tz->trip_type_attrs[indx].attr.attr.name =
                                                tz->trip_type_attrs[indx].name;
 -              tz->trip_type_attrs[indx].attr.attr.mode = S_IRUGO;
 +              tz->trip_type_attrs[indx].attr.attr.mode = S_IRUGO | S_IWUSR;
                tz->trip_type_attrs[indx].attr.show = trip_point_type_show;
 +              tz->trip_type_attrs[indx].attr.store = trip_point_type_activate;
  
                device_create_file(&tz->device,
                                   &tz->trip_type_attrs[indx].attr);
@@@ -2398,14 -1933,13 +2398,14 @@@ struct thermal_zone_device *thermal_zon
        }
  
        mutex_lock(&thermal_list_lock);
 -      list_add_tail(&tz->node, &thermal_tz_list);
 +      list_add_tail_rcu(&tz->node, &thermal_tz_list);
 +      sensor_init(tz);
        mutex_unlock(&thermal_list_lock);
  
        /* Bind cooling devices for this zone */
        bind_tz(tz);
  
 -      INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check);
 +      INIT_DEFERRABLE_WORK(&(tz->poll_queue), thermal_zone_device_check);
  
        thermal_zone_device_reset(tz);
        /* Update the new thermal zone and mark it as already updated. */
@@@ -2446,7 -1980,7 +2446,7 @@@ void thermal_zone_device_unregister(str
                mutex_unlock(&thermal_list_lock);
                return;
        }
 -      list_del(&tz->node);
 +      list_del_rcu(&tz->node);
  
        /* Unbind all cdevs associated with 'this' thermal zone */
        list_for_each_entry(cdev, &thermal_cdev_list, node) {
  
        mutex_unlock(&thermal_list_lock);
  
-       thermal_zone_device_set_polling(NULL, tz, 0);
+       cancel_delayed_work_sync(&tz->poll_queue);
  
        if (tz->type[0])
                device_remove_file(&tz->device, &dev_attr_type);
        thermal_set_governor(tz, NULL);
  
        thermal_remove_hwmon_sysfs(tz);
 +      flush_work(&tz->sensor.work);
 +      tz->sensor.deregister_active = true;
 +      complete(&tz->sensor.sysfs_notify_complete);
 +      kthread_stop(tz->sensor.sysfs_notify_thread);
 +      mutex_lock(&thermal_list_lock);
 +      list_del_rcu(&tz->sensor.sensor_list);
 +      mutex_unlock(&thermal_list_lock);
        release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
        idr_destroy(&tz->idr);
        mutex_destroy(&tz->lock);
@@@ -2514,13 -2041,13 +2514,13 @@@ struct thermal_zone_device *thermal_zon
        if (!name)
                goto exit;
  
 -      mutex_lock(&thermal_list_lock);
 -      list_for_each_entry(pos, &thermal_tz_list, node)
 +      rcu_read_lock();
 +      list_for_each_entry_rcu(pos, &thermal_tz_list, node)
                if (!strncasecmp(name, pos->type, THERMAL_NAME_LENGTH)) {
                        found++;
                        ref = pos;
                }
 -      mutex_unlock(&thermal_list_lock);
 +      rcu_read_unlock();
  
        /* nothing has been found, thus an error code for it */
        if (found == 0)
@@@ -2657,22 -2184,43 +2657,22 @@@ static void thermal_unregister_governor
        thermal_gov_power_allocator_unregister();
  }
  
 -static int thermal_pm_notify(struct notifier_block *nb,
 -                              unsigned long mode, void *_unused)
 -{
 -      struct thermal_zone_device *tz;
 -
 -      switch (mode) {
 -      case PM_HIBERNATION_PREPARE:
 -      case PM_RESTORE_PREPARE:
 -      case PM_SUSPEND_PREPARE:
 -              atomic_set(&in_suspend, 1);
 -              break;
 -      case PM_POST_HIBERNATION:
 -      case PM_POST_RESTORE:
 -      case PM_POST_SUSPEND:
 -              atomic_set(&in_suspend, 0);
 -              list_for_each_entry(tz, &thermal_tz_list, node) {
 -                      thermal_zone_device_reset(tz);
 -                      thermal_zone_device_update(tz);
 -              }
 -              break;
 -      default:
 -              break;
 -      }
 -      return 0;
 -}
 -
 -static struct notifier_block thermal_pm_nb = {
 -      .notifier_call = thermal_pm_notify,
 -};
 -
  static int __init thermal_init(void)
  {
        int result;
  
 +      thermal_passive_wq = alloc_workqueue("thermal_passive_wq",
 +                                              WQ_HIGHPRI | WQ_UNBOUND
 +                                              | WQ_FREEZABLE,
 +                                              THERMAL_MAX_ACTIVE);
 +      if (!thermal_passive_wq) {
 +              result = -ENOMEM;
 +              goto error;
 +      }
 +
        result = thermal_register_governors();
        if (result)
 -              goto error;
 +              goto destroy_wq;
  
        result = class_register(&thermal_class);
        if (result)
        if (result)
                goto exit_netlink;
  
 -      result = register_pm_notifier(&thermal_pm_nb);
 -      if (result)
 -              pr_warn("Thermal: Can not register suspend notifier, return %d\n",
 -                      result);
 -
        return 0;
  
  exit_netlink:
@@@ -2694,8 -2247,6 +2694,8 @@@ unregister_class
        class_unregister(&thermal_class);
  unregister_governors:
        thermal_unregister_governors();
 +destroy_wq:
 +      destroy_workqueue(thermal_passive_wq);
  error:
        idr_destroy(&thermal_tz_idr);
        idr_destroy(&thermal_cdev_idr);
  
  static void __exit thermal_exit(void)
  {
 -      unregister_pm_notifier(&thermal_pm_nb);
        of_thermal_destroy_zones();
 +      destroy_workqueue(thermal_passive_wq);
        genetlink_exit();
        class_unregister(&thermal_class);
        thermal_unregister_governors();
  # define SUPPORT_SYSRQ
  #endif
  
 +#include <linux/kernel.h>
  #include <linux/atomic.h>
  #include <linux/dma-mapping.h>
  #include <linux/dmaengine.h>
 -#include <linux/hrtimer.h>
  #include <linux/module.h>
  #include <linux/io.h>
  #include <linux/ioport.h>
 -#include <linux/irq.h>
 +#include <linux/interrupt.h>
  #include <linux/init.h>
  #include <linux/console.h>
  #include <linux/tty.h>
  #include <linux/tty_flip.h>
  #include <linux/serial_core.h>
 -#include <linux/serial.h>
  #include <linux/slab.h>
  #include <linux/clk.h>
  #include <linux/platform_device.h>
  #include <linux/delay.h>
  #include <linux/of.h>
  #include <linux/of_device.h>
 -
 -#include "msm_serial.h"
 -
 -#define UARTDM_BURST_SIZE     16   /* in bytes */
 -#define UARTDM_TX_AIGN(x)     ((x) & ~0x3) /* valid for > 1p3 */
 -#define UARTDM_TX_MAX         256   /* in bytes, valid for <= 1p3 */
 -#define UARTDM_RX_SIZE                (UART_XMIT_SIZE / 4)
 +#include <linux/wait.h>
 +
 +#define UART_MR1                      0x0000
 +
 +#define UART_MR1_AUTO_RFR_LEVEL0      0x3F
 +#define UART_MR1_AUTO_RFR_LEVEL1      0x3FF00
 +#define UART_DM_MR1_AUTO_RFR_LEVEL1   0xFFFFFF00
 +#define UART_MR1_RX_RDY_CTL           BIT(7)
 +#define UART_MR1_CTS_CTL              BIT(6)
 +
 +#define UART_MR2                      0x0004
 +#define UART_MR2_ERROR_MODE           BIT(6)
 +#define UART_MR2_BITS_PER_CHAR                0x30
 +#define UART_MR2_BITS_PER_CHAR_5      (0x0 << 4)
 +#define UART_MR2_BITS_PER_CHAR_6      (0x1 << 4)
 +#define UART_MR2_BITS_PER_CHAR_7      (0x2 << 4)
 +#define UART_MR2_BITS_PER_CHAR_8      (0x3 << 4)
 +#define UART_MR2_STOP_BIT_LEN_ONE     (0x1 << 2)
 +#define UART_MR2_STOP_BIT_LEN_TWO     (0x3 << 2)
 +#define UART_MR2_PARITY_MODE_NONE     0x0
 +#define UART_MR2_PARITY_MODE_ODD      0x1
 +#define UART_MR2_PARITY_MODE_EVEN     0x2
 +#define UART_MR2_PARITY_MODE_SPACE    0x3
 +#define UART_MR2_PARITY_MODE          0x3
 +
 +#define UART_CSR                      0x0008
 +
 +#define UART_TF                               0x000C
 +#define UARTDM_TF                     0x0070
 +
 +#define UART_CR                               0x0010
 +#define UART_CR_CMD_NULL              (0 << 4)
 +#define UART_CR_CMD_RESET_RX          (1 << 4)
 +#define UART_CR_CMD_RESET_TX          (2 << 4)
 +#define UART_CR_CMD_RESET_ERR         (3 << 4)
 +#define UART_CR_CMD_RESET_BREAK_INT   (4 << 4)
 +#define UART_CR_CMD_START_BREAK               (5 << 4)
 +#define UART_CR_CMD_STOP_BREAK                (6 << 4)
 +#define UART_CR_CMD_RESET_CTS         (7 << 4)
 +#define UART_CR_CMD_RESET_STALE_INT   (8 << 4)
 +#define UART_CR_CMD_PACKET_MODE               (9 << 4)
 +#define UART_CR_CMD_MODE_RESET                (12 << 4)
 +#define UART_CR_CMD_SET_RFR           (13 << 4)
 +#define UART_CR_CMD_RESET_RFR         (14 << 4)
 +#define UART_CR_CMD_PROTECTION_EN     (16 << 4)
 +#define UART_CR_CMD_STALE_EVENT_DISABLE       (6 << 8)
 +#define UART_CR_CMD_STALE_EVENT_ENABLE        (80 << 4)
 +#define UART_CR_CMD_FORCE_STALE               (4 << 8)
 +#define UART_CR_CMD_RESET_TX_READY    (3 << 8)
 +#define UART_CR_TX_DISABLE            BIT(3)
 +#define UART_CR_TX_ENABLE             BIT(2)
 +#define UART_CR_RX_DISABLE            BIT(1)
 +#define UART_CR_RX_ENABLE             BIT(0)
 +#define UART_CR_CMD_RESET_RXBREAK_START       ((1 << 11) | (2 << 4))
 +
 +#define UART_IMR                      0x0014
 +#define UART_IMR_TXLEV                        BIT(0)
 +#define UART_IMR_RXSTALE              BIT(3)
 +#define UART_IMR_RXLEV                        BIT(4)
 +#define UART_IMR_DELTA_CTS            BIT(5)
 +#define UART_IMR_CURRENT_CTS          BIT(6)
 +#define UART_IMR_RXBREAK_START                BIT(10)
 +
 +#define UART_IPR_RXSTALE_LAST         0x20
 +#define UART_IPR_STALE_LSB            0x1F
 +#define UART_IPR_STALE_TIMEOUT_MSB    0x3FF80
 +#define UART_DM_IPR_STALE_TIMEOUT_MSB 0xFFFFFF80
 +
 +#define UART_IPR                      0x0018
 +#define UART_TFWR                     0x001C
 +#define UART_RFWR                     0x0020
 +#define UART_HCR                      0x0024
 +
 +#define UART_MREG                     0x0028
 +#define UART_NREG                     0x002C
 +#define UART_DREG                     0x0030
 +#define UART_MNDREG                   0x0034
 +#define UART_IRDA                     0x0038
 +#define UART_MISR_MODE                        0x0040
 +#define UART_MISR_RESET                       0x0044
 +#define UART_MISR_EXPORT              0x0048
 +#define UART_MISR_VAL                 0x004C
 +#define UART_TEST_CTRL                        0x0050
 +
 +#define UART_SR                               0x0008
 +#define UART_SR_HUNT_CHAR             BIT(7)
 +#define UART_SR_RX_BREAK              BIT(6)
 +#define UART_SR_PAR_FRAME_ERR         BIT(5)
 +#define UART_SR_OVERRUN                       BIT(4)
 +#define UART_SR_TX_EMPTY              BIT(3)
 +#define UART_SR_TX_READY              BIT(2)
 +#define UART_SR_RX_FULL                       BIT(1)
 +#define UART_SR_RX_READY              BIT(0)
 +
 +#define UART_RF                               0x000C
 +#define UARTDM_RF                     0x0070
 +#define UART_MISR                     0x0010
 +#define UART_ISR                      0x0014
 +#define UART_ISR_TX_READY             BIT(7)
 +
 +#define UARTDM_RXFS                   0x50
 +#define UARTDM_RXFS_BUF_SHIFT         0x7
 +#define UARTDM_RXFS_BUF_MASK          0x7
 +
 +#define UARTDM_DMEN                   0x3C
 +#define UARTDM_DMEN_RX_SC_ENABLE      BIT(5)
 +#define UARTDM_DMEN_TX_SC_ENABLE      BIT(4)
 +
 +#define UARTDM_DMEN_TX_BAM_ENABLE     BIT(2)  /* UARTDM_1P4 */
 +#define UARTDM_DMEN_TX_DM_ENABLE      BIT(0)  /* < UARTDM_1P4 */
 +
 +#define UARTDM_DMEN_RX_BAM_ENABLE     BIT(3)  /* UARTDM_1P4 */
 +#define UARTDM_DMEN_RX_DM_ENABLE      BIT(1)  /* < UARTDM_1P4 */
 +
 +#define UARTDM_DMRX                   0x34
 +#define UARTDM_NCF_TX                 0x40
 +#define UARTDM_RX_TOTAL_SNAP          0x38
 +
 +#define UARTDM_BURST_SIZE             16   /* in bytes */
 +#define UARTDM_TX_AIGN(x)             ((x) & ~0x3) /* valid for > 1p3 */
 +#define UARTDM_TX_MAX                 256   /* in bytes, valid for <= 1p3 */
 +#define UARTDM_RX_SIZE                        (UART_XMIT_SIZE / 4)
  
  enum {
        UARTDM_1P1 = 1,
@@@ -192,65 -78,10 +192,65 @@@ struct msm_port 
        struct msm_dma          rx_dma;
  };
  
 +#define UART_TO_MSM(uart_port)        container_of(uart_port, struct msm_port, uart)
 +
 +static
 +void msm_write(struct uart_port *port, unsigned int val, unsigned int off)
 +{
 +      writel_relaxed_no_log(val, port->membase + off);
 +}
 +
 +static
 +unsigned int msm_read(struct uart_port *port, unsigned int off)
 +{
 +      return readl_relaxed_no_log(port->membase + off);
 +}
 +
 +/*
 + * Setup the MND registers to use the TCXO clock.
 + */
 +static void msm_serial_set_mnd_regs_tcxo(struct uart_port *port)
 +{
 +      msm_write(port, 0x06, UART_MREG);
 +      msm_write(port, 0xF1, UART_NREG);
 +      msm_write(port, 0x0F, UART_DREG);
 +      msm_write(port, 0x1A, UART_MNDREG);
 +      port->uartclk = 1843200;
 +}
 +
 +/*
 + * Setup the MND registers to use the TCXO clock divided by 4.
 + */
 +static void msm_serial_set_mnd_regs_tcxoby4(struct uart_port *port)
 +{
 +      msm_write(port, 0x18, UART_MREG);
 +      msm_write(port, 0xF6, UART_NREG);
 +      msm_write(port, 0x0F, UART_DREG);
 +      msm_write(port, 0x0A, UART_MNDREG);
 +      port->uartclk = 1843200;
 +}
 +
 +static void msm_serial_set_mnd_regs(struct uart_port *port)
 +{
 +      struct msm_port *msm_port = UART_TO_MSM(port);
 +
 +      /*
 +       * These registers don't exist so we change the clk input rate
 +       * on uartdm hardware instead
 +       */
 +      if (msm_port->is_uartdm)
 +              return;
 +
 +      if (port->uartclk == 19200000)
 +              msm_serial_set_mnd_regs_tcxo(port);
 +      else if (port->uartclk == 4800000)
 +              msm_serial_set_mnd_regs_tcxoby4(port);
 +}
 +
  static void msm_handle_tx(struct uart_port *port);
  static void msm_start_rx_dma(struct msm_port *msm_port);
  
 -void msm_stop_dma(struct uart_port *port, struct msm_dma *dma)
 +static void msm_stop_dma(struct uart_port *port, struct msm_dma *dma)
  {
        struct device *dev = port->dev;
        unsigned int mapped;
@@@ -303,17 -134,15 +303,17 @@@ static void msm_request_tx_dma(struct m
        struct device *dev = msm_port->uart.dev;
        struct dma_slave_config conf;
        struct msm_dma *dma;
 +      struct dma_chan *dma_chan;
        u32 crci = 0;
        int ret;
  
        dma = &msm_port->tx_dma;
  
        /* allocate DMA resources, if available */
 -      dma->chan = dma_request_slave_channel_reason(dev, "tx");
 -      if (IS_ERR(dma->chan))
 +      dma_chan = dma_request_slave_channel_reason(dev, "tx");
 +      if (IS_ERR(dma_chan))
                goto no_tx;
 +      dma->chan = dma_chan;
  
        of_property_read_u32(dev->of_node, "qcom,tx-crci", &crci);
  
@@@ -348,17 -177,15 +348,17 @@@ static void msm_request_rx_dma(struct m
        struct device *dev = msm_port->uart.dev;
        struct dma_slave_config conf;
        struct msm_dma *dma;
 +      struct dma_chan *dma_chan;
        u32 crci = 0;
        int ret;
  
        dma = &msm_port->rx_dma;
  
        /* allocate DMA resources, if available */
 -      dma->chan = dma_request_slave_channel_reason(dev, "rx");
 -      if (IS_ERR(dma->chan))
 +      dma_chan = dma_request_slave_channel_reason(dev, "rx");
 +      if (IS_ERR(dma_chan))
                goto no_rx;
 +      dma->chan = dma_chan;
  
        of_property_read_u32(dev->of_node, "qcom,rx-crci", &crci);
  
@@@ -565,6 -392,10 +565,6 @@@ static void msm_complete_rx_dma(void *a
        val &= ~dma->enable_bit;
        msm_write(port, val, UARTDM_DMEN);
  
 -      /* Restore interrupts */
 -      msm_port->imr |= UART_IMR_RXLEV | UART_IMR_RXSTALE;
 -      msm_write(port, msm_port->imr, UART_IMR);
 -
        if (msm_read(port, UART_SR) & UART_SR_OVERRUN) {
                port->icount.overrun++;
                tty_insert_flip_char(tport, 0, TTY_OVERRUN);
@@@ -992,6 -823,7 +992,7 @@@ static unsigned int msm_get_mctrl(struc
  static void msm_reset(struct uart_port *port)
  {
        struct msm_port *msm_port = UART_TO_MSM(port);
+       unsigned int mr;
  
        /* reset everything */
        msm_write(port, UART_CR_CMD_RESET_RX, UART_CR);
        msm_write(port, UART_CR_CMD_RESET_ERR, UART_CR);
        msm_write(port, UART_CR_CMD_RESET_BREAK_INT, UART_CR);
        msm_write(port, UART_CR_CMD_RESET_CTS, UART_CR);
-       msm_write(port, UART_CR_CMD_SET_RFR, UART_CR);
+       msm_write(port, UART_CR_CMD_RESET_RFR, UART_CR);
+       mr = msm_read(port, UART_MR1);
+       mr &= ~UART_MR1_RX_RDY_CTL;
+       msm_write(port, mr, UART_MR1);
  
        /* Disable DM modes */
        if (msm_port->is_uartdm)
@@@ -1037,72 -872,37 +1041,72 @@@ struct msm_baud_map 
  };
  
  static const struct msm_baud_map *
 -msm_find_best_baud(struct uart_port *port, unsigned int baud)
 +msm_find_best_baud(struct uart_port *port, unsigned int baud,
 +                 unsigned long *rate)
  {
 -      unsigned int i, divisor;
 -      const struct msm_baud_map *entry;
 +      struct msm_port *msm_port = UART_TO_MSM(port);
 +      unsigned int divisor, result;
 +      unsigned long target, old, best_rate = 0, diff, best_diff = ULONG_MAX;
 +      const struct msm_baud_map *entry, *end, *best;
        static const struct msm_baud_map table[] = {
 -              { 1536, 0x00,  1 },
 -              {  768, 0x11,  1 },
 -              {  384, 0x22,  1 },
 -              {  192, 0x33,  1 },
 -              {   96, 0x44,  1 },
 -              {   48, 0x55,  1 },
 -              {   32, 0x66,  1 },
 -              {   24, 0x77,  1 },
 -              {   16, 0x88,  1 },
 -              {   12, 0x99,  6 },
 -              {    8, 0xaa,  6 },
 -              {    6, 0xbb,  6 },
 -              {    4, 0xcc,  6 },
 -              {    3, 0xdd,  8 },
 -              {    2, 0xee, 16 },
                {    1, 0xff, 31 },
 -              {    0, 0xff, 31 },
 +              {    2, 0xee, 16 },
 +              {    3, 0xdd,  8 },
 +              {    4, 0xcc,  6 },
 +              {    6, 0xbb,  6 },
 +              {    8, 0xaa,  6 },
 +              {   12, 0x99,  6 },
 +              {   16, 0x88,  1 },
 +              {   24, 0x77,  1 },
 +              {   32, 0x66,  1 },
 +              {   48, 0x55,  1 },
 +              {   96, 0x44,  1 },
 +              {  192, 0x33,  1 },
 +              {  384, 0x22,  1 },
 +              {  768, 0x11,  1 },
 +              { 1536, 0x00,  1 },
        };
  
 -      divisor = uart_get_divisor(port, baud);
 +      best = table; /* Default to smallest divider */
 +      target = clk_round_rate(msm_port->clk, 16 * baud);
 +      divisor = DIV_ROUND_CLOSEST(target, 16 * baud);
 +
 +      end = table + ARRAY_SIZE(table);
 +      entry = table;
 +      while (entry < end) {
 +              if (entry->divisor <= divisor) {
 +                      result = target / entry->divisor / 16;
 +                      diff = abs(result - baud);
 +
 +                      /* Keep track of best entry */
 +                      if (diff < best_diff) {
 +                              best_diff = diff;
 +                              best = entry;
 +                              best_rate = target;
 +                      }
  
 -      for (i = 0, entry = table; i < ARRAY_SIZE(table); i++, entry++)
 -              if (entry->divisor <= divisor)
 -                      break;
 +                      if (result == baud)
 +                              break;
 +              } else if (entry->divisor > divisor) {
 +                      old = target;
 +                      target = clk_round_rate(msm_port->clk, old + 1);
 +                      /*
 +                       * The rate didn't get any faster so we can't do
 +                       * better at dividing it down
 +                       */
 +                      if (target == old)
 +                              break;
 +
 +                      /* Start the divisor search over at this new rate */
 +                      entry = table;
 +                      divisor = DIV_ROUND_CLOSEST(target, 16 * baud);
 +                      continue;
 +              }
 +              entry++;
 +      }
  
 -      return entry; /* Default to smallest divider */
 +      *rate = best_rate;
 +      return best;
  }
  
  static int msm_set_baud_rate(struct uart_port *port, unsigned int baud,
        unsigned int rxstale, watermark, mask;
        struct msm_port *msm_port = UART_TO_MSM(port);
        const struct msm_baud_map *entry;
 -      unsigned long flags;
 -
 -      entry = msm_find_best_baud(port, baud);
 -
 -      msm_write(port, entry->code, UART_CSR);
 -
 -      if (baud > 460800)
 -              port->uartclk = baud * 16;
 +      unsigned long flags, rate;
  
        flags = *saved_flags;
        spin_unlock_irqrestore(&port->lock, flags);
  
 -      clk_set_rate(msm_port->clk, port->uartclk);
 +      entry = msm_find_best_baud(port, baud, &rate);
 +      clk_set_rate(msm_port->clk, rate);
 +      baud = rate / 16 / entry->divisor;
  
        spin_lock_irqsave(&port->lock, flags);
        *saved_flags = flags;
 +      port->uartclk = rate;
 +
 +      msm_write(port, entry->code, UART_CSR);
  
        /* RX stale watermark */
        rxstale = entry->rxstale;
        return baud;
  }
  
 -static void msm_init_clock(struct uart_port *port)
 -{
 -      struct msm_port *msm_port = UART_TO_MSM(port);
 -
 -      clk_prepare_enable(msm_port->clk);
 -      clk_prepare_enable(msm_port->pclk);
 -      msm_serial_set_mnd_regs(port);
 -}
 -
  static int msm_startup(struct uart_port *port)
  {
        struct msm_port *msm_port = UART_TO_MSM(port);
        snprintf(msm_port->name, sizeof(msm_port->name),
                 "msm_serial%d", port->line);
  
 -      ret = request_irq(port->irq, msm_uart_irq, IRQF_TRIGGER_HIGH,
 -                        msm_port->name, port);
 -      if (unlikely(ret))
 +      /*
 +       * UART clk must be kept enabled to
 +       * avoid losing received character
 +       */
 +      ret = clk_prepare_enable(msm_port->clk);
 +      if (ret)
                return ret;
  
 -      msm_init_clock(port);
 +      ret = clk_prepare_enable(msm_port->pclk);
 +      if (ret)
 +              goto err_pclk;
 +
 +      msm_serial_set_mnd_regs(port);
  
        if (likely(port->fifosize > 12))
                rfr_level = port->fifosize - 12;
                msm_request_rx_dma(msm_port, msm_port->uart.mapbase);
        }
  
 +      ret = request_irq(port->irq, msm_uart_irq, IRQF_TRIGGER_HIGH,
 +                        msm_port->name, port);
 +      if (unlikely(ret))
 +              goto err_irq;
 +
        return 0;
 +
 +err_irq:
 +      if (msm_port->is_uartdm)
 +              msm_release_dma(msm_port);
 +
 +      clk_disable_unprepare(msm_port->pclk);
 +
 +err_pclk:
 +      clk_disable_unprepare(msm_port->clk);
 +
 +      return ret;
  }
  
  static void msm_shutdown(struct uart_port *port)
        if (msm_port->is_uartdm)
                msm_release_dma(msm_port);
  
 +      clk_disable_unprepare(msm_port->pclk);
        clk_disable_unprepare(msm_port->clk);
  
        free_irq(port->irq, port);
@@@ -1411,16 -1198,8 +1415,16 @@@ static void msm_power(struct uart_port 
  
        switch (state) {
        case 0:
 -              clk_prepare_enable(msm_port->clk);
 -              clk_prepare_enable(msm_port->pclk);
 +              /*
 +               * UART clk must be kept enabled to
 +               * avoid losing received character
 +               */
 +              if (clk_prepare_enable(msm_port->clk))
 +                      return;
 +              if (clk_prepare_enable(msm_port->pclk)) {
 +                      clk_disable_unprepare(msm_port->clk);
 +                      return;
 +              }
                break;
        case 3:
                clk_disable_unprepare(msm_port->clk);
@@@ -1623,7 -1402,6 +1627,7 @@@ static void __msm_console_write(struct 
                int j;
                unsigned int num_chars;
                char buf[4] = { 0 };
 +              const u32 *buffer;
  
                if (is_uartdm)
                        num_chars = min(count - i, (unsigned int)sizeof(buf));
                while (!(msm_read(port, UART_SR) & UART_SR_TX_READY))
                        cpu_relax();
  
 -              iowrite32_rep(tf, buf, 1);
 +              buffer = (const u32 *)buf;
 +              writel_relaxed_no_log(*buffer, tf);
                i += num_chars;
        }
        spin_unlock(&port->lock);
@@@ -1685,7 -1462,7 +1689,7 @@@ static int __init msm_console_setup(str
        if (unlikely(!port->membase))
                return -ENXIO;
  
 -      msm_init_clock(port);
 +      msm_serial_set_mnd_regs(port);
  
        if (options)
                uart_parse_options(options, &baud, &parity, &bits, &flow);
@@@ -1712,6 -1489,7 +1716,6 @@@ msm_serial_early_console_setup(struct e
        device->con->write = msm_serial_early_write;
        return 0;
  }
 -EARLYCON_DECLARE(msm_serial, msm_serial_early_console_setup);
  OF_EARLYCON_DECLARE(msm_serial, "qcom,msm-uart",
                    msm_serial_early_console_setup);
  
@@@ -1733,6 -1511,7 +1737,6 @@@ msm_serial_early_console_setup_dm(struc
        device->con->write = msm_serial_early_write_dm;
        return 0;
  }
 -EARLYCON_DECLARE(msm_serial_dm, msm_serial_early_console_setup_dm);
  OF_EARLYCON_DECLARE(msm_serial_dm, "qcom,msm-uartdm",
                    msm_serial_early_console_setup_dm);
  
@@@ -1811,6 -1590,8 +1815,6 @@@ static int msm_serial_probe(struct plat
                msm_port->pclk = devm_clk_get(&pdev->dev, "iface");
                if (IS_ERR(msm_port->pclk))
                        return PTR_ERR(msm_port->pclk);
 -
 -              clk_set_rate(msm_port->clk, 1843200);
        }
  
        port->uartclk = clk_get_rate(msm_port->clk);
@@@ -1847,37 -1628,12 +1851,37 @@@ static const struct of_device_id msm_ma
  };
  MODULE_DEVICE_TABLE(of, msm_match_table);
  
 +#ifdef CONFIG_PM_SLEEP
 +static int msm_serial_suspend(struct device *dev)
 +{
 +      struct uart_port *port = dev_get_drvdata(dev);
 +
 +      uart_suspend_port(&msm_uart_driver, port);
 +
 +      return 0;
 +}
 +
 +static int msm_serial_resume(struct device *dev)
 +{
 +      struct uart_port *port = dev_get_drvdata(dev);
 +
 +      uart_resume_port(&msm_uart_driver, port);
 +
 +      return 0;
 +}
 +#endif
 +
 +static const struct dev_pm_ops msm_serial_pm_ops = {
 +      SET_SYSTEM_SLEEP_PM_OPS(msm_serial_suspend, msm_serial_resume)
 +};
 +
  static struct platform_driver msm_platform_driver = {
        .remove = msm_serial_remove,
        .probe = msm_serial_probe,
        .driver = {
                .name = "msm_serial",
                .of_match_table = msm_match_table,
 +              .pm = &msm_serial_pm_ops,
        },
  };
  
@@@ -95,9 -95,6 +95,9 @@@ static void __uart_start(struct tty_str
        struct uart_state *state = tty->driver_data;
        struct uart_port *port = state->uart_port;
  
 +      if (port->ops->wake_peer)
 +              port->ops->wake_peer(port);
 +
        if (!uart_tx_stopped(port))
                port->ops->start_tx(port);
  }
@@@ -1018,7 -1015,7 +1018,7 @@@ static int uart_break_ctl(struct tty_st
  
        mutex_lock(&port->mutex);
  
-       if (uport->type != PORT_UNKNOWN)
+       if (uport->type != PORT_UNKNOWN && uport->ops->break_ctl)
                uport->ops->break_ctl(uport, break_state);
  
        mutex_unlock(&port->mutex);
diff --combined drivers/usb/core/hub.c
@@@ -36,8 -36,6 +36,8 @@@
  #define USB_VENDOR_GENESYS_LOGIC              0x05e3
  #define HUB_QUIRK_CHECK_PORT_AUTOSUSPEND      0x01
  
 +extern int deny_new_usb;
 +
  /* Protect struct usb_device->state and ->children members
   * Note: Both are also protected by ->dev.sem, except that ->state can
   * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */
@@@ -50,11 -48,6 +50,11 @@@ static void hub_event(struct work_struc
  /* synchronize hub-port add/remove and peering operations */
  DEFINE_MUTEX(usb_port_peer_mutex);
  
 +static bool skip_extended_resume_delay = 1;
 +module_param(skip_extended_resume_delay, bool, S_IRUGO | S_IWUSR);
 +MODULE_PARM_DESC(skip_extended_resume_delay,
 +              "removes extra delay added to finish bus resume");
 +
  /* cycle leds on hubs that aren't blinking for attention */
  static bool blinkenlights = 0;
  module_param(blinkenlights, bool, S_IRUGO);
@@@ -112,11 -105,6 +112,11 @@@ static int hub_port_disable(struct usb_
  static bool hub_port_warm_reset_required(struct usb_hub *hub, int port1,
                u16 portstatus);
  
 +#define USB_VENDOR_XIAOMI             0x2717
 +#define USB_PRODUCT_XIAOMI_HEADSET    0x3801
 +
 +bool is_xiaomi_headset = false;
 +
  static inline char *portspeed(struct usb_hub *hub, int portstatus)
  {
        if (hub_is_superspeed(hub->hdev))
@@@ -634,12 -622,6 +634,12 @@@ void usb_kick_hub_wq(struct usb_device 
                kick_hub_wq(hub);
  }
  
 +void usb_flush_hub_wq(void)
 +{
 +      flush_workqueue(hub_wq);
 +}
 +EXPORT_SYMBOL(usb_flush_hub_wq);
 +
  /*
   * Let the USB core know that a USB 3.0 device has sent a Function Wake Device
   * Notification, which indicates it had initiated remote wakeup.
@@@ -1698,6 -1680,47 +1698,6 @@@ static int hub_probe(struct usb_interfa
        hdev = interface_to_usbdev(intf);
  
        /*
 -       * Set default autosuspend delay as 0 to speedup bus suspend,
 -       * based on the below considerations:
 -       *
 -       * - Unlike other drivers, the hub driver does not rely on the
 -       *   autosuspend delay to provide enough time to handle a wakeup
 -       *   event, and the submitted status URB is just to check future
 -       *   change on hub downstream ports, so it is safe to do it.
 -       *
 -       * - The patch might cause one or more auto supend/resume for
 -       *   below very rare devices when they are plugged into hub
 -       *   first time:
 -       *
 -       *      devices having trouble initializing, and disconnect
 -       *      themselves from the bus and then reconnect a second
 -       *      or so later
 -       *
 -       *      devices just for downloading firmware, and disconnects
 -       *      themselves after completing it
 -       *
 -       *   For these quite rare devices, their drivers may change the
 -       *   autosuspend delay of their parent hub in the probe() to one
 -       *   appropriate value to avoid the subtle problem if someone
 -       *   does care it.
 -       *
 -       * - The patch may cause one or more auto suspend/resume on
 -       *   hub during running 'lsusb', but it is probably too
 -       *   infrequent to worry about.
 -       *
 -       * - Change autosuspend delay of hub can avoid unnecessary auto
 -       *   suspend timer for hub, also may decrease power consumption
 -       *   of USB bus.
 -       *
 -       * - If user has indicated to prevent autosuspend by passing
 -       *   usbcore.autosuspend = -1 then keep autosuspend disabled.
 -       */
 -#ifdef CONFIG_PM
 -      if (hdev->dev.power.autosuspend_delay >= 0)
 -              pm_runtime_set_autosuspend_delay(&hdev->dev, 0);
 -#endif
 -
 -      /*
         * Hubs have proper suspend/resume support, except for root hubs
         * where the controller driver doesn't have bus_suspend and
         * bus_resume methods.
@@@ -2089,11 -2112,6 +2089,11 @@@ void usb_disconnect(struct usb_device *
        dev_info(&udev->dev, "USB disconnect, device number %d\n",
                        udev->devnum);
  
 +      if (is_xiaomi_headset) {
 +              dev_info(&udev->dev, "xiaomi headset removed, devnum %d\n", udev->devnum);
 +              is_xiaomi_headset = false;
 +      }
 +
        /*
         * Ensure that the pm runtime code knows that the USB device
         * is in the process of being disconnected.
@@@ -2421,12 -2439,6 +2421,12 @@@ int usb_new_device(struct usb_device *u
        udev->dev.devt = MKDEV(USB_DEVICE_MAJOR,
                        (((udev->bus->busnum-1) * 128) + (udev->devnum-1)));
  
 +      if (USB_VENDOR_XIAOMI == le16_to_cpu(udev->descriptor.idVendor)
 +                       && USB_PRODUCT_XIAOMI_HEADSET == le16_to_cpu(udev->descriptor.idProduct)) {
 +              dev_info(&udev->dev, "xiaomi headset identified, devnum %d\n", udev->devnum);
 +              is_xiaomi_headset = true;
 +      }
 +
        /* Tell the world! */
        announce_device(udev);
  
@@@ -3397,9 -3409,7 +3397,9 @@@ int usb_port_resume(struct usb_device *
                /* drive resume for USB_RESUME_TIMEOUT msec */
                dev_dbg(&udev->dev, "usb %sresume\n",
                                (PMSG_IS_AUTO(msg) ? "auto-" : ""));
 -              msleep(USB_RESUME_TIMEOUT);
 +              if (!skip_extended_resume_delay)
 +                      usleep_range(USB_RESUME_TIMEOUT * 1000,
 +                                      (USB_RESUME_TIMEOUT + 1) * 1000);
  
                /* Virtual root hubs can trigger on GET_PORT_STATUS to
                 * stop resume signaling.  Then finish the resume
                status = hub_port_status(hub, port1, &portstatus, &portchange);
  
                /* TRSMRCY = 10 msec */
 -              msleep(10);
 +              usleep_range(10000, 10500);
        }
  
   SuspendCleared:
@@@ -4310,8 -4320,6 +4310,8 @@@ hub_port_init(struct usb_hub *hub, stru
        enum usb_device_speed   oldspeed = udev->speed;
        const char              *speed;
        int                     devnum = udev->devnum;
 +      char                    *error_event[] = {
 +                              "USB_DEVICE_ERROR=Device_No_Response", NULL };
  
        /* root hub ports have a slightly longer reset period
         * (from USB 2.0 spec, section 7.1.7.5)
                                if (r != -ENODEV)
                                        dev_err(&udev->dev, "device descriptor read/64, error %d\n",
                                                        r);
 +                              kobject_uevent_env(&udev->parent->dev.kobj,
 +                                              KOBJ_CHANGE, error_event);
                                retval = -EMSGSIZE;
                                continue;
                        }
                                dev_err(&udev->dev,
                                        "device descriptor read/8, error %d\n",
                                        retval);
 +                      kobject_uevent_env(&udev->parent->dev.kobj,
 +                                              KOBJ_CHANGE, error_event);
                        if (retval >= 0)
                                retval = -EMSGSIZE;
                } else {
@@@ -4758,12 -4762,6 +4758,12 @@@ static void hub_port_connect(struct usb
                        goto done;
                return;
        }
 +
 +      if (deny_new_usb) {
 +              dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1);
 +              goto done;
 +      }
 +
        if (hub_is_superspeed(hub->hdev))
                unit_load = 150;
        else
@@@ -5556,7 -5554,7 +5556,7 @@@ re_enumerate_no_bos
  
  /**
   * usb_reset_device - warn interface drivers and perform a USB port reset
-  * @udev: device to reset (not in SUSPENDED or NOTATTACHED state)
+  * @udev: device to reset (not in NOTATTACHED state)
   *
   * Warns all drivers bound to registered interfaces (using their pre_reset
   * method), performs the port reset, and then lets the drivers know that
@@@ -5584,8 -5582,7 +5584,7 @@@ int usb_reset_device(struct usb_device 
        struct usb_host_config *config = udev->actconfig;
        struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent);
  
-       if (udev->state == USB_STATE_NOTATTACHED ||
-                       udev->state == USB_STATE_SUSPENDED) {
+       if (udev->state == USB_STATE_NOTATTACHED) {
                dev_dbg(&udev->dev, "device reset not allowed in state %d\n",
                                udev->state);
                return -EINVAL;
@@@ -8,33 -8,6 +8,33 @@@
  #include "configfs.h"
  #include "u_f.h"
  #include "u_os_desc.h"
 +#include "debug.h"
 +
 +#ifdef CONFIG_USB_CONFIGFS_UEVENT
 +#include <linux/platform_device.h>
 +#include <linux/kdev_t.h>
 +#include <linux/usb/ch9.h>
 +
 +#ifdef CONFIG_USB_CONFIGFS_F_ACC
 +extern int acc_ctrlrequest(struct usb_composite_dev *cdev,
 +                              const struct usb_ctrlrequest *ctrl);
 +void acc_disconnect(void);
 +#endif
 +static struct class *android_class;
 +static struct device *android_device;
 +static int index;
 +static int gadget_index;
 +
 +struct device *create_function_device(char *name)
 +{
 +      if (android_device && !IS_ERR(android_device))
 +              return device_create(android_class, android_device,
 +                      MKDEV(0, index++), NULL, name);
 +      else
 +              return ERR_PTR(-EINVAL);
 +}
 +EXPORT_SYMBOL_GPL(create_function_device);
 +#endif
  
  int check_user_usb_string(const char *name,
                struct usb_gadget_strings *stringtab_dev)
@@@ -87,17 -60,10 +87,17 @@@ struct gadget_info 
        struct usb_composite_driver composite;
        struct usb_composite_dev cdev;
        bool use_os_desc;
 +      bool unbinding;
        char b_vendor_code;
        char qw_sign[OS_STRING_QW_SIGN_LEN];
        spinlock_t spinlock;
        bool unbind;
 +#ifdef CONFIG_USB_CONFIGFS_UEVENT
 +      bool connected;
 +      bool sw_connected;
 +      struct work_struct work;
 +      struct device *dev;
 +#endif
  };
  
  static inline struct gadget_info *to_gadget_info(struct config_item *item)
@@@ -145,28 -111,21 +145,28 @@@ struct gadget_config_name 
        struct list_head list;
  };
  
 +#define MAX_USB_STRING_LEN    126
 +#define MAX_USB_STRING_WITH_NULL_LEN  (MAX_USB_STRING_LEN+1)
 +
  static int usb_string_copy(const char *s, char **s_copy)
  {
        int ret;
        char *str;
        char *copy = *s_copy;
        ret = strlen(s);
 -      if (ret > 126)
 +      if (ret > MAX_USB_STRING_LEN)
                return -EOVERFLOW;
  
 -      str = kstrdup(s, GFP_KERNEL);
 -      if (!str)
 -              return -ENOMEM;
 +      if (copy) {
 +              str = copy;
 +      } else {
 +              str = kmalloc(MAX_USB_STRING_WITH_NULL_LEN, GFP_KERNEL);
 +              if (!str)
 +                      return -ENOMEM;
 +      }
 +      strlcpy(str, s, MAX_USB_STRING_WITH_NULL_LEN);
        if (str[ret - 1] == '\n')
                str[ret - 1] = '\0';
 -      kfree(copy);
        *s_copy = str;
        return 0;
  }
@@@ -286,11 -245,9 +286,11 @@@ static int unregister_gadget(struct gad
        if (!gi->udc_name)
                return -ENODEV;
  
 +      gi->unbinding = true;
        ret = usb_gadget_unregister_driver(&gi->composite.gadget_driver);
        if (ret)
                return ret;
 +      gi->unbinding = false;
        kfree(gi->udc_name);
        gi->udc_name = NULL;
        return 0;
@@@ -311,7 -268,7 +311,7 @@@ static ssize_t gadget_dev_desc_UDC_stor
  
        mutex_lock(&gi->lock);
  
 -      if (!strlen(name)) {
 +      if (!strlen(name) || strcmp(name, "none") == 0) {
                ret = unregister_gadget(gi);
                if (ret)
                        goto err;
@@@ -1261,12 -1218,12 +1261,12 @@@ static void purge_configs_funcs(struct 
  
                cfg = container_of(c, struct config_usb_cfg, c);
  
 -              list_for_each_entry_safe(f, tmp, &c->functions, list) {
 +              list_for_each_entry_safe_reverse(f, tmp, &c->functions, list) {
  
 -                      list_move_tail(&f->list, &cfg->func_list);
 +                      list_move(&f->list, &cfg->func_list);
                        if (f->unbind) {
                                dev_err(&gi->cdev.gadget->dev, "unbind function"
 -                                              " '%s'/%p\n", f->name, f);
 +                                              " '%s'/%pK\n", f->name, f);
                                f->unbind(c, f);
                        }
                }
@@@ -1291,6 -1248,7 +1291,6 @@@ static int configfs_composite_bind(stru
        int                             ret;
  
        /* the gi->lock is hold by the caller */
 -      gi->unbind = 0;
        cdev->gadget = gadget;
        set_gadget_data(gadget, cdev);
        ret = composite_dev_prepare(composite, cdev);
@@@ -1419,269 -1377,139 +1419,269 @@@ err_comp_cleanup
        return ret;
  }
  
 +#ifdef CONFIG_USB_CONFIGFS_UEVENT
 +static void android_work(struct work_struct *data)
 +{
 +      struct gadget_info *gi = container_of(data, struct gadget_info, work);
 +      struct usb_composite_dev *cdev = &gi->cdev;
 +      char *disconnected[2] = { "USB_STATE=DISCONNECTED", NULL };
 +      char *connected[2]    = { "USB_STATE=CONNECTED", NULL };
 +      char *configured[2]   = { "USB_STATE=CONFIGURED", NULL };
 +      /* 0-connected 1-configured 2-disconnected*/
 +      bool status[3] = { false, false, false };
 +      unsigned long flags;
 +      bool uevent_sent = false;
 +
 +      spin_lock_irqsave(&cdev->lock, flags);
 +      if (cdev->config)
 +              status[1] = true;
 +
 +      if (gi->connected != gi->sw_connected) {
 +              if (gi->connected)
 +                      status[0] = true;
 +              else
 +                      status[2] = true;
 +              gi->sw_connected = gi->connected;
 +      }
 +      spin_unlock_irqrestore(&cdev->lock, flags);
 +
 +      if (status[0]) {
 +              kobject_uevent_env(&gi->dev->kobj,
 +                                      KOBJ_CHANGE, connected);
 +              pr_info("%s: sent uevent %s\n", __func__, connected[0]);
 +              uevent_sent = true;
 +      }
 +
 +      if (status[1]) {
 +              kobject_uevent_env(&gi->dev->kobj,
 +                                      KOBJ_CHANGE, configured);
 +              pr_info("%s: sent uevent %s\n", __func__, configured[0]);
 +              uevent_sent = true;
 +      }
 +
 +      if (status[2]) {
 +              kobject_uevent_env(&gi->dev->kobj,
 +                                      KOBJ_CHANGE, disconnected);
 +              pr_info("%s: sent uevent %s\n", __func__, disconnected[0]);
 +              uevent_sent = true;
 +      }
 +
 +      if (!uevent_sent) {
 +              pr_info("%s: did not send uevent (%d %d %pK)\n", __func__,
 +                      gi->connected, gi->sw_connected, cdev->config);
 +      }
 +}
 +#endif
 +
  static void configfs_composite_unbind(struct usb_gadget *gadget)
  {
        struct usb_composite_dev        *cdev;
        struct gadget_info              *gi;
 -      unsigned long flags;
  
        /* the gi->lock is hold by the caller */
  
        cdev = get_gadget_data(gadget);
        gi = container_of(cdev, struct gadget_info, cdev);
 -      spin_lock_irqsave(&gi->spinlock, flags);
 -      gi->unbind = 1;
 -      spin_unlock_irqrestore(&gi->spinlock, flags);
  
        kfree(otg_desc[0]);
        otg_desc[0] = NULL;
        purge_configs_funcs(gi);
        composite_dev_cleanup(cdev);
        usb_ep_autoconfig_reset(cdev->gadget);
 -      spin_lock_irqsave(&gi->spinlock, flags);
        cdev->gadget = NULL;
        set_gadget_data(gadget, NULL);
 -      spin_unlock_irqrestore(&gi->spinlock, flags);
  }
  
 -static int configfs_composite_setup(struct usb_gadget *gadget,
 -              const struct usb_ctrlrequest *ctrl)
 +#ifdef CONFIG_USB_CONFIGFS_UEVENT
 +static int android_setup(struct usb_gadget *gadget,
 +                      const struct usb_ctrlrequest *c)
  {
 -      struct usb_composite_dev *cdev;
 -      struct gadget_info *gi;
 +      struct usb_composite_dev *cdev = get_gadget_data(gadget);
        unsigned long flags;
 -      int ret;
 +      struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
 +      int value = -EOPNOTSUPP;
 +      struct usb_function_instance *fi;
  
 -      cdev = get_gadget_data(gadget);
 -      if (!cdev)
 -              return 0;
 +      spin_lock_irqsave(&cdev->lock, flags);
 +      if (!gi->connected) {
 +              gi->connected = 1;
 +              schedule_work(&gi->work);
 +      }
 +      spin_unlock_irqrestore(&cdev->lock, flags);
 +      list_for_each_entry(fi, &gi->available_func, cfs_list) {
 +              if (fi != NULL && fi->f != NULL && fi->f->setup != NULL) {
 +                      value = fi->f->setup(fi->f, c);
 +                      if (value >= 0)
 +                              break;
 +              }
 +      }
  
 -      gi = container_of(cdev, struct gadget_info, cdev);
 -      spin_lock_irqsave(&gi->spinlock, flags);
 -      cdev = get_gadget_data(gadget);
 -      if (!cdev || gi->unbind) {
 -              spin_unlock_irqrestore(&gi->spinlock, flags);
 -              return 0;
 +#ifdef CONFIG_USB_CONFIGFS_F_ACC
 +      if (value < 0)
 +              value = acc_ctrlrequest(cdev, c);
 +#endif
 +
 +      if (value < 0)
 +              value = composite_setup(gadget, c);
 +
 +      spin_lock_irqsave(&cdev->lock, flags);
 +      if (c->bRequest == USB_REQ_SET_CONFIGURATION &&
 +                                              cdev->config) {
 +              schedule_work(&gi->work);
        }
 +      spin_unlock_irqrestore(&cdev->lock, flags);
  
 -      ret = composite_setup(gadget, ctrl);
 -      spin_unlock_irqrestore(&gi->spinlock, flags);
 -      return ret;
 +      return value;
  }
  
 -static void configfs_composite_disconnect(struct usb_gadget *gadget)
 +static void android_disconnect(struct usb_gadget *gadget)
  {
 -      struct usb_composite_dev *cdev;
 +      struct usb_composite_dev        *cdev = get_gadget_data(gadget);
        struct gadget_info *gi;
 -      unsigned long flags;
  
 -      cdev = get_gadget_data(gadget);
 -      if (!cdev)
 +      if (!cdev) {
 +              pr_err("%s: gadget is not connected\n", __func__);
                return;
 +      }
  
        gi = container_of(cdev, struct gadget_info, cdev);
 -      spin_lock_irqsave(&gi->spinlock, flags);
 -      cdev = get_gadget_data(gadget);
 -      if (!cdev || gi->unbind) {
 -              spin_unlock_irqrestore(&gi->spinlock, flags);
 +
 +      /* FIXME: There's a race between usb_gadget_udc_stop() which is likely
 +       * to set the gadget driver to NULL in the udc driver and this drivers
 +       * gadget disconnect fn which likely checks for the gadget driver to
 +       * be a null ptr. It happens that unbind (doing set_gadget_data(NULL))
 +       * is called before the gadget driver is set to NULL and the udc driver
 +       * calls disconnect fn which results in cdev being a null ptr.
 +       */
 +      if (cdev == NULL) {
 +              WARN(1, "%s: gadget driver already disconnected\n", __func__);
                return;
        }
  
 +      /* accessory HID support can be active while the
 +              accessory function is not actually enabled,
 +              so we need to inform it when we are disconnected.
 +      */
 +
 +#ifdef CONFIG_USB_CONFIGFS_F_ACC
 +      acc_disconnect();
 +#endif
 +      gi->connected = 0;
 +      if (!gi->unbinding)
 +              schedule_work(&gi->work);
        composite_disconnect(gadget);
 -      spin_unlock_irqrestore(&gi->spinlock, flags);
  }
 +#endif
 +
 +static const struct usb_gadget_driver configfs_driver_template = {
 +      .bind           = configfs_composite_bind,
 +      .unbind         = configfs_composite_unbind,
 +#ifdef CONFIG_USB_CONFIGFS_UEVENT
 +      .setup          = android_setup,
 +      .reset          = android_disconnect,
 +      .disconnect     = android_disconnect,
 +#else
 +      .setup          = composite_setup,
 +      .reset          = composite_disconnect,
 +      .disconnect     = composite_disconnect,
 +#endif
 +      .suspend        = composite_suspend,
 +      .resume         = composite_resume,
 +
 +      .max_speed      = USB_SPEED_SUPER,
 +      .driver = {
 +              .owner          = THIS_MODULE,
 +              .name           = "configfs-gadget",
 +      },
 +};
  
 -static void configfs_composite_suspend(struct usb_gadget *gadget)
 +#ifdef CONFIG_USB_CONFIGFS_UEVENT
 +static ssize_t state_show(struct device *pdev, struct device_attribute *attr,
 +                      char *buf)
  {
 +      struct gadget_info *dev = dev_get_drvdata(pdev);
        struct usb_composite_dev *cdev;
 -      struct gadget_info *gi;
 +      char *state = "DISCONNECTED";
        unsigned long flags;
  
 -      cdev = get_gadget_data(gadget);
 -      if (!cdev)
 -              return;
 +      if (!dev)
 +              goto out;
  
 -      gi = container_of(cdev, struct gadget_info, cdev);
 -      spin_lock_irqsave(&gi->spinlock, flags);
 -      cdev = get_gadget_data(gadget);
 -      if (!cdev || gi->unbind) {
 -              spin_unlock_irqrestore(&gi->spinlock, flags);
 -              return;
 -      }
 +      cdev = &dev->cdev;
  
 -      composite_suspend(gadget);
 -      spin_unlock_irqrestore(&gi->spinlock, flags);
 +      if (!cdev)
 +              goto out;
 +
 +      spin_lock_irqsave(&cdev->lock, flags);
 +      if (cdev->config)
 +              state = "CONFIGURED";
 +      else if (dev->connected)
 +              state = "CONNECTED";
 +      spin_unlock_irqrestore(&cdev->lock, flags);
 +out:
 +      return sprintf(buf, "%s\n", state);
  }
  
 -static void configfs_composite_resume(struct usb_gadget *gadget)
 -{
 -      struct usb_composite_dev *cdev;
 -      struct gadget_info *gi;
 -      unsigned long flags;
 +static DEVICE_ATTR(state, S_IRUGO, state_show, NULL);
  
 -      cdev = get_gadget_data(gadget);
 -      if (!cdev)
 -              return;
 +static struct device_attribute *android_usb_attributes[] = {
 +      &dev_attr_state,
 +      NULL
 +};
  
 -      gi = container_of(cdev, struct gadget_info, cdev);
 -      spin_lock_irqsave(&gi->spinlock, flags);
 -      cdev = get_gadget_data(gadget);
 -      if (!cdev || gi->unbind) {
 -              spin_unlock_irqrestore(&gi->spinlock, flags);
 -              return;
 +static int android_device_create(struct gadget_info *gi)
 +{
 +      struct device_attribute **attrs;
 +      struct device_attribute *attr;
 +      char str[10];
 +
 +      INIT_WORK(&gi->work, android_work);
 +      snprintf(str, sizeof(str), "android%d", gadget_index - 1);
 +      pr_debug("Creating android device %s\n", str);
 +      gi->dev = device_create(android_class, NULL,
 +                              MKDEV(0, 0), NULL, str);
 +      if (IS_ERR(gi->dev))
 +              return PTR_ERR(gi->dev);
 +
 +      dev_set_drvdata(gi->dev, gi);
 +      if (gadget_index == 1)
 +              android_device = gi->dev;
 +
 +      attrs = android_usb_attributes;
 +      while ((attr = *attrs++)) {
 +              int err;
 +
 +              err = device_create_file(gi->dev, attr);
 +              if (err) {
 +                      device_destroy(gi->dev->class,
 +                                     gi->dev->devt);
 +                      return err;
 +              }
        }
  
 -      composite_resume(gadget);
 -      spin_unlock_irqrestore(&gi->spinlock, flags);
 +      return 0;
  }
  
 -static const struct usb_gadget_driver configfs_driver_template = {
 -      .bind           = configfs_composite_bind,
 -      .unbind         = configfs_composite_unbind,
 -
 -      .setup          = configfs_composite_setup,
 -      .reset          = configfs_composite_disconnect,
 -      .disconnect     = configfs_composite_disconnect,
 +static void android_device_destroy(struct device *dev)
 +{
 +      struct device_attribute **attrs;
 +      struct device_attribute *attr;
  
 -      .suspend        = configfs_composite_suspend,
 -      .resume         = configfs_composite_resume,
 +      attrs = android_usb_attributes;
 +      while ((attr = *attrs++))
 +              device_remove_file(dev, attr);
 +      device_destroy(dev->class, dev->devt);
 +}
 +#else
 +static inline int android_device_create(struct gadget_info *gi)
 +{
 +      return 0;
 +}
  
 -      .max_speed      = USB_SPEED_SUPER,
 -      .driver = {
 -              .owner          = THIS_MODULE,
 -              .name           = "configfs-gadget",
 -      },
 -};
 +static inline void android_device_destroy(struct device *dev)
 +{
 +}
 +#endif
  
  static struct config_group *gadgets_make(
                struct config_group *group,
        gi = kzalloc(sizeof(*gi), GFP_KERNEL);
        if (!gi)
                return ERR_PTR(-ENOMEM);
 -
        gi->group.default_groups = gi->default_groups;
        gi->group.default_groups[0] = &gi->functions_group;
        gi->group.default_groups[1] = &gi->configs_group;
        gi->composite.resume = NULL;
        gi->composite.max_speed = USB_SPEED_SUPER;
  
+       spin_lock_init(&gi->spinlock);
        mutex_init(&gi->lock);
        INIT_LIST_HEAD(&gi->string_list);
        INIT_LIST_HEAD(&gi->available_func);
        if (!gi->composite.gadget_driver.function)
                goto err;
  
 +      gadget_index++;
 +      pr_debug("Creating gadget index %d\n", gadget_index);
 +      if (android_device_create(gi) < 0)
 +              goto err;
 +
        config_group_init_type_name(&gi->group, name,
                                &gadget_root_type);
        return &gi->group;
 +
  err:
        kfree(gi);
        return ERR_PTR(-ENOMEM);
  
  static void gadgets_drop(struct config_group *group, struct config_item *item)
  {
 +      struct gadget_info *gi;
 +
 +      gi = container_of(to_config_group(item), struct gadget_info, group);
        config_item_put(item);
 +      if (gi->dev) {
 +              android_device_destroy(gi->dev);
 +              gi->dev = NULL;
 +      }
  }
  
  static struct configfs_group_operations gadgets_ops = {
@@@ -1780,7 -1597,6 +1781,7 @@@ void unregister_gadget_item(struct conf
  {
        struct gadget_info *gi = to_gadget_info(item);
  
 +      /* to protect race with gadget_dev_desc_UDC_store*/
        mutex_lock(&gi->lock);
        unregister_gadget(gi);
        mutex_unlock(&gi->lock);
@@@ -1793,28 -1609,13 +1794,28 @@@ static int __init gadget_cfs_init(void
  
        config_group_init(&gadget_subsys.su_group);
  
 +      debug_debugfs_init();
 +
        ret = configfs_register_subsystem(&gadget_subsys);
 +
 +#ifdef CONFIG_USB_CONFIGFS_UEVENT
 +      android_class = class_create(THIS_MODULE, "android_usb");
 +      if (IS_ERR(android_class))
 +              return PTR_ERR(android_class);
 +#endif
 +
        return ret;
  }
  module_init(gadget_cfs_init);
  
  static void __exit gadget_cfs_exit(void)
  {
 +      debug_debugfs_exit();
        configfs_unregister_subsystem(&gadget_subsys);
 +#ifdef CONFIG_USB_CONFIGFS_UEVENT
 +      if (!IS_ERR(android_class))
 +              class_destroy(android_class);
 +#endif
 +
  }
  module_exit(gadget_cfs_exit);
@@@ -4,7 -4,6 +4,7 @@@
   * Copyright (C) 2003 Al Borchers (alborchers@steinerpoint.com)
   * Copyright (C) 2008 David Brownell
   * Copyright (C) 2008 by Nokia Corporation
 + * Copyright (c) 2013-2017 The Linux Foundation. All rights reserved.
   *
   * This code also borrows from usbserial.c, which is
   * Copyright (C) 1999 - 2002 Greg Kroah-Hartman (greg@kroah.com)
@@@ -28,8 -27,6 +28,8 @@@
  #include <linux/slab.h>
  #include <linux/export.h>
  #include <linux/module.h>
 +#include <linux/debugfs.h>
 +#include <linux/workqueue.h>
  
  #include "u_serial.h"
  
   * next layer of buffering.  For TX that's a circular buffer; for RX
   * consider it a NOP.  A third layer is provided by the TTY code.
   */
 -#define QUEUE_SIZE            16
 +#define TX_QUEUE_SIZE          8
 +#define TX_BUF_SIZE            4096
  #define WRITE_BUF_SIZE                8192            /* TX only */
  
 +#define RX_QUEUE_SIZE          8
 +#define RX_BUF_SIZE            4096
 +
  /* circular buffer */
  struct gs_buf {
        unsigned                buf_size;
@@@ -113,7 -106,7 +113,7 @@@ struct gs_port 
        int read_allocated;
        struct list_head        read_queue;
        unsigned                n_read;
 -      struct tasklet_struct   push;
 +      struct work_struct      push;
  
        struct list_head        write_pool;
        int write_started;
  
        /* REVISIT this state ... */
        struct usb_cdc_line_coding port_line_coding;    /* 8-N-1 etc */
 +      unsigned long   nbytes_from_host;
 +      unsigned long   nbytes_to_tty;
 +      unsigned long   nbytes_from_tty;
 +      unsigned long   nbytes_to_host;
  };
  
  static struct portmaster {
        struct gs_port  *port;
  } ports[MAX_U_SERIAL_PORTS];
  
 +static struct workqueue_struct *gserial_wq;
  #define GS_CLOSE_TIMEOUT              15              /* seconds */
  
  
@@@ -372,50 -360,26 +372,50 @@@ __releases(&port->port_lock
  __acquires(&port->port_lock)
  */
  {
 -      struct list_head        *pool = &port->write_pool;
 +      struct list_head        *pool;
        struct usb_ep           *in;
        int                     status = 0;
 +      static long             prev_len;
        bool                    do_tty_wake = false;
  
 -      if (!port->port_usb)
 -              return status;
 +      if (!port || !port->port_usb) {
 +              pr_err("Error - port or port->usb is NULL.");
 +              return -EIO;
 +      }
  
 -      in = port->port_usb->in;
 +      pool = &port->write_pool;
 +      in   = port->port_usb->in;
  
        while (!port->write_busy && !list_empty(pool)) {
                struct usb_request      *req;
                int                     len;
  
 -              if (port->write_started >= QUEUE_SIZE)
 +              if (port->write_started >= TX_QUEUE_SIZE)
                        break;
  
                req = list_entry(pool->next, struct usb_request, list);
 -              len = gs_send_packet(port, req->buf, in->maxpacket);
 +              len = gs_send_packet(port, req->buf, TX_BUF_SIZE);
                if (len == 0) {
 +                      /* Queue zero length packet explicitly to make it
 +                       * work with UDCs which don't support req->zero flag
 +                       */
 +                      if (prev_len && (prev_len % in->maxpacket == 0)) {
 +                              req->length = 0;
 +                              list_del(&req->list);
 +                              spin_unlock(&port->port_lock);
 +                              status = usb_ep_queue(in, req, GFP_ATOMIC);
 +                              spin_lock(&port->port_lock);
 +                              if (!port->port_usb) {
 +                                      gs_free_req(in, req);
 +                                      break;
 +                              }
 +                              if (status) {
 +                                      printk(KERN_ERR "%s: %s err %d\n",
 +                                              __func__, "queue", status);
 +                                      list_add(&req->list, pool);
 +                              }
 +                              prev_len = 0;
 +                      }
                        wake_up_interruptible(&port->drain_wait);
                        break;
                }
  
                req->length = len;
                list_del(&req->list);
 -              req->zero = (gs_buf_data_avail(&port->port_write_buf) == 0);
  
                pr_vdebug("ttyGS%d: tx len=%d, 0x%02x 0x%02x 0x%02x ...\n",
                          port->port_num, len, *((u8 *)req->buf),
                status = usb_ep_queue(in, req, GFP_ATOMIC);
                spin_lock(&port->port_lock);
                port->write_busy = false;
 +              /*
 +               * If port_usb is NULL, gserial disconnect is called
 +               * while the spinlock is dropped and all requests are
 +               * freed. Free the current request here.
 +               */
 +              if (!port->port_usb) {
 +                      do_tty_wake = false;
 +                      gs_free_req(in, req);
 +                      break;
 +              }
  
                if (status) {
                        pr_debug("%s: %s %s err %d\n",
                        break;
                }
  
 -              port->write_started++;
 +              prev_len = req->length;
 +              port->nbytes_from_tty += req->length;
  
 -              /* abort immediately after disconnect */
 -              if (!port->port_usb)
 -                      break;
 +              port->write_started++;
        }
  
        if (do_tty_wake && port->port.tty)
@@@ -478,17 -434,8 +478,17 @@@ __releases(&port->port_lock
  __acquires(&port->port_lock)
  */
  {
 -      struct list_head        *pool = &port->read_pool;
 -      struct usb_ep           *out = port->port_usb->out;
 +      struct list_head        *pool;
 +      struct usb_ep           *out;
 +      unsigned                started = 0;
 +
 +      if (!port || !port->port_usb) {
 +              pr_err("Error - port or port->usb is NULL.");
 +              return -EIO;
 +      }
 +
 +      pool = &port->read_pool;
 +      out  = port->port_usb->out;
  
        while (!list_empty(pool)) {
                struct usb_request      *req;
                if (!tty)
                        break;
  
 -              if (port->read_started >= QUEUE_SIZE)
 +              if (port->read_started >= RX_QUEUE_SIZE)
                        break;
  
                req = list_entry(pool->next, struct usb_request, list);
                list_del(&req->list);
 -              req->length = out->maxpacket;
 +              req->length = RX_BUF_SIZE;
  
                /* drop lock while we call out; the controller driver
                 * may need to call us back (e.g. for disconnect)
                status = usb_ep_queue(out, req, GFP_ATOMIC);
                spin_lock(&port->port_lock);
  
 +              /*
 +               * If port_usb is NULL, gserial disconnect is called
 +               * while the spinlock is dropped and all requests are
 +               * freed. Free the current request here.
 +               */
 +              if (!port->port_usb) {
 +                      started = 0;
 +                      gs_free_req(out, req);
 +                      break;
 +              }
 +
                if (status) {
                        pr_debug("%s: %s %s err %d\n",
                                        __func__, "queue", out->name, status);
                        break;
                }
                port->read_started++;
 -
 -              /* abort immediately after disconnect */
 -              if (!port->port_usb)
 -                      break;
        }
        return port->read_started;
  }
   * So QUEUE_SIZE packets plus however many the FIFO holds (usually two)
   * can be buffered before the TTY layer's buffers (currently 64 KB).
   */
 -static void gs_rx_push(unsigned long _port)
 +static void gs_rx_push(struct work_struct *w)
  {
 -      struct gs_port          *port = (void *)_port;
 +      struct gs_port          *port = container_of(w, struct gs_port, push);
        struct tty_struct       *tty;
        struct list_head        *queue = &port->read_queue;
        bool                    disconnect = false;
  
                        count = tty_insert_flip_string(&port->port, packet,
                                        size);
 +                      port->nbytes_to_tty += count;
                        if (count)
                                do_push = true;
                        if (count != size) {
         * this time around, there may be trouble unless there's an
         * implicit tty_unthrottle() call on its way...
         *
 -       * REVISIT we should probably add a timer to keep the tasklet
 +       * REVISIT we should probably add a timer to keep the work queue
         * from starving ... but it's not clear that case ever happens.
         */
        if (!list_empty(queue) && tty) {
                if (!test_bit(TTY_THROTTLED, &tty->flags)) {
                        if (do_push)
 -                              tasklet_schedule(&port->push);
 +                              queue_work(gserial_wq, &port->push);
                        else
                                pr_warn("ttyGS%d: RX not scheduled?\n",
                                        port->port_num);
  static void gs_read_complete(struct usb_ep *ep, struct usb_request *req)
  {
        struct gs_port  *port = ep->driver_data;
 +      unsigned long flags;
  
        /* Queue all received data until the tty layer is ready for it. */
 -      spin_lock(&port->port_lock);
 +      spin_lock_irqsave(&port->port_lock, flags);
 +      port->nbytes_from_host += req->actual;
        list_add_tail(&req->list, &port->read_queue);
 -      tasklet_schedule(&port->push);
 -      spin_unlock(&port->port_lock);
 +      queue_work(gserial_wq, &port->push);
 +      spin_unlock_irqrestore(&port->port_lock, flags);
  }
  
  static void gs_write_complete(struct usb_ep *ep, struct usb_request *req)
  {
        struct gs_port  *port = ep->driver_data;
 +      unsigned long flags;
  
 -      spin_lock(&port->port_lock);
 +      spin_lock_irqsave(&port->port_lock, flags);
 +      port->nbytes_to_host += req->actual;
        list_add(&req->list, &port->write_pool);
        port->write_started--;
  
                /* FALL THROUGH */
        case 0:
                /* normal completion */
 -              gs_start_tx(port);
 +              if (port->port_usb)
 +                      gs_start_tx(port);
                break;
  
        case -ESHUTDOWN:
                break;
        }
  
 -      spin_unlock(&port->port_lock);
 +      spin_unlock_irqrestore(&port->port_lock, flags);
  }
  
  static void gs_free_requests(struct usb_ep *ep, struct list_head *head,
  }
  
  static int gs_alloc_requests(struct usb_ep *ep, struct list_head *head,
 +              int queue_size, int req_size,
                void (*fn)(struct usb_ep *, struct usb_request *),
                int *allocated)
  {
        int                     i;
        struct usb_request      *req;
 -      int n = allocated ? QUEUE_SIZE - *allocated : QUEUE_SIZE;
 +      int n = allocated ? queue_size - *allocated : queue_size;
  
        /* Pre-allocate up to QUEUE_SIZE transfers, but if we can't
         * do quite that many this time, don't fail ... we just won't
         * be as speedy as we might otherwise be.
         */
        for (i = 0; i < n; i++) {
 -              req = gs_alloc_req(ep, ep->maxpacket, GFP_ATOMIC);
 +              req = gs_alloc_req(ep, req_size, GFP_ATOMIC);
                if (!req)
                        return list_empty(head) ? -ENOMEM : 0;
                req->complete = fn;
   */
  static int gs_start_io(struct gs_port *port)
  {
 -      struct list_head        *head = &port->read_pool;
 -      struct usb_ep           *ep = port->port_usb->out;
 +      struct list_head        *head;
 +      struct usb_ep           *ep;
        int                     status;
        unsigned                started;
  
 +      if (!port || !port->port_usb) {
 +              pr_err("Error - port or port->usb is NULL.");
 +              return -EIO;
 +      }
 +
 +      head = &port->read_pool;
 +      ep = port->port_usb->out;
 +
        /* Allocate RX and TX I/O buffers.  We can't easily do this much
         * earlier (with GFP_KERNEL) because the requests are coupled to
         * endpoints, as are the packet sizes we'll be using.  Different
         * configurations may use different endpoints with a given port;
         * and high speed vs full speed changes packet sizes too.
         */
 -      status = gs_alloc_requests(ep, head, gs_read_complete,
 -              &port->read_allocated);
 +      status = gs_alloc_requests(ep, head, RX_QUEUE_SIZE, RX_BUF_SIZE,
 +                      gs_read_complete, &port->read_allocated);
        if (status)
                return status;
  
        status = gs_alloc_requests(port->port_usb->in, &port->write_pool,
 +                      TX_QUEUE_SIZE, TX_BUF_SIZE,
                        gs_write_complete, &port->write_allocated);
        if (status) {
                gs_free_requests(ep, head, &port->read_allocated);
        port->n_read = 0;
        started = gs_start_rx(port);
  
 +      if (!port->port_usb)
 +              return -EIO;
 +
        /* unblock any pending writes into our circular buffer */
        if (started) {
                tty_wakeup(port->port.tty);
@@@ -862,7 -783,7 +862,7 @@@ static int gs_open(struct tty_struct *t
                spin_lock_irq(&port->port_lock);
  
                if (status) {
 -                      pr_debug("gs_open: ttyGS%d (%p,%p) no buffer\n",
 +                      pr_debug("gs_open: ttyGS%d (%pK,%pK) no buffer\n",
                                port->port_num, tty, file);
                        port->openclose = false;
                        goto exit_unlock_port;
                        gser->connect(gser);
        }
  
 -      pr_debug("gs_open: ttyGS%d (%p,%p)\n", port->port_num, tty, file);
 +      pr_debug("gs_open: ttyGS%d (%pK,%pK)\n", port->port_num, tty, file);
  
        status = 0;
  
@@@ -928,8 -849,7 +928,8 @@@ static void gs_close(struct tty_struct 
                goto exit;
        }
  
 -      pr_debug("gs_close: ttyGS%d (%p,%p) ...\n", port->port_num, tty, file);
 +      pr_debug("gs_close: ttyGS%d (%pK,%pK) ...\n",
 +                      port->port_num, tty, file);
  
        /* mark port as closing but in use; we can drop port lock
         * and sleep if necessary
  
        /* Iff we're disconnected, there can be no I/O in flight so it's
         * ok to free the circular buffer; else just scrub it.  And don't
 -       * let the push tasklet fire again until we're re-opened.
 +       * let the push work queue fire again until we're re-opened.
         */
        if (gser == NULL)
                gs_buf_free(&port->port_write_buf);
  
        port->openclose = false;
  
 -      pr_debug("gs_close: ttyGS%d (%p,%p) done!\n",
 +      pr_debug("gs_close: ttyGS%d (%pK,%pK) done!\n",
                        port->port_num, tty, file);
  
        wake_up(&port->close_wait);
@@@ -980,10 -900,7 +980,10 @@@ static int gs_write(struct tty_struct *
        unsigned long   flags;
        int             status;
  
 -      pr_vdebug("gs_write: ttyGS%d (%p) writing %d bytes\n",
 +      if (!port)
 +              return 0;
 +
 +      pr_vdebug("gs_write: ttyGS%d (%pK) writing %d bytes\n",
                        port->port_num, tty, count);
  
        spin_lock_irqsave(&port->port_lock, flags);
@@@ -1003,9 -920,7 +1003,9 @@@ static int gs_put_char(struct tty_struc
        unsigned long   flags;
        int             status;
  
 -      pr_vdebug("gs_put_char: (%d,%p) char=0x%x, called from %ps\n",
 +      if (!port)
 +              return 0;
 +      pr_vdebug("gs_put_char: (%d,%pK) char=0x%x, called from %pKs\n",
                port->port_num, tty, ch, __builtin_return_address(0));
  
        spin_lock_irqsave(&port->port_lock, flags);
@@@ -1020,9 -935,7 +1020,9 @@@ static void gs_flush_chars(struct tty_s
        struct gs_port  *port = tty->driver_data;
        unsigned long   flags;
  
 -      pr_vdebug("gs_flush_chars: (%d,%p)\n", port->port_num, tty);
 +      if (!port)
 +              return;
 +      pr_vdebug("gs_flush_chars: (%d,%pK)\n", port->port_num, tty);
  
        spin_lock_irqsave(&port->port_lock, flags);
        if (port->port_usb)
@@@ -1036,14 -949,12 +1036,14 @@@ static int gs_write_room(struct tty_str
        unsigned long   flags;
        int             room = 0;
  
 +      if (!port)
 +              return 0;
        spin_lock_irqsave(&port->port_lock, flags);
        if (port->port_usb)
                room = gs_buf_space_avail(&port->port_write_buf);
        spin_unlock_irqrestore(&port->port_lock, flags);
  
 -      pr_vdebug("gs_write_room: (%d,%p) room=%d\n",
 +      pr_vdebug("gs_write_room: (%d,%pK) room=%d\n",
                port->port_num, tty, room);
  
        return room;
@@@ -1059,7 -970,7 +1059,7 @@@ static int gs_chars_in_buffer(struct tt
        chars = gs_buf_data_avail(&port->port_write_buf);
        spin_unlock_irqrestore(&port->port_lock, flags);
  
 -      pr_vdebug("gs_chars_in_buffer: (%d,%p) chars=%d\n",
 +      pr_vdebug("gs_chars_in_buffer: (%d,%pK) chars=%d\n",
                port->port_num, tty, chars);
  
        return chars;
@@@ -1071,20 -982,13 +1071,20 @@@ static void gs_unthrottle(struct tty_st
        struct gs_port          *port = tty->driver_data;
        unsigned long           flags;
  
 +      /*
 +       * tty's driver data is set to NULL during port close.  Nothing
 +       * to do here.
 +       */
 +      if (!port)
 +              return;
 +
        spin_lock_irqsave(&port->port_lock, flags);
        if (port->port_usb) {
                /* Kickstart read queue processing.  We don't do xon/xoff,
                 * rts/cts, or other handshaking with the host, but if the
                 * read queue backs up enough we'll be NAKing OUT packets.
                 */
 -              tasklet_schedule(&port->push);
 +              queue_work(gserial_wq, &port->push);
                pr_vdebug("ttyGS%d: unthrottle\n", port->port_num);
        }
        spin_unlock_irqrestore(&port->port_lock, flags);
@@@ -1096,8 -1000,6 +1096,8 @@@ static int gs_break_ctl(struct tty_stru
        int             status = 0;
        struct gserial  *gser;
  
 +      if (!port)
 +              return 0;
        pr_vdebug("gs_break_ctl: ttyGS%d, send break (%d) \n",
                        port->port_num, duration);
  
        return status;
  }
  
 +static int gs_tiocmget(struct tty_struct *tty)
 +{
 +      struct gs_port  *port = tty->driver_data;
 +      struct gserial  *gser;
 +      unsigned int result = 0;
 +
 +      spin_lock_irq(&port->port_lock);
 +      gser = port->port_usb;
 +      if (!gser) {
 +              result = -ENODEV;
 +              goto fail;
 +      }
 +
 +      if (gser->get_dtr)
 +              result |= (gser->get_dtr(gser) ? TIOCM_DTR : 0);
 +
 +      if (gser->get_rts)
 +              result |= (gser->get_rts(gser) ? TIOCM_RTS : 0);
 +
 +      if (gser->serial_state & TIOCM_CD)
 +              result |= TIOCM_CD;
 +
 +      if (gser->serial_state & TIOCM_RI)
 +              result |= TIOCM_RI;
 +
 +fail:
 +      spin_unlock_irq(&port->port_lock);
 +      return result;
 +}
 +
 +static int gs_tiocmset(struct tty_struct *tty,
 +              unsigned int set, unsigned int clear)
 +{
 +      struct gs_port  *port = tty->driver_data;
 +      struct gserial *gser;
 +      int status = 0;
 +
 +      spin_lock_irq(&port->port_lock);
 +      gser = port->port_usb;
 +
 +      if (!gser) {
 +              status = -ENODEV;
 +              goto fail;
 +      }
 +
 +      if (set & TIOCM_RI) {
 +              if (gser->send_ring_indicator) {
 +                      gser->serial_state |= TIOCM_RI;
 +                      status = gser->send_ring_indicator(gser, 1);
 +              }
 +      }
 +
 +      if (clear & TIOCM_RI) {
 +              if (gser->send_ring_indicator) {
 +                      gser->serial_state &= ~TIOCM_RI;
 +                      status = gser->send_ring_indicator(gser, 0);
 +              }
 +      }
 +
 +      if (set & TIOCM_CD) {
 +              if (gser->send_carrier_detect) {
 +                      gser->serial_state |= TIOCM_CD;
 +                      status = gser->send_carrier_detect(gser, 1);
 +              }
 +      }
 +
 +      if (clear & TIOCM_CD) {
 +              if (gser->send_carrier_detect) {
 +                      gser->serial_state &= ~TIOCM_CD;
 +                      status = gser->send_carrier_detect(gser, 0);
 +              }
 +      }
 +fail:
 +      spin_unlock_irq(&port->port_lock);
 +      return status;
 +}
 +
  static const struct tty_operations gs_tty_ops = {
        .open =                 gs_open,
        .close =                gs_close,
        .chars_in_buffer =      gs_chars_in_buffer,
        .unthrottle =           gs_unthrottle,
        .break_ctl =            gs_break_ctl,
 +      .tiocmget =             gs_tiocmget,
 +      .tiocmset =             gs_tiocmset,
  };
  
  /*-------------------------------------------------------------------------*/
@@@ -1228,7 -1051,7 +1228,7 @@@ gs_port_alloc(unsigned port_num, struc
        init_waitqueue_head(&port->drain_wait);
        init_waitqueue_head(&port->close_wait);
  
 -      tasklet_init(&port->push, gs_rx_push, (unsigned long) port);
 +      INIT_WORK(&port->push, gs_rx_push);
  
        INIT_LIST_HEAD(&port->read_pool);
        INIT_LIST_HEAD(&port->read_queue);
        return ret;
  }
  
 +#if defined(CONFIG_DEBUG_FS)
 +
 +#define BUF_SIZE      512
 +
 +static ssize_t debug_read_status(struct file *file, char __user *ubuf,
 +                              size_t count, loff_t *ppos)
 +{
 +      struct gs_port *ui_dev = file->private_data;
 +      struct tty_struct       *tty;
 +      struct gserial          *gser;
 +      char *buf;
 +      unsigned long flags;
 +      int i = 0;
 +      int ret;
 +      int result = 0;
 +
 +      if (!ui_dev)
 +              return -EINVAL;
 +
 +      tty = ui_dev->port.tty;
 +      gser = ui_dev->port_usb;
 +
 +      buf = kzalloc(sizeof(char) * BUF_SIZE, GFP_KERNEL);
 +      if (!buf)
 +              return -ENOMEM;
 +
 +      spin_lock_irqsave(&ui_dev->port_lock, flags);
 +
 +      i += scnprintf(buf + i, BUF_SIZE - i,
 +                      "nbytes_from_host: %lu\n", ui_dev->nbytes_from_host);
 +
 +      i += scnprintf(buf + i, BUF_SIZE - i,
 +                      "nbytes_to_tty: %lu\n", ui_dev->nbytes_to_tty);
 +
 +      i += scnprintf(buf + i, BUF_SIZE - i, "nbytes_with_usb_OUT_txr: %lu\n",
 +                      (ui_dev->nbytes_from_host - ui_dev->nbytes_to_tty));
 +
 +      i += scnprintf(buf + i, BUF_SIZE - i,
 +                      "nbytes_from_tty: %lu\n", ui_dev->nbytes_from_tty);
 +
 +      i += scnprintf(buf + i, BUF_SIZE - i,
 +                      "nbytes_to_host: %lu\n", ui_dev->nbytes_to_host);
 +
 +      i += scnprintf(buf + i, BUF_SIZE - i, "nbytes_with_usb_IN_txr: %lu\n",
 +                      (ui_dev->nbytes_from_tty - ui_dev->nbytes_to_host));
 +
 +      if (tty)
 +              i += scnprintf(buf + i, BUF_SIZE - i,
 +                      "tty_flags: %lu\n", tty->flags);
 +
 +      if (gser->get_dtr) {
 +              result |= (gser->get_dtr(gser) ? TIOCM_DTR : 0);
 +              i += scnprintf(buf + i, BUF_SIZE - i,
 +                      "DTR_status: %d\n", result);
 +      }
 +
 +      spin_unlock_irqrestore(&ui_dev->port_lock, flags);
 +      ret = simple_read_from_buffer(ubuf, count, ppos, buf, i);
 +      kfree(buf);
 +      return ret;
 +}
 +
 +static ssize_t debug_write_reset(struct file *file, const char __user *buf,
 +              size_t count, loff_t *ppos)
 +{
 +      struct gs_port *ui_dev = file->private_data;
 +      unsigned long flags;
 +
 +      if (!ui_dev)
 +              return -EINVAL;
 +
 +      spin_lock_irqsave(&ui_dev->port_lock, flags);
 +      ui_dev->nbytes_from_host = ui_dev->nbytes_to_tty =
 +              ui_dev->nbytes_from_tty = ui_dev->nbytes_to_host = 0;
 +      spin_unlock_irqrestore(&ui_dev->port_lock, flags);
 +
 +      return count;
 +}
 +
 +static int serial_debug_open(struct inode *inode, struct file *file)
 +{
 +      file->private_data = inode->i_private;
 +      return 0;
 +}
 +
 +const struct file_operations debug_rst_ops = {
 +      .open = serial_debug_open,
 +      .write = debug_write_reset,
 +};
 +
 +const struct file_operations debug_adb_ops = {
 +      .open = serial_debug_open,
 +      .read = debug_read_status,
 +};
 +
 +struct dentry *gs_dent;
 +static void usb_debugfs_init(struct gs_port *ui_dev, int port_num)
 +{
 +      char buf[48];
 +
 +      if (!ui_dev)
 +              return;
 +
 +      snprintf(buf, 48, "usb_serial%d", port_num);
 +      gs_dent = debugfs_create_dir(buf, 0);
 +      if (!gs_dent || IS_ERR(gs_dent))
 +              return;
 +
 +      debugfs_create_file("readstatus", 0444, gs_dent, ui_dev,
 +                      &debug_adb_ops);
 +      debugfs_create_file("reset", S_IRUGO | S_IWUSR,
 +                      gs_dent, ui_dev, &debug_rst_ops);
 +}
 +
 +static void usb_debugfs_remove(void)
 +{
 +      debugfs_remove_recursive(gs_dent);
 +}
 +#else
 +static inline void usb_debugfs_init(struct gs_port *ui_dev, int port_num) {}
 +static inline void usb_debugfs_remove(void) {}
 +#endif
 +
  static int gs_closed(struct gs_port *port)
  {
        int cond;
  
  static void gserial_free_port(struct gs_port *port)
  {
 -      tasklet_kill(&port->push);
 +      cancel_work_sync(&port->push);
        /* wait for old opens to finish */
        wait_event(port->close_wait, gs_closed(port));
        WARN_ON(port->port_usb != NULL);
@@@ -1437,8 -1137,10 +1437,10 @@@ int gserial_alloc_line(unsigned char *l
                                __func__, port_num, PTR_ERR(tty_dev));
  
                ret = PTR_ERR(tty_dev);
+               mutex_lock(&ports[port_num].lock);
                port = ports[port_num].port;
                ports[port_num].port = NULL;
+               mutex_unlock(&ports[port_num].lock);
                gserial_free_port(port);
                goto err;
        }
@@@ -1582,9 -1284,6 +1584,9 @@@ void gserial_disconnect(struct gserial 
        port->read_allocated = port->read_started =
                port->write_allocated = port->write_started = 0;
  
 +      port->nbytes_from_host = port->nbytes_to_tty =
 +              port->nbytes_from_tty = port->nbytes_to_host = 0;
 +
        spin_unlock_irqrestore(&port->port_lock, flags);
  }
  EXPORT_SYMBOL_GPL(gserial_disconnect);
@@@ -1604,8 -1303,7 +1606,8 @@@ static int userial_init(void
  
        gs_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
        gs_tty_driver->subtype = SERIAL_TYPE_NORMAL;
 -      gs_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
 +      gs_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV
 +                              | TTY_DRIVER_RESET_TERMIOS;
        gs_tty_driver->init_termios = tty_std_termios;
  
        /* 9600-8-N-1 ... matches defaults expected by "usbser.sys" on
        for (i = 0; i < MAX_U_SERIAL_PORTS; i++)
                mutex_init(&ports[i].lock);
  
 +      gserial_wq = create_singlethread_workqueue("k_gserial");
 +      if (!gserial_wq) {
 +              status = -ENOMEM;
 +              goto fail;
 +      }
 +
        /* export the driver ... */
        status = tty_register_driver(gs_tty_driver);
        if (status) {
                goto fail;
        }
  
 +      for (i = 0; i < MAX_U_SERIAL_PORTS; i++)
 +              usb_debugfs_init(ports[i].port, i);
 +
        pr_debug("%s: registered %d ttyGS* device%s\n", __func__,
                        MAX_U_SERIAL_PORTS,
                        (MAX_U_SERIAL_PORTS == 1) ? "" : "s");
        return status;
  fail:
        put_tty_driver(gs_tty_driver);
 +      if (gserial_wq)
 +              destroy_workqueue(gserial_wq);
        gs_tty_driver = NULL;
        return status;
  }
@@@ -1654,8 -1341,6 +1656,8 @@@ module_init(userial_init)
  
  static void userial_cleanup(void)
  {
 +      usb_debugfs_remove();
 +      destroy_workqueue(gserial_wq);
        tty_unregister_driver(gs_tty_driver);
        put_tty_driver(gs_tty_driver);
        gs_tty_driver = NULL;
@@@ -20,7 -20,7 +20,7 @@@
   * Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
   */
  
 -
 +#include <linux/gfp.h>
  #include <linux/slab.h>
  #include <asm/unaligned.h>
  
@@@ -376,6 -376,10 +376,6 @@@ static int xhci_stop_device(struct xhci
        int i;
  
        ret = 0;
 -      virt_dev = xhci->devs[slot_id];
 -      if (!virt_dev)
 -              return -ENODEV;
 -
        cmd = xhci_alloc_command(xhci, false, true, GFP_NOIO);
        if (!cmd) {
                xhci_dbg(xhci, "Couldn't allocate command structure.\n");
        }
  
        spin_lock_irqsave(&xhci->lock, flags);
 +      virt_dev = xhci->devs[slot_id];
 +      if (!virt_dev) {
 +              spin_unlock_irqrestore(&xhci->lock, flags);
 +              xhci_free_command(xhci, cmd);
 +              return -ENODEV;
 +      }
 +
        for (i = LAST_EP_INDEX; i > 0; i--) {
                if (virt_dev->eps[i].ring && virt_dev->eps[i].ring->dequeue) {
                        struct xhci_command *command;
@@@ -707,7 -704,7 +707,7 @@@ static u32 xhci_get_port_status(struct 
                struct xhci_bus_state *bus_state,
                __le32 __iomem **port_array,
                u16 wIndex, u32 raw_port_status,
-               unsigned long flags)
+               unsigned long *flags)
        __releases(&xhci->lock)
        __acquires(&xhci->lock)
  {
                        status |= USB_PORT_STAT_C_BH_RESET << 16;
                if ((raw_port_status & PORT_CEC))
                        status |= USB_PORT_STAT_C_CONFIG_ERROR << 16;
+               /* USB3 remote wake resume signaling completed */
+               if (bus_state->port_remote_wakeup & (1 << wIndex) &&
+                   (raw_port_status & PORT_PLS_MASK) != XDEV_RESUME &&
+                   (raw_port_status & PORT_PLS_MASK) != XDEV_RECOVERY) {
+                       bus_state->port_remote_wakeup &= ~(1 << wIndex);
+                       usb_hcd_end_port_resume(&hcd->self, wIndex);
+               }
        }
  
        if (hcd->speed < HCD_USB3) {
                        xhci_set_link_state(xhci, port_array, wIndex,
                                        XDEV_U0);
  
-                       spin_unlock_irqrestore(&xhci->lock, flags);
+                       spin_unlock_irqrestore(&xhci->lock, *flags);
                        time_left = wait_for_completion_timeout(
                                        &bus_state->rexit_done[wIndex],
                                        msecs_to_jiffies(
                                                XHCI_MAX_REXIT_TIMEOUT_MS));
-                       spin_lock_irqsave(&xhci->lock, flags);
+                       spin_lock_irqsave(&xhci->lock, *flags);
  
                        if (time_left) {
                                slot_id = xhci_find_slot_id_by_port(hcd,
        return status;
  }
  
 +static void xhci_single_step_completion(struct urb *urb)
 +{
 +      struct completion *done = urb->context;
 +
 +      complete(done);
 +}
 +
 +/*
 + * Allocate a URB and initialize the various fields of it.
 + * This API is used by the single_step_set_feature test of
 + * EHSET where IN packet of the GetDescriptor request is
 + * sent 15secs after the SETUP packet.
 + * Return NULL if failed.
 + */
 +static struct urb *xhci_request_single_step_set_feature_urb(
 +              struct usb_device *udev,
 +              void *dr,
 +              void *buf,
 +              struct completion *done)
 +{
 +      struct urb *urb;
 +      struct usb_hcd *hcd = bus_to_hcd(udev->bus);
 +      struct usb_host_endpoint *ep;
 +
 +      urb = usb_alloc_urb(0, GFP_KERNEL);
 +      if (!urb)
 +              return NULL;
 +
 +      urb->pipe = usb_rcvctrlpipe(udev, 0);
 +      ep = udev->ep_in[usb_pipeendpoint(urb->pipe)];
 +      if (!ep) {
 +              usb_free_urb(urb);
 +              return NULL;
 +      }
 +
 +      /*
 +       * Initialize the various URB fields as these are used by the HCD
 +       * driver to queue it and as well as when completion happens.
 +       */
 +      urb->ep = ep;
 +      urb->dev = udev;
 +      urb->setup_packet = dr;
 +      urb->transfer_buffer = buf;
 +      urb->transfer_buffer_length = USB_DT_DEVICE_SIZE;
 +      urb->complete = xhci_single_step_completion;
 +      urb->status = -EINPROGRESS;
 +      urb->actual_length = 0;
 +      urb->transfer_flags = URB_DIR_IN;
 +      usb_get_urb(urb);
 +      atomic_inc(&urb->use_count);
 +      atomic_inc(&urb->dev->urbnum);
 +      usb_hcd_map_urb_for_dma(hcd, urb, GFP_KERNEL);
 +      urb->context = done;
 +      return urb;
 +}
 +
 +/*
 + * This function implements the USB_PORT_FEAT_TEST handling of the
 + * SINGLE_STEP_SET_FEATURE test mode as defined in the Embedded
 + * High-Speed Electrical Test (EHSET) specification. This simply
 + * issues a GetDescriptor control transfer, with an inserted 15-second
 + * delay after the end of the SETUP stage and before the IN token of
 + * the DATA stage is set. The idea is that this gives the test operator
 + * enough time to configure the oscilloscope to perform a measurement
 + * of the response time between the DATA and ACK packets that follow.
 + */
 +static int xhci_ehset_single_step_set_feature(struct usb_hcd *hcd, int port)
 +{
 +      int retval;
 +      struct usb_ctrlrequest *dr;
 +      struct urb *urb;
 +      struct usb_device *udev;
 +      struct xhci_hcd *xhci = hcd_to_xhci(hcd);
 +      struct usb_device_descriptor *buf;
 +      unsigned long flags;
 +      DECLARE_COMPLETION_ONSTACK(done);
 +
 +      /* Obtain udev of the rhub's child port */
 +      udev = usb_hub_find_child(hcd->self.root_hub, port);
 +      if (!udev) {
 +              xhci_err(xhci, "No device attached to the RootHub\n");
 +              return -ENODEV;
 +      }
 +      buf = kmalloc(USB_DT_DEVICE_SIZE, GFP_KERNEL);
 +      if (!buf)
 +              return -ENOMEM;
 +
 +      dr = kmalloc(sizeof(struct usb_ctrlrequest), GFP_KERNEL);
 +      if (!dr) {
 +              kfree(buf);
 +              return -ENOMEM;
 +      }
 +
 +      /* Fill Setup packet for GetDescriptor */
 +      dr->bRequestType = USB_DIR_IN;
 +      dr->bRequest = USB_REQ_GET_DESCRIPTOR;
 +      dr->wValue = cpu_to_le16(USB_DT_DEVICE << 8);
 +      dr->wIndex = 0;
 +      dr->wLength = cpu_to_le16(USB_DT_DEVICE_SIZE);
 +      urb = xhci_request_single_step_set_feature_urb(udev, dr, buf, &done);
 +      if (!urb) {
 +              retval = -ENOMEM;
 +              goto cleanup;
 +      }
 +
 +      /* Now complete just the SETUP stage */
 +      spin_lock_irqsave(&xhci->lock, flags);
 +      retval = xhci_submit_single_step_set_feature(hcd, urb, 1);
 +      spin_unlock_irqrestore(&xhci->lock, flags);
 +      if (retval)
 +              goto out1;
 +
 +      if (!wait_for_completion_timeout(&done, msecs_to_jiffies(2000))) {
 +              usb_kill_urb(urb);
 +              retval = -ETIMEDOUT;
 +              xhci_err(xhci, "%s SETUP stage timed out on ep0\n", __func__);
 +              goto out1;
 +      }
 +
 +      /* Sleep for 15 seconds; HC will send SOFs during this period */
 +      msleep(15 * 1000);
 +
 +      /* Complete remaining DATA and status stages. Re-use same URB */
 +      urb->status = -EINPROGRESS;
 +      usb_get_urb(urb);
 +      atomic_inc(&urb->use_count);
 +      atomic_inc(&urb->dev->urbnum);
 +
 +      spin_lock_irqsave(&xhci->lock, flags);
 +      retval = xhci_submit_single_step_set_feature(hcd, urb, 0);
 +      spin_unlock_irqrestore(&xhci->lock, flags);
 +      if (!retval && !wait_for_completion_timeout(&done,
 +                                              msecs_to_jiffies(2000))) {
 +              usb_kill_urb(urb);
 +              retval = -ETIMEDOUT;
 +              xhci_err(xhci, "%s IN stage timed out on ep0\n", __func__);
 +      }
 +out1:
 +      usb_free_urb(urb);
 +cleanup:
 +      kfree(dr);
 +      kfree(buf);
 +      return retval;
 +}
 +
  int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
                u16 wIndex, char *buf, u16 wLength)
  {
        u16 link_state = 0;
        u16 wake_mask = 0;
        u16 timeout = 0;
 +      u16 test_mode = 0;
  
        max_ports = xhci_get_ports(hcd, &port_array);
        bus_state = &xhci->bus_state[hcd_index(hcd)];
                        break;
                }
                status = xhci_get_port_status(hcd, bus_state, port_array,
-                               wIndex, temp, flags);
+                               wIndex, temp, &flags);
                if (status == 0xffffffff)
                        goto error;
  
                        link_state = (wIndex & 0xff00) >> 3;
                if (wValue == USB_PORT_FEAT_REMOTE_WAKE_MASK)
                        wake_mask = wIndex & 0xff00;
 -              /* The MSB of wIndex is the U1/U2 timeout */
 -              timeout = (wIndex & 0xff00) >> 8;
 +              /* The MSB of wIndex is the U1/U2 timeout OR TEST mode*/
 +              test_mode = timeout = (wIndex & 0xff00) >> 8;
                wIndex &= 0xff;
                if (!wIndex || wIndex > max_ports)
                        goto error;
                                temp = readl(port_array[wIndex]);
                                break;
                        }
 +
 +                      /*
 +                       * For xHCI 1.1 according to section 4.19.1.2.4.1 a
 +                       * root hub port's transition to compliance mode upon
 +                       * detecting LFPS timeout may be controlled by an
 +                       * Compliance Transition Enabled (CTE) flag (not
 +                       * software visible). This flag is set by writing 0xA
 +                       * to PORTSC PLS field which will allow transition to
 +                       * compliance mode the next time LFPS timeout is
 +                       * encountered. A warm reset will clear it.
 +                       *
 +                       * The CTE flag is only supported if the HCCPARAMS2 CTC
 +                       * flag is set, otherwise, the compliance substate is
 +                       * automatically entered as on 1.0 and prior.
 +                       */
 +                      if (link_state == USB_SS_PORT_LS_COMP_MOD) {
 +                              if (!HCC2_CTC(xhci->hcc_params2)) {
 +                                      xhci_dbg(xhci, "CTC flag is 0, port already supports entering compliance mode\n");
 +                                      break;
 +                              }
 +
 +                              if ((temp & PORT_CONNECT)) {
 +                                      xhci_warn(xhci, "Can't set compliance mode when port is connected\n");
 +                                      goto error;
 +                              }
 +
 +                              xhci_dbg(xhci, "Enable compliance mode transition for port %d\n",
 +                                              wIndex);
 +                              xhci_set_link_state(xhci, port_array, wIndex,
 +                                              link_state);
 +                              temp = readl(port_array[wIndex]);
 +                              break;
 +                      }
 +
                        /* Port must be enabled */
                        if (!(temp & PORT_PE)) {
                                retval = -ENODEV;
                        temp |= PORT_U2_TIMEOUT(timeout);
                        writel(temp, port_array[wIndex] + PORTPMSC);
                        break;
 +              case USB_PORT_FEAT_TEST:
 +                      slot_id = xhci_find_slot_id_by_port(hcd, xhci,
 +                                                      wIndex + 1);
 +                      if (test_mode && test_mode <= 5) {
 +                              /* unlock to execute stop endpoint commands */
 +                              spin_unlock_irqrestore(&xhci->lock, flags);
 +                              xhci_stop_device(xhci, slot_id, 1);
 +                              spin_lock_irqsave(&xhci->lock, flags);
 +                              xhci_halt(xhci);
 +
 +                              temp = readl_relaxed(port_array[wIndex] +
 +                                                              PORTPMSC);
 +                              temp |= test_mode << 28;
 +                              writel_relaxed(temp, port_array[wIndex] +
 +                                                              PORTPMSC);
 +                              /* to make sure above write goes through */
 +                              mb();
 +                      } else if (test_mode == 6) {
 +                              spin_unlock_irqrestore(&xhci->lock, flags);
 +                              retval = xhci_ehset_single_step_set_feature(hcd,
 +                                                                      wIndex);
 +                              spin_lock_irqsave(&xhci->lock, flags);
 +                      } else {
 +                              goto error;
 +                      }
 +                      break;
                default:
                        goto error;
                }
                                xhci_set_link_state(xhci, port_array, wIndex,
                                                        XDEV_RESUME);
                                spin_unlock_irqrestore(&xhci->lock, flags);
 -                              msleep(USB_RESUME_TIMEOUT);
 +                              usleep_range(21000, 21500);
                                spin_lock_irqsave(&xhci->lock, flags);
                                xhci_set_link_state(xhci, port_array, wIndex,
                                                        XDEV_U0);
@@@ -1695,7 -1494,7 +1703,7 @@@ int xhci_bus_resume(struct usb_hcd *hcd
  
        if (need_usb2_u3_exit) {
                spin_unlock_irqrestore(&xhci->lock, flags);
 -              msleep(USB_RESUME_TIMEOUT);
 +              usleep_range(21000, 21500);
                spin_lock_irqsave(&xhci->lock, flags);
        }
  
@@@ -1064,7 -1064,7 +1064,7 @@@ int xhci_alloc_virt_device(struct xhci_
  
        /* Point to output device context in dcbaa. */
        xhci->dcbaa->dev_context_ptrs[slot_id] = cpu_to_le64(dev->out_ctx->dma);
 -      xhci_dbg(xhci, "Set slot id %d dcbaa entry %p to 0x%llx\n",
 +      xhci_dbg(xhci, "Set slot id %d dcbaa entry %pK to 0x%llx\n",
                 slot_id,
                 &xhci->dcbaa->dev_context_ptrs[slot_id],
                 le64_to_cpu(xhci->dcbaa->dev_context_ptrs[slot_id]));
@@@ -1235,7 -1235,7 +1235,7 @@@ int xhci_setup_addressable_virt_dev(str
                if (udev->tt->multi)
                        slot_ctx->dev_info |= cpu_to_le32(DEV_MTT);
        }
 -      xhci_dbg(xhci, "udev->tt = %p\n", udev->tt);
 +      xhci_dbg(xhci, "udev->tt = %pK\n", udev->tt);
        xhci_dbg(xhci, "udev->ttport = 0x%x\n", udev->ttport);
  
        /* Step 4 - ring already allocated */
@@@ -1527,8 -1527,6 +1527,8 @@@ int xhci_endpoint_init(struct xhci_hcd 
                }
                break;
        case USB_SPEED_FULL:
 +              if (usb_endpoint_xfer_bulk(&ep->desc) && max_packet < 8)
 +                      max_packet = 8;
        case USB_SPEED_LOW:
                break;
        default:
@@@ -1842,151 -1840,25 +1842,151 @@@ void xhci_free_command(struct xhci_hcd 
        kfree(command);
  }
  
 -void xhci_mem_cleanup(struct xhci_hcd *xhci)
 +void xhci_handle_sec_intr_events(struct xhci_hcd *xhci, int intr_num)
  {
 +      union xhci_trb *erdp_trb, *current_trb;
 +      struct xhci_segment     *seg;
 +      u64 erdp_reg;
 +      u32 iman_reg;
 +      dma_addr_t deq;
 +      unsigned long segment_offset;
 +
 +      /* disable irq, ack pending interrupt and ack all pending events */
 +
 +      iman_reg =
 +              readl_relaxed(&xhci->sec_ir_set[intr_num]->irq_pending);
 +      iman_reg &= ~IMAN_IE;
 +      writel_relaxed(iman_reg,
 +                      &xhci->sec_ir_set[intr_num]->irq_pending);
 +      iman_reg =
 +              readl_relaxed(&xhci->sec_ir_set[intr_num]->irq_pending);
 +      if (iman_reg & IMAN_IP)
 +              writel_relaxed(iman_reg,
 +                      &xhci->sec_ir_set[intr_num]->irq_pending);
 +
 +      /* last acked event trb is in erdp reg  */
 +      erdp_reg =
 +              xhci_read_64(xhci, &xhci->sec_ir_set[intr_num]->erst_dequeue);
 +      deq = (dma_addr_t)(erdp_reg & ~ERST_PTR_MASK);
 +      if (!deq) {
 +              pr_debug("%s: event ring handling not required\n", __func__);
 +              return;
 +      }
 +
 +      seg = xhci->sec_event_ring[intr_num]->first_seg;
 +      segment_offset = deq - seg->dma;
 +
 +      /* find out virtual address of the last acked event trb */
 +      erdp_trb = current_trb = &seg->trbs[0] +
 +                              (segment_offset/sizeof(*current_trb));
 +
 +      /* read cycle state of the last acked trb to find out CCS */
 +      xhci->sec_event_ring[intr_num]->cycle_state =
 +                              (current_trb->event_cmd.flags & TRB_CYCLE);
 +
 +       while (1) {
 +              /* last trb of the event ring: toggle cycle state */
 +              if (current_trb == &seg->trbs[TRBS_PER_SEGMENT - 1]) {
 +                      xhci->sec_event_ring[intr_num]->cycle_state ^= 1;
 +                      current_trb = &seg->trbs[0];
 +              } else {
 +                      current_trb++;
 +              }
 +
 +              /* cycle state transition */
 +              if ((le32_to_cpu(current_trb->event_cmd.flags) & TRB_CYCLE) !=
 +                  xhci->sec_event_ring[intr_num]->cycle_state)
 +                      break;
 +      }
 +
 +      if (erdp_trb != current_trb) {
 +              deq =
 +              xhci_trb_virt_to_dma(xhci->sec_event_ring[intr_num]->deq_seg,
 +                                      current_trb);
 +              if (deq == 0)
 +                      xhci_warn(xhci,
 +                              "WARN ivalid SW event ring dequeue ptr.\n");
 +              /* Update HC event ring dequeue pointer */
 +              erdp_reg &= ERST_PTR_MASK;
 +              erdp_reg |= ((u64) deq & (u64) ~ERST_PTR_MASK);
 +      }
 +
 +      /* Clear the event handler busy flag (RW1C); event ring is empty. */
 +      erdp_reg |= ERST_EHB;
 +      xhci_write_64(xhci, erdp_reg,
 +                      &xhci->sec_ir_set[intr_num]->erst_dequeue);
 +}
 +
 +int xhci_sec_event_ring_cleanup(struct usb_hcd *hcd, unsigned intr_num)
 +{
 +      int size;
 +      struct xhci_hcd *xhci = hcd_to_xhci(hcd);
        struct device   *dev = xhci_to_hcd(xhci)->self.controller;
 +
 +      if (intr_num >= xhci->max_interrupters) {
 +              xhci_err(xhci, "invalid secondary interrupter num %d\n",
 +                      intr_num);
 +              return -EINVAL;
 +      }
 +
 +      size =
 +      sizeof(struct xhci_erst_entry)*(xhci->sec_erst[intr_num].num_entries);
 +      if (xhci->sec_erst[intr_num].entries) {
 +              xhci_handle_sec_intr_events(xhci, intr_num);
 +              dma_free_coherent(dev, size, xhci->sec_erst[intr_num].entries,
 +                              xhci->sec_erst[intr_num].erst_dma_addr);
 +              xhci->sec_erst[intr_num].entries = NULL;
 +      }
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Freed SEC ERST#%d",
 +              intr_num);
 +      if (xhci->sec_event_ring[intr_num])
 +              xhci_ring_free(xhci, xhci->sec_event_ring[intr_num]);
 +
 +      xhci->sec_event_ring[intr_num] = NULL;
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 +              "Freed sec event ring");
 +
 +      return 0;
 +}
 +
 +void xhci_event_ring_cleanup(struct xhci_hcd *xhci)
 +{
        int size;
 -      int i, j, num_ports;
 +      unsigned int i;
 +      struct device   *dev = xhci_to_hcd(xhci)->self.controller;
  
 -      cancel_delayed_work_sync(&xhci->cmd_timer);
 +      /* sec event ring clean up */
 +      for (i = 1; i < xhci->max_interrupters; i++)
 +              xhci_sec_event_ring_cleanup(xhci_to_hcd(xhci), i);
  
 -      /* Free the Event Ring Segment Table and the actual Event Ring */
 +      kfree(xhci->sec_ir_set);
 +      xhci->sec_ir_set = NULL;
 +      kfree(xhci->sec_erst);
 +      xhci->sec_erst = NULL;
 +      kfree(xhci->sec_event_ring);
 +      xhci->sec_event_ring = NULL;
 +
 +      /* primary event ring clean up */
        size = sizeof(struct xhci_erst_entry)*(xhci->erst.num_entries);
        if (xhci->erst.entries)
                dma_free_coherent(dev, size,
                                xhci->erst.entries, xhci->erst.erst_dma_addr);
        xhci->erst.entries = NULL;
 -      xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Freed ERST");
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Freed primary ERST");
        if (xhci->event_ring)
                xhci_ring_free(xhci, xhci->event_ring);
        xhci->event_ring = NULL;
 -      xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Freed event ring");
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Freed priamry event ring");
 +}
 +
 +void xhci_mem_cleanup(struct xhci_hcd *xhci)
 +{
 +      struct device   *dev = xhci_to_hcd(xhci)->self.controller;
 +      int i, j, num_ports;
 +
 +      cancel_delayed_work_sync(&xhci->cmd_timer);
 +
 +      xhci_event_ring_cleanup(xhci);
  
        if (xhci->lpm_command)
                xhci_free_command(xhci, xhci->lpm_command);
@@@ -2056,10 -1928,14 +2056,14 @@@ no_bw
        kfree(xhci->port_array);
        kfree(xhci->rh_bw);
        kfree(xhci->ext_caps);
+       kfree(xhci->usb2_rhub.psi);
+       kfree(xhci->usb3_rhub.psi);
  
        xhci->usb2_ports = NULL;
        xhci->usb3_ports = NULL;
        xhci->port_array = NULL;
+       xhci->usb2_rhub.psi = NULL;
+       xhci->usb3_rhub.psi = NULL;
        xhci->rh_bw = NULL;
        xhci->ext_caps = NULL;
  
@@@ -2088,15 -1964,15 +2092,15 @@@ static int xhci_test_trb_in_td(struct x
        if (seg != result_seg) {
                xhci_warn(xhci, "WARN: %s TRB math test %d failed!\n",
                                test_name, test_number);
 -              xhci_warn(xhci, "Tested TRB math w/ seg %p and "
 +              xhci_warn(xhci, "Tested TRB math w/ seg %pK and "
                                "input DMA 0x%llx\n",
                                input_seg,
                                (unsigned long long) input_dma);
 -              xhci_warn(xhci, "starting TRB %p (0x%llx DMA), "
 -                              "ending TRB %p (0x%llx DMA)\n",
 +              xhci_warn(xhci, "starting TRB %pK (0x%llx DMA), "
 +                              "ending TRB %pK (0x%llx DMA)\n",
                                start_trb, start_dma,
                                end_trb, end_dma);
 -              xhci_warn(xhci, "Expected seg %p, got seg %p\n",
 +              xhci_warn(xhci, "Expected seg %pK, got seg %pK\n",
                                result_seg, seg);
                trb_in_td(xhci, input_seg, start_trb, end_trb, input_dma,
                          true);
@@@ -2227,6 -2103,30 +2231,6 @@@ static int xhci_check_trb_in_td_math(st
        return 0;
  }
  
 -static void xhci_set_hc_event_deq(struct xhci_hcd *xhci)
 -{
 -      u64 temp;
 -      dma_addr_t deq;
 -
 -      deq = xhci_trb_virt_to_dma(xhci->event_ring->deq_seg,
 -                      xhci->event_ring->dequeue);
 -      if (deq == 0 && !in_interrupt())
 -              xhci_warn(xhci, "WARN something wrong with SW event ring "
 -                              "dequeue ptr.\n");
 -      /* Update HC event ring dequeue pointer */
 -      temp = xhci_read_64(xhci, &xhci->ir_set->erst_dequeue);
 -      temp &= ERST_PTR_MASK;
 -      /* Don't clear the EHB bit (which is RW1C) because
 -       * there might be more events to service.
 -       */
 -      temp &= ~ERST_EHB;
 -      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 -                      "// Write event ring dequeue pointer, "
 -                      "preserving EHB bit");
 -      xhci_write_64(xhci, ((u64) deq & (u64) ~ERST_PTR_MASK) | temp,
 -                      &xhci->ir_set->erst_dequeue);
 -}
 -
  static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports,
                __le32 __iomem *addr, u8 major_revision, int max_caps)
  {
                rhub = &xhci->usb2_rhub;
        } else {
                xhci_warn(xhci, "Ignoring unknown port speed, "
 -                              "Ext Cap %p, revision = 0x%x\n",
 +                              "Ext Cap %pK, revision = 0x%x\n",
                                addr, major_revision);
                /* Ignoring port protocol we can't understand. FIXME */
                return;
        port_offset = XHCI_EXT_PORT_OFF(temp);
        port_count = XHCI_EXT_PORT_COUNT(temp);
        xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 -                      "Ext Cap %p, port offset = %u, "
 +                      "Ext Cap %pK, port offset = %u, "
                        "count = %u, revision = 0x%x",
                        addr, port_offset, port_count, major_revision);
        /* Port count includes the current port offset */
        for (i = port_offset; i < (port_offset + port_count); i++) {
                /* Duplicate entry.  Ignore the port if the revisions differ. */
                if (xhci->port_array[i] != 0) {
 -                      xhci_warn(xhci, "Duplicate port entry, Ext Cap %p,"
 +                      xhci_warn(xhci, "Duplicate port entry, Ext Cap %pK,"
                                        " port %u\n", addr, i);
                        xhci_warn(xhci, "Port was marked as USB %u, "
                                        "duplicated as USB %u\n",
@@@ -2473,7 -2373,7 +2477,7 @@@ static int xhci_setup_port_arrays(struc
                                NUM_PORT_REGS*i;
                        xhci_dbg_trace(xhci, trace_xhci_dbg_init,
                                        "USB 2.0 port at index %u, "
 -                                      "addr = %p", i,
 +                                      "addr = %pK", i,
                                        xhci->usb2_ports[port_index]);
                        port_index++;
                        if (port_index == xhci->num_usb2_ports)
                                        NUM_PORT_REGS*i;
                                xhci_dbg_trace(xhci, trace_xhci_dbg_init,
                                                "USB 3.0 port at index %u, "
 -                                              "addr = %p", i,
 +                                              "addr = %pK", i,
                                                xhci->usb3_ports[port_index]);
                                port_index++;
                                if (port_index == xhci->num_usb3_ports)
        return 0;
  }
  
 +int xhci_event_ring_setup(struct xhci_hcd *xhci, struct xhci_ring **er,
 +      struct xhci_intr_reg __iomem *ir_set, struct xhci_erst *erst,
 +      unsigned int intr_num, gfp_t flags)
 +{
 +      dma_addr_t dma, deq;
 +      u64 val_64;
 +      unsigned int val;
 +      struct xhci_segment *seg;
 +      struct device *dev = xhci_to_hcd(xhci)->self.controller;
 +
 +      *er = xhci_ring_alloc(xhci, ERST_NUM_SEGS, 1,
 +                              TYPE_EVENT, flags);
 +              if (!*er)
 +                      return -ENOMEM;
 +
 +      erst->entries = dma_alloc_coherent(dev,
 +                      sizeof(struct xhci_erst_entry) * ERST_NUM_SEGS, &dma,
 +                      flags);
 +      if (!erst->entries) {
 +              xhci_ring_free(xhci, *er);
 +              return -ENOMEM;
 +      }
 +
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 +              "intr# %d: Allocated event ring segment table at 0x%llx",
 +              intr_num, (unsigned long long)dma);
 +
 +      memset(erst->entries, 0, sizeof(struct xhci_erst_entry)*ERST_NUM_SEGS);
 +      erst->num_entries = ERST_NUM_SEGS;
 +      erst->erst_dma_addr = dma;
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 +              "intr# %d: num segs = %i, virt addr = %pK, dma addr = 0x%llx",
 +                      intr_num,
 +                      erst->num_entries,
 +                      erst->entries,
 +                      (unsigned long long)erst->erst_dma_addr);
 +
 +      /* set ring base address and size for each segment table entry */
 +      for (val = 0, seg = (*er)->first_seg; val < ERST_NUM_SEGS; val++) {
 +              struct xhci_erst_entry *entry = &erst->entries[val];
 +
 +              entry->seg_addr = cpu_to_le64(seg->dma);
 +              entry->seg_size = cpu_to_le32(TRBS_PER_SEGMENT);
 +              entry->rsvd = 0;
 +              seg = seg->next;
 +      }
 +
 +      /* set ERST count with the number of entries in the segment table */
 +      val = readl_relaxed(&ir_set->erst_size);
 +      val &= ERST_SIZE_MASK;
 +      val |= ERST_NUM_SEGS;
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 +              "Write ERST size = %i to ir_set %d (some bits preserved)", val,
 +              intr_num);
 +      writel_relaxed(val, &ir_set->erst_size);
 +
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 +                      "intr# %d: Set ERST entries to point to event ring.",
 +                      intr_num);
 +      /* set the segment table base address */
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 +                      "Set ERST base address for ir_set %d = 0x%llx",
 +                      intr_num,
 +                      (unsigned long long)erst->erst_dma_addr);
 +      val_64 = xhci_read_64(xhci, &ir_set->erst_base);
 +      val_64 &= ERST_PTR_MASK;
 +      val_64 |= (erst->erst_dma_addr & (u64) ~ERST_PTR_MASK);
 +      xhci_write_64(xhci, val_64, &ir_set->erst_base);
 +
 +      /* Set the event ring dequeue address */
 +      deq = xhci_trb_virt_to_dma((*er)->deq_seg, (*er)->dequeue);
 +      if (deq == 0 && !in_interrupt())
 +              xhci_warn(xhci,
 +              "intr# %d:WARN something wrong with SW event ring deq ptr.\n",
 +              intr_num);
 +      /* Update HC event ring dequeue pointer */
 +      val_64 = xhci_read_64(xhci, &ir_set->erst_dequeue);
 +      val_64 &= ERST_PTR_MASK;
 +      /* Don't clear the EHB bit (which is RW1C) because
 +       * there might be more events to service.
 +       */
 +      val_64 &= ~ERST_EHB;
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 +              "intr# %d:Write event ring dequeue pointer, preserving EHB bit",
 +              intr_num);
 +      xhci_write_64(xhci, ((u64) deq & (u64) ~ERST_PTR_MASK) | val_64,
 +                      &ir_set->erst_dequeue);
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 +                      "Wrote ERST address to ir_set %d.", intr_num);
 +      xhci_print_ir_set(xhci, intr_num);
 +
 +      return 0;
 +}
 +
 +int xhci_sec_event_ring_setup(struct usb_hcd *hcd, unsigned intr_num)
 +{
 +      int ret;
 +      struct xhci_hcd *xhci = hcd_to_xhci(hcd);
 +
 +      if ((xhci->xhc_state & XHCI_STATE_HALTED) || !xhci->sec_ir_set
 +              || !xhci->sec_event_ring || !xhci->sec_erst ||
 +              intr_num >= xhci->max_interrupters) {
 +              xhci_err(xhci,
 +              "%s:state %x ir_set %pK evt_ring %pK erst %pK intr# %d\n",
 +              __func__, xhci->xhc_state, xhci->sec_ir_set,
 +              xhci->sec_event_ring, xhci->sec_erst, intr_num);
 +              return -EINVAL;
 +      }
 +
 +      if (xhci->sec_event_ring && xhci->sec_event_ring[intr_num]
 +              && xhci->sec_event_ring[intr_num]->first_seg)
 +              goto done;
 +
 +      xhci->sec_ir_set[intr_num] = &xhci->run_regs->ir_set[intr_num];
 +      ret = xhci_event_ring_setup(xhci,
 +                              &xhci->sec_event_ring[intr_num],
 +                              xhci->sec_ir_set[intr_num],
 +                              &xhci->sec_erst[intr_num],
 +                              intr_num, GFP_KERNEL);
 +      if (ret) {
 +              xhci_err(xhci, "sec event ring setup failed inter#%d\n",
 +                      intr_num);
 +              return ret;
 +      }
 +done:
 +      return 0;
 +}
 +
 +int xhci_event_ring_init(struct xhci_hcd *xhci, gfp_t flags)
 +{
 +      int ret = 0;
 +
 +      /* primary + secondary */
 +      xhci->max_interrupters = HCS_MAX_INTRS(xhci->hcs_params1);
 +
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 +              "// Allocating primary event ring");
 +
 +      /* Set ir_set to interrupt register set 0 */
 +      xhci->ir_set = &xhci->run_regs->ir_set[0];
 +      ret = xhci_event_ring_setup(xhci, &xhci->event_ring, xhci->ir_set,
 +              &xhci->erst, 0, flags);
 +      if (ret) {
 +              xhci_err(xhci, "failed to setup primary event ring\n");
 +              goto fail;
 +      }
 +
 +      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 +              "// Allocating sec event ring related pointers");
 +
 +      xhci->sec_ir_set = kcalloc(xhci->max_interrupters,
 +                              sizeof(*xhci->sec_ir_set), flags);
 +      if (!xhci->sec_ir_set) {
 +              ret = -ENOMEM;
 +              goto fail;
 +      }
 +
 +      xhci->sec_event_ring = kcalloc(xhci->max_interrupters,
 +                              sizeof(*xhci->sec_event_ring), flags);
 +      if (!xhci->sec_event_ring) {
 +              ret = -ENOMEM;
 +              goto fail;
 +      }
 +
 +      xhci->sec_erst = kcalloc(xhci->max_interrupters,
 +                              sizeof(*xhci->sec_erst), flags);
 +      if (!xhci->sec_erst)
 +              ret = -ENOMEM;
 +fail:
 +      return ret;
 +}
 +
  int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags)
  {
        dma_addr_t      dma;
        struct device   *dev = xhci_to_hcd(xhci)->self.controller;
        unsigned int    val, val2;
        u64             val_64;
 -      struct xhci_segment     *seg;
        u32 page_size, temp;
        int i;
  
        memset(xhci->dcbaa, 0, sizeof *(xhci->dcbaa));
        xhci->dcbaa->dma = dma;
        xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 -                      "// Device context base array address = 0x%llx (DMA), %p (virt)",
 +                      "// Device context base array address = 0x%llx (DMA), %pK (virt)",
                        (unsigned long long)xhci->dcbaa->dma, xhci->dcbaa);
        xhci_write_64(xhci, dma, &xhci->op_regs->dcbaa_ptr);
  
        if (!xhci->cmd_ring)
                goto fail;
        xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 -                      "Allocated command ring at %p", xhci->cmd_ring);
 +                      "Allocated command ring at %pK", xhci->cmd_ring);
        xhci_dbg_trace(xhci, trace_xhci_dbg_init, "First segment DMA is 0x%llx",
                        (unsigned long long)xhci->cmd_ring->first_seg->dma);
  
        xhci->dba = (void __iomem *) xhci->cap_regs + val;
        xhci_dbg_regs(xhci);
        xhci_print_run_regs(xhci);
 -      /* Set ir_set to interrupt register set 0 */
 -      xhci->ir_set = &xhci->run_regs->ir_set[0];
  
        /*
         * Event ring setup: Allocate a normal ring, but also setup
         * the event ring segment table (ERST).  Section 4.9.3.
         */
 -      xhci_dbg_trace(xhci, trace_xhci_dbg_init, "// Allocating event ring");
 -      xhci->event_ring = xhci_ring_alloc(xhci, ERST_NUM_SEGS, 1, TYPE_EVENT,
 -                                              flags);
 -      if (!xhci->event_ring)
 -              goto fail;
 -      if (xhci_check_trb_in_td_math(xhci) < 0)
 +      if (xhci_event_ring_init(xhci, GFP_KERNEL))
                goto fail;
  
 -      xhci->erst.entries = dma_alloc_coherent(dev,
 -                      sizeof(struct xhci_erst_entry) * ERST_NUM_SEGS, &dma,
 -                      flags);
 -      if (!xhci->erst.entries)
 +      if (xhci_check_trb_in_td_math(xhci) < 0)
                goto fail;
 -      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 -                      "// Allocated event ring segment table at 0x%llx",
 -                      (unsigned long long)dma);
 -
 -      memset(xhci->erst.entries, 0, sizeof(struct xhci_erst_entry)*ERST_NUM_SEGS);
 -      xhci->erst.num_entries = ERST_NUM_SEGS;
 -      xhci->erst.erst_dma_addr = dma;
 -      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 -                      "Set ERST to 0; private num segs = %i, virt addr = %p, dma addr = 0x%llx",
 -                      xhci->erst.num_entries,
 -                      xhci->erst.entries,
 -                      (unsigned long long)xhci->erst.erst_dma_addr);
 -
 -      /* set ring base address and size for each segment table entry */
 -      for (val = 0, seg = xhci->event_ring->first_seg; val < ERST_NUM_SEGS; val++) {
 -              struct xhci_erst_entry *entry = &xhci->erst.entries[val];
 -              entry->seg_addr = cpu_to_le64(seg->dma);
 -              entry->seg_size = cpu_to_le32(TRBS_PER_SEGMENT);
 -              entry->rsvd = 0;
 -              seg = seg->next;
 -      }
 -
 -      /* set ERST count with the number of entries in the segment table */
 -      val = readl(&xhci->ir_set->erst_size);
 -      val &= ERST_SIZE_MASK;
 -      val |= ERST_NUM_SEGS;
 -      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 -                      "// Write ERST size = %i to ir_set 0 (some bits preserved)",
 -                      val);
 -      writel(val, &xhci->ir_set->erst_size);
 -
 -      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 -                      "// Set ERST entries to point to event ring.");
 -      /* set the segment table base address */
 -      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 -                      "// Set ERST base address for ir_set 0 = 0x%llx",
 -                      (unsigned long long)xhci->erst.erst_dma_addr);
 -      val_64 = xhci_read_64(xhci, &xhci->ir_set->erst_base);
 -      val_64 &= ERST_PTR_MASK;
 -      val_64 |= (xhci->erst.erst_dma_addr & (u64) ~ERST_PTR_MASK);
 -      xhci_write_64(xhci, val_64, &xhci->ir_set->erst_base);
 -
 -      /* Set the event ring dequeue address */
 -      xhci_set_hc_event_deq(xhci);
 -      xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 -                      "Wrote ERST address to ir_set 0.");
 -      xhci_print_ir_set(xhci, 0);
  
        /*
         * XXX: Might need to set the Interrupter Moderation Register to
@@@ -68,8 -68,6 +68,8 @@@
  #include <linux/slab.h>
  #include "xhci.h"
  #include "xhci-trace.h"
 +extern void kick_usbpd_vbus_sm(void);
 +extern bool is_xiaomi_headset;
  
  /*
   * Returns zero if the TRB isn't in this segment, otherwise it returns the DMA
@@@ -284,9 -282,6 +284,9 @@@ void xhci_ring_cmd_db(struct xhci_hcd *
  
  static bool xhci_mod_cmd_timer(struct xhci_hcd *xhci, unsigned long delay)
  {
 +      if (is_xiaomi_headset)
 +              delay = msecs_to_jiffies(1000);
 +
        return mod_delayed_work(system_wq, &xhci->cmd_timer, delay);
  }
  
@@@ -315,7 -310,7 +315,7 @@@ static void xhci_handle_stopped_cmd_rin
  
                i_cmd->status = COMP_CMD_STOP;
  
 -              xhci_dbg(xhci, "Turn aborted command %p to no-op\n",
 +              xhci_dbg(xhci, "Turn aborted command %pK to no-op\n",
                         i_cmd->command_trb);
                /* get cycle state from the original cmd trb */
                cycle_state = le32_to_cpu(
@@@ -349,7 -344,6 +349,7 @@@ static int xhci_abort_cmd_ring(struct x
  {
        u64 temp_64;
        int ret;
 +      int delay;
  
        xhci_dbg(xhci, "Abort command ring\n");
  
                        &xhci->op_regs->cmd_ring);
  
        /* Section 4.6.1.2 of xHCI 1.0 spec says software should
 -       * time the completion od all xHCI commands, including
 +       * time the completion of all xHCI commands, including
         * the Command Abort operation. If software doesn't see
 -       * CRR negated in a timely manner (e.g. longer than 5
 -       * seconds), then it should assume that the there are
 -       * larger problems with the xHC and assert HCRST.
 +       * CRR negated in a timely manner, then it should assume
 +       * that the there are larger problems with the xHC and assert HCRST.
         */
 -      ret = xhci_handshake(&xhci->op_regs->cmd_ring,
 -                      CMD_RING_RUNNING, 0, 5 * 1000 * 1000);
 +      if (is_xiaomi_headset) {
 +              delay = 500 * 1000;
 +      } else {
 +              delay = 5000 * 1000;
 +      }
 +
 +      ret = xhci_handshake_check_state(xhci, &xhci->op_regs->cmd_ring,
 +                      CMD_RING_RUNNING, 0, 1000 * 1000);
        if (ret < 0) {
 -              /* we are about to kill xhci, give it one more chance */
 -              xhci_write_64(xhci, temp_64 | CMD_RING_ABORT,
 -                            &xhci->op_regs->cmd_ring);
 -              udelay(1000);
 -              ret = xhci_handshake(&xhci->op_regs->cmd_ring,
 -                                   CMD_RING_RUNNING, 0, 3 * 1000 * 1000);
 -              if (ret < 0) {
 -                      xhci_err(xhci, "Stopped the command ring failed, "
 -                               "maybe the host is dead\n");
 -                      xhci->xhc_state |= XHCI_STATE_DYING;
 -                      xhci_quiesce(xhci);
 -                      xhci_halt(xhci);
 -                      return -ESHUTDOWN;
 -              }
 +              if (is_xiaomi_headset)
 +                      return -EPERM;
 +              xhci_err(xhci,
 +                       "Stop command ring failed, maybe the host is dead\n");
 +              xhci->xhc_state |= XHCI_STATE_DYING;
 +              xhci_quiesce(xhci);
 +              xhci_halt(xhci);
 +              return -ESHUTDOWN;
        }
        /*
         * Writing the CMD_RING_ABORT bit should cause a cmd completion event,
@@@ -597,7 -592,7 +597,7 @@@ void xhci_find_new_dequeue_state(struc
                        "Cycle state = 0x%x", state->new_cycle_state);
  
        xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
 -                      "New dequeue segment = %p (virtual)",
 +                      "New dequeue segment = %pK (virtual)",
                        state->new_deq_seg);
        addr = xhci_trb_virt_to_dma(state->new_deq_seg, state->new_deq_ptr);
        xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
@@@ -632,8 -627,8 +632,8 @@@ static void td_to_noop(struct xhci_hcd 
                        xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
                                        "Cancel (unchain) link TRB");
                        xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
 -                                      "Address = %p (0x%llx dma); "
 -                                      "in seg %p (0x%llx dma)",
 +                                      "Address = %pK (0x%llx dma); "
 +                                      "in seg %pK (0x%llx dma)",
                                        cur_trb,
                                        (unsigned long long)xhci_trb_virt_to_dma(cur_seg, cur_trb),
                                        cur_seg,
@@@ -769,7 -764,7 +769,7 @@@ static void xhci_handle_cmd_stop_ep(str
                         * short, don't muck with the stream ID after
                         * submission.
                         */
 -                      xhci_warn(xhci, "WARN Cancelled URB %p "
 +                      xhci_warn(xhci, "WARN Cancelled URB %pK "
                                        "has invalid stream ID %u.\n",
                                        cur_td->urb,
                                        cur_td->urb->stream_id);
@@@ -1108,7 -1103,7 +1108,7 @@@ static void xhci_handle_cmd_set_deq(str
                                ep_ring, ep_index);
                } else {
                        xhci_warn(xhci, "Mismatch between completed Set TR Deq Ptr command & xHCI internal state.\n");
 -                      xhci_warn(xhci, "ep deq seg = %p, deq ptr = %p\n",
 +                      xhci_warn(xhci, "ep deq seg = %pK, deq ptr = %pK\n",
                                  ep->queued_deq_seg, ep->queued_deq_ptr);
                }
        }
@@@ -1302,14 -1297,6 +1302,14 @@@ void xhci_handle_command_timeout(struc
                xhci->cmd_ring_state = CMD_RING_STATE_ABORTED;
                xhci_dbg(xhci, "Command timeout\n");
                ret = xhci_abort_cmd_ring(xhci, flags);
 +              if (ret == -EPERM) {
 +                      xhci_err(xhci, "Abort command ring failed reset usb device\n");
 +                      xhci_cleanup_command_queue(xhci);
 +                      spin_unlock_irqrestore(&xhci->lock, flags);
 +                      kick_usbpd_vbus_sm();
 +                      return;
 +              }
 +
                if (unlikely(ret == -ESHUTDOWN)) {
                        xhci_err(xhci, "Abort command ring failed\n");
                        xhci_cleanup_command_queue(xhci);
@@@ -1615,9 -1602,6 +1615,6 @@@ static void handle_port_status(struct x
                usb_hcd_resume_root_hub(hcd);
        }
  
-       if (hcd->speed >= HCD_USB3 && (temp & PORT_PLS_MASK) == XDEV_INACTIVE)
-               bus_state->port_remote_wakeup &= ~(1 << faked_port_index);
        if ((temp & PORT_PLC) && (temp & PORT_PLS_MASK) == XDEV_RESUME) {
                xhci_dbg(xhci, "port resume event for port %d\n", port_id);
  
                        bus_state->port_remote_wakeup |= 1 << faked_port_index;
                        xhci_test_and_clear_bit(xhci, port_array,
                                        faked_port_index, PORT_PLC);
+                       usb_hcd_start_port_resume(&hcd->self, faked_port_index);
                        xhci_set_link_state(xhci, port_array, faked_port_index,
                                                XDEV_U0);
                        /* Need to wait until the next link state change
                if (slot_id && xhci->devs[slot_id])
                        xhci_ring_device(xhci, slot_id);
                if (bus_state->port_remote_wakeup & (1 << faked_port_index)) {
-                       bus_state->port_remote_wakeup &=
-                               ~(1 << faked_port_index);
                        xhci_test_and_clear_bit(xhci, port_array,
                                        faked_port_index, PORT_PLC);
                        usb_wakeup_notification(hcd->self.root_hub,
@@@ -2640,7 -2623,7 +2636,7 @@@ cleanup
                                                 URB_SHORT_NOT_OK)) ||
                                        (status != 0 &&
                                         !usb_endpoint_xfer_isoc(&urb->ep->desc)))
 -                              xhci_dbg(xhci, "Giveback URB %p, len = %d, "
 +                              xhci_dbg(xhci, "Giveback URB %pK, len = %d, "
                                                "expected = %d, status = %d\n",
                                                urb, urb->actual_length,
                                                urb->transfer_buffer_length,
@@@ -3590,156 -3573,6 +3586,156 @@@ int xhci_queue_ctrl_tx(struct xhci_hcd 
        return 0;
  }
  
 +/*
 + * Variant of xhci_queue_ctrl_tx() used to implement EHSET
 + * SINGLE_STEP_SET_FEATURE test mode. It differs in that the control
 + * transfer is broken up so that the SETUP stage can happen and call
 + * the URB's completion handler before the DATA/STATUS stages are
 + * executed by the xHC hardware. This assumes the control transfer is a
 + * GetDescriptor, with a DATA stage in the IN direction, and an OUT
 + * STATUS stage.
 + *
 + * This function is called twice, usually with a 15-second delay in between.
 + * - with is_setup==true, the SETUP stage for the control request
 + *   (GetDescriptor) is queued in the TRB ring and sent to HW immediately
 + * - with is_setup==false, the DATA and STATUS TRBs are queued and exceuted
 + *
 + * Caller must have locked xhci->lock
 + */
 +int xhci_submit_single_step_set_feature(struct usb_hcd *hcd, struct urb *urb,
 +                                      int is_setup)
 +{
 +      struct xhci_hcd *xhci = hcd_to_xhci(hcd);
 +      struct xhci_ring *ep_ring;
 +      int num_trbs;
 +      int ret;
 +      unsigned int slot_id, ep_index;
 +      struct usb_ctrlrequest *setup;
 +      struct xhci_generic_trb *start_trb;
 +      int start_cycle;
 +      u32 field, length_field, remainder;
 +      struct urb_priv *urb_priv;
 +      struct xhci_td *td;
 +
 +      ep_ring = xhci_urb_to_transfer_ring(xhci, urb);
 +      if (!ep_ring)
 +              return -EINVAL;
 +
 +      /* Need buffer for data stage */
 +      if (urb->transfer_buffer_length <= 0)
 +              return -EINVAL;
 +
 +      /*
 +       * Need to copy setup packet into setup TRB, so we can't use the setup
 +       * DMA address.
 +       */
 +      if (!urb->setup_packet)
 +              return -EINVAL;
 +      setup = (struct usb_ctrlrequest *) urb->setup_packet;
 +
 +      slot_id = urb->dev->slot_id;
 +      ep_index = xhci_get_endpoint_index(&urb->ep->desc);
 +
 +      urb_priv = kzalloc(sizeof(struct urb_priv) +
 +                                sizeof(struct xhci_td *), GFP_ATOMIC);
 +      if (!urb_priv)
 +              return -ENOMEM;
 +
 +      td = urb_priv->td[0] = kzalloc(sizeof(struct xhci_td), GFP_ATOMIC);
 +      if (!td) {
 +              kfree(urb_priv);
 +              return -ENOMEM;
 +      }
 +
 +      urb_priv->length = 1;
 +      urb_priv->td_cnt = 0;
 +      urb->hcpriv = urb_priv;
 +
 +      num_trbs = is_setup ? 1 : 2;
 +
 +      ret = prepare_transfer(xhci, xhci->devs[slot_id],
 +                      ep_index, urb->stream_id,
 +                      num_trbs, urb, 0, GFP_ATOMIC);
 +      if (ret < 0) {
 +              kfree(td);
 +              kfree(urb_priv);
 +              return ret;
 +      }
 +
 +      /*
 +       * Don't give the first TRB to the hardware (by toggling the cycle bit)
 +       * until we've finished creating all the other TRBs.  The ring's cycle
 +       * state may change as we enqueue the other TRBs, so save it too.
 +       */
 +      start_trb = &ep_ring->enqueue->generic;
 +      start_cycle = ep_ring->cycle_state;
 +
 +      if (is_setup) {
 +              /* Queue only the setup TRB */
 +              field = TRB_IDT | TRB_IOC | TRB_TYPE(TRB_SETUP);
 +              if (start_cycle == 0)
 +                      field |= 0x1;
 +
 +              /* xHCI 1.0 6.4.1.2.1: Transfer Type field */
 +              if (xhci->hci_version == 0x100) {
 +                      if (setup->bRequestType & USB_DIR_IN)
 +                              field |= TRB_TX_TYPE(TRB_DATA_IN);
 +                      else
 +                              field |= TRB_TX_TYPE(TRB_DATA_OUT);
 +              }
 +
 +              /* Save the DMA address of the last TRB in the TD */
 +              td->last_trb = ep_ring->enqueue;
 +
 +              queue_trb(xhci, ep_ring, false,
 +                        setup->bRequestType | setup->bRequest << 8 |
 +                              le16_to_cpu(setup->wValue) << 16,
 +                        le16_to_cpu(setup->wIndex) |
 +                              le16_to_cpu(setup->wLength) << 16,
 +                        TRB_LEN(8) | TRB_INTR_TARGET(0),
 +                        field);
 +      } else {
 +              /* Queue data TRB */
 +              field = TRB_ISP | TRB_TYPE(TRB_DATA);
 +              if (start_cycle == 0)
 +                      field |= 0x1;
 +              if (setup->bRequestType & USB_DIR_IN)
 +                      field |= TRB_DIR_IN;
 +
 +              remainder = xhci_td_remainder(xhci, 0,
 +                                         urb->transfer_buffer_length,
 +                                         urb->transfer_buffer_length,
 +                                         urb, 1);
 +
 +              length_field = TRB_LEN(urb->transfer_buffer_length) |
 +                      TRB_TD_SIZE(remainder) |
 +                      TRB_INTR_TARGET(0);
 +
 +              queue_trb(xhci, ep_ring, true,
 +                        lower_32_bits(urb->transfer_dma),
 +                        upper_32_bits(urb->transfer_dma),
 +                        length_field,
 +                        field);
 +
 +              /* Save the DMA address of the last TRB in the TD */
 +              td->last_trb = ep_ring->enqueue;
 +
 +              /* Queue status TRB */
 +              field = TRB_IOC | TRB_TYPE(TRB_STATUS);
 +              if (!(setup->bRequestType & USB_DIR_IN))
 +                      field |= TRB_DIR_IN;
 +
 +              queue_trb(xhci, ep_ring, false,
 +                        0,
 +                        0,
 +                        TRB_INTR_TARGET(0),
 +                        field | ep_ring->cycle_state);
 +      }
 +
 +      giveback_first_trb(xhci, slot_id, ep_index, 0, start_cycle, start_trb);
 +      return 0;
 +}
 +
  static int count_isoc_trbs_needed(struct xhci_hcd *xhci,
                struct urb *urb, int i)
  {
@@@ -4344,7 -4177,7 +4340,7 @@@ void xhci_queue_new_dequeue_state(struc
        int ret;
  
        xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
 -              "Set TR Deq Ptr cmd, new deq seg = %p (0x%llx dma), new deq ptr = %p (0x%llx dma), new cycle = %u",
 +              "Set TR Deq Ptr cmd, new deq seg = %pK (0x%llx dma), new deq ptr = %pK (0x%llx dma), new cycle = %u",
                deq_state->new_deq_seg,
                (unsigned long long)deq_state->new_deq_seg->dma,
                deq_state->new_deq_ptr,
                                    deq_state->new_deq_ptr);
        if (addr == 0) {
                xhci_warn(xhci, "WARN Cannot submit Set TR Deq Ptr\n");
 -              xhci_warn(xhci, "WARN deq seg = %p, deq pt = %p\n",
 +              xhci_warn(xhci, "WARN deq seg = %pK, deq pt = %pK\n",
                          deq_state->new_deq_seg, deq_state->new_deq_ptr);
                return;
        }
diff --combined drivers/usb/host/xhci.c
@@@ -75,27 -75,6 +75,27 @@@ int xhci_handshake(void __iomem *ptr, u
        return ret;
  }
  
 +int xhci_handshake_check_state(struct xhci_hcd *xhci,
 +              void __iomem *ptr, u32 mask, u32 done, int usec)
 +{
 +      u32     result;
 +
 +      do {
 +              result = readl_relaxed(ptr);
 +              if (result == ~(u32)0) /* card removed */
 +                      return -ENODEV;
 +              /* host removed. Bail out */
 +              if (xhci->xhc_state & XHCI_STATE_REMOVING)
 +                      return -ENODEV;
 +              result &= mask;
 +              if (result == done)
 +                      return 0;
 +              udelay(1);
 +              usec--;
 +      } while (usec > 0);
 +      return -ETIMEDOUT;
 +}
 +
  /*
   * Disable interrupts and begin the xHCI halting process.
   */
@@@ -133,20 -112,10 +133,20 @@@ int xhci_halt(struct xhci_hcd *xhci
                        STS_HALT, STS_HALT, XHCI_MAX_HALT_USEC);
        if (!ret) {
                xhci->xhc_state |= XHCI_STATE_HALTED;
 -              xhci->cmd_ring_state = CMD_RING_STATE_STOPPED;
 -      } else
 +      } else {
                xhci_warn(xhci, "Host not halted after %u microseconds.\n",
                                XHCI_MAX_HALT_USEC);
 +      }
 +
 +      xhci->cmd_ring_state = CMD_RING_STATE_STOPPED;
 +
 +      if (delayed_work_pending(&xhci->cmd_timer)) {
 +              xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 +                              "Cleanup command queue");
 +              cancel_delayed_work(&xhci->cmd_timer);
 +              xhci_cleanup_command_queue(xhci);
 +      }
 +
        return ret;
  }
  
@@@ -157,13 -126,7 +157,13 @@@ static int xhci_start(struct xhci_hcd *
  {
        u32 temp;
        int ret;
 +      struct usb_hcd *hcd = xhci_to_hcd(xhci);
  
 +      /*
 +       * disable irq to avoid xhci_irq flooding due to unhandeled port
 +       * change event in halt state, as soon as xhci_start clears halt bit
 +       */
 +      disable_irq(hcd->irq);
        temp = readl(&xhci->op_regs->command);
        temp |= (CMD_RUN);
        xhci_dbg_trace(xhci, trace_xhci_dbg_init, "// Turn on HC, cmd = 0x%x.",
                /* clear state flags. Including dying, halted or removing */
                xhci->xhc_state = 0;
  
 +      enable_irq(hcd->irq);
 +
        return ret;
  }
  
@@@ -684,7 -645,7 +684,7 @@@ int xhci_run(struct usb_hcd *hcd
  
        temp = readl(&xhci->ir_set->irq_pending);
        xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 -                      "// Enabling event ring interrupter %p by writing 0x%x to irq_pending",
 +                      "// Enabling event ring interrupter %pK by writing 0x%x to irq_pending",
                        xhci->ir_set, (unsigned int) ER_IRQ_ENABLE(temp));
        writel(ER_IRQ_ENABLE(temp), &xhci->ir_set->irq_pending);
        xhci_print_ir_set(xhci, 0);
@@@ -782,10 -743,6 +782,10 @@@ void xhci_shutdown(struct usb_hcd *hcd
                usb_disable_xhci_ports(to_pci_dev(hcd->self.controller));
  
        spin_lock_irq(&xhci->lock);
 +      if (!HCD_HW_ACCESSIBLE(hcd)) {
 +              spin_unlock_irq(&xhci->lock);
 +              return;
 +      }
        xhci_halt(xhci);
        /* Workaround for spurious wakeups at shutdown with HSW */
        if (xhci->quirks & XHCI_SPURIOUS_WAKEUP)
        xhci_dbg_trace(xhci, trace_xhci_dbg_init,
                        "xhci_shutdown completed - status = %x",
                        readl(&xhci->op_regs->status));
-       /* Yet another workaround for spurious wakeups at shutdown with HSW */
-       if (xhci->quirks & XHCI_SPURIOUS_WAKEUP)
-               pci_set_power_state(to_pci_dev(hcd->self.controller), PCI_D3hot);
  }
+ EXPORT_SYMBOL_GPL(xhci_shutdown);
  
  #ifdef CONFIG_PM
  static void xhci_save_registers(struct xhci_hcd *xhci)
@@@ -972,11 -926,11 +969,11 @@@ static bool xhci_pending_portevent(stru
  int xhci_suspend(struct xhci_hcd *xhci, bool do_wakeup)
  {
        int                     rc = 0;
-       unsigned int            delay = XHCI_MAX_HALT_USEC;
+       unsigned int            delay = XHCI_MAX_HALT_USEC * 2;
        struct usb_hcd          *hcd = xhci_to_hcd(xhci);
        u32                     command;
  
 -      if (!hcd->state)
 +      if (!hcd->state || xhci->suspended)
                return 0;
  
        if (hcd->state != HC_STATE_SUSPENDED ||
        /* step 5: remove core well power */
        /* synchronize irq when using MSI-X */
        xhci_msix_sync_irqs(xhci);
 +      xhci->suspended = true;
  
        return rc;
  }
@@@ -1066,7 -1019,7 +1063,7 @@@ int xhci_resume(struct xhci_hcd *xhci, 
        int                     retval = 0;
        bool                    comp_timer_running = false;
  
 -      if (!hcd->state)
 +      if (!hcd->state || !xhci->suspended)
                return 0;
  
        /* Wait a bit if either of the roothubs need to settle from the
  
        /* Re-enable port polling. */
        xhci_dbg(xhci, "%s: starting port polling.\n", __func__);
 +      xhci->suspended = false;
        set_bit(HCD_FLAG_POLL_RH, &xhci->shared_hcd->flags);
        usb_hcd_poll_rh_status(xhci->shared_hcd);
        set_bit(HCD_FLAG_POLL_RH, &hcd->flags);
@@@ -1540,7 -1492,7 +1537,7 @@@ int xhci_urb_enqueue(struct usb_hcd *hc
  exit:
        return ret;
  dying:
 -      xhci_dbg(xhci, "Ep 0x%x: URB %p submitted for "
 +      xhci_dbg(xhci, "Ep 0x%x: URB %pK submitted for "
                        "non-responsive xHCI host.\n",
                        urb->ep->desc.bEndpointAddress, urb);
        ret = -ESHUTDOWN;
@@@ -1676,7 -1628,7 +1673,7 @@@ int xhci_urb_dequeue(struct usb_hcd *hc
        i = urb_priv->td_cnt;
        if (i < urb_priv->length)
                xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
 -                              "Cancel URB %p, dev %s, ep 0x%x, "
 +                              "Cancel URB %pK, dev %s, ep 0x%x, "
                                "starting at offset 0x%llx",
                                urb, urb->dev->devpath,
                                urb->ep->desc.bEndpointAddress,
@@@ -1744,7 -1696,7 +1741,7 @@@ int xhci_drop_endpoint(struct usb_hcd *
        if (xhci->xhc_state & XHCI_STATE_DYING)
                return -ENODEV;
  
 -      xhci_dbg(xhci, "%s called for udev %p\n", __func__, udev);
 +      xhci_dbg(xhci, "%s called for udev %pK\n", __func__, udev);
        drop_flag = xhci_get_endpoint_flag(&ep->desc);
        if (drop_flag == SLOT_FLAG || drop_flag == EP0_FLAG) {
                xhci_dbg(xhci, "xHCI %s - can't drop slot or ep 0 %#x\n",
            xhci_get_endpoint_flag(&ep->desc)) {
                /* Do not warn when called after a usb_device_reset */
                if (xhci->devs[udev->slot_id]->eps[ep_index].ring != NULL)
 -                      xhci_warn(xhci, "xHCI %s called with disabled ep %p\n",
 +                      xhci_warn(xhci, "xHCI %s called with disabled ep %pK\n",
                                  __func__, ep);
                return 0;
        }
@@@ -1864,7 -1816,7 +1861,7 @@@ int xhci_add_endpoint(struct usb_hcd *h
         * ignore this request.
         */
        if (le32_to_cpu(ctrl_ctx->add_flags) & added_ctxs) {
 -              xhci_warn(xhci, "xHCI %s called with enabled ep %p\n",
 +              xhci_warn(xhci, "xHCI %s called with enabled ep %pK\n",
                                __func__, ep);
                return 0;
        }
@@@ -2845,7 -2797,7 +2842,7 @@@ int xhci_check_bandwidth(struct usb_hc
                (xhci->xhc_state & XHCI_STATE_REMOVING))
                return -ENODEV;
  
 -      xhci_dbg(xhci, "%s called for udev %p\n", __func__, udev);
 +      xhci_dbg(xhci, "%s called for udev %pK\n", __func__, udev);
        virt_dev = xhci->devs[udev->slot_id];
  
        command = xhci_alloc_command(xhci, false, true, GFP_KERNEL);
@@@ -2942,7 -2894,7 +2939,7 @@@ void xhci_reset_bandwidth(struct usb_hc
                return;
        xhci = hcd_to_xhci(hcd);
  
 -      xhci_dbg(xhci, "%s called for udev %p\n", __func__, udev);
 +      xhci_dbg(xhci, "%s called for udev %pK\n", __func__, udev);
        virt_dev = xhci->devs[udev->slot_id];
        /* Free any rings allocated for added endpoints */
        for (i = 0; i < 31; ++i) {
@@@ -2995,7 -2947,7 +2992,7 @@@ static void xhci_setup_input_ctx_for_qu
        if (addr == 0) {
                xhci_warn(xhci, "WARN Cannot submit config ep after "
                                "reset ep command\n");
 -              xhci_warn(xhci, "WARN deq seg = %p, deq ptr = %p\n",
 +              xhci_warn(xhci, "WARN deq seg = %pK, deq ptr = %pK\n",
                                deq_state->new_deq_seg,
                                deq_state->new_deq_ptr);
                return;
@@@ -3729,7 -3681,6 +3726,7 @@@ void xhci_free_dev(struct usb_hcd *hcd
                del_timer_sync(&virt_dev->eps[i].stop_cmd_timer);
        }
  
 +      virt_dev->udev = NULL;
        spin_lock_irqsave(&xhci->lock, flags);
  
        virt_dev->udev = NULL;
@@@ -4023,7 -3974,7 +4020,7 @@@ static int xhci_setup_device(struct usb
        xhci_dbg_trace(xhci, trace_xhci_dbg_address,
                        "Op regs DCBAA ptr = %#016llx", temp_64);
        xhci_dbg_trace(xhci, trace_xhci_dbg_address,
 -              "Slot ID %d dcbaa entry @%p = %#016llx",
 +              "Slot ID %d dcbaa entry @%pK = %#016llx",
                udev->slot_id,
                &xhci->dcbaa->dev_context_ptrs[udev->slot_id],
                (unsigned long long)
@@@ -5072,61 -5023,6 +5069,61 @@@ int xhci_gen_setup(struct usb_hcd *hcd
  }
  EXPORT_SYMBOL_GPL(xhci_gen_setup);
  
 +dma_addr_t xhci_get_sec_event_ring_dma_addr(struct usb_hcd *hcd,
 +      unsigned intr_num)
 +{
 +      struct xhci_hcd *xhci = hcd_to_xhci(hcd);
 +
 +      if (intr_num >= xhci->max_interrupters) {
 +              xhci_err(xhci, "intr num %d >= max intrs %d\n", intr_num,
 +                      xhci->max_interrupters);
 +              return 0;
 +      }
 +
 +      if (!(xhci->xhc_state & XHCI_STATE_HALTED) &&
 +              xhci->sec_event_ring && xhci->sec_event_ring[intr_num]
 +              && xhci->sec_event_ring[intr_num]->first_seg)
 +              return xhci->sec_event_ring[intr_num]->first_seg->dma;
 +
 +      return 0;
 +}
 +
 +static dma_addr_t xhci_get_dcba_dma_addr(struct usb_hcd *hcd,
 +      struct usb_device *udev)
 +{
 +      struct xhci_hcd *xhci = hcd_to_xhci(hcd);
 +
 +      if (!(xhci->xhc_state & XHCI_STATE_HALTED) && xhci->dcbaa)
 +              return xhci->dcbaa->dev_context_ptrs[udev->slot_id];
 +
 +      return 0;
 +}
 +
 +dma_addr_t xhci_get_xfer_ring_dma_addr(struct usb_hcd *hcd,
 +      struct usb_device *udev, struct usb_host_endpoint *ep)
 +{
 +      int ret;
 +      unsigned int ep_index;
 +      struct xhci_virt_device *virt_dev;
 +
 +      struct xhci_hcd *xhci = hcd_to_xhci(hcd);
 +
 +      ret = xhci_check_args(hcd, udev, ep, 1, true, __func__);
 +      if (ret <= 0) {
 +              xhci_err(xhci, "%s: invalid args\n", __func__);
 +              return 0;
 +      }
 +
 +      virt_dev = xhci->devs[udev->slot_id];
 +      ep_index = xhci_get_endpoint_index(&ep->desc);
 +
 +      if (virt_dev->eps[ep_index].ring &&
 +              virt_dev->eps[ep_index].ring->first_seg)
 +              return virt_dev->eps[ep_index].ring->first_seg->dma;
 +
 +      return 0;
 +}
 +
  static const struct hc_driver xhci_hc_driver = {
        .description =          "xhci-hcd",
        .product_desc =         "xHCI Host Controller",
        .enable_usb3_lpm_timeout =      xhci_enable_usb3_lpm_timeout,
        .disable_usb3_lpm_timeout =     xhci_disable_usb3_lpm_timeout,
        .find_raw_port_number = xhci_find_raw_port_number,
 +      .sec_event_ring_setup =         xhci_sec_event_ring_setup,
 +      .sec_event_ring_cleanup =       xhci_sec_event_ring_cleanup,
 +      .get_sec_event_ring_dma_addr =  xhci_get_sec_event_ring_dma_addr,
 +      .get_xfer_ring_dma_addr =       xhci_get_xfer_ring_dma_addr,
 +      .get_dcba_dma_addr =            xhci_get_dcba_dma_addr,
  };
  
  void xhci_init_driver(struct hc_driver *drv,
diff --combined drivers/usb/host/xhci.h
@@@ -314,6 -314,7 +314,7 @@@ struct xhci_op_regs 
  #define XDEV_U3               (0x3 << 5)
  #define XDEV_INACTIVE (0x6 << 5)
  #define XDEV_POLLING  (0x7 << 5)
+ #define XDEV_RECOVERY (0x8 << 5)
  #define XDEV_COMP_MODE  (0xa << 5)
  #define XDEV_RESUME   (0xf << 5)
  /* true: port has power (see HCC_PPC) */
@@@ -1521,9 -1522,6 +1522,9 @@@ struct xhci_hcd 
        /* Our HCD's current interrupter register set */
        struct  xhci_intr_reg __iomem *ir_set;
  
 +      /* secondary interrupter */
 +      struct  xhci_intr_reg __iomem **sec_ir_set;
 +
        /* Cached register copies of read-only HC data */
        __u32           hcs_params1;
        __u32           hcs_params2;
        struct xhci_command     *current_cmd;
        struct xhci_ring        *event_ring;
        struct xhci_erst        erst;
 +
 +      /* secondary event ring and erst */
 +      struct xhci_ring        **sec_event_ring;
 +      struct xhci_erst        *sec_erst;
 +
        /* Scratchpad */
        struct xhci_scratchpad  *scratchpad;
        /* Store LPM test failed devices' information */
        /* Compliance Mode Recovery Data */
        struct timer_list       comp_mode_recovery_timer;
        u32                     port_status_u0;
 +      bool                    suspended;
  /* Compliance Mode Timer Triggered every 2 seconds */
  #define COMP_MODE_RCVRY_MSECS 2000
  };
@@@ -1830,14 -1822,10 +1831,14 @@@ struct xhci_command *xhci_alloc_command
  void xhci_urb_free_priv(struct urb_priv *urb_priv);
  void xhci_free_command(struct xhci_hcd *xhci,
                struct xhci_command *command);
 +int xhci_sec_event_ring_setup(struct usb_hcd *hcd, unsigned intr_num);
 +int xhci_sec_event_ring_cleanup(struct usb_hcd *hcd, unsigned intr_num);
  
  /* xHCI host controller glue */
  typedef void (*xhci_get_quirks_t)(struct device *, struct xhci_hcd *);
  int xhci_handshake(void __iomem *ptr, u32 mask, u32 done, int usec);
 +int xhci_handshake_check_state(struct xhci_hcd *xhci,
 +              void __iomem *ptr, u32 mask, u32 done, int usec);
  void xhci_quiesce(struct xhci_hcd *xhci);
  int xhci_halt(struct xhci_hcd *xhci);
  int xhci_reset(struct xhci_hcd *xhci);
@@@ -1846,6 -1834,7 +1847,7 @@@ int xhci_run(struct usb_hcd *hcd)
  void xhci_stop(struct usb_hcd *hcd);
  void xhci_shutdown(struct usb_hcd *hcd);
  int xhci_gen_setup(struct usb_hcd *hcd, xhci_get_quirks_t get_quirks);
+ void xhci_shutdown(struct usb_hcd *hcd);
  void xhci_init_driver(struct hc_driver *drv,
                      const struct xhci_driver_overrides *over);
  
@@@ -1972,8 -1961,4 +1974,8 @@@ struct xhci_input_control_ctx *xhci_get
  struct xhci_slot_ctx *xhci_get_slot_ctx(struct xhci_hcd *xhci, struct xhci_container_ctx *ctx);
  struct xhci_ep_ctx *xhci_get_ep_ctx(struct xhci_hcd *xhci, struct xhci_container_ctx *ctx, unsigned int ep_index);
  
 +/* EHSET */
 +int xhci_submit_single_step_set_feature(struct usb_hcd *hcd, struct urb *urb,
 +                                      int is_setup);
 +
  #endif /* __LINUX_XHCI_HCD_H */
diff --combined drivers/video/hdmi.c
@@@ -538,10 -538,6 +538,10 @@@ hdmi_picture_aspect_get_name(enum hdmi_
                return "4:3";
        case HDMI_PICTURE_ASPECT_16_9:
                return "16:9";
 +      case HDMI_PICTURE_ASPECT_64_27:
 +              return "64:27";
 +      case HDMI_PICTURE_ASPECT_256_135:
 +              return "256:135";
        case HDMI_PICTURE_ASPECT_RESERVED:
                return "Reserved";
        }
@@@ -1036,12 -1032,12 +1036,12 @@@ static int hdmi_avi_infoframe_unpack(st
        if (ptr[0] & 0x10)
                frame->active_aspect = ptr[1] & 0xf;
        if (ptr[0] & 0x8) {
-               frame->top_bar = (ptr[5] << 8) + ptr[6];
-               frame->bottom_bar = (ptr[7] << 8) + ptr[8];
+               frame->top_bar = (ptr[6] << 8) | ptr[5];
+               frame->bottom_bar = (ptr[8] << 8) | ptr[7];
        }
        if (ptr[0] & 0x4) {
-               frame->left_bar = (ptr[9] << 8) + ptr[10];
-               frame->right_bar = (ptr[11] << 8) + ptr[12];
+               frame->left_bar = (ptr[10] << 8) | ptr[9];
+               frame->right_bar = (ptr[12] << 8) | ptr[11];
        }
        frame->scan_mode = ptr[0] & 0x3;
  
@@@ -30,7 -30,6 +30,7 @@@
  #include <linux/balloon_compaction.h>
  #include <linux/oom.h>
  #include <linux/wait.h>
 +#include <linux/mount.h>
  
  /*
   * Balloon device works in 4K page units.  So each page is pointed to by
@@@ -46,10 -45,6 +46,10 @@@ static int oom_pages = OOM_VBALLOON_DEF
  module_param(oom_pages, int, S_IRUSR | S_IWUSR);
  MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
  
 +#ifdef CONFIG_BALLOON_COMPACTION
 +static struct vfsmount *balloon_mnt;
 +#endif
 +
  struct virtio_balloon {
        struct virtio_device *vdev;
        struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
@@@ -401,7 -396,7 +401,7 @@@ static int init_vqs(struct virtio_ballo
  {
        struct virtqueue *vqs[3];
        vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
 -      const char *names[] = { "inflate", "deflate", "stats" };
 +      static const char * const names[] = { "inflate", "deflate", "stats" };
        int err, nvqs;
  
        /*
@@@ -473,6 -468,17 +473,17 @@@ static int virtballoon_migratepage(stru
  
        get_page(newpage); /* balloon reference */
  
+       /*
+         * When we migrate a page to a different zone and adjusted the
+         * managed page count when inflating, we have to fixup the count of
+         * both involved zones.
+         */
+       if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM) &&
+           page_zone(page) != page_zone(newpage)) {
+               adjust_managed_page_count(page, 1);
+               adjust_managed_page_count(newpage, -1);
+       }
        /* balloon's page migration 1st step  -- inflate "newpage" */
        spin_lock_irqsave(&vb_dev_info->pages_lock, flags);
        balloon_page_insert(vb_dev_info, newpage);
  
        return MIGRATEPAGE_SUCCESS;
  }
 +
 +static struct dentry *balloon_mount(struct file_system_type *fs_type,
 +              int flags, const char *dev_name, void *data)
 +{
 +      static const struct dentry_operations ops = {
 +              .d_dname = simple_dname,
 +      };
 +
 +      return mount_pseudo(fs_type, "balloon-kvm:", NULL, &ops,
 +                              BALLOON_KVM_MAGIC);
 +}
 +
 +static struct file_system_type balloon_fs = {
 +      .name           = "balloon-kvm",
 +      .mount          = balloon_mount,
 +      .kill_sb        = kill_anon_super,
 +};
 +
  #endif /* CONFIG_BALLOON_COMPACTION */
  
  static int virtballoon_probe(struct virtio_device *vdev)
        vb->need_stats_update = 0;
  
        balloon_devinfo_init(&vb->vb_dev_info);
 -#ifdef CONFIG_BALLOON_COMPACTION
 -      vb->vb_dev_info.migratepage = virtballoon_migratepage;
 -#endif
  
        err = init_vqs(vb);
        if (err)
        vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
        err = register_oom_notifier(&vb->nb);
        if (err < 0)
 -              goto out_oom_notify;
 +              goto out_del_vqs;
 +
 +#ifdef CONFIG_BALLOON_COMPACTION
 +      balloon_mnt = kern_mount(&balloon_fs);
 +      if (IS_ERR(balloon_mnt)) {
 +              err = PTR_ERR(balloon_mnt);
 +              unregister_oom_notifier(&vb->nb);
 +              goto out_del_vqs;
 +      }
 +
 +      vb->vb_dev_info.migratepage = virtballoon_migratepage;
 +      vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
 +      if (IS_ERR(vb->vb_dev_info.inode)) {
 +              err = PTR_ERR(vb->vb_dev_info.inode);
 +              kern_unmount(balloon_mnt);
 +              unregister_oom_notifier(&vb->nb);
 +              vb->vb_dev_info.inode = NULL;
 +              goto out_del_vqs;
 +      }
 +      vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
 +#endif
  
        virtio_device_ready(vdev);
  
  
  out_del_vqs:
        unregister_oom_notifier(&vb->nb);
 -out_oom_notify:
        vdev->config->del_vqs(vdev);
  out_free_vb:
        kfree(vb);
@@@ -612,8 -584,6 +623,8 @@@ static void virtballoon_remove(struct v
        unregister_oom_notifier(&vb->nb);
        kthread_stop(vb->thread);
        remove_common(vb);
 +      if (vb->vb_dev_info.inode)
 +              iput(vb->vb_dev_info.inode);
        kfree(vb);
  }
  
diff --combined fs/cifs/file.c
@@@ -312,9 -312,6 +312,6 @@@ cifs_new_fileinfo(struct cifs_fid *fid
        INIT_LIST_HEAD(&fdlocks->locks);
        fdlocks->cfile = cfile;
        cfile->llist = fdlocks;
-       cifs_down_write(&cinode->lock_sem);
-       list_add(&fdlocks->llist, &cinode->llist);
-       up_write(&cinode->lock_sem);
  
        cfile->count = 1;
        cfile->pid = current->tgid;
                oplock = 0;
        }
  
+       cifs_down_write(&cinode->lock_sem);
+       list_add(&fdlocks->llist, &cinode->llist);
+       up_write(&cinode->lock_sem);
        spin_lock(&tcon->open_file_lock);
        if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock)
                oplock = fid->pending_open->oplock;
@@@ -702,6 -703,13 +703,13 @@@ cifs_reopen_file(struct cifsFileInfo *c
        if (backup_cred(cifs_sb))
                create_options |= CREATE_OPEN_BACKUP_INTENT;
  
+       /* O_SYNC also has bit for O_DSYNC so following check picks up either */
+       if (cfile->f_flags & O_SYNC)
+               create_options |= CREATE_WRITE_THROUGH;
+       if (cfile->f_flags & O_DIRECT)
+               create_options |= CREATE_NO_BUFFER;
        if (server->ops->get_lease_key)
                server->ops->get_lease_key(inode, &cfile->fid);
  
@@@ -3443,13 -3451,13 +3451,13 @@@ readpages_get_pages(struct address_spac
         * should have access to this page, we're safe to simply set
         * PG_locked without checking it first.
         */
 -      __set_page_locked(page);
 +      __SetPageLocked(page);
        rc = add_to_page_cache_locked(page, mapping,
                                      page->index, gfp);
  
        /* give up if we can't stick it in the cache */
        if (rc) {
 -              __clear_page_locked(page);
 +              __ClearPageLocked(page);
                return rc;
        }
  
                if (*bytes + PAGE_CACHE_SIZE > rsize)
                        break;
  
 -              __set_page_locked(page);
 +              __SetPageLocked(page);
                if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
 -                      __clear_page_locked(page);
 +                      __ClearPageLocked(page);
                        break;
                }
                list_move_tail(&page->lru, tmplist);
diff --combined fs/fuse/dir.c
@@@ -240,7 -240,8 +240,8 @@@ static int fuse_dentry_revalidate(struc
                kfree(forget);
                if (ret == -ENOMEM)
                        goto out;
-               if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+               if (ret || fuse_invalid_attr(&outarg.attr) ||
+                   (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
                        goto invalid;
  
                fuse_change_attributes(inode, &outarg.attr,
@@@ -267,50 -268,6 +268,50 @@@ invalid
        goto out;
  }
  
 +/*
 + * Get the canonical path. Since we must translate to a path, this must be done
 + * in the context of the userspace daemon, however, the userspace daemon cannot
 + * look up paths on its own. Instead, we handle the lookup as a special case
 + * inside of the write request.
 + */
 +static void fuse_dentry_canonical_path(const struct path *path, struct path *canonical_path) {
 +      struct inode *inode = path->dentry->d_inode;
 +      struct fuse_conn *fc = get_fuse_conn(inode);
 +      struct fuse_req *req;
 +      int err;
 +      char *path_name;
 +
 +      req = fuse_get_req(fc, 1);
 +      err = PTR_ERR(req);
 +      if (IS_ERR(req))
 +              goto default_path;
 +
 +      path_name = (char*)__get_free_page(GFP_KERNEL);
 +      if (!path_name) {
 +              fuse_put_request(fc, req);
 +              goto default_path;
 +      }
 +
 +      req->in.h.opcode = FUSE_CANONICAL_PATH;
 +      req->in.h.nodeid = get_node_id(inode);
 +      req->in.numargs = 0;
 +      req->out.numargs = 1;
 +      req->out.args[0].size = PATH_MAX;
 +      req->out.args[0].value = path_name;
 +      req->canonical_path = canonical_path;
 +      req->out.argvar = 1;
 +      fuse_request_send(fc, req);
 +      err = req->out.h.error;
 +      fuse_put_request(fc, req);
 +      free_page((unsigned long)path_name);
 +      if (!err)
 +              return;
 +default_path:
 +      canonical_path->dentry = path->dentry;
 +      canonical_path->mnt = path->mnt;
 +      path_get(canonical_path);
 +}
 +
  static int invalid_nodeid(u64 nodeid)
  {
        return !nodeid || nodeid == FUSE_ROOT_ID;
  
  const struct dentry_operations fuse_dentry_operations = {
        .d_revalidate   = fuse_dentry_revalidate,
 +      .d_canonical_path = fuse_dentry_canonical_path,
  };
  
  int fuse_valid_type(int m)
                S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
  }
  
+ bool fuse_invalid_attr(struct fuse_attr *attr)
+ {
+       return !fuse_valid_type(attr->mode) ||
+               attr->size > LLONG_MAX;
+ }
  int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
                     struct fuse_entry_out *outarg, struct inode **inode)
  {
        err = -EIO;
        if (!outarg->nodeid)
                goto out_put_forget;
-       if (!fuse_valid_type(outarg->attr.mode))
+       if (fuse_invalid_attr(&outarg->attr))
                goto out_put_forget;
  
        *inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
@@@ -473,20 -435,18 +480,21 @@@ static int fuse_create_open(struct inod
        args.out.args[0].value = &outentry;
        args.out.args[1].size = sizeof(outopen);
        args.out.args[1].value = &outopen;
 +      args.out.passthrough_filp = NULL;
        err = fuse_simple_request(fc, &args);
        if (err)
                goto out_free_ff;
  
        err = -EIO;
-       if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
+       if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid) ||
+           fuse_invalid_attr(&outentry.attr))
                goto out_free_ff;
  
        ff->fh = outopen.fh;
        ff->nodeid = outentry.nodeid;
        ff->open_flags = outopen.open_flags;
 +      if (args.out.passthrough_filp != NULL)
 +              ff->passthrough_filp = args.out.passthrough_filp;
        inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
                          &outentry.attr, entry_attr_timeout(&outentry), 0);
        if (!inode) {
@@@ -587,7 -547,7 +595,7 @@@ static int create_new_entry(struct fuse
                goto out_put_forget_req;
  
        err = -EIO;
-       if (invalid_nodeid(outarg.nodeid))
+       if (invalid_nodeid(outarg.nodeid) || fuse_invalid_attr(&outarg.attr))
                goto out_put_forget_req;
  
        if ((outarg.attr.mode ^ mode) & S_IFMT)
@@@ -860,7 -820,8 +868,8 @@@ static int fuse_link(struct dentry *ent
  
                spin_lock(&fc->lock);
                fi->attr_version = ++fc->attr_version;
-               inc_nlink(inode);
+               if (likely(inode->i_nlink < UINT_MAX))
+                       inc_nlink(inode);
                spin_unlock(&fc->lock);
                fuse_invalidate_attr(inode);
                fuse_update_ctime(inode);
@@@ -940,7 -901,8 +949,8 @@@ static int fuse_do_getattr(struct inod
        args.out.args[0].value = &outarg;
        err = fuse_simple_request(fc, &args);
        if (!err) {
-               if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+               if (fuse_invalid_attr(&outarg.attr) ||
+                   (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
                        make_bad_inode(inode);
                        err = -EIO;
                } else {
@@@ -1245,7 -1207,7 +1255,7 @@@ static int fuse_direntplus_link(struct 
  
        if (invalid_nodeid(o->nodeid))
                return -EIO;
-       if (!fuse_valid_type(o->attr.mode))
+       if (fuse_invalid_attr(&o->attr))
                return -EIO;
  
        fc = get_fuse_conn(dir);
@@@ -1717,7 -1679,8 +1727,8 @@@ int fuse_do_setattr(struct inode *inode
                goto error;
        }
  
-       if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+       if (fuse_invalid_attr(&outarg.attr) ||
+           (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
                make_bad_inode(inode);
                err = -EIO;
                goto error;
diff --combined fs/fuse/fuse_i.h
@@@ -158,10 -158,6 +158,10 @@@ struct fuse_file 
  
        /** Has flock been performed on this file? */
        bool flock:1;
 +
 +      /* the read write file */
 +      struct file *passthrough_filp;
 +      bool passthrough_enabled;
  };
  
  /** One input argument of a request */
@@@ -241,7 -237,6 +241,7 @@@ struct fuse_args 
                unsigned argvar:1;
                unsigned numargs;
                struct fuse_arg args[2];
 +              struct file *passthrough_filp;
        } out;
  };
  
@@@ -378,9 -373,6 +378,9 @@@ struct fuse_req 
        /** Inode used in the request or NULL */
        struct inode *inode;
  
 +      /** Path used for completing d_canonical_path */
 +      struct path *canonical_path;
 +
        /** AIO control block */
        struct fuse_io_priv *io;
  
  
        /** Request is stolen from fuse_file->reserved_req */
        struct file *stolen_file;
 +
 +      /** fuse passthrough file  */
 +      struct file *passthrough_filp;
  };
  
  struct fuse_iqueue {
@@@ -552,9 -541,6 +552,9 @@@ struct fuse_conn 
        /** write-back cache policy (default is write-through) */
        unsigned writeback_cache:1;
  
 +      /** passthrough IO. */
 +      unsigned passthrough:1;
 +
        /*
         * The following bitfields are only for optimization purposes
         * and hence races in setting them will not cause malfunction
@@@ -901,6 -887,8 +901,8 @@@ void fuse_ctl_remove_conn(struct fuse_c
   */
  int fuse_valid_type(int m);
  
+ bool fuse_invalid_attr(struct fuse_attr *attr);
  /**
   * Is current process allowed to perform filesystem operation?
   */
diff --combined fs/proc/array.c
@@@ -172,15 -172,15 +172,15 @@@ static inline void task_state(struct se
        seq_printf(m,
                "State:\t%s\n"
                "Tgid:\t%d\n"
 -              "Ngid:\t%d\n"
                "Pid:\t%d\n"
                "PPid:\t%d\n"
                "TracerPid:\t%d\n"
                "Uid:\t%d\t%d\t%d\t%d\n"
                "Gid:\t%d\t%d\t%d\t%d\n"
 +              "Ngid:\t%d\n"
                "FDSize:\t%d\nGroups:\t",
                get_task_state(p),
 -              tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid,
 +              tgid, pid_nr_ns(pid, ns), ppid, tpid,
                from_kuid_munged(user_ns, cred->uid),
                from_kuid_munged(user_ns, cred->euid),
                from_kuid_munged(user_ns, cred->suid),
                from_kgid_munged(user_ns, cred->egid),
                from_kgid_munged(user_ns, cred->sgid),
                from_kgid_munged(user_ns, cred->fsgid),
 -              max_fds);
 +              ngid, max_fds);
  
        group_info = cred->group_info;
        for (g = 0; g < group_info->ngroups; g++)
@@@ -425,9 -425,21 +425,21 @@@ static int do_task_stat(struct seq_fil
        mm = get_task_mm(task);
        if (mm) {
                vsize = task_vsize(mm);
-               if (permitted) {
-                       eip = KSTK_EIP(task);
-                       esp = KSTK_ESP(task);
+               /*
+                * esp and eip are intentionally zeroed out.  There is no
+                * non-racy way to read them without freezing the task.
+                * Programs that need reliable values can use ptrace(2).
+                *
+                * The only exception is if the task is core dumping because
+                * a program is not able to use ptrace(2) in that case. It is
+                * safe because the task has stopped executing permanently.
+                */
+               if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) {
+                       if (try_get_task_stack(task)) {
+                               eip = KSTK_EIP(task);
+                               esp = KSTK_ESP(task);
+                               put_task_stack(task);
+                       }
                }
        }
  
@@@ -61,10 -61,6 +61,10 @@@ struct dma_map_ops 
        int (*mapping_error)(struct device *dev, dma_addr_t dma_addr);
        int (*dma_supported)(struct device *dev, u64 mask);
        int (*set_dma_mask)(struct device *dev, u64 mask);
 +      void *(*remap)(struct device *dev, void *cpu_addr, dma_addr_t handle,
 +                      size_t size, struct dma_attrs *attrs);
 +      void (*unremap)(struct device *dev, void *remapped_address,
 +                      size_t size);
  #ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
        u64 (*get_required_mask)(struct device *dev);
  #endif
@@@ -93,40 -89,6 +93,40 @@@ static inline int is_device_dma_capable
  #include <asm-generic/dma-mapping-broken.h>
  #endif
  
 +#ifndef CONFIG_NO_DMA
 +static inline void *dma_remap(struct device *dev, void *cpu_addr,
 +              dma_addr_t dma_handle, size_t size, struct dma_attrs *attrs)
 +{
 +      const struct dma_map_ops *ops = get_dma_ops(dev);
 +      BUG_ON(!ops);
 +
 +      if (!ops->remap) {
 +              WARN_ONCE(1, "Remap function not implemented for %pS\n",
 +                              ops->remap);
 +              return NULL;
 +      }
 +
 +      return ops->remap(dev, cpu_addr, dma_handle, size, attrs);
 +}
 +
 +
 +static inline void dma_unremap(struct device *dev, void *remapped_addr,
 +                              size_t size)
 +{
 +      const struct dma_map_ops *ops = get_dma_ops(dev);
 +      BUG_ON(!ops);
 +
 +      if (!ops->unremap) {
 +              WARN_ONCE(1, "unremap function not implemented for %pS\n",
 +                              ops->unremap);
 +              return;
 +      }
 +
 +      return ops->unremap(dev, remapped_addr, size);
 +}
 +#endif
 +
 +
  static inline u64 dma_get_mask(struct device *dev)
  {
        if (dev && dev->dma_mask && *dev->dma_mask)
@@@ -189,8 -151,7 +189,7 @@@ static inline unsigned int dma_get_max_
        return SZ_64K;
  }
  
- static inline unsigned int dma_set_max_seg_size(struct device *dev,
-                                               unsigned int size)
+ static inline int dma_set_max_seg_size(struct device *dev, unsigned int size)
  {
        if (dev->dma_parms) {
                dev->dma_parms->max_segment_size = size;
@@@ -1617,6 -1617,11 +1617,11 @@@ struct net_device 
        unsigned char           if_port;
        unsigned char           dma;
  
+       /* Note : dev->mtu is often read without holding a lock.
+        * Writers usually hold RTNL.
+        * It is recommended to use READ_ONCE() to annotate the reads,
+        * and to use WRITE_ONCE() to annotate the writes.
+        */
        unsigned int            mtu;
        unsigned short          type;
        unsigned short          hard_header_len;
@@@ -2589,7 -2594,6 +2594,7 @@@ extern int netdev_flow_limit_table_len
   */
  struct softnet_data {
        struct list_head        poll_list;
 +      struct napi_struct      *current_napi;
        struct sk_buff_head     process_queue;
  
        /* stats */
        unsigned int            time_squeeze;
        unsigned int            cpu_collision;
        unsigned int            received_rps;
 +      unsigned int            gro_coalesced;
 +
  #ifdef CONFIG_RPS
        struct softnet_data     *rps_ipi_list;
  #endif
@@@ -3086,7 -3088,6 +3091,7 @@@ struct sk_buff *napi_get_frags(struct n
  gro_result_t napi_gro_frags(struct napi_struct *napi);
  struct packet_offload *gro_find_receive_by_type(__be16 type);
  struct packet_offload *gro_find_complete_by_type(__be16 type);
 +extern struct napi_struct *get_current_napi_context(void);
  
  static inline void napi_free_frags(struct napi_struct *napi)
  {
@@@ -103,7 -103,6 +103,7 @@@ struct regmap
   *                      Data passed is old voltage cast to (void *).
   * PRE_DISABLE    Regulator is about to be disabled
   * ABORT_DISABLE  Regulator disable failed for some reason
 + * ENABLE         Regulator was enabled.
   *
   * NOTE: These events can be OR'ed together when passed into handler.
   */
  #define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE  0x200
  #define REGULATOR_EVENT_PRE_DISABLE           0x400
  #define REGULATOR_EVENT_ABORT_DISABLE         0x800
 +#define REGULATOR_EVENT_ENABLE                        0x1000
  
  /**
   * struct pre_voltage_change_data - Data sent with PRE_VOLTAGE_CHANGE event
@@@ -144,10 -142,6 +144,10 @@@ struct regulator
   *            using the bulk regulator APIs.
   * @consumer: The regulator consumer for the supply.  This will be managed
   *            by the bulk API.
 + * @min_uV:   The minimum requested voltage for the regulator (in microvolts),
 + *            or 0 to not set a voltage.
 + * @max_uV:   The maximum requested voltage for the regulator (in microvolts),
 + *            or 0 to use @min_uV.
   *
   * The regulator APIs provide a series of regulator_bulk_() API calls as
   * a convenience to consumers which require multiple supplies.  This
  struct regulator_bulk_data {
        const char *supply;
        struct regulator *consumer;
 +      int min_uV;
 +      int max_uV;
  
        /* private: Internal use */
        int ret;
@@@ -222,8 -214,6 +222,8 @@@ int __must_check devm_regulator_bulk_ge
                                         struct regulator_bulk_data *consumers);
  int __must_check regulator_bulk_enable(int num_consumers,
                                       struct regulator_bulk_data *consumers);
 +int regulator_bulk_set_voltage(int num_consumers,
 +                        struct regulator_bulk_data *consumers);
  int regulator_bulk_disable(int num_consumers,
                           struct regulator_bulk_data *consumers);
  int regulator_bulk_force_disable(int num_consumers,
@@@ -234,7 -224,6 +234,7 @@@ void regulator_bulk_free(int num_consum
  int regulator_can_change_voltage(struct regulator *regulator);
  int regulator_count_voltages(struct regulator *regulator);
  int regulator_list_voltage(struct regulator *regulator, unsigned selector);
 +int regulator_list_corner_voltage(struct regulator *regulator, int corner);
  int regulator_is_supported_voltage(struct regulator *regulator,
                                   int min_uV, int max_uV);
  unsigned int regulator_get_linear_step(struct regulator *regulator);
@@@ -496,7 -485,7 +496,7 @@@ static inline unsigned int regulator_ge
  
  static inline int regulator_set_load(struct regulator *regulator, int load_uA)
  {
-       return REGULATOR_MODE_NORMAL;
+       return 0;
  }
  
  static inline int regulator_allow_bypass(struct regulator *regulator,
@@@ -567,11 -556,6 +567,11 @@@ static inline int regulator_list_voltag
        return -EINVAL;
  }
  
 +static inline int regulator_list_corner_voltage(struct regulator *regulator,
 +      int corner)
 +{
 +      return -EINVAL;
 +}
  #endif
  
  static inline int regulator_set_voltage_triplet(struct regulator *regulator,
@@@ -66,7 -66,6 +66,7 @@@ struct uart_ops 
        void            (*set_ldisc)(struct uart_port *, struct ktermios *);
        void            (*pm)(struct uart_port *, unsigned int state,
                              unsigned int oldstate);
 +      void            (*wake_peer)(struct uart_port *);
  
        /*
         * Return a string describing the type of the port
@@@ -160,6 -159,7 +160,7 @@@ struct uart_port 
        struct console          *cons;                  /* struct console, if any */
  #if defined(CONFIG_SERIAL_CORE_CONSOLE) || defined(SUPPORT_SYSRQ)
        unsigned long           sysrq;                  /* sysrq timeout */
+       unsigned int            sysrq_ch;               /* char for sysrq */
  #endif
  
        /* flags must be updated while holding port mutex */
@@@ -342,26 -342,22 +343,26 @@@ struct earlycon_device 
  
  struct earlycon_id {
        char    name[16];
 +      char    compatible[128];
        int     (*setup)(struct earlycon_device *, const char *options);
  } __aligned(32);
  
 +extern const struct earlycon_id __earlycon_table[];
 +extern const struct earlycon_id __earlycon_table_end[];
 +
 +#define OF_EARLYCON_DECLARE(_name, compat, fn)                                \
 +      static const struct earlycon_id __UNIQUE_ID(__earlycon_##_name) \
 +           __used __section(__earlycon_table)                         \
 +              = { .name = __stringify(_name),                         \
 +                  .compatible = compat,                               \
 +                  .setup = fn  }
 +
 +#define EARLYCON_DECLARE(_name, fn)   OF_EARLYCON_DECLARE(_name, "", fn)
 +
  extern int setup_earlycon(char *buf);
  extern int of_setup_earlycon(unsigned long addr,
                             int (*setup)(struct earlycon_device *, const char *));
  
 -#define EARLYCON_DECLARE(_name, func)                                 \
 -      static const struct earlycon_id __earlycon_##_name              \
 -              __used __section(__earlycon_table)                      \
 -               = { .name  = __stringify(_name),                       \
 -                   .setup = func  }
 -
 -#define OF_EARLYCON_DECLARE(name, compat, fn)                         \
 -      _OF_DECLARE(earlycon, name, compat, fn, void *)
 -
  struct uart_port *uart_get_console(struct uart_port *ports, int nr,
                                   struct console *c);
  int uart_parse_earlycon(char *p, unsigned char *iotype, unsigned long *addr,
@@@ -402,7 -398,7 +403,7 @@@ int uart_resume_port(struct uart_drive
  static inline int uart_tx_stopped(struct uart_port *port)
  {
        struct tty_struct *tty = port->state->port.tty;
 -      if (tty->stopped || port->hw_stopped)
 +      if ((tty && tty->stopped) || port->hw_stopped)
                return 1;
        return 0;
  }
@@@ -445,8 -441,42 +446,42 @@@ uart_handle_sysrq_char(struct uart_por
        }
        return 0;
  }
+ static inline int
+ uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch)
+ {
+       if (port->sysrq) {
+               if (ch && time_before(jiffies, port->sysrq)) {
+                       port->sysrq_ch = ch;
+                       port->sysrq = 0;
+                       return 1;
+               }
+               port->sysrq = 0;
+       }
+       return 0;
+ }
+ static inline void
+ uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags)
+ {
+       int sysrq_ch;
+       sysrq_ch = port->sysrq_ch;
+       port->sysrq_ch = 0;
+       spin_unlock_irqrestore(&port->lock, irqflags);
+       if (sysrq_ch)
+               handle_sysrq(sysrq_ch);
+ }
  #else
- #define uart_handle_sysrq_char(port,ch) ({ (void)port; 0; })
+ static inline int
+ uart_handle_sysrq_char(struct uart_port *port, unsigned int ch) { return 0; }
+ static inline int
+ uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch) { return 0; }
+ static inline void
+ uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags)
+ {
+       spin_unlock_irqrestore(&port->lock, irqflags);
+ }
  #endif
  
  /*
@@@ -9,17 -9,50 +9,20 @@@
  
  #include <linux/types.h>
  #include <linux/bug.h>
 +#include <linux/restart_block.h>
  
 -#ifdef CONFIG_THREAD_INFO_IN_TASK
 -#define current_thread_info() ((struct thread_info *)current)
 -#endif
 -
 +#ifdef CONFIG_THREAD_INFO_IN_TASK
+ struct timespec;
+ struct compat_timespec;
  /*
 - * System call restart block.
 + * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
 + * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
 + * including <asm/current.h> can cause a circular dependency on some platforms.
   */
 -struct restart_block {
 -      long (*fn)(struct restart_block *);
 -      union {
 -              /* For futex_wait and futex_wait_requeue_pi */
 -              struct {
 -                      u32 __user *uaddr;
 -                      u32 val;
 -                      u32 flags;
 -                      u32 bitset;
 -                      u64 time;
 -                      u32 __user *uaddr2;
 -              } futex;
 -              /* For nanosleep */
 -              struct {
 -                      clockid_t clockid;
 -                      struct timespec __user *rmtp;
 -#ifdef CONFIG_COMPAT
 -                      struct compat_timespec __user *compat_rmtp;
 +#include <asm/current.h>
 +#define current_thread_info() ((struct thread_info *)current)
  #endif
 -                      u64 expires;
 -              } nanosleep;
 -              /* For poll */
 -              struct {
 -                      struct pollfd __user *ufds;
 -                      int nfds;
 -                      int has_timeout;
 -                      unsigned long tv_sec;
 -                      unsigned long tv_nsec;
 -              } poll;
 -      };
 -};
 -
 -extern long do_no_restart_syscall(struct restart_block *parm);
  
  #include <linux/bitops.h>
  #include <asm/thread_info.h>
@@@ -112,31 -145,6 +115,31 @@@ static inline bool test_and_clear_resto
  #error "no set_restore_sigmask() provided and default one won't work"
  #endif
  
 +#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
 +static inline int arch_within_stack_frames(const void * const stack,
 +                                         const void * const stackend,
 +                                         const void *obj, unsigned long len)
 +{
 +      return 0;
 +}
 +#endif
 +
 +#ifdef CONFIG_HARDENED_USERCOPY
 +extern void __check_object_size(const void *ptr, unsigned long n,
 +                                      bool to_user);
 +
 +static __always_inline void check_object_size(const void *ptr, unsigned long n,
 +                                            bool to_user)
 +{
 +      if (!__builtin_constant_p(n))
 +              __check_object_size(ptr, n, to_user);
 +}
 +#else
 +static inline void check_object_size(const void *ptr, unsigned long n,
 +                                   bool to_user)
 +{ }
 +#endif /* CONFIG_HARDENED_USERCOPY */
 +
  #endif        /* __KERNEL__ */
  
  #endif /* _LINUX_THREAD_INFO_H */
diff --combined include/net/ip.h
@@@ -172,7 -172,6 +172,7 @@@ struct ip_reply_arg 
                                /* -1 if not needed */ 
        int         bound_dev_if;
        u8          tos;
 +      kuid_t      uid;
  }; 
  
  #define IP_REPLY_ARG_NOSRCCHECK 1
@@@ -243,8 -242,6 +243,8 @@@ static inline int inet_is_local_reserve
  }
  #endif
  
 +extern int sysctl_reserved_port_bind;
 +
  /* From inetpeer.c */
  extern int inet_peer_threshold;
  extern int inet_peer_minttl;
@@@ -599,4 -596,9 +599,9 @@@ extern int sysctl_icmp_msgs_burst
  int ip_misc_proc_init(void);
  #endif
  
+ static inline bool inetdev_valid_mtu(unsigned int mtu)
+ {
+       return likely(mtu >= IPV4_MIN_MTU);
+ }
  #endif        /* _IP_H */
diff --combined include/net/tcp.h
@@@ -143,9 -143,6 +143,9 @@@ void tcp_time_wait(struct sock *sk, in
                                                 * most likely due to retrans in 3WHS.
                                                 */
  
 +/* Number of full MSS to receive before Acking RFC2581 */
 +#define TCP_DELACK_SEG          1
 +
  #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
                                                         * for local resources.
                                                         */
@@@ -289,14 -286,8 +289,14 @@@ extern int sysctl_tcp_autocorking
  extern int sysctl_tcp_invalid_ratelimit;
  extern int sysctl_tcp_pacing_ss_ratio;
  extern int sysctl_tcp_pacing_ca_ratio;
 +extern int sysctl_tcp_default_init_rwnd;
  
  extern atomic_long_t tcp_memory_allocated;
 +
 +/* sysctl variables for controlling various tcp parameters */
 +extern int sysctl_tcp_delack_seg;
 +extern int sysctl_tcp_use_userconfig;
 +
  extern struct percpu_counter tcp_sockets_allocated;
  extern int tcp_memory_pressure;
  
@@@ -387,14 -378,7 +387,14 @@@ ssize_t tcp_splice_read(struct socket *
                        struct pipe_inode_info *pipe, size_t len,
                        unsigned int flags);
  
 +/* sysctl master controller */
 +extern int tcp_use_userconfig_sysctl_handler(struct ctl_table *, int,
 +                              void __user *, size_t *, loff_t *);
 +extern int tcp_proc_delayed_ack_control(struct ctl_table *, int,
 +                              void __user *, size_t *, loff_t *);
 +
  void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks);
 +
  static inline void tcp_dec_quickack_mode(struct sock *sk,
                                         const unsigned int pkts)
  {
@@@ -518,19 -502,27 +518,27 @@@ struct sock *cookie_v4_check(struct soc
   */
  static inline void tcp_synq_overflow(const struct sock *sk)
  {
-       unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+       unsigned long last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
        unsigned long now = jiffies;
  
-       if (time_after(now, last_overflow + HZ))
-               tcp_sk(sk)->rx_opt.ts_recent_stamp = now;
+       if (!time_between32(now, last_overflow, last_overflow + HZ))
+               WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now);
  }
  
  /* syncookies: no recent synqueue overflow on this listening socket? */
  static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
  {
-       unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
-       return time_after(jiffies, last_overflow + TCP_SYNCOOKIE_VALID);
+       unsigned long last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
+       /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID,
+        * then we're under synflood. However, we have to use
+        * 'last_overflow - HZ' as lower bound. That's because a concurrent
+        * tcp_synq_overflow() could update .ts_recent_stamp after we read
+        * jiffies but before we store .ts_recent_stamp into last_overflow,
+        * which could lead to rejecting a valid syncookie.
+        */
+       return !time_between32(jiffies, last_overflow - HZ,
+                              last_overflow + TCP_SYNCOOKIE_VALID);
  }
  
  static inline u32 tcp_cookie_time(void)
@@@ -722,14 -714,11 +730,14 @@@ u32 __tcp_select_window(struct sock *sk
  
  void tcp_send_window_probe(struct sock *sk);
  
 -/* TCP timestamps are only 32-bits, this causes a slight
 - * complication on 64-bit systems since we store a snapshot
 - * of jiffies in the buffer control blocks below.  We decided
 - * to use only the low 32-bits of jiffies and hide the ugly
 - * casts with the following macro.
 +/* TCP uses 32bit jiffies to save some space.
 + * Note that this is different from tcp_time_stamp, which
 + * historically has been the same until linux-4.13.
 + */
 +#define tcp_jiffies32 ((u32)jiffies)
 +
 +/* Generator for TCP TS option (RFC 7323)
 + * Currently tied to 'jiffies' but will soon be driven by 1 ms clock.
   */
  #define tcp_time_stamp                ((__u32)(jiffies))
  
@@@ -1192,8 -1181,6 +1200,8 @@@ void tcp_set_state(struct sock *sk, in
  
  void tcp_done(struct sock *sk);
  
 +int tcp_abort(struct sock *sk, int err);
 +
  static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
  {
        rx_opt->dsack = 0;
diff --combined kernel/module.c
@@@ -1014,6 -1014,8 +1014,8 @@@ SYSCALL_DEFINE2(delete_module, const ch
        strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
  
        free_module(mod);
+       /* someone could wait for the module in add_unformed_module() */
+       wake_up_all(&module_wq);
        return 0;
  out:
        mutex_unlock(&module_mutex);
@@@ -2505,7 -2507,7 +2507,7 @@@ static void layout_symtab(struct modul
  
        /* We'll tack temporary mod_kallsyms on the end. */
        mod->init_size = ALIGN(mod->init_size,
 -                             __alignof__(struct mod_kallsyms));
 +                                    __alignof__(struct mod_kallsyms));
        info->mod_kallsyms_init_off = mod->init_size;
        mod->init_size += sizeof(struct mod_kallsyms);
        mod->init_size = debug_align(mod->init_size);
@@@ -2585,13 -2587,7 +2587,13 @@@ void * __weak module_alloc(unsigned lon
        return vmalloc_exec(size);
  }
  
 -#ifdef CONFIG_DEBUG_KMEMLEAK
 +#if defined(CONFIG_DEBUG_KMEMLEAK) && defined(CONFIG_DEBUG_MODULE_SCAN_OFF)
 +static void kmemleak_load_module(const struct module *mod,
 +                               const struct load_info *info)
 +{
 +      kmemleak_no_scan(mod->module_core);
 +}
 +#elif defined(CONFIG_DEBUG_KMEMLEAK)
  static void kmemleak_load_module(const struct module *mod,
                                 const struct load_info *info)
  {
diff --combined kernel/sched/fair.c
  #include <linux/mempolicy.h>
  #include <linux/migrate.h>
  #include <linux/task_work.h>
 -
 -#include <trace/events/sched.h>
 +#include <linux/module.h>
  
  #include "sched.h"
 +#include <trace/events/sched.h>
 +#include "tune.h"
 +#include "walt.h"
  
  /*
   * Targeted preemption latency for CPU-bound tasks:
@@@ -52,9 -50,6 +52,9 @@@
  unsigned int sysctl_sched_latency = 6000000ULL;
  unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  
 +unsigned int sysctl_sched_sync_hint_enable = 1;
 +unsigned int sysctl_sched_cstate_aware = 1;
 +
  /*
   * The initial- and re-scaling of tunables is configurable
   * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@@ -119,12 -114,6 +119,12 @@@ unsigned int __read_mostly sysctl_sched
  unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
  #endif
  
 +/*
 + * The margin used when comparing utilization with CPU capacity:
 + * util * margin < capacity * 1024
 + */
 +unsigned int capacity_margin = 1280; /* ~20% */
 +
  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
  {
        lw->weight += inc;
@@@ -247,9 -236,6 +247,9 @@@ static u64 __calc_delta(u64 delta_exec
        return mul_u64_u32_shr(delta_exec, fact, shift);
  }
  
 +#ifdef CONFIG_SMP
 +static int active_load_balance_cpu_stop(void *data);
 +#endif
  
  const struct sched_class fair_sched_class;
  
@@@ -300,59 -286,19 +300,59 @@@ static inline struct cfs_rq *group_cfs_
  static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
        if (!cfs_rq->on_list) {
 +              struct rq *rq = rq_of(cfs_rq);
 +              int cpu = cpu_of(rq);
                /*
                 * Ensure we either appear before our parent (if already
                 * enqueued) or force our parent to appear after us when it is
 -               * enqueued.  The fact that we always enqueue bottom-up
 -               * reduces this to two cases.
 +               * enqueued. The fact that we always enqueue bottom-up
 +               * reduces this to two cases and a special case for the root
 +               * cfs_rq. Furthermore, it also means that we will always reset
 +               * tmp_alone_branch either when the branch is connected
 +               * to a tree or when we reach the beg of the tree
                 */
                if (cfs_rq->tg->parent &&
 -                  cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
 -                      list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 -                              &rq_of(cfs_rq)->leaf_cfs_rq_list);
 -              } else {
 +                  cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
 +                      /*
 +                       * If parent is already on the list, we add the child
 +                       * just before. Thanks to circular linked property of
 +                       * the list, this means to put the child at the tail
 +                       * of the list that starts by parent.
 +                       */
 +                      list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 +                              &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
 +                      /*
 +                       * The branch is now connected to its tree so we can
 +                       * reset tmp_alone_branch to the beginning of the
 +                       * list.
 +                       */
 +                      rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 +              } else if (!cfs_rq->tg->parent) {
 +                      /*
 +                       * cfs rq without parent should be put
 +                       * at the tail of the list.
 +                       */
                        list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 -                              &rq_of(cfs_rq)->leaf_cfs_rq_list);
 +                              &rq->leaf_cfs_rq_list);
 +                      /*
 +                       * We have reach the beg of a tree so we can reset
 +                       * tmp_alone_branch to the beginning of the list.
 +                       */
 +                      rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 +              } else {
 +                      /*
 +                       * The parent has not already been added so we want to
 +                       * make sure that it will be put after us.
 +                       * tmp_alone_branch points to the beg of the branch
 +                       * where we will add parent.
 +                       */
 +                      list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 +                              rq->tmp_alone_branch);
 +                      /*
 +                       * update tmp_alone_branch to points to the new beg
 +                       * of the branch
 +                       */
 +                      rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
                }
  
                cfs_rq->on_list = 1;
@@@ -710,7 -656,7 +710,7 @@@ static u64 sched_vslice(struct cfs_rq *
  }
  
  #ifdef CONFIG_SMP
 -static int select_idle_sibling(struct task_struct *p, int cpu);
 +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
  static unsigned long task_h_load(struct task_struct *p);
  
  /*
@@@ -727,112 -673,25 +727,112 @@@ void init_entity_runnable_average(struc
  {
        struct sched_avg *sa = &se->avg;
  
 -      sa->last_update_time = 0;
 +      memset(sa, 0, sizeof(*sa));
        /*
 +       * util_avg is initialized in post_init_entity_util_avg.
 +       * util_est should start from zero.
         * sched_avg's period_contrib should be strictly less then 1024, so
         * we give it 1023 to make sure it is almost a period (1024us), and
         * will definitely be update (after enqueue).
         */
        sa->period_contrib = 1023;
 -      sa->load_avg = scale_load_down(se->load.weight);
 +      /*
 +       * Tasks are intialized with full load to be seen as heavy tasks until
 +       * they get a chance to stabilize to their real load level.
 +       * Group entities are intialized with zero load to reflect the fact that
 +       * nothing has been attached to the task group yet.
 +       */
 +      if (entity_is_task(se))
 +              sa->load_avg = scale_load_down(se->load.weight);
        sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
 -      sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
 -      sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
        /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  }
  
 -#else
 +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
 +static void attach_entity_cfs_rq(struct sched_entity *se);
 +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
 +
 +/*
 + * With new tasks being created, their initial util_avgs are extrapolated
 + * based on the cfs_rq's current util_avg:
 + *
 + *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
 + *
 + * However, in many cases, the above util_avg does not give a desired
 + * value. Moreover, the sum of the util_avgs may be divergent, such
 + * as when the series is a harmonic series.
 + *
 + * To solve this problem, we also cap the util_avg of successive tasks to
 + * only 1/2 of the left utilization budget:
 + *
 + *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
 + *
 + * where n denotes the nth task.
 + *
 + * For example, a simplest series from the beginning would be like:
 + *
 + *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
 + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
 + *
 + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
 + * if util_avg > util_avg_cap.
 + */
 +void post_init_entity_util_avg(struct sched_entity *se)
 +{
 +      struct cfs_rq *cfs_rq = cfs_rq_of(se);
 +      struct sched_avg *sa = &se->avg;
 +      long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
 +
 +      if (cap > 0) {
 +              if (cfs_rq->avg.util_avg != 0) {
 +                      sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
 +                      sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 +
 +                      if (sa->util_avg > cap)
 +                              sa->util_avg = cap;
 +              } else {
 +                      sa->util_avg = cap;
 +              }
 +              /*
 +               * If we wish to restore tuning via setting initial util,
 +               * this is where we should do it.
 +               */
 +              sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 +      }
 +
 +      if (entity_is_task(se)) {
 +              struct task_struct *p = task_of(se);
 +              if (p->sched_class != &fair_sched_class) {
 +                      /*
 +                       * For !fair tasks do:
 +                       *
 +                      update_cfs_rq_load_avg(now, cfs_rq, false);
 +                      attach_entity_load_avg(cfs_rq, se);
 +                      switched_from_fair(rq, p);
 +                       *
 +                       * such that the next switched_to_fair() has the
 +                       * expected state.
 +                       */
 +                      se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
 +                      return;
 +              }
 +      }
 +
 +      attach_entity_cfs_rq(se);
 +}
 +
 +#else /* !CONFIG_SMP */
  void init_entity_runnable_average(struct sched_entity *se)
  {
  }
 -#endif
 +void post_init_entity_util_avg(struct sched_entity *se)
 +{
 +}
 +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 +{
 +}
 +#endif /* CONFIG_SMP */
  
  /*
   * Update the current task's runtime statistics.
@@@ -877,56 -736,12 +877,56 @@@ static void update_curr_fair(struct rq 
        update_curr(cfs_rq_of(&rq->curr->se));
  }
  
 +#ifdef CONFIG_SCHEDSTATS
 +static inline void
 +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 +{
 +      u64 wait_start = rq_clock(rq_of(cfs_rq));
 +
 +      if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
 +          likely(wait_start > se->statistics.wait_start))
 +              wait_start -= se->statistics.wait_start;
 +
 +      se->statistics.wait_start = wait_start;
 +}
 +
 +static void
 +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 +{
 +      struct task_struct *p;
 +      u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
 +
 +      if (entity_is_task(se)) {
 +              p = task_of(se);
 +              if (task_on_rq_migrating(p)) {
 +                      /*
 +                       * Preserve migrating task's wait time so wait_start
 +                       * time stamp can be adjusted to accumulate wait time
 +                       * prior to migration.
 +                       */
 +                      se->statistics.wait_start = delta;
 +                      return;
 +              }
 +              trace_sched_stat_wait(p, delta);
 +      }
 +
 +      se->statistics.wait_max = max(se->statistics.wait_max, delta);
 +      se->statistics.wait_count++;
 +      se->statistics.wait_sum += delta;
 +      se->statistics.wait_start = 0;
 +}
 +#else
  static inline void
  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
 -      schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
  }
  
 +static inline void
 +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 +{
 +}
 +#endif
 +
  /*
   * Task is being enqueued - update stats:
   */
@@@ -940,6 -755,23 +940,6 @@@ static void update_stats_enqueue(struc
                update_stats_wait_start(cfs_rq, se);
  }
  
 -static void
 -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 -{
 -      schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
 -                      rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
 -      schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
 -      schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
 -                      rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
 -#ifdef CONFIG_SCHEDSTATS
 -      if (entity_is_task(se)) {
 -              trace_sched_stat_wait(task_of(se),
 -                      rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
 -      }
 -#endif
 -      schedstat_set(se->statistics.wait_start, 0);
 -}
 -
  static inline void
  update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
@@@ -1556,8 -1388,7 +1556,8 @@@ balance
         * Call select_idle_sibling to maybe find a better one.
         */
        if (!cur)
 -              env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
 +              env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
 +                                                 env->dst_cpu);
  
  assign:
        assigned = true;
@@@ -2563,22 -2394,28 +2563,22 @@@ account_entity_dequeue(struct cfs_rq *c
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  # ifdef CONFIG_SMP
 -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
  {
 -      long tg_weight;
 +      long tg_weight, load, shares;
  
        /*
 -       * Use this CPU's real-time load instead of the last load contribution
 -       * as the updating of the contribution is delayed, and we will use the
 -       * the real-time load to calc the share. See update_tg_load_avg().
 +       * This really should be: cfs_rq->avg.load_avg, but instead we use
 +       * cfs_rq->load.weight, which is its upper bound. This helps ramp up
 +       * the shares for small weight interactive tasks.
         */
 -      tg_weight = atomic_long_read(&tg->load_avg);
 -      tg_weight -= cfs_rq->tg_load_avg_contrib;
 -      tg_weight += cfs_rq->load.weight;
 -
 -      return tg_weight;
 -}
 +      load = scale_load_down(cfs_rq->load.weight);
  
 -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 -{
 -      long tg_weight, load, shares;
 +      tg_weight = atomic_long_read(&tg->load_avg);
  
 -      tg_weight = calc_tg_weight(tg, cfs_rq);
 -      load = cfs_rq->load.weight;
 +      /* Ensure tg_weight >= load */
 +      tg_weight -= cfs_rq->tg_load_avg_contrib;
 +      tg_weight += load;
  
        shares = (tg->shares * load);
        if (tg_weight)
@@@ -2597,7 -2434,6 +2597,7 @@@ static inline long calc_cfs_shares(stru
        return tg->shares;
  }
  # endif /* CONFIG_SMP */
 +
  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                            unsigned long weight)
  {
  
  static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
  
 -static void update_cfs_shares(struct cfs_rq *cfs_rq)
 +static void update_cfs_shares(struct sched_entity *se)
  {
 +      struct cfs_rq *cfs_rq = group_cfs_rq(se);
        struct task_group *tg;
 -      struct sched_entity *se;
        long shares;
  
 -      tg = cfs_rq->tg;
 -      se = tg->se[cpu_of(rq_of(cfs_rq))];
 -      if (!se || throttled_hierarchy(cfs_rq))
 +      if (!cfs_rq)
 +              return;
 +
 +      if (throttled_hierarchy(cfs_rq))
                return;
 +
 +      tg = cfs_rq->tg;
 +
  #ifndef CONFIG_SMP
        if (likely(se->load.weight == tg->shares))
                return;
  
        reweight_entity(cfs_rq_of(se), se, shares);
  }
 +
  #else /* CONFIG_FAIR_GROUP_SCHED */
 -static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 +static inline void update_cfs_shares(struct sched_entity *se)
  {
  }
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  #ifdef CONFIG_SMP
 -/* Precomputed fixed inverse multiplies for multiplication by y^n */
 +u32 sched_get_wake_up_idle(struct task_struct *p)
 +{
 +      u32 enabled = p->flags & PF_WAKE_UP_IDLE;
 +
 +      return !!enabled;
 +}
 +EXPORT_SYMBOL(sched_get_wake_up_idle);
 +
 +int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
 +{
 +      int enable = !!wake_up_idle;
 +
 +      if (enable)
 +              p->flags |= PF_WAKE_UP_IDLE;
 +      else
 +              p->flags &= ~PF_WAKE_UP_IDLE;
 +
 +      return 0;
 +}
 +EXPORT_SYMBOL(sched_set_wake_up_idle);
 +
  static const u32 runnable_avg_yN_inv[] = {
        0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
        0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
@@@ -2746,1183 -2557,120 +2746,1183 @@@ static u32 __compute_runnable_contrib(u
        return contrib + runnable_avg_yN_sum[n];
  }
  
 -#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
 -#error "load tracking assumes 2^10 as unit"
 -#endif
 +#ifdef CONFIG_SCHED_HMP
 +
 +/* CPU selection flag */
 +#define SBC_FLAG_PREV_CPU                             0x1
 +#define SBC_FLAG_BEST_CAP_CPU                         0x2
 +#define SBC_FLAG_CPU_COST                             0x4
 +#define SBC_FLAG_MIN_COST                             0x8
 +#define SBC_FLAG_IDLE_LEAST_LOADED                    0x10
 +#define SBC_FLAG_IDLE_CSTATE                          0x20
 +#define SBC_FLAG_COST_CSTATE_TIE_BREAKER              0x40
 +#define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER     0x80
 +#define SBC_FLAG_CSTATE_LOAD                          0x100
 +#define SBC_FLAG_BEST_SIBLING                         0x200
 +#define SBC_FLAG_WAKER_CPU                            0x400
 +#define SBC_FLAG_PACK_TASK                            0x800
 +
 +/* Cluster selection flag */
 +#define SBC_FLAG_COLOC_CLUSTER                                0x10000
 +#define SBC_FLAG_WAKER_CLUSTER                                0x20000
 +#define SBC_FLAG_BACKUP_CLUSTER                               0x40000
 +#define SBC_FLAG_BOOST_CLUSTER                                0x80000
 +
 +struct cpu_select_env {
 +      struct task_struct *p;
 +      struct related_thread_group *rtg;
 +      u8 reason;
 +      u8 need_idle:1;
 +      u8 need_waker_cluster:1;
 +      u8 sync:1;
 +      enum sched_boost_policy boost_policy;
 +      u8 pack_task:1;
 +      int prev_cpu;
 +      DECLARE_BITMAP(candidate_list, NR_CPUS);
 +      DECLARE_BITMAP(backup_list, NR_CPUS);
 +      u64 task_load;
 +      u64 cpu_load;
 +      u32 sbc_best_flag;
 +      u32 sbc_best_cluster_flag;
 +      struct cpumask search_cpus;
 +};
  
 -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 +struct cluster_cpu_stats {
 +      int best_idle_cpu, least_loaded_cpu;
 +      int best_capacity_cpu, best_cpu, best_sibling_cpu;
 +      int min_cost, best_sibling_cpu_cost;
 +      int best_cpu_wakeup_latency;
 +      u64 min_load, best_load, best_sibling_cpu_load;
 +      s64 highest_spare_capacity;
 +};
  
  /*
 - * We can represent the historical contribution to runnable average as the
 - * coefficients of a geometric series.  To do this we sub-divide our runnable
 - * history into segments of approximately 1ms (1024us); label the segment that
 - * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
 - *
 - * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
 - *      p0            p1           p2
 - *     (now)       (~1ms ago)  (~2ms ago)
 - *
 - * Let u_i denote the fraction of p_i that the entity was runnable.
 - *
 - * We then designate the fractions u_i as our co-efficients, yielding the
 - * following representation of historical load:
 - *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
 - *
 - * We choose y based on the with of a reasonably scheduling period, fixing:
 - *   y^32 = 0.5
 - *
 - * This means that the contribution to load ~32ms ago (u_32) will be weighted
 - * approximately half as much as the contribution to load within the last ms
 - * (u_0).
 + * Should task be woken to any available idle cpu?
   *
 - * When a period "rolls over" and we have new u_0`, multiplying the previous
 - * sum again by y is sufficient to update:
 - *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
 - *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
 + * Waking tasks to idle cpu has mixed implications on both performance and
 + * power. In many cases, scheduler can't estimate correctly impact of using idle
 + * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
 + * module to pass a strong hint to scheduler that the task in question should be
 + * woken to idle cpu, generally to improve performance.
   */
 -static __always_inline int
 -__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 -                unsigned long weight, int running, struct cfs_rq *cfs_rq)
 +static inline int wake_to_idle(struct task_struct *p)
  {
 -      u64 delta, scaled_delta, periods;
 -      u32 contrib;
 -      unsigned int delta_w, scaled_delta_w, decayed = 0;
 -      unsigned long scale_freq, scale_cpu;
 +      return (current->flags & PF_WAKE_UP_IDLE) ||
 +               (p->flags & PF_WAKE_UP_IDLE);
 +}
  
 -      delta = now - sa->last_update_time;
 -      /*
 -       * This should only happen when time goes backwards, which it
 -       * unfortunately does during sched clock init when we swap over to TSC.
 -       */
 -      if ((s64)delta < 0) {
 -              sa->last_update_time = now;
 -              return 0;
 -      }
 +static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
 +{
 +      u64 total_load;
  
 -      /*
 -       * Use 1024ns as the unit of measurement since it's a reasonable
 -       * approximation of 1us and fast to compute.
 -       */
 -      delta >>= 10;
 -      if (!delta)
 -              return 0;
 -      sa->last_update_time = now;
 +      total_load = env->task_load + env->cpu_load;
  
 -      scale_freq = arch_scale_freq_capacity(NULL, cpu);
 -      scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 +      if (total_load > sched_spill_load ||
 +          (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
 +              return 1;
  
 -      /* delta_w is the amount already accumulated against our next period */
 -      delta_w = sa->period_contrib;
 -      if (delta + delta_w >= 1024) {
 -              decayed = 1;
 +      return 0;
 +}
  
 -              /* how much left for next period will start over, we don't know yet */
 -              sa->period_contrib = 0;
 +static int skip_cpu(int cpu, struct cpu_select_env *env)
 +{
 +      int tcpu = task_cpu(env->p);
 +      int skip = 0;
  
 -              /*
 -               * Now that we know we're crossing a period boundary, figure
 -               * out how much from delta we need to complete the current
 -               * period and accrue it.
 -               */
 -              delta_w = 1024 - delta_w;
 -              scaled_delta_w = cap_scale(delta_w, scale_freq);
 -              if (weight) {
 -                      sa->load_sum += weight * scaled_delta_w;
 -                      if (cfs_rq) {
 -                              cfs_rq->runnable_load_sum +=
 -                                              weight * scaled_delta_w;
 -                      }
 -              }
 -              if (running)
 -                      sa->util_sum += scaled_delta_w * scale_cpu;
 +      if (!env->reason)
 +              return 0;
  
 -              delta -= delta_w;
 +      if (is_reserved(cpu))
 +              return 1;
  
 -              /* Figure out how many additional periods this update spans */
 -              periods = delta / 1024;
 -              delta %= 1024;
 +      switch (env->reason) {
 +      case UP_MIGRATION:
 +              skip = !idle_cpu(cpu);
 +              break;
 +      case IRQLOAD_MIGRATION:
 +              /* Purposely fall through */
 +      default:
 +              skip = (cpu == tcpu);
 +              break;
 +      }
  
 -              sa->load_sum = decay_load(sa->load_sum, periods + 1);
 -              if (cfs_rq) {
 -                      cfs_rq->runnable_load_sum =
 -                              decay_load(cfs_rq->runnable_load_sum, periods + 1);
 -              }
 -              sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
 +      return skip;
 +}
  
 -              /* Efficiently calculate \sum (1..n_period) 1024*y^i */
 -              contrib = __compute_runnable_contrib(periods);
 -              contrib = cap_scale(contrib, scale_freq);
 -              if (weight) {
 -                      sa->load_sum += weight * contrib;
 -                      if (cfs_rq)
 -                              cfs_rq->runnable_load_sum += weight * contrib;
 -              }
 -              if (running)
 -                      sa->util_sum += contrib * scale_cpu;
 -      }
 +static inline int
 +acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
 +{
 +      int tcpu;
 +
 +      if (!env->reason)
 +              return 1;
 +
 +      tcpu = task_cpu(env->p);
 +      switch (env->reason) {
 +      case UP_MIGRATION:
 +              return cluster->capacity > cpu_capacity(tcpu);
 +
 +      case DOWN_MIGRATION:
 +              return cluster->capacity < cpu_capacity(tcpu);
 +
 +      default:
 +              break;
 +      }
 +
 +      return 1;
 +}
 +
 +static int
 +skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
 +{
 +      if (!test_bit(cluster->id, env->candidate_list))
 +              return 1;
 +
 +      if (!acceptable_capacity(cluster, env)) {
 +              __clear_bit(cluster->id, env->candidate_list);
 +              return 1;
 +      }
 +
 +      return 0;
 +}
 +
 +static struct sched_cluster *
 +select_least_power_cluster(struct cpu_select_env *env)
 +{
 +      struct sched_cluster *cluster;
 +
 +      if (env->rtg) {
 +              int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
 +
 +              env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
 +
 +              if (task_load_will_fit(env->p, env->task_load,
 +                                      cpu, env->boost_policy)) {
 +                      env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
 +
 +                      if (env->boost_policy == SCHED_BOOST_NONE)
 +                              return env->rtg->preferred_cluster;
 +
 +                      for_each_sched_cluster(cluster) {
 +                              if (cluster != env->rtg->preferred_cluster) {
 +                                      __set_bit(cluster->id,
 +                                              env->backup_list);
 +                                      __clear_bit(cluster->id,
 +                                              env->candidate_list);
 +                              }
 +                      }
 +
 +                      return env->rtg->preferred_cluster;
 +              }
 +
 +              /*
 +               * Since the task load does not fit on the preferred
 +               * cluster anymore, pretend that the task does not
 +               * have any preferred cluster. This allows the waking
 +               * task to get the appropriate CPU it needs as per the
 +               * non co-location placement policy without having to
 +               * wait until the preferred cluster is updated.
 +               */
 +              env->rtg = NULL;
 +      }
 +
 +      for_each_sched_cluster(cluster) {
 +              if (!skip_cluster(cluster, env)) {
 +                      int cpu = cluster_first_cpu(cluster);
 +
 +                      env->task_load = scale_load_to_cpu(task_load(env->p),
 +                                                                       cpu);
 +                      if (task_load_will_fit(env->p, env->task_load, cpu,
 +                                             env->boost_policy))
 +                              return cluster;
 +
 +                      __set_bit(cluster->id, env->backup_list);
 +                      __clear_bit(cluster->id, env->candidate_list);
 +              }
 +      }
 +
 +      return NULL;
 +}
 +
 +static struct sched_cluster *
 +next_candidate(const unsigned long *list, int start, int end)
 +{
 +      int cluster_id;
 +
 +      cluster_id = find_next_bit(list, end, start - 1 + 1);
 +      if (cluster_id >= end)
 +              return NULL;
 +
 +      return sched_cluster[cluster_id];
 +}
 +
 +static void
 +update_spare_capacity(struct cluster_cpu_stats *stats,
 +                    struct cpu_select_env *env, int cpu, int capacity,
 +                    u64 cpu_load)
 +{
 +      s64 spare_capacity = sched_ravg_window - cpu_load;
 +
 +      if (spare_capacity > 0 &&
 +          (spare_capacity > stats->highest_spare_capacity ||
 +           (spare_capacity == stats->highest_spare_capacity &&
 +            ((!env->need_waker_cluster &&
 +              capacity > cpu_capacity(stats->best_capacity_cpu)) ||
 +             (env->need_waker_cluster &&
 +              cpu_rq(cpu)->nr_running <
 +              cpu_rq(stats->best_capacity_cpu)->nr_running))))) {
 +              /*
 +               * If sync waker is the only runnable of CPU, cr_avg of the
 +               * CPU is 0 so we have high chance to place the wakee on the
 +               * waker's CPU which likely causes preemtion of the waker.
 +               * This can lead migration of preempted waker.  Place the
 +               * wakee on the real idle CPU when it's possible by checking
 +               * nr_running to avoid such preemption.
 +               */
 +              stats->highest_spare_capacity = spare_capacity;
 +              stats->best_capacity_cpu = cpu;
 +      }
 +}
 +
 +static inline void find_backup_cluster(
 +struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 +{
 +      struct sched_cluster *next = NULL;
 +      int i;
 +      struct cpumask search_cpus;
 +
 +      extern int num_clusters;
 +
 +      while (!bitmap_empty(env->backup_list, num_clusters)) {
 +              next = next_candidate(env->backup_list, 0, num_clusters);
 +              __clear_bit(next->id, env->backup_list);
 +
 +              cpumask_and(&search_cpus, &env->search_cpus, &next->cpus);
 +              for_each_cpu(i, &search_cpus) {
 +                      trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
 +                      sched_irqload(i), power_cost(i, task_load(env->p) +
 +                                      cpu_cravg_sync(i, env->sync)), 0);
 +
 +                      update_spare_capacity(stats, env, i, next->capacity,
 +                                        cpu_load_sync(i, env->sync));
 +              }
 +              env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER;
 +      }
 +}
 +
 +struct sched_cluster *
 +next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
 +                                      struct cluster_cpu_stats *stats)
 +{
 +      struct sched_cluster *next = NULL;
 +
 +      extern int num_clusters;
 +
 +      __clear_bit(cluster->id, env->candidate_list);
 +
 +      if (env->rtg && preferred_cluster(cluster, env->p))
 +              return NULL;
 +
 +      do {
 +              if (bitmap_empty(env->candidate_list, num_clusters))
 +                      return NULL;
 +
 +              next = next_candidate(env->candidate_list, 0, num_clusters);
 +              if (next) {
 +                      if (next->min_power_cost > stats->min_cost) {
 +                              clear_bit(next->id, env->candidate_list);
 +                              next = NULL;
 +                              continue;
 +                      }
 +
 +                      if (skip_cluster(next, env))
 +                              next = NULL;
 +              }
 +      } while (!next);
 +
 +      env->task_load = scale_load_to_cpu(task_load(env->p),
 +                                      cluster_first_cpu(next));
 +      return next;
 +}
 +
 +#ifdef CONFIG_SCHED_HMP_CSTATE_AWARE
 +static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
 +                                 struct cpu_select_env *env, int cpu_cost)
 +{
 +      int wakeup_latency;
 +      int prev_cpu = env->prev_cpu;
 +
 +      wakeup_latency = cpu_rq(cpu)->wakeup_latency;
 +
 +      if (env->need_idle) {
 +              stats->min_cost = cpu_cost;
 +              if (idle_cpu(cpu)) {
 +                      if (wakeup_latency < stats->best_cpu_wakeup_latency ||
 +                          (wakeup_latency == stats->best_cpu_wakeup_latency &&
 +                           cpu == prev_cpu)) {
 +                              stats->best_idle_cpu = cpu;
 +                              stats->best_cpu_wakeup_latency = wakeup_latency;
 +                      }
 +              } else {
 +                      if (env->cpu_load < stats->min_load ||
 +                              (env->cpu_load == stats->min_load &&
 +                                                      cpu == prev_cpu)) {
 +                              stats->least_loaded_cpu = cpu;
 +                              stats->min_load = env->cpu_load;
 +                      }
 +              }
 +
 +              return;
 +      }
 +
 +      if (cpu_cost < stats->min_cost)  {
 +              stats->min_cost = cpu_cost;
 +              stats->best_cpu_wakeup_latency = wakeup_latency;
 +              stats->best_load = env->cpu_load;
 +              stats->best_cpu = cpu;
 +              env->sbc_best_flag = SBC_FLAG_CPU_COST;
 +              return;
 +      }
 +
 +      /* CPU cost is the same. Start breaking the tie by C-state */
 +
 +      if (wakeup_latency > stats->best_cpu_wakeup_latency)
 +              return;
 +
 +      if (wakeup_latency < stats->best_cpu_wakeup_latency) {
 +              stats->best_cpu_wakeup_latency = wakeup_latency;
 +              stats->best_load = env->cpu_load;
 +              stats->best_cpu = cpu;
 +              env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
 +              return;
 +      }
 +
 +      /* C-state is the same. Use prev CPU to break the tie */
 +      if (cpu == prev_cpu) {
 +              stats->best_cpu = cpu;
 +              env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER;
 +              return;
 +      }
 +
 +      if (stats->best_cpu != prev_cpu &&
 +          ((wakeup_latency == 0 && env->cpu_load < stats->best_load) ||
 +          (wakeup_latency > 0 && env->cpu_load > stats->best_load))) {
 +              stats->best_load = env->cpu_load;
 +              stats->best_cpu = cpu;
 +              env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
 +      }
 +}
 +#else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
 +static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
 +                                 struct cpu_select_env *env, int cpu_cost)
 +{
 +      int prev_cpu = env->prev_cpu;
 +
 +      if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
 +              if (stats->best_sibling_cpu_cost > cpu_cost ||
 +                  (stats->best_sibling_cpu_cost == cpu_cost &&
 +                   stats->best_sibling_cpu_load > env->cpu_load)) {
 +                      stats->best_sibling_cpu_cost = cpu_cost;
 +                      stats->best_sibling_cpu_load = env->cpu_load;
 +                      stats->best_sibling_cpu = cpu;
 +              }
 +      }
 +
 +      if ((cpu_cost < stats->min_cost) ||
 +          ((stats->best_cpu != prev_cpu &&
 +            stats->min_load > env->cpu_load) || cpu == prev_cpu)) {
 +              if (env->need_idle) {
 +                      if (idle_cpu(cpu)) {
 +                              stats->min_cost = cpu_cost;
 +                              stats->best_idle_cpu = cpu;
 +                      }
 +              } else {
 +                      stats->min_cost = cpu_cost;
 +                      stats->min_load = env->cpu_load;
 +                      stats->best_cpu = cpu;
 +                      env->sbc_best_flag = SBC_FLAG_MIN_COST;
 +              }
 +      }
 +}
 +#endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */
 +
 +static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
 +                                       struct cpu_select_env *env)
 +{
 +      int cpu_cost;
 +
 +      /*
 +       * We try to find the least loaded *busy* CPU irrespective
 +       * of the power cost.
 +       */
 +      if (env->pack_task)
 +              cpu_cost = cpu_min_power_cost(cpu);
 +
 +      else
 +              cpu_cost = power_cost(cpu, task_load(env->p) +
 +                              cpu_cravg_sync(cpu, env->sync));
 +
 +      if (cpu_cost <= stats->min_cost)
 +              __update_cluster_stats(cpu, stats, env, cpu_cost);
 +}
 +
 +static void find_best_cpu_in_cluster(struct sched_cluster *c,
 +       struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 +{
 +      int i;
 +      struct cpumask search_cpus;
 +
 +      cpumask_and(&search_cpus, &env->search_cpus, &c->cpus);
 +
 +      env->need_idle = wake_to_idle(env->p) || c->wake_up_idle;
 +
 +      for_each_cpu(i, &search_cpus) {
 +              env->cpu_load = cpu_load_sync(i, env->sync);
 +
 +              trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
 +                      sched_irqload(i),
 +                      power_cost(i, task_load(env->p) +
 +                                      cpu_cravg_sync(i, env->sync)), 0);
 +
 +              if (skip_cpu(i, env))
 +                      continue;
 +
 +              update_spare_capacity(stats, env, i, c->capacity,
 +                                    env->cpu_load);
 +
 +              /*
 +               * need_idle takes precedence over sched boost but when both
 +               * are set, idlest CPU with in all the clusters is selected
 +               * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
 +               * big cluster is selected within boost_policy = BOOST_ON_BIG.
 +               */
 +              if ((!env->need_idle &&
 +                  env->boost_policy != SCHED_BOOST_NONE) ||
 +                  env->need_waker_cluster ||
 +                  sched_cpu_high_irqload(i) ||
 +                  spill_threshold_crossed(env, cpu_rq(i)))
 +                      continue;
 +
 +              update_cluster_stats(i, stats, env);
 +      }
 +}
 +
 +static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
 +{
 +      stats->best_cpu = stats->best_idle_cpu = -1;
 +      stats->best_capacity_cpu = stats->best_sibling_cpu  = -1;
 +      stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
 +      stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX;
 +      stats->highest_spare_capacity = 0;
 +      stats->least_loaded_cpu = -1;
 +      stats->best_cpu_wakeup_latency = INT_MAX;
 +      /* No need to initialize stats->best_load */
 +}
 +
 +static inline bool env_has_special_flags(struct cpu_select_env *env)
 +{
 +      if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
 +          env->reason)
 +              return true;
 +
 +      return false;
 +}
 +
 +static inline bool
 +bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 +{
 +      int prev_cpu;
 +      struct task_struct *task = env->p;
 +      struct sched_cluster *cluster;
 +
 +      if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
 +              return false;
 +
 +      prev_cpu = env->prev_cpu;
 +      if (!cpumask_test_cpu(prev_cpu, &env->search_cpus))
 +              return false;
 +
 +      if (task->ravg.mark_start - task->last_cpu_selected_ts >=
 +                              sched_long_cpu_selection_threshold)
 +              return false;
 +
 +      /*
 +       * This function should be used by task wake up path only as it's
 +       * assuming p->last_switch_out_ts as last sleep time.
 +       * p->last_switch_out_ts can denote last preemption time as well as
 +       * last sleep time.
 +       */
 +      if (task->ravg.mark_start - task->last_switch_out_ts >=
 +                                      sched_short_sleep_task_threshold)
 +              return false;
 +
 +      env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
 +      cluster = cpu_rq(prev_cpu)->cluster;
 +
 +      if (!task_load_will_fit(task, env->task_load, prev_cpu,
 +                              sched_boost_policy())) {
 +
 +              __set_bit(cluster->id, env->backup_list);
 +              __clear_bit(cluster->id, env->candidate_list);
 +              return false;
 +      }
 +
 +      env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
 +      if (sched_cpu_high_irqload(prev_cpu) ||
 +                      spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
 +              update_spare_capacity(stats, env, prev_cpu,
 +                              cluster->capacity, env->cpu_load);
 +              cpumask_clear_cpu(prev_cpu, &env->search_cpus);
 +              return false;
 +      }
 +
 +      return true;
 +}
 +
 +static inline bool
 +wake_to_waker_cluster(struct cpu_select_env *env)
 +{
 +      return env->sync &&
 +             task_load(current) > sched_big_waker_task_load &&
 +             task_load(env->p) < sched_small_wakee_task_load;
 +}
 +
 +static inline bool
 +bias_to_waker_cpu(struct cpu_select_env *env, int cpu)
 +{
 +      return sysctl_sched_prefer_sync_wakee_to_waker &&
 +             cpu_rq(cpu)->nr_running == 1 &&
 +             cpumask_test_cpu(cpu, &env->search_cpus);
 +}
 +
 +static inline int
 +cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster)
 +{
 +      return cpumask_intersects(&env->search_cpus, &cluster->cpus);
 +}
 +
 +/* return cheapest cpu that can fit this task */
 +static int select_best_cpu(struct task_struct *p, int target, int reason,
 +                         int sync)
 +{
 +      struct sched_cluster *cluster, *pref_cluster = NULL;
 +      struct cluster_cpu_stats stats;
 +      struct related_thread_group *grp;
 +      unsigned int sbc_flag = 0;
 +      int cpu = raw_smp_processor_id();
 +      bool special;
 +
 +      struct cpu_select_env env = {
 +              .p                      = p,
 +              .reason                 = reason,
 +              .need_idle              = wake_to_idle(p),
 +              .need_waker_cluster     = 0,
 +              .sync                   = sync,
 +              .prev_cpu               = target,
 +              .rtg                    = NULL,
 +              .sbc_best_flag          = 0,
 +              .sbc_best_cluster_flag  = 0,
 +              .pack_task              = false,
 +      };
 +
 +      env.boost_policy = task_sched_boost(p) ?
 +                      sched_boost_policy() : SCHED_BOOST_NONE;
 +
 +      bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
 +      bitmap_zero(env.backup_list, NR_CPUS);
 +
 +      cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask);
 +      cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask);
 +
 +      init_cluster_cpu_stats(&stats);
 +      special = env_has_special_flags(&env);
 +
 +      rcu_read_lock();
 +
 +      grp = task_related_thread_group(p);
 +
 +      if (grp && grp->preferred_cluster) {
 +              pref_cluster = grp->preferred_cluster;
 +              if (!cluster_allowed(&env, pref_cluster))
 +                      clear_bit(pref_cluster->id, env.candidate_list);
 +              else
 +                      env.rtg = grp;
 +      } else if (!special) {
 +              cluster = cpu_rq(cpu)->cluster;
 +              if (wake_to_waker_cluster(&env)) {
 +                      if (bias_to_waker_cpu(&env, cpu)) {
 +                              target = cpu;
 +                              sbc_flag = SBC_FLAG_WAKER_CLUSTER |
 +                                         SBC_FLAG_WAKER_CPU;
 +                              goto out;
 +                      } else if (cluster_allowed(&env, cluster)) {
 +                              env.need_waker_cluster = 1;
 +                              bitmap_zero(env.candidate_list, NR_CPUS);
 +                              __set_bit(cluster->id, env.candidate_list);
 +                              env.sbc_best_cluster_flag =
 +                                                      SBC_FLAG_WAKER_CLUSTER;
 +                      }
 +              } else if (bias_to_prev_cpu(&env, &stats)) {
 +                      sbc_flag = SBC_FLAG_PREV_CPU;
 +                      goto out;
 +              }
 +      }
 +
 +      if (!special && is_short_burst_task(p)) {
 +              env.pack_task = true;
 +              sbc_flag = SBC_FLAG_PACK_TASK;
 +      }
 +retry:
 +      cluster = select_least_power_cluster(&env);
 +
 +      if (!cluster)
 +              goto out;
 +
 +      /*
 +       * 'cluster' now points to the minimum power cluster which can satisfy
 +       * task's perf goals. Walk down the cluster list starting with that
 +       * cluster. For non-small tasks, skip clusters that don't have
 +       * mostly_idle/idle cpus
 +       */
 +
 +      do {
 +              find_best_cpu_in_cluster(cluster, &env, &stats);
 +
 +      } while ((cluster = next_best_cluster(cluster, &env, &stats)));
 +
 +      if (env.need_idle) {
 +              if (stats.best_idle_cpu >= 0) {
 +                      target = stats.best_idle_cpu;
 +                      sbc_flag |= SBC_FLAG_IDLE_CSTATE;
 +              } else if (stats.least_loaded_cpu >= 0) {
 +                      target = stats.least_loaded_cpu;
 +                      sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
 +              }
 +      } else if (stats.best_cpu >= 0) {
 +              if (stats.best_sibling_cpu >= 0 &&
 +                              stats.best_cpu != task_cpu(p) &&
 +                              stats.min_cost == stats.best_sibling_cpu_cost) {
 +                      stats.best_cpu = stats.best_sibling_cpu;
 +                      sbc_flag |= SBC_FLAG_BEST_SIBLING;
 +              }
 +              sbc_flag |= env.sbc_best_flag;
 +              target = stats.best_cpu;
 +      } else {
 +              if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
 +                      env.rtg = NULL;
 +                      goto retry;
 +              }
 +
 +              /*
 +               * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
 +               * backup_list = little cluster, candidate_list = none and
 +               * stats->best_capacity_cpu points the best spare capacity
 +               * CPU among the CPUs in the big cluster.
 +               */
 +              if (env.boost_policy == SCHED_BOOST_ON_BIG &&
 +                  stats.best_capacity_cpu >= 0)
 +                      sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
 +              else
 +                      find_backup_cluster(&env, &stats);
 +
 +              if (stats.best_capacity_cpu >= 0) {
 +                      target = stats.best_capacity_cpu;
 +                      sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
 +              }
 +      }
 +      p->last_cpu_selected_ts = sched_ktime_clock();
 +out:
 +      sbc_flag |= env.sbc_best_cluster_flag;
 +      rcu_read_unlock();
 +      trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
 +              env.reason, env.sync, env.need_idle, sbc_flag, target);
 +      return target;
 +}
 +
 +#ifdef CONFIG_CFS_BANDWIDTH
 +
 +static inline struct task_group *next_task_group(struct task_group *tg)
 +{
 +      tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
 +
 +      return (&tg->list == &task_groups) ? NULL : tg;
 +}
 +
 +/* Iterate over all cfs_rq in a cpu */
 +#define for_each_cfs_rq(cfs_rq, tg, cpu)      \
 +      for (tg = container_of(&task_groups, struct task_group, list);  \
 +              ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
 +
 +void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
 +{
 +      struct task_group *tg;
 +      struct cfs_rq *cfs_rq;
 +
 +      rcu_read_lock();
 +
 +      for_each_cfs_rq(cfs_rq, tg, cpu)
 +              reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
 +
 +      rcu_read_unlock();
 +}
 +
 +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
 +
 +static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 +       struct task_struct *p, int change_cra);
 +static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 +       struct task_struct *p, int change_cra);
 +
 +/* Add task's contribution to a cpu' HMP statistics */
 +void _inc_hmp_sched_stats_fair(struct rq *rq,
 +                      struct task_struct *p, int change_cra)
 +{
 +      struct cfs_rq *cfs_rq;
 +      struct sched_entity *se = &p->se;
 +
 +      /*
 +       * Although below check is not strictly required  (as
 +       * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
 +       * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
 +       * efficiency by short-circuiting for_each_sched_entity() loop when
 +       * sched_disable_window_stats
 +       */
 +      if (sched_disable_window_stats)
 +              return;
 +
 +      for_each_sched_entity(se) {
 +              cfs_rq = cfs_rq_of(se);
 +              inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
 +              if (cfs_rq_throttled(cfs_rq))
 +                      break;
 +      }
 +
 +      /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
 +      if (!se)
 +              inc_rq_hmp_stats(rq, p, change_cra);
 +}
 +
 +/* Remove task's contribution from a cpu' HMP statistics */
 +static void
 +_dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
 +{
 +      struct cfs_rq *cfs_rq;
 +      struct sched_entity *se = &p->se;
 +
 +      /* See comment on efficiency in _inc_hmp_sched_stats_fair */
 +      if (sched_disable_window_stats)
 +              return;
 +
 +      for_each_sched_entity(se) {
 +              cfs_rq = cfs_rq_of(se);
 +              dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
 +              if (cfs_rq_throttled(cfs_rq))
 +                      break;
 +      }
 +
 +      /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
 +      if (!se)
 +              dec_rq_hmp_stats(rq, p, change_cra);
 +}
 +
 +static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
 +{
 +      _inc_hmp_sched_stats_fair(rq, p, 1);
 +}
 +
 +static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
 +{
 +      _dec_hmp_sched_stats_fair(rq, p, 1);
 +}
 +
 +static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
 +                                     u32 new_task_load, u32 new_pred_demand)
 +{
 +      struct cfs_rq *cfs_rq;
 +      struct sched_entity *se = &p->se;
 +      s64 task_load_delta = (s64)new_task_load - task_load(p);
 +      s64 pred_demand_delta = PRED_DEMAND_DELTA;
 +
 +      for_each_sched_entity(se) {
 +              cfs_rq = cfs_rq_of(se);
 +
 +              fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
 +                                            task_load_delta,
 +                                            pred_demand_delta);
 +              fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
 +              if (cfs_rq_throttled(cfs_rq))
 +                      break;
 +      }
 +
 +      /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
 +      if (!se) {
 +              fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
 +                                            task_load_delta,
 +                                            pred_demand_delta);
 +              fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
 +      }
 +}
 +
 +static int task_will_be_throttled(struct task_struct *p);
 +
 +#else /* CONFIG_CFS_BANDWIDTH */
 +
 +inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
 +
 +static void
 +inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
 +{
 +      inc_nr_big_task(&rq->hmp_stats, p);
 +      inc_cumulative_runnable_avg(&rq->hmp_stats, p);
 +}
 +
 +static void
 +dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
 +{
 +      dec_nr_big_task(&rq->hmp_stats, p);
 +      dec_cumulative_runnable_avg(&rq->hmp_stats, p);
 +}
 +static void
 +fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
 +                         u32 new_task_load, u32 new_pred_demand)
 +{
 +      s64 task_load_delta = (s64)new_task_load - task_load(p);
 +      s64 pred_demand_delta = PRED_DEMAND_DELTA;
 +
 +      fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
 +                                    pred_demand_delta);
 +      fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
 +}
 +
 +static inline int task_will_be_throttled(struct task_struct *p)
 +{
 +      return 0;
 +}
 +
 +void _inc_hmp_sched_stats_fair(struct rq *rq,
 +                      struct task_struct *p, int change_cra)
 +{
 +      inc_nr_big_task(&rq->hmp_stats, p);
 +}
 +
 +#endif        /* CONFIG_CFS_BANDWIDTH */
 +
 +/*
 + * Reset balance_interval at all sched_domain levels of given cpu, so that it
 + * honors kick.
 + */
 +static inline void reset_balance_interval(int cpu)
 +{
 +      struct sched_domain *sd;
 +
 +      if (cpu >= nr_cpu_ids)
 +              return;
 +
 +      rcu_read_lock();
 +      for_each_domain(cpu, sd)
 +              sd->balance_interval = 0;
 +      rcu_read_unlock();
 +}
 +
 +/*
 + * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
 + * cpu as per its demand or priority)
 + *
 + * Returns reason why task needs to be migrated
 + */
 +static inline int migration_needed(struct task_struct *p, int cpu)
 +{
 +      int nice;
 +      struct related_thread_group *grp;
 +
 +      if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
 +              return 0;
 +
 +      /* No need to migrate task that is about to be throttled */
 +      if (task_will_be_throttled(p))
 +              return 0;
 +
 +      if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
 +               cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
 +              return UP_MIGRATION;
 +
 +      if (sched_cpu_high_irqload(cpu))
 +              return IRQLOAD_MIGRATION;
 +
 +      nice = task_nice(p);
 +      rcu_read_lock();
 +      grp = task_related_thread_group(p);
 +      /*
 +       * Don't assume higher capacity means higher power. If the task
 +       * is running on the power efficient CPU, avoid migrating it
 +       * to a lower capacity cluster.
 +       */
 +      if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
 +                      upmigrate_discouraged(p)) &&
 +                      cpu_capacity(cpu) > min_capacity &&
 +                      cpu_max_power_cost(cpu) == max_power_cost) {
 +              rcu_read_unlock();
 +              return DOWN_MIGRATION;
 +      }
 +
 +      if (!task_will_fit(p, cpu)) {
 +              rcu_read_unlock();
 +              return UP_MIGRATION;
 +      }
 +      rcu_read_unlock();
 +
 +      return 0;
 +}
 +
 +static inline int
 +kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
 +{
 +      unsigned long flags;
 +      int rc = 0;
 +
 +      /* Invoke active balance to force migrate currently running task */
 +      raw_spin_lock_irqsave(&rq->lock, flags);
 +      if (!rq->active_balance) {
 +              rq->active_balance = 1;
 +              rq->push_cpu = new_cpu;
 +              get_task_struct(p);
 +              rq->push_task = p;
 +              rc = 1;
 +      }
 +      raw_spin_unlock_irqrestore(&rq->lock, flags);
 +
 +      return rc;
 +}
 +
 +static DEFINE_RAW_SPINLOCK(migration_lock);
 +
 +static bool do_migration(int reason, int new_cpu, int cpu)
 +{
 +      if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
 +                              && same_cluster(new_cpu, cpu))
 +              return false;
 +
 +      /* Inter cluster high irqload migrations are OK */
 +      return new_cpu != cpu;
 +}
 +
 +/*
 + * Check if currently running task should be migrated to a better cpu.
 + *
 + * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
 + */
 +void check_for_migration(struct rq *rq, struct task_struct *p)
 +{
 +      int cpu = cpu_of(rq), new_cpu;
 +      int active_balance = 0, reason;
 +
 +      reason = migration_needed(p, cpu);
 +      if (!reason)
 +              return;
 +
 +      raw_spin_lock(&migration_lock);
 +      new_cpu = select_best_cpu(p, cpu, reason, 0);
 +
 +      if (do_migration(reason, new_cpu, cpu)) {
 +              active_balance = kick_active_balance(rq, p, new_cpu);
 +              if (active_balance)
 +                      mark_reserved(new_cpu);
 +      }
 +
 +      raw_spin_unlock(&migration_lock);
 +
 +      if (active_balance)
 +              stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
 +                                      &rq->active_balance_work);
 +}
 +
 +#ifdef CONFIG_CFS_BANDWIDTH
 +
 +static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
 +{
 +      cfs_rq->hmp_stats.nr_big_tasks = 0;
 +      cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
 +      cfs_rq->hmp_stats.pred_demands_sum = 0;
 +}
 +
 +static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 +               struct task_struct *p, int change_cra)
 +{
 +      inc_nr_big_task(&cfs_rq->hmp_stats, p);
 +      if (change_cra)
 +              inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
 +}
 +
 +static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 +               struct task_struct *p, int change_cra)
 +{
 +      dec_nr_big_task(&cfs_rq->hmp_stats, p);
 +      if (change_cra)
 +              dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
 +}
 +
 +static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
 +                       struct cfs_rq *cfs_rq)
 +{
 +      stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
 +      stats->cumulative_runnable_avg +=
 +                              cfs_rq->hmp_stats.cumulative_runnable_avg;
 +      stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum;
 +}
 +
 +static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
 +                               struct cfs_rq *cfs_rq)
 +{
 +      stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
 +      stats->cumulative_runnable_avg -=
 +                              cfs_rq->hmp_stats.cumulative_runnable_avg;
 +      stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum;
 +
 +      BUG_ON(stats->nr_big_tasks < 0 ||
 +              (s64)stats->cumulative_runnable_avg < 0);
 +      BUG_ON((s64)stats->pred_demands_sum < 0);
 +}
 +
 +#else /* CONFIG_CFS_BANDWIDTH */
 +
 +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 +       struct task_struct *p, int change_cra) { }
 +
 +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 +       struct task_struct *p, int change_cra) { }
 +
 +#endif        /* CONFIG_CFS_BANDWIDTH */
 +
 +#else /* CONFIG_SCHED_HMP */
 +
 +static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
 +
 +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 +       struct task_struct *p, int change_cra) { }
 +
 +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 +       struct task_struct *p, int change_cra) { }
 +
 +#define dec_throttled_cfs_rq_hmp_stats(...)
 +#define inc_throttled_cfs_rq_hmp_stats(...)
 +
 +#endif        /* CONFIG_SCHED_HMP */
 +
 +#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
 +#error "load tracking assumes 2^10 as unit"
 +#endif
 +
 +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 +
 +/*
 + * We can represent the historical contribution to runnable average as the
 + * coefficients of a geometric series.  To do this we sub-divide our runnable
 + * history into segments of approximately 1ms (1024us); label the segment that
 + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
 + *
 + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
 + *      p0            p1           p2
 + *     (now)       (~1ms ago)  (~2ms ago)
 + *
 + * Let u_i denote the fraction of p_i that the entity was runnable.
 + *
 + * We then designate the fractions u_i as our co-efficients, yielding the
 + * following representation of historical load:
 + *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
 + *
 + * We choose y based on the with of a reasonably scheduling period, fixing:
 + *   y^32 = 0.5
 + *
 + * This means that the contribution to load ~32ms ago (u_32) will be weighted
 + * approximately half as much as the contribution to load within the last ms
 + * (u_0).
 + *
 + * When a period "rolls over" and we have new u_0`, multiplying the previous
 + * sum again by y is sufficient to update:
 + *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
 + *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
 + */
 +static __always_inline int
 +__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 +                unsigned long weight, int running, struct cfs_rq *cfs_rq)
 +{
 +      u64 delta, scaled_delta, periods;
 +      u32 contrib;
 +      unsigned int delta_w, scaled_delta_w, decayed = 0;
 +      unsigned long scale_freq, scale_cpu;
 +
 +      delta = now - sa->last_update_time;
 +      /*
 +       * This should only happen when time goes backwards, which it
 +       * unfortunately does during sched clock init when we swap over to TSC.
 +       */
 +      if ((s64)delta < 0) {
 +              sa->last_update_time = now;
 +              return 0;
 +      }
 +
 +      /*
 +       * Use 1024ns as the unit of measurement since it's a reasonable
 +       * approximation of 1us and fast to compute.
 +       */
 +      delta >>= 10;
 +      if (!delta)
 +              return 0;
 +      sa->last_update_time = now;
 +
 +      scale_freq = arch_scale_freq_capacity(NULL, cpu);
 +      scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 +      trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
 +
 +      /* delta_w is the amount already accumulated against our next period */
 +      delta_w = sa->period_contrib;
 +      if (delta + delta_w >= 1024) {
 +              decayed = 1;
 +
 +              /* how much left for next period will start over, we don't know yet */
 +              sa->period_contrib = 0;
 +
 +              /*
 +               * Now that we know we're crossing a period boundary, figure
 +               * out how much from delta we need to complete the current
 +               * period and accrue it.
 +               */
 +              delta_w = 1024 - delta_w;
 +              scaled_delta_w = cap_scale(delta_w, scale_freq);
 +              if (weight) {
 +                      sa->load_sum += weight * scaled_delta_w;
 +                      if (cfs_rq) {
 +                              cfs_rq->runnable_load_sum +=
 +                                              weight * scaled_delta_w;
 +                      }
 +              }
 +              if (running)
 +                      sa->util_sum += scaled_delta_w * scale_cpu;
 +
 +              delta -= delta_w;
 +
 +              /* Figure out how many additional periods this update spans */
 +              periods = delta / 1024;
 +              delta %= 1024;
 +
 +              sa->load_sum = decay_load(sa->load_sum, periods + 1);
 +              if (cfs_rq) {
 +                      cfs_rq->runnable_load_sum =
 +                              decay_load(cfs_rq->runnable_load_sum, periods + 1);
 +              }
 +              sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
 +
 +              /* Efficiently calculate \sum (1..n_period) 1024*y^i */
 +              contrib = __compute_runnable_contrib(periods);
 +              contrib = cap_scale(contrib, scale_freq);
 +              if (weight) {
 +                      sa->load_sum += weight * contrib;
 +                      if (cfs_rq)
 +                              cfs_rq->runnable_load_sum += weight * contrib;
 +              }
 +              if (running)
 +                      sa->util_sum += contrib * scale_cpu;
 +      }
  
        /* Remainder of delta accrued against u_0` */
        scaled_delta = cap_scale(delta, scale_freq);
                if (cfs_rq)
                        cfs_rq->runnable_load_sum += weight * scaled_delta;
        }
 +
        if (running)
                sa->util_sum += scaled_delta * scale_cpu;
  
        return decayed;
  }
  
 -#ifdef CONFIG_FAIR_GROUP_SCHED
  /*
 - * Updating tg's load_avg is necessary before update_cfs_share (which is done)
 - * and effective_load (which is not done because it is too costly).
 + * Signed add and clamp on underflow.
 + *
 + * Explicitly do a load-store to ensure the intermediate value never hits
 + * memory. This allows lockless observations without ever seeing the negative
 + * values.
 + */
 +#define add_positive(_ptr, _val) do {                           \
 +      typeof(_ptr) ptr = (_ptr);                              \
 +      typeof(_val) val = (_val);                              \
 +      typeof(*ptr) res, var = READ_ONCE(*ptr);                \
 +                                                              \
 +      res = var + val;                                        \
 +                                                              \
 +      if (val < 0 && res > var)                               \
 +              res = 0;                                        \
 +                                                              \
 +      WRITE_ONCE(*ptr, res);                                  \
 +} while (0)
 +
 +#ifdef CONFIG_FAIR_GROUP_SCHED
 +/**
 + * update_tg_load_avg - update the tg's load avg
 + * @cfs_rq: the cfs_rq whose avg changed
 + * @force: update regardless of how small the difference
 + *
 + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
 + * However, because tg->load_avg is a global value there are performance
 + * considerations.
 + *
 + * In order to avoid having to look at the other cfs_rq's, we use a
 + * differential update where we store the last value we propagated. This in
 + * turn allows skipping updates if the differential is 'small'.
 + *
 + * Updating tg's load_avg is necessary before update_cfs_share() (which is
 + * done) and effective_load() (which is not done because it is too costly).
   */
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  {
        long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
  
 +      /*
 +       * No need to update load_avg for root_task_group as it is not used.
 +       */
 +      if (cfs_rq->tg == &root_task_group)
 +              return;
 +
        if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
                atomic_long_add(delta, &cfs_rq->tg->load_avg);
                cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
        }
  }
  
 +/*
 + * Called within set_task_rq() right before setting a task's cpu. The
 + * caller only guarantees p->pi_lock is held; no other assumptions,
 + * including the state of rq->lock, should be made.
 + */
 +void set_task_rq_fair(struct sched_entity *se,
 +                    struct cfs_rq *prev, struct cfs_rq *next)
 +{
 +      if (!sched_feat(ATTACH_AGE_LOAD))
 +              return;
 +
 +      /*
 +       * We are supposed to update the task to "current" time, then its up to
 +       * date and ready to go to new CPU/cfs_rq. But we have difficulty in
 +       * getting what current time is, so simply throw away the out-of-date
 +       * time. This will result in the wakee task is less decayed, but giving
 +       * the wakee more load sounds not bad.
 +       */
 +      if (se->avg.last_update_time && prev) {
 +              u64 p_last_update_time;
 +              u64 n_last_update_time;
 +
 +#ifndef CONFIG_64BIT
 +              u64 p_last_update_time_copy;
 +              u64 n_last_update_time_copy;
 +
 +              do {
 +                      p_last_update_time_copy = prev->load_last_update_time_copy;
 +                      n_last_update_time_copy = next->load_last_update_time_copy;
 +
 +                      smp_rmb();
 +
 +                      p_last_update_time = prev->avg.last_update_time;
 +                      n_last_update_time = next->avg.last_update_time;
 +
 +              } while (p_last_update_time != p_last_update_time_copy ||
 +                       n_last_update_time != n_last_update_time_copy);
 +#else
 +              p_last_update_time = prev->avg.last_update_time;
 +              n_last_update_time = next->avg.last_update_time;
 +#endif
 +              __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
 +                                &se->avg, 0, 0, NULL);
 +              se->avg.last_update_time = n_last_update_time;
 +      }
 +}
 +
 +/* Take into account change of utilization of a child task group */
 +static inline void
 +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
 +{
 +      struct cfs_rq *gcfs_rq = group_cfs_rq(se);
 +      long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
 +
 +      /* Nothing to update */
 +      if (!delta)
 +              return;
 +
 +      /* Set new sched_entity's utilization */
 +      se->avg.util_avg = gcfs_rq->avg.util_avg;
 +      se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
 +
 +      /* Update parent cfs_rq utilization */
 +      add_positive(&cfs_rq->avg.util_avg, delta);
 +      cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
 +}
 +
 +/* Take into account change of load of a child task group */
 +static inline void
 +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
 +{
 +      struct cfs_rq *gcfs_rq = group_cfs_rq(se);
 +      long delta, load = gcfs_rq->avg.load_avg;
 +
 +      /*
 +       * If the load of group cfs_rq is null, the load of the
 +       * sched_entity will also be null so we can skip the formula
 +       */
 +      if (load) {
 +              long tg_load;
 +
 +              /* Get tg's load and ensure tg_load > 0 */
 +              tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
 +
 +              /* Ensure tg_load >= load and updated with current load*/
 +              tg_load -= gcfs_rq->tg_load_avg_contrib;
 +              tg_load += load;
 +
 +              /*
 +               * We need to compute a correction term in the case that the
 +               * task group is consuming more CPU than a task of equal
 +               * weight. A task with a weight equals to tg->shares will have
 +               * a load less or equal to scale_load_down(tg->shares).
 +               * Similarly, the sched_entities that represent the task group
 +               * at parent level, can't have a load higher than
 +               * scale_load_down(tg->shares). And the Sum of sched_entities'
 +               * load must be <= scale_load_down(tg->shares).
 +               */
 +              if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
 +                      /* scale gcfs_rq's load into tg's shares*/
 +                      load *= scale_load_down(gcfs_rq->tg->shares);
 +                      load /= tg_load;
 +              }
 +      }
 +
 +      delta = load - se->avg.load_avg;
 +
 +      /* Nothing to update */
 +      if (!delta)
 +              return;
 +
 +      /* Set new sched_entity's load */
 +      se->avg.load_avg = load;
 +      se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
 +
 +      /* Update parent cfs_rq load */
 +      add_positive(&cfs_rq->avg.load_avg, delta);
 +      cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
 +
 +      /*
 +       * If the sched_entity is already enqueued, we also have to update the
 +       * runnable load avg.
 +       */
 +      if (se->on_rq) {
 +              /* Update parent cfs_rq runnable_load_avg */
 +              add_positive(&cfs_rq->runnable_load_avg, delta);
 +              cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
 +      }
 +}
 +
 +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
 +{
 +      cfs_rq->propagate_avg = 1;
 +}
 +
 +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
 +{
 +      struct cfs_rq *cfs_rq = group_cfs_rq(se);
 +
 +      if (!cfs_rq->propagate_avg)
 +              return 0;
 +
 +      cfs_rq->propagate_avg = 0;
 +      return 1;
 +}
 +
 +/* Update task and its cfs_rq load average */
 +static inline int propagate_entity_load_avg(struct sched_entity *se)
 +{
 +      struct cfs_rq *cfs_rq;
 +
 +      if (entity_is_task(se))
 +              return 0;
 +
 +      if (!test_and_clear_tg_cfs_propagate(se))
 +              return 0;
 +
 +      cfs_rq = cfs_rq_of(se);
 +
 +      set_tg_cfs_propagate(cfs_rq);
 +
 +      update_tg_cfs_util(cfs_rq, se);
 +      update_tg_cfs_load(cfs_rq, se);
 +
 +      return 1;
 +}
 +
  #else /* CONFIG_FAIR_GROUP_SCHED */
 +
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 +
 +static inline int propagate_entity_load_avg(struct sched_entity *se)
 +{
 +      return 0;
 +}
 +
 +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 +
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
 +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 +{
 +        if (&this_rq()->cfs == cfs_rq) {
 +                /*
 +                 * There are a few boundary cases this might miss but it should
 +                 * get called often enough that that should (hopefully) not be
 +                 * a real problem -- added to that it only calls on the local
 +                 * CPU, so if we enqueue remotely we'll miss an update, but
 +                 * the next tick/schedule should update.
 +                 *
 +                 * It will not get called when we go idle, because the idle
 +                 * thread is a different class (!fair), nor will the utilization
 +                 * number include things like RT tasks.
 +                 *
 +                 * As is, the util number is not freq-invariant (we'd have to
 +                 * implement arch_scale_freq_capacity() for that).
 +                 *
 +                 * See cpu_util().
 +                 */
 +                cpufreq_update_util(rq_of(cfs_rq), 0);
 +        }
 +}
 +
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  
  /*
        WRITE_ONCE(*ptr, res);                                  \
  } while (0)
  
 -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
 -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 +/**
 + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
 + * @now: current time, as per cfs_rq_clock_task()
 + * @cfs_rq: cfs_rq to update
 + * @update_freq: should we call cfs_rq_util_change() or will the call do so
 + *
 + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
 + * avg. The immediate corollary is that all (fair) tasks must be attached, see
 + * post_init_entity_util_avg().
 + *
 + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
 + *
 + * Returns true if the load decayed or we removed load.
 + *
 + * Since both these conditions indicate a changed cfs_rq->avg.load we should
 + * call update_tg_load_avg() when this function returns true.
 + */
 +static inline int
 +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  {
        struct sched_avg *sa = &cfs_rq->avg;
 -      int decayed, removed = 0;
 +      int decayed, removed = 0, removed_util = 0;
  
        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
                sub_positive(&sa->load_avg, r);
                sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
                removed = 1;
 +              set_tg_cfs_propagate(cfs_rq);
        }
  
        if (atomic_long_read(&cfs_rq->removed_util_avg)) {
                long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
                sub_positive(&sa->util_avg, r);
                sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
 +              removed_util = 1;
 +              set_tg_cfs_propagate(cfs_rq);
        }
  
        decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
        cfs_rq->load_last_update_time_copy = sa->last_update_time;
  #endif
  
 +      /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
 +      if (cfs_rq == &rq_of(cfs_rq)->cfs)
 +              trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
 +
 +      if (update_freq && (decayed || removed_util))
 +              cfs_rq_util_change(cfs_rq);
 +
        return decayed || removed;
  }
  
 +/*
 + * Optional action to be done while updating the load average
 + */
 +#define UPDATE_TG     0x1
 +#define SKIP_AGE_LOAD 0x2
 +
  /* Update task and its cfs_rq load average */
 -static inline void update_load_avg(struct sched_entity *se, int update_tg)
 +static inline void update_load_avg(struct sched_entity *se, int flags)
  {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        u64 now = cfs_rq_clock_task(cfs_rq);
        int cpu = cpu_of(rq_of(cfs_rq));
 +      int decayed;
 +      void *ptr = NULL;
  
        /*
         * Track task load average for carrying it to new CPU after migrated, and
         * track group sched_entity load average for task_h_load calc in migration
         */
 -      __update_load_avg(now, cpu, &se->avg,
 +      if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
 +              __update_load_avg(now, cpu, &se->avg,
                          se->on_rq * scale_load_down(se->load.weight),
                          cfs_rq->curr == se, NULL);
 +      }
 +
 +      decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
 +      decayed |= propagate_entity_load_avg(se);
  
 -      if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
 +      if (decayed && (flags & UPDATE_TG))
                update_tg_load_avg(cfs_rq, 0);
 +
 +      if (entity_is_task(se)) {
 +#ifdef CONFIG_SCHED_WALT
 +              ptr = (void *)&(task_of(se)->ravg);
 +#endif
 +              trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
 +      }
  }
  
 +/**
 + * attach_entity_load_avg - attach this entity to its cfs_rq load avg
 + * @cfs_rq: cfs_rq to attach to
 + * @se: sched_entity to attach
 + *
 + * Must call update_cfs_rq_load_avg() before this, since we rely on
 + * cfs_rq->avg.last_update_time being current.
 + */
  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
 -      if (!sched_feat(ATTACH_AGE_LOAD))
 -              goto skip_aging;
 -
 -      /*
 -       * If we got migrated (either between CPUs or between cgroups) we'll
 -       * have aged the average right before clearing @last_update_time.
 -       */
 -      if (se->avg.last_update_time) {
 -              __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
 -                                &se->avg, 0, 0, NULL);
 -
 -              /*
 -               * XXX: we could have just aged the entire load away if we've been
 -               * absent from the fair class for too long.
 -               */
 -      }
 -
 -skip_aging:
        se->avg.last_update_time = cfs_rq->avg.last_update_time;
        cfs_rq->avg.load_avg += se->avg.load_avg;
        cfs_rq->avg.load_sum += se->avg.load_sum;
        cfs_rq->avg.util_avg += se->avg.util_avg;
        cfs_rq->avg.util_sum += se->avg.util_sum;
 +      set_tg_cfs_propagate(cfs_rq);
 +
 +      cfs_rq_util_change(cfs_rq);
  }
  
 +/**
 + * detach_entity_load_avg - detach this entity from its cfs_rq load avg
 + * @cfs_rq: cfs_rq to detach from
 + * @se: sched_entity to detach
 + *
 + * Must call update_cfs_rq_load_avg() before this, since we rely on
 + * cfs_rq->avg.last_update_time being current.
 + */
  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
 -      __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
 -                        &se->avg, se->on_rq * scale_load_down(se->load.weight),
 -                        cfs_rq->curr == se, NULL);
  
        sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
        sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
        sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
        sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
 +      set_tg_cfs_propagate(cfs_rq);
 +
 +      cfs_rq_util_change(cfs_rq);
  }
  
  /* Add the load generated by se into cfs_rq's load average */
@@@ -4365,20 -2827,34 +4365,20 @@@ static inline voi
  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
        struct sched_avg *sa = &se->avg;
 -      u64 now = cfs_rq_clock_task(cfs_rq);
 -      int migrated, decayed;
 -
 -      migrated = !sa->last_update_time;
 -      if (!migrated) {
 -              __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
 -                      se->on_rq * scale_load_down(se->load.weight),
 -                      cfs_rq->curr == se, NULL);
 -      }
 -
 -      decayed = update_cfs_rq_load_avg(now, cfs_rq);
  
        cfs_rq->runnable_load_avg += sa->load_avg;
        cfs_rq->runnable_load_sum += sa->load_sum;
  
 -      if (migrated)
 +      if (!sa->last_update_time) {
                attach_entity_load_avg(cfs_rq, se);
 -
 -      if (decayed || migrated)
                update_tg_load_avg(cfs_rq, 0);
 +      }
  }
  
  /* Remove the runnable load generated by se from cfs_rq's runnable load average */
  static inline void
  dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
 -      update_load_avg(se, 1);
 -
        cfs_rq->runnable_load_avg =
                max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
        cfs_rq->runnable_load_sum =
@@@ -4407,37 -2883,24 +4407,37 @@@ static inline u64 cfs_rq_last_update_ti
  #endif
  
  /*
 + * Synchronize entity load avg of dequeued entity without locking
 + * the previous rq.
 + */
 +void sync_entity_load_avg(struct sched_entity *se)
 +{
 +      struct cfs_rq *cfs_rq = cfs_rq_of(se);
 +      u64 last_update_time;
 +
 +      last_update_time = cfs_rq_last_update_time(cfs_rq);
 +      __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
 +}
 +
 +/*
   * Task first catches up with cfs_rq, and then subtract
   * itself from the cfs_rq (task must be off the queue now).
   */
  void remove_entity_load_avg(struct sched_entity *se)
  {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
 -      u64 last_update_time;
  
        /*
 -       * Newly created task or never used group entity should not be removed
 -       * from its (source) cfs_rq
 +       * tasks cannot exit without having gone through wake_up_new_task() ->
 +       * post_init_entity_util_avg() which will have added things to the
 +       * cfs_rq, so we can remove unconditionally.
 +       *
 +       * Similarly for groups, they will have passed through
 +       * post_init_entity_util_avg() before unregister_sched_fair_group()
 +       * calls this.
         */
 -      if (se->avg.last_update_time == 0)
 -              return;
 -
 -      last_update_time = cfs_rq_last_update_time(cfs_rq);
  
 -      __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
 +      sync_entity_load_avg(se);
        atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
        atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
  }
@@@ -4474,16 -2937,7 +4474,16 @@@ static int idle_balance(struct rq *this
  
  #else /* CONFIG_SMP */
  
 -static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
 +static inline int
 +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 +{
 +      return 0;
 +}
 +
 +#define UPDATE_TG     0x0
 +#define SKIP_AGE_LOAD 0x0
 +
 +static inline void update_load_avg(struct sched_entity *se, int not_used1){}
  static inline void
  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void
@@@ -4500,12 -2954,6 +4500,12 @@@ static inline int idle_balance(struct r
        return 0;
  }
  
 +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 +       struct task_struct *p, int change_cra) { }
 +
 +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
 +       struct task_struct *p, int change_cra) { }
 +
  #endif /* CONFIG_SMP */
  
  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                        }
  
                        trace_sched_stat_blocked(tsk, delta);
 +                      trace_sched_blocked_reason(tsk);
  
                        /*
                         * Blocking time is in units of nanosecs, so shift by
@@@ -4632,10 -3079,9 +4632,10 @@@ enqueue_entity(struct cfs_rq *cfs_rq, s
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
 +      update_load_avg(se, UPDATE_TG);
        enqueue_entity_load_avg(cfs_rq, se);
 +      update_cfs_shares(se);
        account_entity_enqueue(cfs_rq, se);
 -      update_cfs_shares(cfs_rq);
  
        if (flags & ENQUEUE_WAKEUP) {
                place_entity(cfs_rq, se, 0);
@@@ -4708,16 -3154,6 +4708,16 @@@ dequeue_entity(struct cfs_rq *cfs_rq, s
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
 +
 +      /*
 +       * When dequeuing a sched_entity, we must:
 +       *   - Update loads to have both entity and cfs_rq synced with now.
 +       *   - Substract its load from the cfs_rq->runnable_avg.
 +       *   - Substract its previous weight from cfs_rq->load.weight.
 +       *   - For group entity, update its weight to reflect the new share
 +       *     of its group cfs_rq.
 +       */
 +      update_load_avg(se, UPDATE_TG);
        dequeue_entity_load_avg(cfs_rq, se);
  
        update_stats_dequeue(cfs_rq, se);
        return_cfs_rq_runtime(cfs_rq);
  
        update_min_vruntime(cfs_rq);
 -      update_cfs_shares(cfs_rq);
 +      update_cfs_shares(se);
  }
  
  /*
@@@ -4808,7 -3244,7 +4808,7 @@@ set_next_entity(struct cfs_rq *cfs_rq, 
                 */
                update_stats_wait_end(cfs_rq, se);
                __dequeue_entity(cfs_rq, se);
 -              update_load_avg(se, 1);
 +              update_load_avg(se, UPDATE_TG);
        }
  
        update_stats_curr_start(cfs_rq, se);
@@@ -4924,8 -3360,8 +4924,8 @@@ entity_tick(struct cfs_rq *cfs_rq, stru
        /*
         * Ensure that runnable average is periodically updated.
         */
 -      update_load_avg(curr, 1);
 -      update_cfs_shares(cfs_rq);
 +      update_load_avg(curr, UPDATE_TG);
 +      update_cfs_shares(curr);
  
  #ifdef CONFIG_SCHED_HRTICK
        /*
@@@ -5132,35 -3568,6 +5132,35 @@@ static inline int cfs_rq_throttled(stru
        return cfs_bandwidth_used() && cfs_rq->throttled;
  }
  
 +#ifdef CONFIG_SCHED_HMP
 +/*
 + * Check if task is part of a hierarchy where some cfs_rq does not have any
 + * runtime left.
 + *
 + * We can't rely on throttled_hierarchy() to do this test, as
 + * cfs_rq->throttle_count will not be updated yet when this function is called
 + * from scheduler_tick()
 + */
 +static int task_will_be_throttled(struct task_struct *p)
 +{
 +      struct sched_entity *se = &p->se;
 +      struct cfs_rq *cfs_rq;
 +
 +      if (!cfs_bandwidth_used())
 +              return 0;
 +
 +      for_each_sched_entity(se) {
 +              cfs_rq = cfs_rq_of(se);
 +              if (!cfs_rq->runtime_enabled)
 +                      continue;
 +              if (cfs_rq->runtime_remaining <= 0)
 +                      return 1;
 +      }
 +
 +      return 0;
 +}
 +#endif
 +
  /* check whether cfs_rq, or any parent, is throttled */
  static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
  {
@@@ -5240,16 -3647,13 +5240,16 @@@ static void throttle_cfs_rq(struct cfs_
                if (dequeue)
                        dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
                qcfs_rq->h_nr_running -= task_delta;
 +              dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
  
                if (qcfs_rq->load.weight)
                        dequeue = 0;
        }
  
 -      if (!se)
 +      if (!se) {
                sub_nr_running(rq, task_delta);
 +              dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
 +      }
  
        cfs_rq->throttled = 1;
        cfs_rq->throttled_clock = rq_clock(rq);
                start_cfs_bandwidth(cfs_b);
  
        raw_spin_unlock(&cfs_b->lock);
 +
 +      /* Log effect on hmp stats after throttling */
 +      trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
 +                           sched_irqload(cpu_of(rq)),
 +                           power_cost(cpu_of(rq), 0),
 +                           cpu_temp(cpu_of(rq)));
  }
  
  void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
        struct sched_entity *se;
        int enqueue = 1;
        long task_delta;
 +      struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
  
        se = cfs_rq->tg->se[cpu_of(rq)];
  
                if (enqueue)
                        enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
                cfs_rq->h_nr_running += task_delta;
 +              inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
  
                if (cfs_rq_throttled(cfs_rq))
                        break;
        }
  
 -      if (!se)
 +      if (!se) {
                add_nr_running(rq, task_delta);
 +              inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
 +      }
  
        /* determine whether we need to wake up potentially idle cpu */
        if (rq->curr == rq->idle && rq->cfs.nr_running)
                resched_curr(rq);
 +
 +      /* Log effect on hmp stats after un-throttling */
 +      trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
 +                           sched_irqload(cpu_of(rq)),
 +                           power_cost(cpu_of(rq), 0),
 +                           cpu_temp(cpu_of(rq)));
  }
  
  static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
@@@ -5667,20 -4055,28 +5667,28 @@@ static enum hrtimer_restart sched_cfs_p
                if (++count > 3) {
                        u64 new, old = ktime_to_ns(cfs_b->period);
  
-                       new = (old * 147) / 128; /* ~115% */
-                       new = min(new, max_cfs_quota_period);
-                       cfs_b->period = ns_to_ktime(new);
-                       /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
-                       cfs_b->quota *= new;
-                       cfs_b->quota = div64_u64(cfs_b->quota, old);
-                       pr_warn_ratelimited(
-         "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
-                               smp_processor_id(),
-                               div_u64(new, NSEC_PER_USEC),
-                                 div_u64(cfs_b->quota, NSEC_PER_USEC));
+                       /*
+                        * Grow period by a factor of 2 to avoid losing precision.
+                        * Precision loss in the quota/period ratio can cause __cfs_schedulable
+                        * to fail.
+                        */
+                       new = old * 2;
+                       if (new < max_cfs_quota_period) {
+                               cfs_b->period = ns_to_ktime(new);
+                               cfs_b->quota *= 2;
+                               pr_warn_ratelimited(
+       "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+                                       smp_processor_id(),
+                                       div_u64(new, NSEC_PER_USEC),
+                                       div_u64(cfs_b->quota, NSEC_PER_USEC));
+                       } else {
+                               pr_warn_ratelimited(
+       "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+                                       smp_processor_id(),
+                                       div_u64(old, NSEC_PER_USEC),
+                                       div_u64(cfs_b->quota, NSEC_PER_USEC));
+                       }
  
                        /* reset count so we don't come right back in here */
                        count = 0;
@@@ -5714,7 -4110,6 +5722,7 @@@ static void init_cfs_rq_runtime(struct 
  {
        cfs_rq->runtime_enabled = 0;
        INIT_LIST_HEAD(&cfs_rq->throttled_list);
 +      init_cfs_rq_hmp_stats(cfs_rq);
  }
  
  void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@@ -5830,7 -4225,7 +5838,7 @@@ static void hrtick_start_fair(struct r
  
        WARN_ON(task_rq(p) != rq);
  
 -      if (cfs_rq->nr_running > 1) {
 +      if (rq->cfs.h_nr_running > 1) {
                u64 slice = sched_slice(cfs_rq, se);
                u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
                s64 delta = slice - ran;
  
  /*
   * called from enqueue/dequeue and updates the hrtick when the
 - * current task is from our class and nr_running is low enough
 - * to matter.
 + * current task is from our class.
   */
  static void hrtick_update(struct rq *rq)
  {
        if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
                return;
  
 -      if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
 -              hrtick_start_fair(rq, curr);
 +      hrtick_start_fair(rq, curr);
  }
  #else /* !CONFIG_SCHED_HRTICK */
  static inline void
@@@ -5868,14 -4265,6 +5876,14 @@@ static inline void hrtick_update(struc
  }
  #endif
  
 +#ifdef CONFIG_SMP
 +static bool __cpu_overutilized(int cpu, int delta);
 +static bool cpu_overutilized(int cpu);
 +unsigned long boosted_cpu_util(int cpu);
 +#else
 +#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
 +#endif
 +
  /*
   * The enqueue_task method is called before nr_running is
   * increased. Here we update the fair scheduling stats and
@@@ -5886,17 -4275,6 +5894,17 @@@ enqueue_task_fair(struct rq *rq, struc
  {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
 +#ifdef CONFIG_SMP
 +      int task_new = flags & ENQUEUE_WAKEUP_NEW;
 +#endif
 +
 +      /*
 +       * If in_iowait is set, the code below may not trigger any cpufreq
 +       * utilization updates, so do it here explicitly with the IOWAIT flag
 +       * passed.
 +       */
 +      if (p->in_iowait)
 +              cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
  
        for_each_sched_entity(se) {
                if (se->on_rq)
                 *
                 * note: in the case of encountering a throttled cfs_rq we will
                 * post the final h_nr_running increment below.
 -              */
 +               */
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running++;
 +              inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
  
                flags = ENQUEUE_WAKEUP;
        }
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running++;
 +              inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
  
                if (cfs_rq_throttled(cfs_rq))
                        break;
  
 -              update_load_avg(se, 1);
 -              update_cfs_shares(cfs_rq);
 +              update_load_avg(se, UPDATE_TG);
 +              update_cfs_shares(se);
        }
  
 -      if (!se)
 +      if (!se) {
                add_nr_running(rq, 1);
 +              inc_rq_hmp_stats(rq, p, 1);
 +      }
 +
 +#ifdef CONFIG_SMP
 +
 +      /*
 +       * Update SchedTune accounting.
 +       *
 +       * We do it before updating the CPU capacity to ensure the
 +       * boost value of the current task is accounted for in the
 +       * selection of the OPP.
 +       *
 +       * We do it also in the case where we enqueue a throttled task;
 +       * we could argue that a throttled task should not boost a CPU,
 +       * however:
 +       * a) properly implementing CPU boosting considering throttled
 +       *    tasks will increase a lot the complexity of the solution
 +       * b) it's not easy to quantify the benefits introduced by
 +       *    such a more complex solution.
 +       * Thus, for the time being we go for the simple solution and boost
 +       * also for throttled RQs.
 +       */
 +      schedtune_enqueue_task(p, cpu_of(rq));
 +
 +      if (energy_aware() && !se) {
 +              if (!task_new && !rq->rd->overutilized &&
 +                  cpu_overutilized(rq->cpu)) {
 +                      rq->rd->overutilized = true;
 +                      trace_sched_overutilized(true);
 +              }
 +      }
  
 +#endif /* CONFIG_SMP */
        hrtick_update(rq);
  }
  
@@@ -5994,7 -4338,6 +6002,7 @@@ static void dequeue_task_fair(struct r
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running--;
 +              dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
  
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running--;
 +              dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
  
                if (cfs_rq_throttled(cfs_rq))
                        break;
  
 -              update_load_avg(se, 1);
 -              update_cfs_shares(cfs_rq);
 +              update_load_avg(se, UPDATE_TG);
 +              update_cfs_shares(se);
 +      }
 +
 +      if (!se) {
 +              sub_nr_running(rq, 1);
 +              dec_rq_hmp_stats(rq, p, 1);
        }
  
 -      if (!se)
 -              sub_nr_running(rq, 1);
 +#ifdef CONFIG_SMP
 +
 +      /*
 +       * Update SchedTune accounting
 +       *
 +       * We do it before updating the CPU capacity to ensure the
 +       * boost value of the current task is accounted for in the
 +       * selection of the OPP.
 +       */
 +      schedtune_dequeue_task(p, cpu_of(rq));
 +
 +#endif /* CONFIG_SMP */
  
        hrtick_update(rq);
  }
@@@ -6267,6 -4594,15 +6275,6 @@@ static unsigned long target_load(int cp
        return max(rq->cpu_load[type-1], total);
  }
  
 -static unsigned long capacity_of(int cpu)
 -{
 -      return cpu_rq(cpu)->cpu_capacity;
 -}
 -
 -static unsigned long capacity_orig_of(int cpu)
 -{
 -      return cpu_rq(cpu)->cpu_capacity_orig;
 -}
  
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
@@@ -6441,520 -4777,6 +6449,520 @@@ static long effective_load(struct task_
  #endif
  
  /*
 + * Returns the current capacity of cpu after applying both
 + * cpu and freq scaling.
 + */
 +unsigned long capacity_curr_of(int cpu)
 +{
 +      return cpu_rq(cpu)->cpu_capacity_orig *
 +             arch_scale_freq_capacity(NULL, cpu)
 +             >> SCHED_CAPACITY_SHIFT;
 +}
 +
 +struct energy_env {
 +      struct sched_group      *sg_top;
 +      struct sched_group      *sg_cap;
 +      int                     cap_idx;
 +      int                     util_delta;
 +      int                     src_cpu;
 +      int                     dst_cpu;
 +      int                     trg_cpu;
 +      int                     energy;
 +      int                     payoff;
 +      struct task_struct      *task;
 +      struct {
 +              int before;
 +              int after;
 +              int delta;
 +              int diff;
 +      } nrg;
 +      struct {
 +              int before;
 +              int after;
 +              int delta;
 +      } cap;
 +};
 +
 +static int cpu_util_wake(int cpu, struct task_struct *p);
 +
 +/*
 + * __cpu_norm_util() returns the cpu util relative to a specific capacity,
 + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
 + * energy calculations.
 + *
 + * Since util is a scale-invariant utilization defined as:
 + *
 + *   util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
 + *
 + * the normalized util can be found using the specific capacity.
 + *
 + *   capacity = capacity_orig * curr_freq/max_freq
 + *
 + *   norm_util = running_time/time ~ util/capacity
 + */
 +static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
 +{
 +      if (util >= capacity)
 +              return SCHED_CAPACITY_SCALE;
 +
 +      return (util << SCHED_CAPACITY_SHIFT)/capacity;
 +}
 +
 +static unsigned long group_max_util(struct energy_env *eenv)
 +{
 +      unsigned long max_util = 0;
 +      unsigned long util;
 +      int cpu;
 +
 +      for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
 +              util = cpu_util_wake(cpu, eenv->task);
 +
 +              /*
 +               * If we are looking at the target CPU specified by the eenv,
 +               * then we should add the (estimated) utilization of the task
 +               * assuming we will wake it up on that CPU.
 +               */
 +              if (unlikely(cpu == eenv->trg_cpu))
 +                      util += eenv->util_delta;
 +
 +              max_util = max(max_util, util);
 +      }
 +
 +      return max_util;
 +}
 +
 +/*
 + * group_norm_util() returns the approximated group util relative to it's
 + * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
 + * in energy calculations.
 + *
 + * Since task executions may or may not overlap in time in the group the true
 + * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
 + * when iterating over all CPUs in the group.
 + * The latter estimate is used as it leads to a more pessimistic energy
 + * estimate (more busy).
 + */
 +static unsigned
 +long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
 +{
 +      unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
 +      unsigned long util, util_sum = 0;
 +      int cpu;
 +
 +      for_each_cpu(cpu, sched_group_cpus(sg)) {
 +              util = cpu_util_wake(cpu, eenv->task);
 +
 +              /*
 +               * If we are looking at the target CPU specified by the eenv,
 +               * then we should add the (estimated) utilization of the task
 +               * assuming we will wake it up on that CPU.
 +               */
 +              if (unlikely(cpu == eenv->trg_cpu))
 +                      util += eenv->util_delta;
 +
 +              util_sum += __cpu_norm_util(util, capacity);
 +      }
 +
 +      return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
 +}
 +
 +static int find_new_capacity(struct energy_env *eenv,
 +      const struct sched_group_energy * const sge)
 +{
 +      int idx, max_idx = sge->nr_cap_states - 1;
 +      unsigned long util = group_max_util(eenv);
 +
 +      /* default is max_cap if we don't find a match */
 +      eenv->cap_idx = max_idx;
 +
 +      for (idx = 0; idx < sge->nr_cap_states; idx++) {
 +              if (sge->cap_states[idx].cap >= util) {
 +                      eenv->cap_idx = idx;
 +                      break;
 +              }
 +      }
 +
 +      return eenv->cap_idx;
 +}
 +
 +static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
 +{
 +      int i, state = INT_MAX;
 +      int src_in_grp, dst_in_grp;
 +      long grp_util = 0;
 +
 +      /* Find the shallowest idle state in the sched group. */
 +      for_each_cpu(i, sched_group_cpus(sg))
 +              state = min(state, idle_get_state_idx(cpu_rq(i)));
 +
 +      /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
 +      state++;
 +
 +      src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
 +      dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
 +      if (src_in_grp == dst_in_grp) {
 +              /* both CPUs under consideration are in the same group or not in
 +               * either group, migration should leave idle state the same.
 +               */
 +              goto end;
 +      }
 +
 +      /*
 +       * Try to estimate if a deeper idle state is
 +       * achievable when we move the task.
 +       */
 +      for_each_cpu(i, sched_group_cpus(sg)) {
 +              grp_util += cpu_util_wake(i, eenv->task);
 +              if (unlikely(i == eenv->trg_cpu))
 +                      grp_util += eenv->util_delta;
 +      }
 +
 +      if (grp_util <=
 +              ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
 +              /* after moving, this group is at most partly
 +               * occupied, so it should have some idle time.
 +               */
 +              int max_idle_state_idx = sg->sge->nr_idle_states - 2;
 +              int new_state = grp_util * max_idle_state_idx;
 +              if (grp_util <= 0)
 +                      /* group will have no util, use lowest state */
 +                      new_state = max_idle_state_idx + 1;
 +              else {
 +                      /* for partially idle, linearly map util to idle
 +                       * states, excluding the lowest one. This does not
 +                       * correspond to the state we expect to enter in
 +                       * reality, but an indication of what might happen.
 +                       */
 +                      new_state = min(max_idle_state_idx, (int)
 +                                      (new_state / sg->sgc->max_capacity));
 +                      new_state = max_idle_state_idx - new_state;
 +              }
 +              state = new_state;
 +      } else {
 +              /* After moving, the group will be fully occupied
 +               * so assume it will not be idle at all.
 +               */
 +              state = 0;
 +      }
 +end:
 +      return state;
 +}
 +
 +/*
 + * sched_group_energy(): Computes the absolute energy consumption of cpus
 + * belonging to the sched_group including shared resources shared only by
 + * members of the group. Iterates over all cpus in the hierarchy below the
 + * sched_group starting from the bottom working it's way up before going to
 + * the next cpu until all cpus are covered at all levels. The current
 + * implementation is likely to gather the same util statistics multiple times.
 + * This can probably be done in a faster but more complex way.
 + * Note: sched_group_energy() may fail when racing with sched_domain updates.
 + */
 +static int sched_group_energy(struct energy_env *eenv)
 +{
 +      struct cpumask visit_cpus;
 +      u64 total_energy = 0;
 +      int cpu_count;
 +
 +      WARN_ON(!eenv->sg_top->sge);
 +
 +      cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
 +      /* If a cpu is hotplugged in while we are in this function,
 +       * it does not appear in the existing visit_cpus mask
 +       * which came from the sched_group pointer of the
 +       * sched_domain pointed at by sd_ea for either the prev
 +       * or next cpu and was dereferenced in __energy_diff.
 +       * Since we will dereference sd_scs later as we iterate
 +       * through the CPUs we expect to visit, new CPUs can
 +       * be present which are not in the visit_cpus mask.
 +       * Guard this with cpu_count.
 +       */
 +      cpu_count = cpumask_weight(&visit_cpus);
 +
 +      while (!cpumask_empty(&visit_cpus)) {
 +              struct sched_group *sg_shared_cap = NULL;
 +              int cpu = cpumask_first(&visit_cpus);
 +              struct sched_domain *sd;
 +
 +              /*
 +               * Is the group utilization affected by cpus outside this
 +               * sched_group?
 +               * This sd may have groups with cpus which were not present
 +               * when we took visit_cpus.
 +               */
 +              sd = rcu_dereference(per_cpu(sd_scs, cpu));
 +
 +              if (sd && sd->parent)
 +                      sg_shared_cap = sd->parent->groups;
 +
 +              for_each_domain(cpu, sd) {
 +                      struct sched_group *sg = sd->groups;
 +
 +                      /* Has this sched_domain already been visited? */
 +                      if (sd->child && group_first_cpu(sg) != cpu)
 +                              break;
 +
 +                      do {
 +                              unsigned long group_util;
 +                              int sg_busy_energy, sg_idle_energy;
 +                              int cap_idx, idle_idx;
 +
 +                              if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
 +                                      eenv->sg_cap = sg_shared_cap;
 +                              else
 +                                      eenv->sg_cap = sg;
 +
 +                              cap_idx = find_new_capacity(eenv, sg->sge);
 +
 +                              if (sg->group_weight == 1) {
 +                                      /* Remove capacity of src CPU (before task move) */
 +                                      if (eenv->trg_cpu == eenv->src_cpu &&
 +                                          cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
 +                                              eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
 +                                              eenv->cap.delta -= eenv->cap.before;
 +                                      }
 +                                      /* Add capacity of dst CPU  (after task move) */
 +                                      if (eenv->trg_cpu == eenv->dst_cpu &&
 +                                          cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
 +                                              eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
 +                                              eenv->cap.delta += eenv->cap.after;
 +                                      }
 +                              }
 +
 +                              idle_idx = group_idle_state(eenv, sg);
 +                              group_util = group_norm_util(eenv, sg);
 +
 +                              sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
 +                              sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
 +                                                              * sg->sge->idle_states[idle_idx].power);
 +
 +                              total_energy += sg_busy_energy + sg_idle_energy;
 +
 +                              if (!sd->child) {
 +                                      /*
 +                                       * cpu_count here is the number of
 +                                       * cpus we expect to visit in this
 +                                       * calculation. If we race against
 +                                       * hotplug, we can have extra cpus
 +                                       * added to the groups we are
 +                                       * iterating which do not appear in
 +                                       * the visit_cpus mask. In that case
 +                                       * we are not able to calculate energy
 +                                       * without restarting so we will bail
 +                                       * out and use prev_cpu this time.
 +                                       */
 +                                      if (!cpu_count)
 +                                              return -EINVAL;
 +                                      cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
 +                                      cpu_count--;
 +                              }
 +
 +                              if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
 +                                      goto next_cpu;
 +
 +                      } while (sg = sg->next, sg != sd->groups);
 +              }
 +
 +              /*
 +               * If we raced with hotplug and got an sd NULL-pointer;
 +               * returning a wrong energy estimation is better than
 +               * entering an infinite loop.
 +               * Specifically: If a cpu is unplugged after we took
 +               * the visit_cpus mask, it no longer has an sd_scs
 +               * pointer, so when we dereference it, we get NULL.
 +               */
 +              if (cpumask_test_cpu(cpu, &visit_cpus))
 +                      return -EINVAL;
 +next_cpu:
 +              cpumask_clear_cpu(cpu, &visit_cpus);
 +              continue;
 +      }
 +
 +      eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
 +      return 0;
 +}
 +
 +static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
 +{
 +      return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
 +}
 +
 +static inline unsigned long task_util(struct task_struct *p);
 +
 +/*
 + * energy_diff(): Estimate the energy impact of changing the utilization
 + * distribution. eenv specifies the change: utilisation amount, source, and
 + * destination cpu. Source or destination cpu may be -1 in which case the
 + * utilization is removed from or added to the system (e.g. task wake-up). If
 + * both are specified, the utilization is migrated.
 + */
 +static inline int __energy_diff(struct energy_env *eenv)
 +{
 +      struct sched_domain *sd;
 +      struct sched_group *sg;
 +      int sd_cpu = -1, energy_before = 0, energy_after = 0;
 +      int diff, margin;
 +
 +      struct energy_env eenv_before = {
 +              .util_delta     = task_util(eenv->task),
 +              .src_cpu        = eenv->src_cpu,
 +              .dst_cpu        = eenv->dst_cpu,
 +              .trg_cpu        = eenv->src_cpu,
 +              .nrg            = { 0, 0, 0, 0},
 +              .cap            = { 0, 0, 0 },
 +              .task           = eenv->task,
 +      };
 +
 +      if (eenv->src_cpu == eenv->dst_cpu)
 +              return 0;
 +
 +      sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
 +      sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
 +
 +      if (!sd)
 +              return 0; /* Error */
 +
 +      sg = sd->groups;
 +
 +      do {
 +              if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
 +                      eenv_before.sg_top = eenv->sg_top = sg;
 +
 +                      if (sched_group_energy(&eenv_before))
 +                              return 0; /* Invalid result abort */
 +                      energy_before += eenv_before.energy;
 +
 +                      /* Keep track of SRC cpu (before) capacity */
 +                      eenv->cap.before = eenv_before.cap.before;
 +                      eenv->cap.delta = eenv_before.cap.delta;
 +
 +                      if (sched_group_energy(eenv))
 +                              return 0; /* Invalid result abort */
 +                      energy_after += eenv->energy;
 +              }
 +      } while (sg = sg->next, sg != sd->groups);
 +
 +      eenv->nrg.before = energy_before;
 +      eenv->nrg.after = energy_after;
 +      eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
 +      eenv->payoff = 0;
 +#ifndef CONFIG_SCHED_TUNE
 +      trace_sched_energy_diff(eenv->task,
 +                      eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
 +                      eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
 +                      eenv->cap.before, eenv->cap.after, eenv->cap.delta,
 +                      eenv->nrg.delta, eenv->payoff);
 +#endif
 +      /*
 +       * Dead-zone margin preventing too many migrations.
 +       */
 +
 +      margin = eenv->nrg.before >> 6; /* ~1.56% */
 +
 +      diff = eenv->nrg.after - eenv->nrg.before;
 +
 +      eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
 +
 +      return eenv->nrg.diff;
 +}
 +
 +#ifdef CONFIG_SCHED_TUNE
 +
 +struct target_nrg schedtune_target_nrg;
 +
 +#ifdef CONFIG_CGROUP_SCHEDTUNE
 +extern bool schedtune_initialized;
 +#endif /* CONFIG_CGROUP_SCHEDTUNE */
 +
 +/*
 + * System energy normalization
 + * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
 + * corresponding to the specified energy variation.
 + */
 +static inline int
 +normalize_energy(int energy_diff)
 +{
 +      u32 normalized_nrg;
 +
 +#ifdef CONFIG_CGROUP_SCHEDTUNE
 +      /* during early setup, we don't know the extents */
 +      if (unlikely(!schedtune_initialized))
 +              return energy_diff < 0 ? -1 : 1 ;
 +#endif /* CONFIG_CGROUP_SCHEDTUNE */
 +
 +#ifdef CONFIG_SCHED_DEBUG
 +      {
 +      int max_delta;
 +
 +      /* Check for boundaries */
 +      max_delta  = schedtune_target_nrg.max_power;
 +      max_delta -= schedtune_target_nrg.min_power;
 +      WARN_ON(abs(energy_diff) >= max_delta);
 +      }
 +#endif
 +
 +      /* Do scaling using positive numbers to increase the range */
 +      normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
 +
 +      /* Scale by energy magnitude */
 +      normalized_nrg <<= SCHED_CAPACITY_SHIFT;
 +
 +      /* Normalize on max energy for target platform */
 +      normalized_nrg = reciprocal_divide(
 +                      normalized_nrg, schedtune_target_nrg.rdiv);
 +
 +      return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
 +}
 +
 +static inline int
 +energy_diff(struct energy_env *eenv)
 +{
 +      int boost = schedtune_task_boost(eenv->task);
 +      int nrg_delta;
 +
 +      /* Conpute "absolute" energy diff */
 +      __energy_diff(eenv);
 +
 +      /* Return energy diff when boost margin is 0 */
 +      if (boost == 0) {
 +              trace_sched_energy_diff(eenv->task,
 +                              eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
 +                              eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
 +                              eenv->cap.before, eenv->cap.after, eenv->cap.delta,
 +                              0, -eenv->nrg.diff);
 +              return eenv->nrg.diff;
 +      }
 +
 +      /* Compute normalized energy diff */
 +      nrg_delta = normalize_energy(eenv->nrg.diff);
 +      eenv->nrg.delta = nrg_delta;
 +
 +      eenv->payoff = schedtune_accept_deltas(
 +                      eenv->nrg.delta,
 +                      eenv->cap.delta,
 +                      eenv->task);
 +
 +      trace_sched_energy_diff(eenv->task,
 +                      eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
 +                      eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
 +                      eenv->cap.before, eenv->cap.after, eenv->cap.delta,
 +                      eenv->nrg.delta, eenv->payoff);
 +
 +      /*
 +       * When SchedTune is enabled, the energy_diff() function will return
 +       * the computed energy payoff value. Since the energy_diff() return
 +       * value is expected to be negative by its callers, this evaluation
 +       * function return a negative value each time the evaluation return a
 +       * positive payoff, which is the condition for the acceptance of
 +       * a scheduling decision
 +       */
 +      return -eenv->payoff;
 +}
 +#else /* CONFIG_SCHED_TUNE */
 +#define energy_diff(eenv) __energy_diff(eenv)
 +#endif
 +
 +/*
   * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
   * A waker of many should wake a different task than the one last awakened
   * at a frequency roughly N times higher than one of its wakees.  In order
   * being client/server, worker/dispatcher, interrupt source or whatever is
   * irrelevant, spread criteria is apparent partner count exceeds socket size.
   */
 -static int wake_wide(struct task_struct *p)
 +static int wake_wide(struct task_struct *p, int sibling_count_hint)
  {
        unsigned int master = current->wakee_flips;
        unsigned int slave = p->wakee_flips;
 -      int factor = this_cpu_read(sd_llc_size);
 +      int llc_size = this_cpu_read(sd_llc_size);
 +
 +      if (sibling_count_hint >= llc_size)
 +              return 1;
  
        if (master < slave)
                swap(master, slave);
 -      if (slave < factor || master < slave * factor)
 +      if (slave < llc_size || master < slave * llc_size)
                return 0;
        return 1;
  }
  
 -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 +static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 +                     int prev_cpu, int sync)
  {
        s64 this_load, load;
        s64 this_eff_load, prev_eff_load;
 -      int idx, this_cpu, prev_cpu;
 +      int idx, this_cpu;
        struct task_group *tg;
        unsigned long weight;
        int balanced;
  
        idx       = sd->wake_idx;
        this_cpu  = smp_processor_id();
 -      prev_cpu  = task_cpu(p);
        load      = source_load(prev_cpu, idx);
        this_load = target_load(this_cpu, idx);
  
        this_eff_load = 100;
        this_eff_load *= capacity_of(prev_cpu);
  
 -      prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
 -      prev_eff_load *= capacity_of(this_cpu);
 +      prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
 +      prev_eff_load *= capacity_of(this_cpu);
 +
 +      if (this_load > 0) {
 +              this_eff_load *= this_load +
 +                      effective_load(tg, this_cpu, weight, weight);
 +
 +              prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
 +      }
 +
 +      balanced = this_eff_load <= prev_eff_load;
 +
 +      schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
 +
 +      if (!balanced)
 +              return 0;
 +
 +      schedstat_inc(sd, ttwu_move_affine);
 +      schedstat_inc(p, se.statistics.nr_wakeups_affine);
 +
 +      return 1;
 +}
 +
 +static inline unsigned long task_util(struct task_struct *p)
 +{
 +      return p->se.avg.util_avg;
 +}
 +
 +static inline unsigned long boosted_task_util(struct task_struct *task);
 +
 +static inline bool __task_fits(struct task_struct *p, int cpu, int util)
 +{
 +      unsigned long capacity = capacity_of(cpu);
 +
 +      util += boosted_task_util(p);
 +
 +      return (capacity * 1024) > (util * capacity_margin);
 +}
 +
 +static inline bool task_fits_max(struct task_struct *p, int cpu)
 +{
 +      unsigned long capacity = capacity_of(cpu);
 +      unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
 +
 +      if (capacity == max_capacity)
 +              return true;
 +
 +      if (capacity * capacity_margin > max_capacity * 1024)
 +              return true;
 +
 +      return __task_fits(p, cpu, 0);
 +}
 +
 +static bool __cpu_overutilized(int cpu, int delta)
 +{
 +      return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
 +}
 +
 +static bool cpu_overutilized(int cpu)
 +{
 +      return __cpu_overutilized(cpu, 0);
 +}
 +
 +#ifdef CONFIG_SCHED_TUNE
 +
 +struct reciprocal_value schedtune_spc_rdiv;
 +
 +static long
 +schedtune_margin(unsigned long signal, long boost)
 +{
 +      long long margin = 0;
 +
 +      /*
 +       * Signal proportional compensation (SPC)
 +       *
 +       * The Boost (B) value is used to compute a Margin (M) which is
 +       * proportional to the complement of the original Signal (S):
 +       *   M = B * (SCHED_CAPACITY_SCALE - S)
 +       * The obtained M could be used by the caller to "boost" S.
 +       */
 +      if (boost >= 0) {
 +              margin  = SCHED_CAPACITY_SCALE - signal;
 +              margin *= boost;
 +      } else
 +              margin = -signal * boost;
 +
 +      margin  = reciprocal_divide(margin, schedtune_spc_rdiv);
 +
 +      if (boost < 0)
 +              margin *= -1;
 +      return margin;
 +}
 +
 +static inline int
 +schedtune_cpu_margin(unsigned long util, int cpu)
 +{
 +      int boost = schedtune_cpu_boost(cpu);
 +
 +      if (boost == 0)
 +              return 0;
 +
 +      return schedtune_margin(util, boost);
 +}
 +
 +static inline long
 +schedtune_task_margin(struct task_struct *task)
 +{
 +      int boost = schedtune_task_boost(task);
 +      unsigned long util;
 +      long margin;
 +
 +      if (boost == 0)
 +              return 0;
 +
 +      util = task_util(task);
 +      margin = schedtune_margin(util, boost);
 +
 +      return margin;
 +}
 +
 +#else /* CONFIG_SCHED_TUNE */
  
 -      if (this_load > 0) {
 -              this_eff_load *= this_load +
 -                      effective_load(tg, this_cpu, weight, weight);
 +static inline int
 +schedtune_cpu_margin(unsigned long util, int cpu)
 +{
 +      return 0;
 +}
  
 -              prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
 -      }
 +static inline int
 +schedtune_task_margin(struct task_struct *task)
 +{
 +      return 0;
 +}
  
 -      balanced = this_eff_load <= prev_eff_load;
 +#endif /* CONFIG_SCHED_TUNE */
  
 -      schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
 +unsigned long
 +boosted_cpu_util(int cpu)
 +{
 +      unsigned long util = cpu_util_freq(cpu);
 +      long margin = schedtune_cpu_margin(util, cpu);
  
 -      if (!balanced)
 -              return 0;
 +      trace_sched_boost_cpu(cpu, util, margin);
  
 -      schedstat_inc(sd, ttwu_move_affine);
 -      schedstat_inc(p, se.statistics.nr_wakeups_affine);
 +      return util + margin;
 +}
  
 -      return 1;
 +static inline unsigned long
 +boosted_task_util(struct task_struct *task)
 +{
 +      unsigned long util = task_util(task);
 +      long margin = schedtune_task_margin(task);
 +
 +      trace_sched_boost_task(task, util, margin);
 +
 +      return util + margin;
 +}
 +
 +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
 +{
 +      return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
  }
  
  /*
   * find_idlest_group finds and returns the least busy CPU group within the
   * domain.
 + *
 + * Assumes p is allowed on at least one CPU in sd.
   */
  static struct sched_group *
  find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                  int this_cpu, int sd_flag)
  {
        struct sched_group *idlest = NULL, *group = sd->groups;
 -      unsigned long min_load = ULONG_MAX, this_load = 0;
 +      struct sched_group *most_spare_sg = NULL;
 +      unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
 +      unsigned long most_spare = 0, this_spare = 0;
        int load_idx = sd->forkexec_idx;
        int imbalance = 100 + (sd->imbalance_pct-100)/2;
  
                load_idx = sd->wake_idx;
  
        do {
 -              unsigned long load, avg_load;
 +              unsigned long load, avg_load, spare_cap, max_spare_cap;
                int local_group;
                int i;
  
                local_group = cpumask_test_cpu(this_cpu,
                                               sched_group_cpus(group));
  
 -              /* Tally up the load of all CPUs in the group */
 +              /*
 +               * Tally up the load of all CPUs in the group and find
 +               * the group containing the CPU with most spare capacity.
 +               */
                avg_load = 0;
 +              max_spare_cap = 0;
  
                for_each_cpu(i, sched_group_cpus(group)) {
                        /* Bias balancing toward cpus of our domain */
                                load = target_load(i, load_idx);
  
                        avg_load += load;
 +
 +                      spare_cap = capacity_spare_wake(i, p);
 +
 +                      if (spare_cap > max_spare_cap)
 +                              max_spare_cap = spare_cap;
                }
  
                /* Adjust by relative CPU capacity of the group */
  
                if (local_group) {
                        this_load = avg_load;
 -              } else if (avg_load < min_load) {
 -                      min_load = avg_load;
 -                      idlest = group;
 +                      this_spare = max_spare_cap;
 +              } else {
 +                      if (avg_load < min_load) {
 +                              min_load = avg_load;
 +                              idlest = group;
 +                      }
 +
 +                      if (most_spare < max_spare_cap) {
 +                              most_spare = max_spare_cap;
 +                              most_spare_sg = group;
 +                      }
                }
        } while (group = group->next, group != sd->groups);
  
 +      /*
 +       * The cross-over point between using spare capacity or least load
 +       * is too conservative for high utilization tasks on partially
 +       * utilized systems if we require spare_capacity > task_util(p),
 +       * so we allow for some task stuffing by using
 +       * spare_capacity > task_util(p)/2.
 +       *
 +       * Spare capacity can't be used for fork because the utilization has
 +       * not been set yet, we must first select a rq to compute the initial
 +       * utilization.
 +       */
 +      if (sd_flag & SD_BALANCE_FORK)
 +              goto skip_spare;
 +
 +      if (this_spare > task_util(p) / 2 &&
 +          imbalance*this_spare > 100*most_spare)
 +              return NULL;
 +      else if (most_spare > task_util(p) / 2)
 +              return most_spare_sg;
 +
 +skip_spare:
        if (!idlest || 100*this_load < imbalance*min_load)
                return NULL;
        return idlest;
  }
  
  /*
 - * find_idlest_cpu - find the idlest cpu among the cpus in group.
 + * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
   */
  static int
 -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  {
        unsigned long load, min_load = ULONG_MAX;
        unsigned int min_exit_latency = UINT_MAX;
        int shallowest_idle_cpu = -1;
        int i;
  
 +      /* Check if we have any choice: */
 +      if (group->group_weight == 1)
 +              return cpumask_first(sched_group_cpus(group));
 +
        /* Traverse only the allowed CPUs */
        for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
                if (idle_cpu(i)) {
        }
  
        return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 + }
 +
 +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
 +                                int cpu, int prev_cpu, int sd_flag)
 +{
 +      int new_cpu = cpu;
 +      int wu = sd_flag & SD_BALANCE_WAKE;
 +      int cas_cpu = -1;
 +
 +      if (wu) {
 +              schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
 +              schedstat_inc(this_rq(), eas_stats.cas_attempts);
 +      }
 +
 +      if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
 +              return prev_cpu;
 +
 +      while (sd) {
 +              struct sched_group *group;
 +              struct sched_domain *tmp;
 +              int weight;
 +
 +              if (wu)
 +                      schedstat_inc(sd, eas_stats.cas_attempts);
 +
 +              if (!(sd->flags & sd_flag)) {
 +                      sd = sd->child;
 +                      continue;
 +              }
 +
 +              group = find_idlest_group(sd, p, cpu, sd_flag);
 +              if (!group) {
 +                      sd = sd->child;
 +                      continue;
 +              }
 +
 +              new_cpu = find_idlest_group_cpu(group, p, cpu);
 +              if (new_cpu == cpu) {
 +                      /* Now try balancing at a lower domain level of cpu */
 +                      sd = sd->child;
 +                      continue;
 +              }
 +
 +              /* Now try balancing at a lower domain level of new_cpu */
 +              cpu = cas_cpu = new_cpu;
 +              weight = sd->span_weight;
 +              sd = NULL;
 +              for_each_domain(cpu, tmp) {
 +                      if (weight <= tmp->span_weight)
 +                              break;
 +                      if (tmp->flags & sd_flag)
 +                              sd = tmp;
 +              }
 +              /* while loop will break here if sd == NULL */
 +      }
 +
 +      if (wu && (cas_cpu >= 0)) {
 +              schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
 +              schedstat_inc(this_rq(), eas_stats.cas_count);
 +      }
 +
 +      return new_cpu;
  }
  
  /*
   * Try and locate an idle CPU in the sched_domain.
   */
 -static int select_idle_sibling(struct task_struct *p, int target)
 +static int select_idle_sibling(struct task_struct *p, int prev, int target)
  {
        struct sched_domain *sd;
        struct sched_group *sg;
 -      int i = task_cpu(p);
 +      int best_idle_cpu = -1;
 +      int best_idle_cstate = INT_MAX;
 +      unsigned long best_idle_capacity = ULONG_MAX;
 +
 +      schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
 +      schedstat_inc(this_rq(), eas_stats.sis_attempts);
 +
 +      if (!sysctl_sched_cstate_aware) {
 +              if (idle_cpu(target)) {
 +                      schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
 +                      schedstat_inc(this_rq(), eas_stats.sis_idle);
 +                      return target;
 +              }
  
 -      if (idle_cpu(target))
 -              return target;
 +              /*
 +               * If the prevous cpu is cache affine and idle, don't be stupid.
 +               */
 +              if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
 +                      schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
 +                      schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
 +                      return prev;
 +              }
 +      }
  
 -      /*
 -       * If the prevous cpu is cache affine and idle, don't be stupid.
 -       */
 -      if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
 -              return i;
 +      if (!(current->flags & PF_WAKE_UP_IDLE) &&
 +                      !(p->flags & PF_WAKE_UP_IDLE))
 +              return target;
  
        /*
         * Otherwise, iterate the domains and find an elegible idle cpu.
        for_each_lower_domain(sd) {
                sg = sd->groups;
                do {
 +                      int i;
                        if (!cpumask_intersects(sched_group_cpus(sg),
                                                tsk_cpus_allowed(p)))
                                goto next;
  
 -                      for_each_cpu(i, sched_group_cpus(sg)) {
 -                              if (i == target || !idle_cpu(i))
 -                                      goto next;
 -                      }
 +                      if (sysctl_sched_cstate_aware) {
 +                              for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
 +                                      int idle_idx = idle_get_state_idx(cpu_rq(i));
 +                                      unsigned long new_usage = boosted_task_util(p);
 +                                      unsigned long capacity_orig = capacity_orig_of(i);
 +
 +                                      if (new_usage > capacity_orig || !idle_cpu(i))
 +                                              goto next;
 +
 +                                      if (i == target && new_usage <= capacity_curr_of(target)) {
 +                                              schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
 +                                              schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
 +                                              schedstat_inc(sd, eas_stats.sis_suff_cap);
 +                                              return target;
 +                                      }
 +
 +                                      if (idle_idx < best_idle_cstate &&
 +                                          capacity_orig <= best_idle_capacity) {
 +                                              best_idle_cpu = i;
 +                                              best_idle_cstate = idle_idx;
 +                                              best_idle_capacity = capacity_orig;
 +                                      }
 +                              }
 +                      } else {
 +                              for_each_cpu(i, sched_group_cpus(sg)) {
 +                                      if (i == target || !idle_cpu(i))
 +                                              goto next;
 +                              }
  
 -                      target = cpumask_first_and(sched_group_cpus(sg),
 +                              target = cpumask_first_and(sched_group_cpus(sg),
                                        tsk_cpus_allowed(p));
 -                      goto done;
 +                              schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
 +                              schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
 +                              schedstat_inc(sd, eas_stats.sis_idle_cpu);
 +                              goto done;
 +                      }
  next:
                        sg = sg->next;
                } while (sg != sd->groups);
        }
 +
 +      if (best_idle_cpu >= 0)
 +              target = best_idle_cpu;
 +
  done:
 +      schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
 +      schedstat_inc(this_rq(), eas_stats.sis_count);
 +
        return target;
  }
  
  /*
 - * cpu_util returns the amount of capacity of a CPU that is used by CFS
 - * tasks. The unit of the return value must be the one of capacity so we can
 - * compare the utilization with the capacity of the CPU that is available for
 - * CFS task (ie cpu_capacity).
 - *
 - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
 - * recent utilization of currently non-runnable tasks on a CPU. It represents
 - * the amount of utilization of a CPU in the range [0..capacity_orig] where
 - * capacity_orig is the cpu_capacity available at the highest frequency
 - * (arch_scale_freq_capacity()).
 - * The utilization of a CPU converges towards a sum equal to or less than the
 - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
 - * the running time on this CPU scaled by capacity_curr.
 + * cpu_util_wake: Compute cpu utilization with any contributions from
 + * the waking task p removed.  check_for_migration() looks for a better CPU of
 + * rq->curr. For that case we should return cpu util with contributions from
 + * currently running task p removed.
 + */
 +static int cpu_util_wake(int cpu, struct task_struct *p)
 +{
 +      unsigned long util, capacity;
 +
 +#ifdef CONFIG_SCHED_WALT
 +      /*
 +       * WALT does not decay idle tasks in the same manner
 +       * as PELT, so it makes little sense to subtract task
 +       * utilization from cpu utilization. Instead just use
 +       * cpu_util for this case.
 +       */
 +      if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
 +          p->state == TASK_WAKING)
 +              return cpu_util(cpu);
 +#endif
 +      /* Task has no contribution or is new */
 +      if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
 +              return cpu_util(cpu);
 +
 +      capacity = capacity_orig_of(cpu);
 +      util = max_t(long, cpu_util(cpu) - task_util(p), 0);
 +
 +      return (util >= capacity) ? capacity : util;
 +}
 +
 +static int start_cpu(bool boosted)
 +{
 +      struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 +
 +      return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
 +}
 +
 +static inline int find_best_target(struct task_struct *p, int *backup_cpu,
 +                                 bool boosted, bool prefer_idle)
 +{
 +      unsigned long best_idle_min_cap_orig = ULONG_MAX;
 +      unsigned long min_util = boosted_task_util(p);
 +      unsigned long target_capacity = ULONG_MAX;
 +      unsigned long min_wake_util = ULONG_MAX;
 +      unsigned long target_max_spare_cap = 0;
 +      unsigned long best_active_util = ULONG_MAX;
 +      int best_idle_cstate = INT_MAX;
 +      struct sched_domain *sd;
 +      struct sched_group *sg;
 +      int best_active_cpu = -1;
 +      int best_idle_cpu = -1;
 +      int target_cpu = -1;
 +      int cpu, i;
 +      struct task_struct *curr_tsk;
 +
 +      *backup_cpu = -1;
 +
 +      schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
 +      schedstat_inc(this_rq(), eas_stats.fbt_attempts);
 +
 +      /* Find start CPU based on boost value */
 +      cpu = start_cpu(boosted);
 +      if (cpu < 0) {
 +              schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
 +              schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
 +              return -1;
 +      }
 +
 +      /* Find SD for the start CPU */
 +      sd = rcu_dereference(per_cpu(sd_ea, cpu));
 +      if (!sd) {
 +              schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
 +              schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
 +              return -1;
 +      }
 +
 +      /* Scan CPUs in all SDs */
 +      sg = sd->groups;
 +      do {
 +              for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
 +                      unsigned long capacity_curr = capacity_curr_of(i);
 +                      unsigned long capacity_orig = capacity_orig_of(i);
 +                      unsigned long wake_util, new_util;
 +
 +                      if (!cpu_online(i))
 +                              continue;
 +
 +                      if (walt_cpu_high_irqload(i))
 +                              continue;
 +
 +                      /*
 +                       * p's blocked utilization is still accounted for on prev_cpu
 +                       * so prev_cpu will receive a negative bias due to the double
 +                       * accounting. However, the blocked utilization may be zero.
 +                       */
 +                      wake_util = cpu_util_wake(i, p);
 +                      new_util = wake_util + task_util(p);
 +
 +                      /*
 +                       * Ensure minimum capacity to grant the required boost.
 +                       * The target CPU can be already at a capacity level higher
 +                       * than the one required to boost the task.
 +                       */
 +                      new_util = max(min_util, new_util);
 +                      if (new_util > capacity_orig)
 +                              continue;
 +
 +                      /*
 +                       * Case A) Latency sensitive tasks
 +                       *
 +                       * Unconditionally favoring tasks that prefer idle CPU to
 +                       * improve latency.
 +                       *
 +                       * Looking for:
 +                       * - an idle CPU, whatever its idle_state is, since
 +                       *   the first CPUs we explore are more likely to be
 +                       *   reserved for latency sensitive tasks.
 +                       * - a non idle CPU where the task fits in its current
 +                       *   capacity and has the maximum spare capacity.
 +                       * - a non idle CPU with lower contention from other
 +                       *   tasks and running at the lowest possible OPP.
 +                       *
 +                       * The last two goals tries to favor a non idle CPU
 +                       * where the task can run as if it is "almost alone".
 +                       * A maximum spare capacity CPU is favoured since
 +                       * the task already fits into that CPU's capacity
 +                       * without waiting for an OPP chance.
 +                       *
 +                       * The following code path is the only one in the CPUs
 +                       * exploration loop which is always used by
 +                       * prefer_idle tasks. It exits the loop with wither a
 +                       * best_active_cpu or a target_cpu which should
 +                       * represent an optimal choice for latency sensitive
 +                       * tasks.
 +                       */
 +                      if (prefer_idle) {
 +
 +                              /*
 +                               * Case A.1: IDLE CPU
 +                               * Return the first IDLE CPU we find.
 +                               */
 +                              if (idle_cpu(i)) {
 +                                      schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
 +                                      schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
 +
 +                                      trace_sched_find_best_target(p,
 +                                                      prefer_idle, min_util,
 +                                                      cpu, best_idle_cpu,
 +                                                      best_active_cpu, i);
 +
 +                                      return i;
 +                              }
 +
 +                              /*
 +                               * Case A.2: Target ACTIVE CPU
 +                               * Favor CPUs with max spare capacity.
 +                               */
 +                              if ((capacity_curr > new_util) &&
 +                                      (capacity_orig - new_util > target_max_spare_cap)) {
 +                                      target_max_spare_cap = capacity_orig - new_util;
 +                                      target_cpu = i;
 +                                      continue;
 +                              }
 +                              if (target_cpu != -1)
 +                                      continue;
 +
 +
 +                              /*
 +                               * Case A.3: Backup ACTIVE CPU
 +                               * Favor CPUs with:
 +                               * - lower utilization due to other tasks
 +                               * - lower utilization with the task in
 +                               */
 +                              if (wake_util > min_wake_util)
 +                                      continue;
 +                              if (new_util > best_active_util)
 +                                      continue;
 +                              min_wake_util = wake_util;
 +                              best_active_util = new_util;
 +                              best_active_cpu = i;
 +                              continue;
 +                      }
 +
 +                      /*
 +                       * Enforce EAS mode
 +                       *
 +                       * For non latency sensitive tasks, skip CPUs that
 +                       * will be overutilized by moving the task there.
 +                       *
 +                       * The goal here is to remain in EAS mode as long as
 +                       * possible at least for !prefer_idle tasks.
 +                       */
 +                      if ((new_util * capacity_margin) >
 +                          (capacity_orig * SCHED_CAPACITY_SCALE))
 +                              continue;
 +
 +                      /*
 +                       * Case B) Non latency sensitive tasks on IDLE CPUs.
 +                       *
 +                       * Find an optimal backup IDLE CPU for non latency
 +                       * sensitive tasks.
 +                       *
 +                       * Looking for:
 +                       * - minimizing the capacity_orig,
 +                       *   i.e. preferring LITTLE CPUs
 +                       * - favoring shallowest idle states
 +                       *   i.e. avoid to wakeup deep-idle CPUs
 +                       *
 +                       * The following code path is used by non latency
 +                       * sensitive tasks if IDLE CPUs are available. If at
 +                       * least one of such CPUs are available it sets the
 +                       * best_idle_cpu to the most suitable idle CPU to be
 +                       * selected.
 +                       *
 +                       * If idle CPUs are available, favour these CPUs to
 +                       * improve performances by spreading tasks.
 +                       * Indeed, the energy_diff() computed by the caller
 +                       * will take care to ensure the minimization of energy
 +                       * consumptions without affecting performance.
 +                       */
 +                      if (idle_cpu(i)) {
 +                              int idle_idx = idle_get_state_idx(cpu_rq(i));
 +
 +                              /* Select idle CPU with lower cap_orig */
 +                              if (capacity_orig > best_idle_min_cap_orig)
 +                                      continue;
 +
 +                              /*
 +                               * Skip CPUs in deeper idle state, but only
 +                               * if they are also less energy efficient.
 +                               * IOW, prefer a deep IDLE LITTLE CPU vs a
 +                               * shallow idle big CPU.
 +                               */
 +                              if (sysctl_sched_cstate_aware &&
 +                                  best_idle_cstate <= idle_idx)
 +                                      continue;
 +
 +                              /* Keep track of best idle CPU */
 +                              best_idle_min_cap_orig = capacity_orig;
 +                              best_idle_cstate = idle_idx;
 +                              best_idle_cpu = i;
 +                              continue;
 +                      }
 +
 +                      /*
 +                       * Case C) Non latency sensitive tasks on ACTIVE CPUs.
 +                       *
 +                       * Pack tasks in the most energy efficient capacities.
 +                       *
 +                       * This task packing strategy prefers more energy
 +                       * efficient CPUs (i.e. pack on smaller maximum
 +                       * capacity CPUs) while also trying to spread tasks to
 +                       * run them all at the lower OPP.
 +                       *
 +                       * This assumes for example that it's more energy
 +                       * efficient to run two tasks on two CPUs at a lower
 +                       * OPP than packing both on a single CPU but running
 +                       * that CPU at an higher OPP.
 +                       *
 +                       * Thus, this case keep track of the CPU with the
 +                       * smallest maximum capacity and highest spare maximum
 +                       * capacity.
 +                       */
 +
 +                      /* Favor CPUs with smaller capacity */
 +                      if (capacity_orig > target_capacity)
 +                              continue;
 +
 +                      /* Favor CPUs with maximum spare capacity */
 +                      if ((capacity_orig - new_util) < target_max_spare_cap)
 +                              continue;
 +
 +                      target_max_spare_cap = capacity_orig - new_util;
 +                      target_capacity = capacity_orig;
 +                      target_cpu = i;
 +              }
 +
 +      } while (sg = sg->next, sg != sd->groups);
 +
 +      /*
 +       * For non latency sensitive tasks, cases B and C in the previous loop,
 +       * we pick the best IDLE CPU only if we was not able to find a target
 +       * ACTIVE CPU.
 +       *
 +       * Policies priorities:
 +       *
 +       * - prefer_idle tasks:
 +       *
 +       *   a) IDLE CPU available, we return immediately
 +       *   b) ACTIVE CPU where task fits and has the bigger maximum spare
 +       *      capacity (i.e. target_cpu)
 +       *   c) ACTIVE CPU with less contention due to other tasks
 +       *      (i.e. best_active_cpu)
 +       *
 +       * - NON prefer_idle tasks:
 +       *
 +       *   a) ACTIVE CPU: target_cpu
 +       *   b) IDLE CPU: best_idle_cpu
 +       */
 +      if (target_cpu != -1 && !idle_cpu(target_cpu) &&
 +                      best_idle_cpu != -1) {
 +              curr_tsk = READ_ONCE(cpu_rq(target_cpu)->curr);
 +              if (curr_tsk && schedtune_task_boost_rcu_locked(curr_tsk)) {
 +                      target_cpu = best_idle_cpu;
 +              }
 +      }
 +
 +      if (target_cpu == -1)
 +              target_cpu = prefer_idle
 +                      ? best_active_cpu
 +                      : best_idle_cpu;
 +      else
 +              *backup_cpu = prefer_idle
 +              ? best_active_cpu
 +              : best_idle_cpu;
 +
 +      trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
 +                                   best_idle_cpu, best_active_cpu,
 +                                   target_cpu);
 +
 +      schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
 +      schedstat_inc(this_rq(), eas_stats.fbt_count);
 +
 +      return target_cpu;
 +}
 +
 +/*
 + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
 + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
   *
 - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
 - * higher than capacity_orig because of unfortunate rounding in
 - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
 - * the average stabilizes with the new running time. We need to check that the
 - * utilization stays within the range of [0..capacity_orig] and cap it if
 - * necessary. Without utilization capping, a group could be seen as overloaded
 - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
 - * available capacity. We allow utilization to overshoot capacity_curr (but not
 - * capacity_orig) as it useful for predicting the capacity required after task
 - * migrations (scheduler-driven DVFS).
 + * In that case WAKE_AFFINE doesn't make sense and we'll let
 + * BALANCE_WAKE sort things out.
   */
 -static int cpu_util(int cpu)
 +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 +{
 +      long min_cap, max_cap;
 +
 +      min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
 +      max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
 +
 +      /* Minimum capacity is close to max, no need to abort wake_affine */
 +      if (max_cap - min_cap < max_cap >> 3)
 +              return 0;
 +
 +      /* Bring task utilization in sync with prev_cpu */
 +      sync_entity_load_avg(&p->se);
 +
 +      return min_cap * 1024 < task_util(p) * capacity_margin;
 +}
 +
 +static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
  {
 -      unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
 -      unsigned long capacity = capacity_orig_of(cpu);
 +      struct sched_domain *sd;
 +      int target_cpu = prev_cpu, tmp_target, tmp_backup;
 +      bool boosted, prefer_idle;
 +
 +      schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
 +      schedstat_inc(this_rq(), eas_stats.secb_attempts);
 +
 +      if (sysctl_sched_sync_hint_enable && sync) {
 +              int cpu = smp_processor_id();
 +
 +              if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
 +                      schedstat_inc(p, se.statistics.nr_wakeups_secb_sync);
 +                      schedstat_inc(this_rq(), eas_stats.secb_sync);
 +                      return cpu;
 +              }
 +      }
 +
 +      rcu_read_lock();
 +#ifdef CONFIG_CGROUP_SCHEDTUNE
 +      boosted = schedtune_task_boost(p) > 0;
 +      prefer_idle = schedtune_prefer_idle(p) > 0;
 +#else
 +      boosted = get_sysctl_sched_cfs_boost() > 0;
 +      prefer_idle = 0;
 +#endif
 +
 +      sync_entity_load_avg(&p->se);
 +
 +      sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
 +      /* Find a cpu with sufficient capacity */
 +      tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
 +
 +      if (!sd)
 +              goto unlock;
 +      if (tmp_target >= 0) {
 +              target_cpu = tmp_target;
 +              if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
 +                      schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
 +                      schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
 +                      goto unlock;
 +              }
 +      }
 +
 +      if (target_cpu != prev_cpu) {
 +              int delta = 0;
 +              struct energy_env eenv = {
 +                      .util_delta     = task_util(p),
 +                      .src_cpu        = prev_cpu,
 +                      .dst_cpu        = target_cpu,
 +                      .task           = p,
 +                      .trg_cpu        = target_cpu,
 +              };
 +
 +
 +#ifdef CONFIG_SCHED_WALT
 +              if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
 +                      p->state == TASK_WAKING)
 +                      delta = task_util(p);
 +#endif
 +              /* Not enough spare capacity on previous cpu */
 +              if (__cpu_overutilized(prev_cpu, delta)) {
 +                      schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
 +                      schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
 +                      goto unlock;
 +              }
 +
 +              if (energy_diff(&eenv) >= 0) {
 +                      /* No energy saving for target_cpu, try backup */
 +                      target_cpu = tmp_backup;
 +                      eenv.dst_cpu = target_cpu;
 +                      eenv.trg_cpu = target_cpu;
 +                      if (tmp_backup < 0 ||
 +                          tmp_backup == prev_cpu ||
 +                          energy_diff(&eenv) >= 0) {
 +                              schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
 +                              schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
 +                              target_cpu = prev_cpu;
 +                              goto unlock;
 +                      }
 +              }
 +
 +              schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
 +              schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
 +              goto unlock;
 +      }
 +
 +      schedstat_inc(p, se.statistics.nr_wakeups_secb_count);
 +      schedstat_inc(this_rq(), eas_stats.secb_count);
 +
 +unlock:
 +      rcu_read_unlock();
  
 -      return (util >= capacity) ? capacity : util;
 +      return target_cpu;
  }
  
  /*
   * preempt must be disabled.
   */
  static int
 -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
 +                  int sibling_count_hint)
  {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
        int want_affine = 0;
        int sync = wake_flags & WF_SYNC;
  
 -      if (sd_flag & SD_BALANCE_WAKE)
 -              want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
 +#ifdef CONFIG_SCHED_HMP
 +      return select_best_cpu(p, prev_cpu, 0, sync);
 +#endif
 +
 +      if (sd_flag & SD_BALANCE_WAKE) {
 +              int _wake_cap = wake_cap(p, cpu, prev_cpu);
 +
 +              if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
 +                      bool about_to_idle = (cpu_rq(cpu)->nr_running < 2);
 +
 +                      if (sysctl_sched_sync_hint_enable && sync &&
 +                          !_wake_cap && about_to_idle)
 +                              return cpu;
 +              }
 +
 +              record_wakee(p);
 +              want_affine = !wake_wide(p, sibling_count_hint) &&
 +                            !_wake_cap &&
 +                            cpumask_test_cpu(cpu, &p->cpus_allowed);
 +      }
 +
 +      if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
 +              return select_energy_cpu_brute(p, prev_cpu, sync);
  
        rcu_read_lock();
        for_each_domain(cpu, tmp) {
  
        if (affine_sd) {
                sd = NULL; /* Prefer wake_affine over balance flags */
 -              if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
 +              if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
                        new_cpu = cpu;
        }
  
 +      if (sd && !(sd_flag & SD_BALANCE_FORK)) {
 +              /*
 +               * We're going to need the task's util for capacity_spare_wake
 +               * in find_idlest_group. Sync it up to prev_cpu's
 +               * last_update_time.
 +               */
 +              sync_entity_load_avg(&p->se);
 +      }
 +
        if (!sd) {
                if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
 -                      new_cpu = select_idle_sibling(p, new_cpu);
 -
 -      } else while (sd) {
 -              struct sched_group *group;
 -              int weight;
 -
 -              if (!(sd->flags & sd_flag)) {
 -                      sd = sd->child;
 -                      continue;
 -              }
 -
 -              group = find_idlest_group(sd, p, cpu, sd_flag);
 -              if (!group) {
 -                      sd = sd->child;
 -                      continue;
 -              }
 -
 -              new_cpu = find_idlest_cpu(group, p, cpu);
 -              if (new_cpu == -1 || new_cpu == cpu) {
 -                      /* Now try balancing at a lower domain level of cpu */
 -                      sd = sd->child;
 -                      continue;
 -              }
 +                      new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
  
 -              /* Now try balancing at a lower domain level of new_cpu */
 -              cpu = new_cpu;
 -              weight = sd->span_weight;
 -              sd = NULL;
 -              for_each_domain(cpu, tmp) {
 -                      if (weight <= tmp->span_weight)
 -                              break;
 -                      if (tmp->flags & sd_flag)
 -                              sd = tmp;
 -              }
 -              /* while loop will break here if sd == NULL */
 +      } else {
 +              new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
        }
        rcu_read_unlock();
  
@@@ -8071,8 -5175,6 +8079,8 @@@ static void task_dead_fair(struct task_
  {
        remove_entity_load_avg(&p->se);
  }
 +#else
 +#define task_fits_max(p, cpu) true
  #endif /* CONFIG_SMP */
  
  static unsigned long
@@@ -8319,8 -5421,6 +8327,8 @@@ again
        if (hrtick_enabled(rq))
                hrtick_start_fair(rq, p);
  
 +      rq->misfit_task = !task_fits_max(p, rq->cpu);
 +
        return p;
  simple:
        cfs_rq = &rq->cfs;
        if (hrtick_enabled(rq))
                hrtick_start_fair(rq, p);
  
 +      rq->misfit_task = !task_fits_max(p, rq->cpu);
 +
        return p;
  
  idle:
 +      rq->misfit_task = 0;
        /*
         * This is OK, because current is on_cpu, which avoids it being picked
         * for load-balance and preemption/IRQs are still disabled avoiding
@@@ -8560,21 -5657,10 +8568,21 @@@ static unsigned long __read_mostly max_
  
  enum fbq_type { regular, remote, all };
  
 +enum group_type {
 +      group_other = 0,
 +      group_misfit_task,
 +      group_imbalanced,
 +      group_overloaded,
 +};
 +
  #define LBF_ALL_PINNED        0x01
  #define LBF_NEED_BREAK        0x02
  #define LBF_DST_PINNED  0x04
  #define LBF_SOME_PINNED       0x08
 +#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
 +#define LBF_IGNORE_BIG_TASKS 0x100
 +#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
 +#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
  
  struct lb_env {
        struct sched_domain     *sd;
        int                     new_dst_cpu;
        enum cpu_idle_type      idle;
        long                    imbalance;
 +      unsigned int            src_grp_nr_running;
        /* The set of CPUs under consideration for load-balancing */
        struct cpumask          *cpus;
 +      unsigned int            busiest_grp_capacity;
 +      unsigned int            busiest_nr_running;
  
        unsigned int            flags;
  
        unsigned int            loop_max;
  
        enum fbq_type           fbq_type;
 +      enum group_type         busiest_group_type;
        struct list_head        tasks;
 +      enum sched_boost_policy boost_policy;
  };
  
  /*
@@@ -8702,7 -5783,6 +8710,7 @@@ stati
  int can_migrate_task(struct task_struct *p, struct lb_env *env)
  {
        int tsk_cache_hot;
 +      int twf, group_cpus;
  
        lockdep_assert_held(&env->src_rq->lock);
  
        /* Record that we found atleast one task that could run on dst_cpu */
        env->flags &= ~LBF_ALL_PINNED;
  
 +      if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
 +              if (nr_big_tasks(env->src_rq) && !is_big_task(p))
 +                      return 0;
 +
 +              if (env->boost_policy == SCHED_BOOST_ON_BIG &&
 +                                      !task_sched_boost(p))
 +                      return 0;
 +      }
 +
 +      twf = task_will_fit(p, env->dst_cpu);
 +
 +      /*
 +       * Attempt to not pull tasks that don't fit. We may get lucky and find
 +       * one that actually fits.
 +       */
 +      if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
 +              return 0;
 +
 +      if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
 +          !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p))
 +              return 0;
 +
 +      /*
 +       * Group imbalance can sometimes cause work to be pulled across groups
 +       * even though the group could have managed the imbalance on its own.
 +       * Prevent inter-cluster migrations for big tasks when the number of
 +       * tasks is lower than the capacity of the group.
 +       */
 +      group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity,
 +                                               SCHED_CAPACITY_SCALE);
 +      if (!twf && env->busiest_nr_running <= group_cpus)
 +              return 0;
 +
        if (task_running(env->src_rq, p)) {
                schedstat_inc(p, se.statistics.nr_failed_migrations_running);
                return 0;
  
        /*
         * Aggressive migration if:
 -       * 1) destination numa is preferred
 -       * 2) task is cache cold, or
 -       * 3) too many balance attempts have failed.
 +       * 1) IDLE or NEWLY_IDLE balance.
 +       * 2) destination numa is preferred
 +       * 3) task is cache cold, or
 +       * 4) too many balance attempts have failed.
         */
        tsk_cache_hot = migrate_degrades_locality(p, env);
        if (tsk_cache_hot == -1)
                tsk_cache_hot = task_hot(p, env);
  
 -      if (tsk_cache_hot <= 0 ||
 +      if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
            env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
                if (tsk_cache_hot == 1) {
                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
@@@ -8818,13 -5864,9 +8826,13 @@@ static void detach_task(struct task_str
  {
        lockdep_assert_held(&env->src_rq->lock);
  
 -      deactivate_task(env->src_rq, p, 0);
        p->on_rq = TASK_ON_RQ_MIGRATING;
 +      deactivate_task(env->src_rq, p, 0);
 +      double_lock_balance(env->src_rq, env->dst_rq);
        set_task_cpu(p, env->dst_cpu);
 +      if (task_in_related_thread_group(p))
 +              env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
 +      double_unlock_balance(env->src_rq, env->dst_rq);
  }
  
  /*
@@@ -8852,7 -5894,6 +8860,7 @@@ static struct task_struct *detach_one_t
                 * inside detach_tasks().
                 */
                schedstat_inc(env->sd, lb_gained[env->idle]);
 +
                return p;
        }
        return NULL;
@@@ -8872,20 -5913,12 +8880,20 @@@ static int detach_tasks(struct lb_env *
        struct task_struct *p;
        unsigned long load;
        int detached = 0;
 +      int orig_loop = env->loop;
  
        lockdep_assert_held(&env->src_rq->lock);
  
        if (env->imbalance <= 0)
                return 0;
  
 +      if (!same_cluster(env->dst_cpu, env->src_cpu))
 +              env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
 +
 +      if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
 +              env->flags |= LBF_IGNORE_BIG_TASKS;
 +
 +redo:
        while (!list_empty(tasks)) {
                /*
                 * We don't want to steal all, otherwise we may be treated likewise,
@@@ -8947,15 -5980,6 +8955,15 @@@ next
                list_move_tail(&p->se.group_node, tasks);
        }
  
 +      if (env->flags & (LBF_IGNORE_BIG_TASKS |
 +                      LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
 +              tasks = &env->src_rq->cfs_tasks;
 +              env->flags &= ~(LBF_IGNORE_BIG_TASKS |
 +                              LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
 +              env->loop = orig_loop;
 +              goto redo;
 +      }
 +
        /*
         * Right now, this is one of only two places we collect this stat
         * so we can safely collect detach_one_task() stats here rather
@@@ -8974,8 -5998,8 +8982,8 @@@ static void attach_task(struct rq *rq, 
        lockdep_assert_held(&rq->lock);
  
        BUG_ON(task_rq(p) != rq);
 -      p->on_rq = TASK_ON_RQ_QUEUED;
        activate_task(rq, p, 0);
 +      p->on_rq = TASK_ON_RQ_QUEUED;
        check_preempt_curr(rq, p, 0);
  }
  
@@@ -9030,13 -6054,8 +9038,13 @@@ static void update_blocked_averages(in
                if (throttled_hierarchy(cfs_rq))
                        continue;
  
 -              if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
 +              if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
 +                                         true))
                        update_tg_load_avg(cfs_rq, 0);
 +
 +              /* Propagate pending load changes to the parent */
 +              if (cfs_rq->tg->se[cpu])
 +                      update_load_avg(cfs_rq->tg->se[cpu], 0);
        }
        raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
@@@ -9096,7 -6115,7 +9104,7 @@@ static inline void update_blocked_avera
  
        raw_spin_lock_irqsave(&rq->lock, flags);
        update_rq_clock(rq);
 -      update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
 +      update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
@@@ -9108,6 -6127,12 +9116,6 @@@ static unsigned long task_h_load(struc
  
  /********** Helpers for find_busiest_group ************************/
  
 -enum group_type {
 -      group_other = 0,
 -      group_imbalanced,
 -      group_overloaded,
 -};
 -
  /*
   * sg_lb_stats - stats of a sched_group required for load_balancing
   */
@@@ -9119,15 -6144,10 +9127,15 @@@ struct sg_lb_stats 
        unsigned long group_capacity;
        unsigned long group_util; /* Total utilization of the group */
        unsigned int sum_nr_running; /* Nr tasks running in the group */
 +#ifdef CONFIG_SCHED_HMP
 +      unsigned long sum_nr_big_tasks;
 +      u64 group_cpu_load; /* Scaled load of all CPUs of the group */
 +#endif
        unsigned int idle_cpus;
        unsigned int group_weight;
        enum group_type group_type;
        int group_no_capacity;
 +      int group_misfit_task; /* A cpu has a task too big for its capacity */
  #ifdef CONFIG_NUMA_BALANCING
        unsigned int nr_numa_running;
        unsigned int nr_preferred_running;
@@@ -9166,64 -6186,10 +9174,64 @@@ static inline void init_sd_lb_stats(str
                        .avg_load = 0UL,
                        .sum_nr_running = 0,
                        .group_type = group_other,
 +#ifdef CONFIG_SCHED_HMP
 +                      .sum_nr_big_tasks = 0UL,
 +                      .group_cpu_load = 0ULL,
 +#endif
                },
        };
  }
  
 +#ifdef CONFIG_SCHED_HMP
 +
 +static int
 +bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
 +{
 +      int local_cpu, busiest_cpu;
 +      int local_capacity, busiest_capacity;
 +      int local_pwr_cost, busiest_pwr_cost;
 +      int nr_cpus;
 +      int boost = sched_boost();
 +
 +      if (!sysctl_sched_restrict_cluster_spill ||
 +              boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
 +              return 0;
 +
 +      local_cpu = group_first_cpu(sds->local);
 +      busiest_cpu = group_first_cpu(sds->busiest);
 +
 +      local_capacity = cpu_max_possible_capacity(local_cpu);
 +      busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
 +
 +      local_pwr_cost = cpu_max_power_cost(local_cpu);
 +      busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
 +
 +      if (local_pwr_cost <= busiest_pwr_cost)
 +              return 0;
 +
 +      if (local_capacity > busiest_capacity &&
 +                      sds->busiest_stat.sum_nr_big_tasks)
 +              return 0;
 +
 +      nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
 +      if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
 +              (sds->busiest_stat.sum_nr_running <
 +                      nr_cpus * sysctl_sched_spill_nr_run))
 +              return 1;
 +
 +      return 0;
 +}
 +
 +#else /* CONFIG_SCHED_HMP */
 +
 +static inline int
 +bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
 +{
 +      return 0;
 +}
 +
 +#endif        /* CONFIG_SCHED_HMP */
 +
  /**
   * get_sd_load_idx - Obtain the load index for a given sched domain.
   * @sd: The sched_domain whose load_idx is to be obtained.
@@@ -9273,58 -6239,19 +9281,58 @@@ static unsigned long scale_rt_capacity(
  
        used = div_u64(avg, total);
  
 +      /*
 +       * deadline bandwidth is defined at system level so we must
 +       * weight this bandwidth with the max capacity of the system.
 +       * As a reminder, avg_bw is 20bits width and
 +       * scale_cpu_capacity is 10 bits width
 +       */
 +      used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
 +
        if (likely(used < SCHED_CAPACITY_SCALE))
                return SCHED_CAPACITY_SCALE - used;
  
        return 1;
  }
  
 +void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
 +{
 +      raw_spin_lock_init(&mcc->lock);
 +      mcc->val = 0;
 +      mcc->cpu = -1;
 +}
 +
  static void update_cpu_capacity(struct sched_domain *sd, int cpu)
  {
        unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
        struct sched_group *sdg = sd->groups;
 +      struct max_cpu_capacity *mcc;
 +      unsigned long max_capacity;
 +      int max_cap_cpu;
 +      unsigned long flags;
  
        cpu_rq(cpu)->cpu_capacity_orig = capacity;
  
 +      mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
 +
 +      raw_spin_lock_irqsave(&mcc->lock, flags);
 +      max_capacity = mcc->val;
 +      max_cap_cpu = mcc->cpu;
 +
 +      if ((max_capacity > capacity && max_cap_cpu == cpu) ||
 +          (max_capacity < capacity)) {
 +              mcc->val = capacity;
 +              mcc->cpu = cpu;
 +#ifdef CONFIG_SCHED_DEBUG
 +              raw_spin_unlock_irqrestore(&mcc->lock, flags);
 +              printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
 +                              cpu, capacity);
 +              goto skip_unlock;
 +#endif
 +      }
 +      raw_spin_unlock_irqrestore(&mcc->lock, flags);
 +
 +skip_unlock: __attribute__ ((unused));
        capacity *= scale_rt_capacity(cpu);
        capacity >>= SCHED_CAPACITY_SHIFT;
  
  
        cpu_rq(cpu)->cpu_capacity = capacity;
        sdg->sgc->capacity = capacity;
 +      sdg->sgc->max_capacity = capacity;
 +      sdg->sgc->min_capacity = capacity;
  }
  
  void update_group_capacity(struct sched_domain *sd, int cpu)
  {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
 -      unsigned long capacity;
 +      unsigned long capacity, max_capacity, min_capacity;
        unsigned long interval;
  
        interval = msecs_to_jiffies(sd->balance_interval);
        }
  
        capacity = 0;
 +      max_capacity = 0;
 +      min_capacity = ULONG_MAX;
  
        if (child->flags & SD_OVERLAP) {
                /*
                        struct sched_group_capacity *sgc;
                        struct rq *rq = cpu_rq(cpu);
  
 +                      if (cpumask_test_cpu(cpu, cpu_isolated_mask))
 +                              continue;
                        /*
                         * build_sched_domains() -> init_sched_groups_capacity()
                         * gets here before we've attached the domains to the
                         */
                        if (unlikely(!rq->sd)) {
                                capacity += capacity_of(cpu);
 -                              continue;
 +                      } else {
 +                              sgc = rq->sd->groups->sgc;
 +                              capacity += sgc->capacity;
                        }
  
 -                      sgc = rq->sd->groups->sgc;
 -                      capacity += sgc->capacity;
 +                      max_capacity = max(capacity, max_capacity);
 +                      min_capacity = min(capacity, min_capacity);
                }
        } else  {
                /*
  
                group = child->groups;
                do {
 -                      capacity += group->sgc->capacity;
 +                      struct sched_group_capacity *sgc = group->sgc;
 +
 +                      cpumask_t *cpus = sched_group_cpus(group);
 +
 +                      /* Revisit this later. This won't work for MT domain */
 +                      if (!cpu_isolated(cpumask_first(cpus))) {
 +                              capacity += sgc->capacity;
 +                              max_capacity = max(sgc->max_capacity, max_capacity);
 +                              min_capacity = min(sgc->min_capacity, min_capacity);
 +                      }
                        group = group->next;
                } while (group != child->groups);
        }
  
        sdg->sgc->capacity = capacity;
 +      sdg->sgc->max_capacity = max_capacity;
 +      sdg->sgc->min_capacity = min_capacity;
  }
  
  /*
@@@ -9509,21 -6417,9 +9517,21 @@@ group_is_overloaded(struct lb_env *env
        return false;
  }
  
 +
 +/*
 + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
 + * per-cpu capacity than sched_group ref.
 + */
 +static inline bool
 +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
 +{
 +      return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
 +                                                      ref->sgc->max_capacity;
 +}
 +
  static inline enum
  group_type group_classify(struct sched_group *group,
 -                        struct sg_lb_stats *sgs)
 +                        struct sg_lb_stats *sgs, struct lb_env *env)
  {
        if (sgs->group_no_capacity)
                return group_overloaded;
        if (sg_imbalanced(group))
                return group_imbalanced;
  
 +      if (sgs->group_misfit_task)
 +              return group_misfit_task;
 +
        return group_other;
  }
  
 +#ifdef CONFIG_NO_HZ_COMMON
 +/*
 + * idle load balancing data
 + *  - used by the nohz balance, but we want it available here
 + *    so that we can see which CPUs have no tick.
 + */
 +static struct {
 +      cpumask_var_t idle_cpus_mask;
 +      atomic_t nr_cpus;
 +      unsigned long next_balance;     /* in jiffy units */
 +} nohz ____cacheline_aligned;
 +
 +static inline void update_cpu_stats_if_tickless(struct rq *rq)
 +{
 +      /* only called from update_sg_lb_stats when irqs are disabled */
 +      if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
 +              /* rate limit updates to once-per-jiffie at most */
 +              if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
 +                      return;
 +
 +              raw_spin_lock(&rq->lock);
 +              update_rq_clock(rq);
 +              update_idle_cpu_load(rq);
 +              update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
 +              raw_spin_unlock(&rq->lock);
 +      }
 +}
 +
 +#else
 +static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
 +#endif
 +
  /**
   * update_sg_lb_stats - Update sched_group's statistics for load balancing.
   * @env: The load balancing environment.
   * @local_group: Does group contain this_cpu.
   * @sgs: variable to hold the statistics for this group.
   * @overload: Indicate more than one runnable task for any CPU.
 + * @overutilized: Indicate overutilization for any CPU.
   */
  static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
                        int local_group, struct sg_lb_stats *sgs,
 -                      bool *overload)
 +                      bool *overload, bool *overutilized)
  {
        unsigned long load;
 -      int i;
 +      int i, nr_running;
  
        memset(sgs, 0, sizeof(*sgs));
  
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
  
 +              trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
 +                                   sched_irqload(i),
 +                                   power_cost(i, 0),
 +                                   cpu_temp(i));
 +
 +              if (cpu_isolated(i))
 +                      continue;
 +
 +              /* if we are entering idle and there are CPUs with
 +               * their tick stopped, do an update for them
 +               */
 +              if (env->idle == CPU_NEWLY_IDLE)
 +                      update_cpu_stats_if_tickless(rq);
 +
                /* Bias balancing toward cpus of our domain */
                if (local_group)
                        load = target_load(i, load_idx);
                sgs->group_util += cpu_util(i);
                sgs->sum_nr_running += rq->cfs.h_nr_running;
  
 -              if (rq->nr_running > 1)
 +              nr_running = rq->nr_running;
 +              if (nr_running > 1)
                        *overload = true;
  
 +#ifdef CONFIG_SCHED_HMP
 +              sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks;
 +              sgs->group_cpu_load += cpu_load(i);
 +#endif
 +
  #ifdef CONFIG_NUMA_BALANCING
                sgs->nr_numa_running += rq->nr_numa_running;
                sgs->nr_preferred_running += rq->nr_preferred_running;
  #endif
                sgs->sum_weighted_load += weighted_cpuload(i);
 -              if (idle_cpu(i))
 +              /*
 +               * No need to call idle_cpu() if nr_running is not 0
 +               */
 +              if (!nr_running && idle_cpu(i))
                        sgs->idle_cpus++;
 +
 +              if (energy_aware() && cpu_overutilized(i)) {
 +                      *overutilized = true;
 +                      if (!sgs->group_misfit_task && rq->misfit_task)
 +                              sgs->group_misfit_task = capacity_of(i);
 +              }
        }
  
 -      /* Adjust by relative CPU capacity of the group */
 -      sgs->group_capacity = group->sgc->capacity;
 -      sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
 +      /* Isolated CPU has no weight */
 +      if (!group->group_weight) {
 +              sgs->group_capacity = 0;
 +              sgs->avg_load = 0;
 +              sgs->group_no_capacity = 1;
 +              sgs->group_type = group_other;
 +              sgs->group_weight = group->group_weight;
 +      } else {
 +              /* Adjust by relative CPU capacity of the group */
 +              sgs->group_capacity = group->sgc->capacity;
 +              sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
 +                                                      sgs->group_capacity;
 +
 +              sgs->group_weight = group->group_weight;
 +
 +              sgs->group_no_capacity = group_is_overloaded(env, sgs);
 +              sgs->group_type = group_classify(group, sgs, env);
 +      }
  
        if (sgs->sum_nr_running)
                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
 +}
  
 -      sgs->group_weight = group->group_weight;
 +#ifdef CONFIG_SCHED_HMP
 +static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
 +                                                struct sd_lb_stats *sds,
 +                                                struct sched_group *sg,
 +                                                struct sg_lb_stats *sgs)
 +{
 +      if (env->idle != CPU_NOT_IDLE &&
 +          cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
 +              if (sgs->sum_nr_big_tasks >
 +                              sds->busiest_stat.sum_nr_big_tasks) {
 +                      env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
 +                      return true;
 +              }
 +      }
  
 -      sgs->group_no_capacity = group_is_overloaded(env, sgs);
 -      sgs->group_type = group_classify(group, sgs);
 +      return false;
 +}
 +#else
 +static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
 +                                                struct sd_lb_stats *sds,
 +                                                struct sched_group *sg,
 +                                                struct sg_lb_stats *sgs)
 +{
 +      return false;
  }
 +#endif
  
  /**
   * update_sd_pick_busiest - return 1 on busiest group
@@@ -9713,42 -6507,15 +9721,42 @@@ static bool update_sd_pick_busiest(stru
  {
        struct sg_lb_stats *busiest = &sds->busiest_stat;
  
 +      if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs))
 +              return true;
 +
        if (sgs->group_type > busiest->group_type)
                return true;
  
        if (sgs->group_type < busiest->group_type)
                return false;
  
 -      if (sgs->avg_load <= busiest->avg_load)
 -              return false;
 +      if (energy_aware()) {
 +              /*
 +               * Candidate sg doesn't face any serious load-balance problems
 +               * so don't pick it if the local sg is already filled up.
 +               */
 +              if (sgs->group_type == group_other &&
 +                  !group_has_capacity(env, &sds->local_stat))
 +                      return false;
 +
 +              if (sgs->avg_load <= busiest->avg_load)
 +                      return false;
  
 +              if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
 +                      goto asym_packing;
 +
 +              /*
 +               * Candidate sg has no more than one task per CPU and
 +               * has higher per-CPU capacity. Migrating tasks to less
 +               * capable CPUs may harm throughput. Maximize throughput,
 +               * power/energy consequences are not considered.
 +               */
 +              if (sgs->sum_nr_running <= sgs->group_weight &&
 +                  group_smaller_cpu_capacity(sds->local, sg))
 +                      return false;
 +      }
 +
 +asym_packing:
        /* This is the busiest node in its class. */
        if (!(env->sd->flags & SD_ASYM_PACKING))
                return true;
@@@ -9799,9 -6566,6 +9807,9 @@@ static inline enum fbq_type fbq_classif
  }
  #endif /* CONFIG_NUMA_BALANCING */
  
 +#define lb_sd_parent(sd) \
 +      (sd->parent && sd->parent->groups != sd->parent->groups->next)
 +
  /**
   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
   * @env: The load balancing environment.
@@@ -9813,7 -6577,7 +9821,7 @@@ static inline void update_sd_lb_stats(s
        struct sched_group *sg = env->sd->groups;
        struct sg_lb_stats tmp_sgs;
        int load_idx, prefer_sibling = 0;
 -      bool overload = false;
 +      bool overload = false, overutilized = false;
  
        if (child && child->flags & SD_PREFER_SIBLING)
                prefer_sibling = 1;
                }
  
                update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
 -                                              &overload);
 +                                              &overload, &overutilized);
  
                if (local_group)
                        goto next_group;
                    group_has_capacity(env, &sds->local_stat) &&
                    (sgs->sum_nr_running > 1)) {
                        sgs->group_no_capacity = 1;
 -                      sgs->group_type = group_classify(sg, sgs);
 +                      sgs->group_type = group_classify(sg, sgs, env);
                }
  
 +              /*
 +               * Ignore task groups with misfit tasks if local group has no
 +               * capacity or if per-cpu capacity isn't higher.
 +               */
 +              if (energy_aware() &&
 +                  sgs->group_type == group_misfit_task &&
 +                  (!group_has_capacity(env, &sds->local_stat) ||
 +                   !group_smaller_cpu_capacity(sg, sds->local)))
 +                      sgs->group_type = group_other;
 +
                if (update_sd_pick_busiest(env, sds, sg, sgs)) {
                        sds->busiest = sg;
                        sds->busiest_stat = *sgs;
 +                      env->busiest_nr_running = sgs->sum_nr_running;
 +                      env->busiest_grp_capacity = sgs->group_capacity;
                }
  
  next_group:
        if (env->sd->flags & SD_NUMA)
                env->fbq_type = fbq_classify_group(&sds->busiest_stat);
  
 -      if (!env->sd->parent) {
 +      env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
 +
 +      if (!lb_sd_parent(env->sd)) {
                /* update overload indicator if we are at root domain */
                if (env->dst_rq->rd->overload != overload)
                        env->dst_rq->rd->overload = overload;
 +
 +              /* Update over-utilization (tipping point, U >= 0) indicator */
 +              if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) {
 +                      env->dst_rq->rd->overutilized = overutilized;
 +                      trace_sched_overutilized(overutilized);
 +              }
 +      } else {
 +              if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) {
 +                      env->dst_rq->rd->overutilized = true;
 +                      trace_sched_overutilized(true);
 +              }
        }
  
  }
@@@ -10050,24 -6789,6 +10058,24 @@@ static inline void calculate_imbalance(
         */
        if (busiest->avg_load <= sds->avg_load ||
            local->avg_load >= sds->avg_load) {
 +              if (energy_aware()) {
 +                      /* Misfitting tasks should be migrated in any case */
 +                      if (busiest->group_type == group_misfit_task) {
 +                              env->imbalance = busiest->group_misfit_task;
 +                              return;
 +                      }
 +
 +                      /*
 +                       * Busiest group is overloaded, local is not, use the spare
 +                       * cycles to maximize throughput
 +                       */
 +                      if (busiest->group_type == group_overloaded &&
 +                          local->group_type <= group_misfit_task) {
 +                              env->imbalance = busiest->load_per_task;
 +                              return;
 +                      }
 +              }
 +
                env->imbalance = 0;
                return fix_small_imbalance(env, sds);
        }
                (sds->avg_load - local->avg_load) * local->group_capacity
        ) / SCHED_CAPACITY_SCALE;
  
 +      /* Boost imbalance to allow misfit task to be balanced. */
 +      if (energy_aware() && busiest->group_type == group_misfit_task)
 +              env->imbalance = max_t(long, env->imbalance,
 +                                   busiest->group_misfit_task);
 +
        /*
         * if *imbalance is less than the average load per runnable task
         * there is no guarantee that any tasks will be moved so we'll have
@@@ -10147,10 -6863,6 +10155,10 @@@ static struct sched_group *find_busiest
         * this level.
         */
        update_sd_lb_stats(env, &sds);
 +
 +      if (energy_aware() && !env->dst_rq->rd->overutilized)
 +              goto out_balanced;
 +
        local = &sds.local_stat;
        busiest = &sds.busiest_stat;
  
        if (!sds.busiest || busiest->sum_nr_running == 0)
                goto out_balanced;
  
 +      if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
 +              goto force_balance;
 +
 +      if (bail_inter_cluster_balance(env, &sds))
 +              goto out_balanced;
 +
        sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
                                                / sds.total_capacity;
  
        if (busiest->group_type == group_imbalanced)
                goto force_balance;
  
 -      /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
 -      if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
 +      /*
 +       * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
 +       * capacities from resulting in underutilization due to avg_load.
 +       */
 +      if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
            busiest->group_no_capacity)
                goto force_balance;
  
 +      /* Misfitting tasks should be dealt with regardless of the avg load */
 +      if (energy_aware() && busiest->group_type == group_misfit_task) {
 +              goto force_balance;
 +      }
 +
        /*
         * If the local group is busier than the selected busiest group
         * don't try and pull any tasks.
                 * might end up to just move the imbalance on another group
                 */
                if ((busiest->group_type != group_overloaded) &&
 -                              (local->idle_cpus <= (busiest->idle_cpus + 1)))
 +                  (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
 +                  !group_smaller_cpu_capacity(sds.busiest, sds.local))
                        goto out_balanced;
        } else {
                /*
                        goto out_balanced;
        }
  
 -force_balance:
 -      /* Looks like there is an imbalance. Compute it */
 -      calculate_imbalance(env, &sds);
 -      return sds.busiest;
 +force_balance:
 +      env->busiest_group_type = busiest->group_type;
 +      /* Looks like there is an imbalance. Compute it */
 +      calculate_imbalance(env, &sds);
 +      return sds.busiest;
 +
 +out_balanced:
 +      env->imbalance = 0;
 +      return NULL;
 +}
 +
 +#ifdef CONFIG_SCHED_HMP
 +static struct rq *find_busiest_queue_hmp(struct lb_env *env,
 +                                   struct sched_group *group)
 +{
 +      struct rq *busiest = NULL, *busiest_big = NULL;
 +      u64 max_runnable_avg = 0, max_runnable_avg_big = 0;
 +      int max_nr_big = 0, nr_big;
 +      bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
 +      int i;
 +      cpumask_t cpus;
 +
 +      cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
 +
 +      for_each_cpu(i, &cpus) {
 +              struct rq *rq = cpu_rq(i);
 +              u64 cumulative_runnable_avg =
 +                              rq->hmp_stats.cumulative_runnable_avg;
 +
 +              if (!cpumask_test_cpu(i, env->cpus))
 +                      continue;
 +
 +
 +              if (find_big) {
 +                      nr_big = nr_big_tasks(rq);
 +                      if (nr_big > max_nr_big ||
 +                          (nr_big > 0 && nr_big == max_nr_big &&
 +                           cumulative_runnable_avg > max_runnable_avg_big)) {
 +                              max_runnable_avg_big = cumulative_runnable_avg;
 +                              busiest_big = rq;
 +                              max_nr_big = nr_big;
 +                              continue;
 +                      }
 +              }
 +
 +              if (cumulative_runnable_avg > max_runnable_avg) {
 +                      max_runnable_avg = cumulative_runnable_avg;
 +                      busiest = rq;
 +              }
 +      }
 +
 +      if (busiest_big)
 +              return busiest_big;
  
 -out_balanced:
 -      env->imbalance = 0;
 +      env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;
 +      return busiest;
 +}
 +#else
 +static inline struct rq *find_busiest_queue_hmp(struct lb_env *env,
 +                                    struct sched_group *group)
 +{
        return NULL;
  }
 +#endif
  
  /*
   * find_busiest_queue - find the busiest runqueue among the cpus in group.
@@@ -10304,10 -6946,6 +10312,10 @@@ static struct rq *find_busiest_queue(st
        unsigned long busiest_load = 0, busiest_capacity = 1;
        int i;
  
 +#ifdef CONFIG_SCHED_HMP
 +      return find_busiest_queue_hmp(env, group);
 +#endif
 +
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                unsigned long capacity, wl;
                enum fbq_type rt;
                 */
  
                if (rq->nr_running == 1 && wl > env->imbalance &&
 -                  !check_cpu_capacity(rq, env->sd))
 +                  !check_cpu_capacity(rq, env->sd) &&
 +                  env->busiest_group_type != group_misfit_task)
                        continue;
  
                /*
   * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
   * so long as it is large enough.
   */
 -#define MAX_PINNED_INTERVAL   512
 +#define MAX_PINNED_INTERVAL   16
  
  /* Working cpumask for load_balance and load_balance_newidle. */
  DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
  
 +#define NEED_ACTIVE_BALANCE_THRESHOLD 10
 +
  static int need_active_balance(struct lb_env *env)
  {
        struct sched_domain *sd = env->sd;
  
 +      if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
 +              return 1;
 +
        if (env->idle == CPU_NEWLY_IDLE) {
  
                /*
                        return 1;
        }
  
 -      return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 +      if (energy_aware() &&
 +          (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
 +          ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
 +                              env->src_rq->cfs.h_nr_running == 1 &&
 +                              cpu_overutilized(env->src_cpu) &&
 +                              !cpu_overutilized(env->dst_cpu)) {
 +                      return 1;
 +      }
 +
 +      return unlikely(sd->nr_balance_failed >
 +                      sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
  }
  
 -static int active_load_balance_cpu_stop(void *data);
 +static int group_balance_cpu_not_isolated(struct sched_group *sg)
 +{
 +      cpumask_t cpus;
 +
 +      cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
 +      cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
 +      return cpumask_first(&cpus);
 +}
  
  static int should_we_balance(struct lb_env *env)
  {
        sg_mask = sched_group_mask(sg);
        /* Try to find first idle cpu */
        for_each_cpu_and(cpu, sg_cpus, env->cpus) {
 -              if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
 +              if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
 +                  cpu_isolated(cpu))
                        continue;
  
                balance_cpu = cpu;
        }
  
        if (balance_cpu == -1)
 -              balance_cpu = group_balance_cpu(sg);
 +              balance_cpu = group_balance_cpu_not_isolated(sg);
  
        /*
         * First idle cpu or the first cpu(busiest) in this sched group
@@@ -10479,29 -7093,23 +10487,29 @@@ static int load_balance(int this_cpu, s
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *continue_balancing)
  {
 -      int ld_moved, cur_ld_moved, active_balance = 0;
 -      struct sched_domain *sd_parent = sd->parent;
 -      struct sched_group *group;
 -      struct rq *busiest;
 +      int ld_moved = 0, cur_ld_moved, active_balance = 0;
 +      struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
 +      struct sched_group *group = NULL;
 +      struct rq *busiest = NULL;
        unsigned long flags;
        struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
  
        struct lb_env env = {
 -              .sd             = sd,
 -              .dst_cpu        = this_cpu,
 -              .dst_rq         = this_rq,
 -              .dst_grpmask    = sched_group_cpus(sd->groups),
 -              .idle           = idle,
 -              .loop_break     = sched_nr_migrate_break,
 -              .cpus           = cpus,
 -              .fbq_type       = all,
 -              .tasks          = LIST_HEAD_INIT(env.tasks),
 +              .sd                     = sd,
 +              .dst_cpu                = this_cpu,
 +              .dst_rq                 = this_rq,
 +              .dst_grpmask            = sched_group_cpus(sd->groups),
 +              .idle                   = idle,
 +              .loop_break             = sched_nr_migrate_break,
 +              .cpus                   = cpus,
 +              .fbq_type               = all,
 +              .tasks                  = LIST_HEAD_INIT(env.tasks),
 +              .imbalance              = 0,
 +              .flags                  = 0,
 +              .loop                   = 0,
 +              .busiest_nr_running     = 0,
 +              .busiest_grp_capacity   = 0,
 +              .boost_policy           = sched_boost_policy(),
        };
  
        /*
                 * correctly treated as an imbalance.
                 */
                env.flags |= LBF_ALL_PINNED;
 -              env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
  
  more_balance:
                raw_spin_lock_irqsave(&busiest->lock, flags);
 +              update_rq_clock(busiest);
 +
 +              /* The world might have changed. Validate assumptions */
 +              if (busiest->nr_running <= 1) {
 +                      raw_spin_unlock_irqrestore(&busiest->lock, flags);
 +                      env.flags &= ~LBF_ALL_PINNED;
 +                      goto no_move;
 +              }
 +
 +              /*
 +               * Set loop_max when rq's lock is taken to prevent a race.
 +               */
 +              env.loop_max = min(sysctl_sched_nr_migrate,
 +                                                      busiest->nr_running);
  
                /*
                 * cur_ld_moved - load moved in current iteration
                }
        }
  
 +no_move:
        if (!ld_moved) {
 -              schedstat_inc(sd, lb_failed[idle]);
 +              if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
 +                      schedstat_inc(sd, lb_failed[idle]);
 +
                /*
                 * Increment the failure counter only on periodic balance.
                 * We do not want newidle balance, which can be very
                 * frequent, pollute the failure counter causing
                 * excessive cache_hot migrations and active balances.
                 */
 -              if (idle != CPU_NEWLY_IDLE)
 -                      sd->nr_balance_failed++;
 +              if (idle != CPU_NEWLY_IDLE &&
 +                  !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) {
 +                      if (env.src_grp_nr_running > 1)
 +                              sd->nr_balance_failed++;
 +              }
  
                if (need_active_balance(&env)) {
                        raw_spin_lock_irqsave(&busiest->lock, flags);
                         * ->active_balance_work.  Once set, it's cleared
                         * only after active load balance is finished.
                         */
 -                      if (!busiest->active_balance) {
 +                      if (!busiest->active_balance &&
 +                          !cpu_isolated(cpu_of(busiest))) {
                                busiest->active_balance = 1;
                                busiest->push_cpu = this_cpu;
                                active_balance = 1;
                                stop_one_cpu_nowait(cpu_of(busiest),
                                        active_load_balance_cpu_stop, busiest,
                                        &busiest->active_balance_work);
 +                              *continue_balancing = 0;
                        }
  
                        /*
                         * We've kicked active balancing, reset the failure
                         * counter.
                         */
 -                      sd->nr_balance_failed = sd->cache_nice_tries+1;
 +                      sd->nr_balance_failed =
 +                          sd->cache_nice_tries +
 +                          NEED_ACTIVE_BALANCE_THRESHOLD - 1;
                }
 -      } else
 +      } else {
                sd->nr_balance_failed = 0;
  
 +              /* Assumes one 'busiest' cpu that we pulled tasks from */
 +              if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
 +                      int check_groups = !!(env.flags &
 +                                       LBF_MOVED_RELATED_THREAD_GROUP_TASK);
 +
 +                      check_for_freq_change(this_rq, false, check_groups);
 +                      check_for_freq_change(busiest, false, check_groups);
 +              } else {
 +                      check_for_freq_change(this_rq, true, false);
 +              }
 +      }
        if (likely(!active_balance)) {
                /* We were unbalanced, so reset the balancing interval */
                sd->balance_interval = sd->min_interval;
@@@ -10785,11 -7359,6 +10793,11 @@@ out_one_pinned
                        (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
  out:
 +      trace_sched_load_balance(this_cpu, idle, *continue_balancing,
 +                               group ? group->cpumask[0] : 0,
 +                               busiest ? busiest->nr_running : 0,
 +                               env.imbalance, env.flags, ld_moved,
 +                               sd->balance_interval);
        return ld_moved;
  }
  
@@@ -10832,9 -7401,6 +10840,9 @@@ static int idle_balance(struct rq *this
        int pulled_task = 0;
        u64 curr_cost = 0;
  
 +      if (cpu_isolated(this_cpu))
 +              return 0;
 +
        idle_enter_fair(this_rq);
  
        /*
         */
        this_rq->idle_stamp = rq_clock(this_rq);
  
 -      if (this_rq->avg_idle < sysctl_sched_migration_cost ||
 -          !this_rq->rd->overload) {
 +      if (!energy_aware() &&
 +          (this_rq->avg_idle < sysctl_sched_migration_cost ||
 +           !this_rq->rd->overload)) {
                rcu_read_lock();
                sd = rcu_dereference_check_sched_domain(this_rq->sd);
                if (sd)
  
                /*
                 * Stop searching for tasks to pull if there are
 -               * now runnable tasks on this rq.
 +               * now runnable tasks on the balance rq or if
 +               * continue_balancing has been unset (only possible
 +               * due to active migration).
                 */
 -              if (pulled_task || this_rq->nr_running > 0)
 +              if (pulled_task || this_rq->nr_running > 0 ||
 +                                              !continue_balancing)
                        break;
        }
        rcu_read_unlock();
@@@ -10941,24 -7503,8 +10949,24 @@@ static int active_load_balance_cpu_stop
        int busiest_cpu = cpu_of(busiest_rq);
        int target_cpu = busiest_rq->push_cpu;
        struct rq *target_rq = cpu_rq(target_cpu);
 -      struct sched_domain *sd;
 +      struct sched_domain *sd = NULL;
        struct task_struct *p = NULL;
 +      struct task_struct *push_task = NULL;
 +      int push_task_detached = 0;
 +      struct lb_env env = {
 +              .sd                     = sd,
 +              .dst_cpu                = target_cpu,
 +              .dst_rq                 = target_rq,
 +              .src_cpu                = busiest_rq->cpu,
 +              .src_rq                 = busiest_rq,
 +              .idle                   = CPU_IDLE,
 +              .busiest_nr_running     = 0,
 +              .busiest_grp_capacity   = 0,
 +              .flags                  = 0,
 +              .loop                   = 0,
 +              .boost_policy           = sched_boost_policy(),
 +      };
 +      bool moved = false;
  
        raw_spin_lock_irq(&busiest_rq->lock);
  
         */
        BUG_ON(busiest_rq == target_rq);
  
 +      push_task = busiest_rq->push_task;
 +      target_cpu = busiest_rq->push_cpu;
 +      if (push_task) {
 +              if (task_on_rq_queued(push_task) &&
 +                      push_task->state == TASK_RUNNING &&
 +                      task_cpu(push_task) == busiest_cpu &&
 +                                      cpu_online(target_cpu)) {
 +                      detach_task(push_task, &env);
 +                      push_task_detached = 1;
 +                      moved = true;
 +              }
 +              goto out_unlock;
 +      }
 +
        /* Search for an sd spanning us and the target CPU. */
        rcu_read_lock();
        for_each_domain(target_cpu, sd) {
        }
  
        if (likely(sd)) {
 -              struct lb_env env = {
 -                      .sd             = sd,
 -                      .dst_cpu        = target_cpu,
 -                      .dst_rq         = target_rq,
 -                      .src_cpu        = busiest_rq->cpu,
 -                      .src_rq         = busiest_rq,
 -                      .idle           = CPU_IDLE,
 -              };
 -
 +              env.sd = sd;
                schedstat_inc(sd, alb_count);
 +              update_rq_clock(busiest_rq);
  
                p = detach_one_task(&env);
 -              if (p)
 +              if (p) {
                        schedstat_inc(sd, alb_pushed);
 -              else
 +                      moved = true;
 +              } else {
                        schedstat_inc(sd, alb_failed);
 +              }
        }
        rcu_read_unlock();
  out_unlock:
        busiest_rq->active_balance = 0;
 +      push_task = busiest_rq->push_task;
 +      target_cpu = busiest_rq->push_cpu;
 +
 +      if (push_task)
 +              busiest_rq->push_task = NULL;
 +
        raw_spin_unlock(&busiest_rq->lock);
  
 +      if (push_task) {
 +              if (push_task_detached)
 +                      attach_one_task(target_rq, push_task);
 +              put_task_struct(push_task);
 +              clear_reserved(target_cpu);
 +      }
 +
        if (p)
                attach_one_task(target_rq, p);
  
        local_irq_enable();
  
 +      if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
 +              int check_groups = !!(env.flags &
 +                                       LBF_MOVED_RELATED_THREAD_GROUP_TASK);
 +              check_for_freq_change(busiest_rq, false, check_groups);
 +              check_for_freq_change(target_rq, false, check_groups);
 +      } else if (moved) {
 +              check_for_freq_change(target_rq, true, false);
 +      }
 +
        return 0;
  }
  
@@@ -11060,49 -7575,15 +11068,49 @@@ static inline int on_null_domain(struc
   *   needed, they will kick the idle load balancer, which then does idle
   *   load balancing for all the idle CPUs.
   */
 -static struct {
 -      cpumask_var_t idle_cpus_mask;
 -      atomic_t nr_cpus;
 -      unsigned long next_balance;     /* in jiffy units */
 -} nohz ____cacheline_aligned;
  
 -static inline int find_new_ilb(void)
 +#ifdef CONFIG_SCHED_HMP
 +static inline int find_new_hmp_ilb(int type)
 +{
 +      int call_cpu = raw_smp_processor_id();
 +      struct sched_domain *sd;
 +      int ilb;
 +
 +      rcu_read_lock();
 +
 +      /* Pick an idle cpu "closest" to call_cpu */
 +      for_each_domain(call_cpu, sd) {
 +              for_each_cpu_and(ilb, nohz.idle_cpus_mask,
 +                                              sched_domain_span(sd)) {
 +                      if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
 +                                      cpu_max_power_cost(ilb) <=
 +                                      cpu_max_power_cost(call_cpu))) {
 +                              rcu_read_unlock();
 +                              reset_balance_interval(ilb);
 +                              return ilb;
 +                      }
 +              }
 +      }
 +
 +      rcu_read_unlock();
 +      return nr_cpu_ids;
 +}
 +#else /* CONFIG_SCHED_HMP */
 +static inline int find_new_hmp_ilb(int type)
 +{
 +      return 0;
 +}
 +#endif        /* CONFIG_SCHED_HMP */
 +
 +static inline int find_new_ilb(int type)
  {
 -      int ilb = cpumask_first(nohz.idle_cpus_mask);
 +      int ilb;
 +
 +#ifdef CONFIG_SCHED_HMP
 +      return find_new_hmp_ilb(type);
 +#endif
 +
 +      ilb = cpumask_first(nohz.idle_cpus_mask);
  
        if (ilb < nr_cpu_ids && idle_cpu(ilb))
                return ilb;
   * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
   * CPU (if there is one).
   */
 -static void nohz_balancer_kick(void)
 +static void nohz_balancer_kick(int type)
  {
        int ilb_cpu;
  
        nohz.next_balance++;
  
 -      ilb_cpu = find_new_ilb();
 +      ilb_cpu = find_new_ilb(type);
  
        if (ilb_cpu >= nr_cpu_ids)
                return;
        return;
  }
  
 +void nohz_balance_clear_nohz_mask(int cpu)
 +{
 +      if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
 +              cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
 +              atomic_dec(&nohz.nr_cpus);
 +      }
 +}
 +
  static inline void nohz_balance_exit_idle(int cpu)
  {
        if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
                /*
                 * Completely isolated CPUs don't ever set, so we must test.
                 */
 -              if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
 -                      cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
 -                      atomic_dec(&nohz.nr_cpus);
 -              }
 +              nohz_balance_clear_nohz_mask(cpu);
                clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
        }
  }
@@@ -11209,7 -7685,7 +11217,7 @@@ void nohz_balance_enter_idle(int cpu
        /*
         * If we're a completely isolated CPU, we don't play.
         */
 -      if (on_null_domain(cpu_rq(cpu)))
 +      if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
                return;
  
        cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
@@@ -11238,13 -7714,7 +11246,13 @@@ static DEFINE_SPINLOCK(balancing)
   */
  void update_max_interval(void)
  {
 -      max_load_balance_interval = HZ*num_online_cpus()/10;
 +      cpumask_t avail_mask;
 +      unsigned int available_cpus;
 +
 +      cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
 +      available_cpus = cpumask_weight(&avail_mask);
 +
 +      max_load_balance_interval = HZ*available_cpus/10;
  }
  
  /*
@@@ -11369,15 -7839,12 +11377,15 @@@ static void nohz_idle_balance(struct r
        /* Earliest time when we have to do rebalance again */
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
 +      cpumask_t cpus;
  
        if (idle != CPU_IDLE ||
            !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
                goto end;
  
 -      for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
 +      cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
 +
 +      for_each_cpu(balance_cpu, &cpus) {
                if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
                        continue;
  
        clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
  }
  
 +#ifdef CONFIG_SCHED_HMP
 +static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
 +{
 +      struct sched_domain *sd;
 +      int i;
 +
 +      if (rq->nr_running < 2)
 +              return 0;
 +
 +      if (!sysctl_sched_restrict_cluster_spill ||
 +                      sched_boost_policy() == SCHED_BOOST_ON_ALL)
 +              return 1;
 +
 +      if (cpu_max_power_cost(cpu) == max_power_cost)
 +              return 1;
 +
 +      rcu_read_lock();
 +      sd = rcu_dereference_check_sched_domain(rq->sd);
 +      if (!sd) {
 +              rcu_read_unlock();
 +              return 0;
 +      }
 +
 +      for_each_cpu(i, sched_domain_span(sd)) {
 +              if (cpu_load(i) < sched_spill_load &&
 +                              cpu_rq(i)->nr_running <
 +                              sysctl_sched_spill_nr_run) {
 +                      /* Change the kick type to limit to CPUs that
 +                       * are of equal or lower capacity.
 +                       */
 +                      *type = NOHZ_KICK_RESTRICT;
 +                      break;
 +              }
 +      }
 +      rcu_read_unlock();
 +      return 1;
 +}
 +#else
 +static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
 +{
 +      return 0;
 +}
 +#endif
 +
 +static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
 +{
 +      unsigned long now = jiffies;
 +
 +      /*
 +       * None are in tickless mode and hence no need for NOHZ idle load
 +       * balancing.
 +       */
 +      if (likely(!atomic_read(&nohz.nr_cpus)))
 +              return 0;
 +
 +#ifdef CONFIG_SCHED_HMP
 +      return _nohz_kick_needed_hmp(rq, cpu, type);
 +#endif
 +
 +      if (time_before(now, nohz.next_balance))
 +              return 0;
 +
 +      if (rq->nr_running >= 2 &&
 +          (!energy_aware() || cpu_overutilized(cpu)))
 +              return true;
 +
 +      /* Do idle load balance if there have misfit task */
 +      if (energy_aware())
 +              return rq->misfit_task;
 +
 +      return (rq->nr_running >= 2);
 +}
 +
  /*
   * Current heuristic for kicking the idle load balancer in the presence
   * of an idle cpu in the system.
   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
   *     domain span are idle.
   */
 -static inline bool nohz_kick_needed(struct rq *rq)
 +static inline bool nohz_kick_needed(struct rq *rq, int *type)
  {
 -      unsigned long now = jiffies;
 +#ifndef CONFIG_SCHED_HMP
        struct sched_domain *sd;
        struct sched_group_capacity *sgc;
 -      int nr_busy, cpu = rq->cpu;
 +      int nr_busy;
 +#endif
 +      int cpu = rq->cpu;
        bool kick = false;
  
        if (unlikely(rq->idle_balance))
        set_cpu_sd_state_busy();
        nohz_balance_exit_idle(cpu);
  
 -      /*
 -       * None are in tickless mode and hence no need for NOHZ idle load
 -       * balancing.
 -       */
 -      if (likely(!atomic_read(&nohz.nr_cpus)))
 -              return false;
 -
 -      if (time_before(now, nohz.next_balance))
 -              return false;
 -
 -      if (rq->nr_running >= 2)
 +      if (_nohz_kick_needed(rq, cpu, type))
                return true;
  
 +#ifndef CONFIG_SCHED_HMP
        rcu_read_lock();
        sd = rcu_dereference(per_cpu(sd_busy, cpu));
        if (sd) {
  
  unlock:
        rcu_read_unlock();
 +#endif
        return kick;
  }
  #else
@@@ -11593,19 -7993,15 +11601,19 @@@ static void run_rebalance_domains(struc
   */
  void trigger_load_balance(struct rq *rq)
  {
 -      /* Don't need to rebalance while attached to NULL domain */
 -      if (unlikely(on_null_domain(rq)))
 +      int type = NOHZ_KICK_ANY;
 +
 +      /* Don't need to rebalance while attached to NULL domain or
 +       * cpu is isolated.
 +       */
 +      if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
                return;
  
        if (time_after_eq(jiffies, rq->next_balance))
                raise_softirq(SCHED_SOFTIRQ);
  #ifdef CONFIG_NO_HZ_COMMON
 -      if (nohz_kick_needed(rq))
 -              nohz_balancer_kick();
 +      if (nohz_kick_needed(rq, &type))
 +              nohz_balancer_kick(type);
  #endif
  }
  
@@@ -11641,17 -8037,6 +11649,17 @@@ static void task_tick_fair(struct rq *r
  
        if (static_branch_unlikely(&sched_numa_balancing))
                task_tick_numa(rq, curr);
 +
 +#ifdef CONFIG_SMP
 +      if (energy_aware() &&
 +          !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
 +              rq->rd->overutilized = true;
 +              trace_sched_overutilized(true);
 +      }
 +
 +      rq->misfit_task = !task_fits_max(curr, rq->cpu);
 +#endif
 +
  }
  
  /*
@@@ -11663,17 -8048,31 +11671,17 @@@ static void task_fork_fair(struct task_
  {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se, *curr;
 -      int this_cpu = smp_processor_id();
        struct rq *rq = this_rq();
 -      unsigned long flags;
 -
 -      raw_spin_lock_irqsave(&rq->lock, flags);
  
 +      raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
  
        cfs_rq = task_cfs_rq(current);
        curr = cfs_rq->curr;
 -
 -      /*
 -       * Not only the cpu but also the task_group of the parent might have
 -       * been changed after parent->se.parent,cfs_rq were copied to
 -       * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
 -       * of child point to valid ones.
 -       */
 -      rcu_read_lock();
 -      __set_task_cpu(p, this_cpu);
 -      rcu_read_unlock();
 -
 -      update_curr(cfs_rq);
 -
 -      if (curr)
 +      if (curr) {
 +              update_curr(cfs_rq);
                se->vruntime = curr->vruntime;
 +      }
        place_entity(cfs_rq, se, 1);
  
        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
        }
  
        se->vruntime -= cfs_rq->min_vruntime;
 -
 -      raw_spin_unlock_irqrestore(&rq->lock, flags);
 +      raw_spin_unlock(&rq->lock);
  }
  
  /*
@@@ -11738,61 -8138,6 +11746,61 @@@ static inline bool vruntime_normalized(
        return false;
  }
  
 +#ifdef CONFIG_FAIR_GROUP_SCHED
 +/*
 + * Propagate the changes of the sched_entity across the tg tree to make it
 + * visible to the root
 + */
 +static void propagate_entity_cfs_rq(struct sched_entity *se)
 +{
 +      struct cfs_rq *cfs_rq;
 +
 +      /* Start to propagate at parent */
 +      se = se->parent;
 +
 +      for_each_sched_entity(se) {
 +              cfs_rq = cfs_rq_of(se);
 +
 +              if (cfs_rq_throttled(cfs_rq))
 +                      break;
 +
 +              update_load_avg(se, UPDATE_TG);
 +      }
 +}
 +#else
 +static void propagate_entity_cfs_rq(struct sched_entity *se) { }
 +#endif
 +
 +static void detach_entity_cfs_rq(struct sched_entity *se)
 +{
 +      struct cfs_rq *cfs_rq = cfs_rq_of(se);
 +
 +      /* Catch up with the cfs_rq and remove our load when we leave */
 +      update_load_avg(se, 0);
 +      detach_entity_load_avg(cfs_rq, se);
 +      update_tg_load_avg(cfs_rq, false);
 +      propagate_entity_cfs_rq(se);
 +}
 +
 +static void attach_entity_cfs_rq(struct sched_entity *se)
 +{
 +      struct cfs_rq *cfs_rq = cfs_rq_of(se);
 +
 +#ifdef CONFIG_FAIR_GROUP_SCHED
 +      /*
 +       * Since the real-depth could have been changed (only FAIR
 +       * class maintain depth value), reset depth properly.
 +       */
 +      se->depth = se->parent ? se->parent->depth + 1 : 0;
 +#endif
 +
 +      /* Synchronize entity with its cfs_rq */
 +      update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
 +      attach_entity_load_avg(cfs_rq, se);
 +      update_tg_load_avg(cfs_rq, false);
 +      propagate_entity_cfs_rq(se);
 +}
 +
  static void detach_task_cfs_rq(struct task_struct *p)
  {
        struct sched_entity *se = &p->se;
                se->vruntime -= cfs_rq->min_vruntime;
        }
  
 -      /* Catch up with the cfs_rq and remove our load when we leave */
 -      detach_entity_load_avg(cfs_rq, se);
 +      detach_entity_cfs_rq(se);
  }
  
  static void attach_task_cfs_rq(struct task_struct *p)
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
 -#ifdef CONFIG_FAIR_GROUP_SCHED
 -      /*
 -       * Since the real-depth could have been changed (only FAIR
 -       * class maintain depth value), reset depth properly.
 -       */
 -      se->depth = se->parent ? se->parent->depth + 1 : 0;
 -#endif
 -
 -      /* Synchronize task with its cfs_rq */
 -      attach_entity_load_avg(cfs_rq, se);
 +      attach_entity_cfs_rq(se);
  
        if (!vruntime_normalized(p))
                se->vruntime += cfs_rq->min_vruntime;
@@@ -11869,23 -8224,12 +11877,23 @@@ void init_cfs_rq(struct cfs_rq *cfs_rq
        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  #endif
  #ifdef CONFIG_SMP
 +#ifdef CONFIG_FAIR_GROUP_SCHED
 +      cfs_rq->propagate_avg = 0;
 +#endif
        atomic_long_set(&cfs_rq->removed_load_avg, 0);
        atomic_long_set(&cfs_rq->removed_util_avg, 0);
  #endif
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
 +static void task_set_group_fair(struct task_struct *p)
 +{
 +      struct sched_entity *se = &p->se;
 +
 +      set_task_rq(p, task_cpu(p));
 +      se->depth = se->parent ? se->parent->depth + 1 : 0;
 +}
 +
  static void task_move_group_fair(struct task_struct *p)
  {
        detach_task_cfs_rq(p);
        attach_task_cfs_rq(p);
  }
  
 +static void task_change_group_fair(struct task_struct *p, int type)
 +{
 +      switch (type) {
 +      case TASK_SET_GROUP:
 +              task_set_group_fair(p);
 +              break;
 +
 +      case TASK_MOVE_GROUP:
 +              task_move_group_fair(p);
 +              break;
 +      }
 +}
 +
  void free_fair_sched_group(struct task_group *tg)
  {
        int i;
  
  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  {
 -      struct cfs_rq *cfs_rq;
        struct sched_entity *se;
 +      struct cfs_rq *cfs_rq;
 +      struct rq *rq;
        int i;
  
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
  
        for_each_possible_cpu(i) {
 +              rq = cpu_rq(i);
 +
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
                init_cfs_rq(cfs_rq);
                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
                init_entity_runnable_average(se);
 +
 +              raw_spin_lock_irq(&rq->lock);
 +              post_init_entity_util_avg(se);
 +              raw_spin_unlock_irq(&rq->lock);
        }
  
        return 1;
@@@ -12062,10 -8386,8 +12070,10 @@@ int sched_group_set_shares(struct task_
  
                /* Possible calls to update_curr() need rq clock */
                update_rq_clock(rq);
 -              for_each_sched_entity(se)
 -                      update_cfs_shares(group_cfs_rq(se));
 +              for_each_sched_entity(se) {
 +                      update_load_avg(se, UPDATE_TG);
 +                      update_cfs_shares(se);
 +              }
                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
  
@@@ -12142,12 -8464,7 +12150,12 @@@ const struct sched_class fair_sched_cla
        .update_curr            = update_curr_fair,
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
 -      .task_move_group        = task_move_group_fair,
 +      .task_change_group      = task_change_group_fair,
 +#endif
 +#ifdef CONFIG_SCHED_HMP
 +      .inc_hmp_sched_stats    = inc_hmp_sched_stats_fair,
 +      .dec_hmp_sched_stats    = dec_hmp_sched_stats_fair,
 +      .fixup_hmp_sched_stats  = fixup_hmp_sched_stats_fair,
  #endif
  };
  
diff --combined kernel/workqueue.c
@@@ -48,8 -48,6 +48,8 @@@
  #include <linux/nodemask.h>
  #include <linux/moduleparam.h>
  #include <linux/uaccess.h>
 +#include <linux/bug.h>
 +#include <linux/delay.h>
  
  #include "workqueue_internal.h"
  
@@@ -151,8 -149,6 +151,8 @@@ struct worker_pool 
        int                     id;             /* I: pool ID */
        unsigned int            flags;          /* X: flags */
  
 +      unsigned long           watchdog_ts;    /* L: watchdog timestamp */
 +
        struct list_head        worklist;       /* L: list of pending works */
        int                     nr_workers;     /* L: total number of workers */
  
@@@ -1127,8 -1123,6 +1127,8 @@@ static void pwq_activate_delayed_work(s
        struct pool_workqueue *pwq = get_work_pwq(work);
  
        trace_workqueue_activate_work(work);
 +      if (list_empty(&pwq->pool->worklist))
 +              pwq->pool->watchdog_ts = jiffies;
        move_linked_works(work, &pwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
        pwq->nr_active++;
@@@ -1287,12 -1281,6 +1287,12 @@@ fail
        if (work_is_canceling(work))
                return -ENOENT;
        cpu_relax();
 +      /*
 +       * The queueing is in progress in another context. If we keep
 +       * taking the pool->lock in a busy loop, the other context may
 +       * never get the lock. Give 1 usec delay to avoid this contention.
 +       */
 +      udelay(1);
        return -EAGAIN;
  }
  
@@@ -1437,8 -1425,6 +1437,8 @@@ retry
                trace_workqueue_activate_work(work);
                pwq->nr_active++;
                worklist = &pwq->pool->worklist;
 +              if (list_empty(worklist))
 +                      pwq->pool->watchdog_ts = jiffies;
        } else {
                work_flags |= WORK_STRUCT_DELAYED;
                worklist = &pwq->delayed_works;
@@@ -1510,6 -1496,8 +1510,6 @@@ static void __queue_delayed_work(int cp
                return;
        }
  
 -      timer_stats_timer_set_start_info(&dwork->timer);
 -
        dwork->wq = wq;
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;
@@@ -2088,7 -2076,6 +2088,7 @@@ __acquires(&pool->lock
                       current->comm, preempt_count(), task_pid_nr(current),
                       worker->current_func);
                debug_show_held_locks(current);
 +              BUG_ON(PANIC_CORRUPTION);
                dump_stack();
        }
  
@@@ -2204,8 -2191,6 +2204,8 @@@ recheck
                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);
  
 +              pool->watchdog_ts = jiffies;
 +
                if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
                        /* optimization path, not strictly necessary */
                        process_one_work(worker, work);
@@@ -2289,7 -2274,6 +2289,7 @@@ repeat
                                        struct pool_workqueue, mayday_node);
                struct worker_pool *pool = pwq->pool;
                struct work_struct *work, *n;
 +              bool first = true;
  
                __set_current_state(TASK_RUNNING);
                list_del_init(&pwq->mayday_node);
                 * process'em.
                 */
                WARN_ON_ONCE(!list_empty(scheduled));
 -              list_for_each_entry_safe(work, n, &pool->worklist, entry)
 -                      if (get_work_pwq(work) == pwq)
 +              list_for_each_entry_safe(work, n, &pool->worklist, entry) {
 +                      if (get_work_pwq(work) == pwq) {
 +                              if (first)
 +                                      pool->watchdog_ts = jiffies;
                                move_linked_works(work, scheduled, &n);
 +                      }
 +                      first = false;
 +              }
  
                if (!list_empty(scheduled)) {
                        process_scheduled_works(rescuer);
                         */
                        if (need_to_create_worker(pool)) {
                                spin_lock(&wq_mayday_lock);
-                               get_pwq(pwq);
-                               list_move_tail(&pwq->mayday_node, &wq->maydays);
+                               /*
+                                * Queue iff we aren't racing destruction
+                                * and somebody else hasn't queued it already.
+                                */
+                               if (wq->rescuer && list_empty(&pwq->mayday_node)) {
+                                       get_pwq(pwq);
+                                       list_add_tail(&pwq->mayday_node, &wq->maydays);
+                               }
                                spin_unlock(&wq_mayday_lock);
                        }
                }
@@@ -2919,31 -2904,6 +2925,31 @@@ bool flush_delayed_work(struct delayed_
  }
  EXPORT_SYMBOL(flush_delayed_work);
  
 +static bool __cancel_work(struct work_struct *work, bool is_dwork)
 +{
 +      unsigned long flags;
 +      int ret;
 +
 +      do {
 +              ret = try_to_grab_pending(work, is_dwork, &flags);
 +      } while (unlikely(ret == -EAGAIN));
 +
 +      if (unlikely(ret < 0))
 +              return false;
 +
 +      set_work_pool_and_clear_pending(work, get_work_pool_id(work));
 +      local_irq_restore(flags);
 +      return ret;
 +}
 +
 +/*
 + * See cancel_delayed_work()
 + */
 +bool cancel_work(struct work_struct *work)
 +{
 +      return __cancel_work(work, false);
 +}
 +
  /**
   * cancel_delayed_work - cancel a delayed work
   * @dwork: delayed_work to cancel
   */
  bool cancel_delayed_work(struct delayed_work *dwork)
  {
 -      unsigned long flags;
 -      int ret;
 -
 -      do {
 -              ret = try_to_grab_pending(&dwork->work, true, &flags);
 -      } while (unlikely(ret == -EAGAIN));
 -
 -      if (unlikely(ret < 0))
 -              return false;
 -
 -      set_work_pool_and_clear_pending(&dwork->work,
 -                                      get_work_pool_id(&dwork->work));
 -      local_irq_restore(flags);
 -      return ret;
 +      return __cancel_work(&dwork->work, true);
  }
  EXPORT_SYMBOL(cancel_delayed_work);
  
@@@ -3136,7 -3109,6 +3142,7 @@@ static int init_worker_pool(struct work
        pool->cpu = -1;
        pool->node = NUMA_NO_NODE;
        pool->flags |= POOL_DISASSOCIATED;
 +      pool->watchdog_ts = jiffies;
        INIT_LIST_HEAD(&pool->worklist);
        INIT_LIST_HEAD(&pool->idle_list);
        hash_init(pool->busy_hash);
@@@ -3983,9 -3955,29 +3989,29 @@@ void destroy_workqueue(struct workqueue
        struct pool_workqueue *pwq;
        int node;
  
+       /*
+        * Remove it from sysfs first so that sanity check failure doesn't
+        * lead to sysfs name conflicts.
+        */
+       workqueue_sysfs_unregister(wq);
        /* drain it before proceeding with destruction */
        drain_workqueue(wq);
  
+       /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
+       if (wq->rescuer) {
+               struct worker *rescuer = wq->rescuer;
+               /* this prevents new queueing */
+               spin_lock_irq(&wq_mayday_lock);
+               wq->rescuer = NULL;
+               spin_unlock_irq(&wq_mayday_lock);
+               /* rescuer will empty maydays list before exiting */
+               kthread_stop(rescuer->task);
+               kfree(rescuer);
+       }
        /* sanity checks */
        mutex_lock(&wq->mutex);
        for_each_pwq(pwq, wq) {
        list_del_rcu(&wq->list);
        mutex_unlock(&wq_pool_mutex);
  
-       workqueue_sysfs_unregister(wq);
-       if (wq->rescuer)
-               kthread_stop(wq->rescuer->task);
        if (!(wq->flags & WQ_UNBOUND)) {
                /*
                 * The base ref is never dropped on per-cpu pwqs.  Directly
@@@ -4296,7 -4283,8 +4317,8 @@@ static void show_pwq(struct pool_workqu
        pr_info("  pwq %d:", pool->id);
        pr_cont_pool_info(pool);
  
-       pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+       pr_cont(" active=%d/%d refcnt=%d%s\n",
+               pwq->nr_active, pwq->max_active, pwq->refcnt,
                !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
  
        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
@@@ -4406,9 -4394,7 +4428,9 @@@ void show_workqueue_state(void
  
                pr_info("pool %d:", pool->id);
                pr_cont_pool_info(pool);
 -              pr_cont(" workers=%d", pool->nr_workers);
 +              pr_cont(" hung=%us workers=%d",
 +                      jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
 +                      pool->nr_workers);
                if (pool->manager)
                        pr_cont(" manager: %d",
                                task_pid_nr(pool->manager->task));
@@@ -5278,154 -5264,6 +5300,154 @@@ static void workqueue_sysfs_unregister(
  static void workqueue_sysfs_unregister(struct workqueue_struct *wq)   { }
  #endif        /* CONFIG_SYSFS */
  
 +/*
 + * Workqueue watchdog.
 + *
 + * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
 + * flush dependency, a concurrency managed work item which stays RUNNING
 + * indefinitely.  Workqueue stalls can be very difficult to debug as the
 + * usual warning mechanisms don't trigger and internal workqueue state is
 + * largely opaque.
 + *
 + * Workqueue watchdog monitors all worker pools periodically and dumps
 + * state if some pools failed to make forward progress for a while where
 + * forward progress is defined as the first item on ->worklist changing.
 + *
 + * This mechanism is controlled through the kernel parameter
 + * "workqueue.watchdog_thresh" which can be updated at runtime through the
 + * corresponding sysfs parameter file.
 + */
 +#ifdef CONFIG_WQ_WATCHDOG
 +
 +static void wq_watchdog_timer_fn(unsigned long data);
 +
 +static unsigned long wq_watchdog_thresh = 30;
 +static struct timer_list wq_watchdog_timer =
 +      TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
 +
 +static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
 +static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
 +
 +static void wq_watchdog_reset_touched(void)
 +{
 +      int cpu;
 +
 +      wq_watchdog_touched = jiffies;
 +      for_each_possible_cpu(cpu)
 +              per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
 +}
 +
 +static void wq_watchdog_timer_fn(unsigned long data)
 +{
 +      unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
 +      bool lockup_detected = false;
 +      struct worker_pool *pool;
 +      int pi;
 +
 +      if (!thresh)
 +              return;
 +
 +      rcu_read_lock();
 +
 +      for_each_pool(pool, pi) {
 +              unsigned long pool_ts, touched, ts;
 +
 +              if (list_empty(&pool->worklist))
 +                      continue;
 +
 +              /* get the latest of pool and touched timestamps */
 +              pool_ts = READ_ONCE(pool->watchdog_ts);
 +              touched = READ_ONCE(wq_watchdog_touched);
 +
 +              if (time_after(pool_ts, touched))
 +                      ts = pool_ts;
 +              else
 +                      ts = touched;
 +
 +              if (pool->cpu >= 0) {
 +                      unsigned long cpu_touched =
 +                              READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
 +                                                pool->cpu));
 +                      if (time_after(cpu_touched, ts))
 +                              ts = cpu_touched;
 +              }
 +
 +              /* did we stall? */
 +              if (time_after(jiffies, ts + thresh)) {
 +                      lockup_detected = true;
 +                      pr_emerg("BUG: workqueue lockup - pool");
 +                      pr_cont_pool_info(pool);
 +                      pr_cont(" stuck for %us!\n",
 +                              jiffies_to_msecs(jiffies - pool_ts) / 1000);
 +              }
 +      }
 +
 +      rcu_read_unlock();
 +
 +      if (lockup_detected)
 +              show_workqueue_state();
 +
 +      wq_watchdog_reset_touched();
 +      mod_timer(&wq_watchdog_timer, jiffies + thresh);
 +}
 +
 +void wq_watchdog_touch(int cpu)
 +{
 +      if (cpu >= 0)
 +              per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
 +      else
 +              wq_watchdog_touched = jiffies;
 +}
 +
 +static void wq_watchdog_set_thresh(unsigned long thresh)
 +{
 +      wq_watchdog_thresh = 0;
 +      del_timer_sync(&wq_watchdog_timer);
 +
 +      if (thresh) {
 +              wq_watchdog_thresh = thresh;
 +              wq_watchdog_reset_touched();
 +              mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
 +      }
 +}
 +
 +static int wq_watchdog_param_set_thresh(const char *val,
 +                                      const struct kernel_param *kp)
 +{
 +      unsigned long thresh;
 +      int ret;
 +
 +      ret = kstrtoul(val, 0, &thresh);
 +      if (ret)
 +              return ret;
 +
 +      if (system_wq)
 +              wq_watchdog_set_thresh(thresh);
 +      else
 +              wq_watchdog_thresh = thresh;
 +
 +      return 0;
 +}
 +
 +static const struct kernel_param_ops wq_watchdog_thresh_ops = {
 +      .set    = wq_watchdog_param_set_thresh,
 +      .get    = param_get_ulong,
 +};
 +
 +module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
 +              0644);
 +
 +static void wq_watchdog_init(void)
 +{
 +      wq_watchdog_set_thresh(wq_watchdog_thresh);
 +}
 +
 +#else /* CONFIG_WQ_WATCHDOG */
 +
 +static inline void wq_watchdog_init(void) { }
 +
 +#endif        /* CONFIG_WQ_WATCHDOG */
 +
  static void __init wq_numa_init(void)
  {
        cpumask_var_t *tbl;
@@@ -5549,9 -5387,6 +5571,9 @@@ static int __init init_workqueues(void
               !system_unbound_wq || !system_freezable_wq ||
               !system_power_efficient_wq ||
               !system_freezable_power_efficient_wq);
 +
 +      wq_watchdog_init();
 +
        return 0;
  }
  early_initcall(init_workqueues);
diff --combined mm/shmem.c
@@@ -1003,7 -1003,7 +1003,7 @@@ static int shmem_replace_page(struct pa
        copy_highpage(newpage, oldpage);
        flush_dcache_page(newpage);
  
 -      __set_page_locked(newpage);
 +      __SetPageLocked(newpage);
        SetPageUptodate(newpage);
        SetPageSwapBacked(newpage);
        set_page_private(newpage, swap_index);
@@@ -1195,7 -1195,7 +1195,7 @@@ repeat
                }
  
                __SetPageSwapBacked(page);
 -              __set_page_locked(page);
 +              __SetPageLocked(page);
                if (sgp == SGP_WRITE)
                        __SetPageReferenced(page);
  
@@@ -1499,7 -1499,7 +1499,7 @@@ shmem_write_begin(struct file *file, st
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
  
        /* i_mutex is held by caller */
 -      if (unlikely(info->seals)) {
 +      if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) {
                if (info->seals & F_SEAL_WRITE)
                        return -EPERM;
                if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
@@@ -2092,7 -2092,7 +2092,7 @@@ static long shmem_fallocate(struct fil
                }
  
                shmem_falloc.waitq = &shmem_falloc_waitq;
-               shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+               shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
                shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
                spin_lock(&inode->i_lock);
                inode->i_private = &shmem_falloc;
@@@ -3411,14 -3411,6 +3411,14 @@@ struct file *shmem_file_setup(const cha
  }
  EXPORT_SYMBOL_GPL(shmem_file_setup);
  
 +void shmem_set_file(struct vm_area_struct *vma, struct file *file)
 +{
 +      if (vma->vm_file)
 +              fput(vma->vm_file);
 +      vma->vm_file = file;
 +      vma->vm_ops = &shmem_vm_ops;
 +}
 +
  /**
   * shmem_zero_setup - setup a shared anonymous mapping
   * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
@@@ -3438,7 -3430,10 +3438,7 @@@ int shmem_zero_setup(struct vm_area_str
        if (IS_ERR(file))
                return PTR_ERR(file);
  
 -      if (vma->vm_file)
 -              fput(vma->vm_file);
 -      vma->vm_file = file;
 -      vma->vm_ops = &shmem_vm_ops;
 +      shmem_set_file(vma, file);
        return 0;
  }
  
diff --combined net/bridge/br_device.c
@@@ -48,17 -48,16 +48,17 @@@ netdev_tx_t br_dev_xmit(struct sk_buff 
                return NETDEV_TX_OK;
        }
  
 -      u64_stats_update_begin(&brstats->syncp);
 -      brstats->tx_packets++;
 -      brstats->tx_bytes += skb->len;
 -      u64_stats_update_end(&brstats->syncp);
 -
        BR_INPUT_SKB_CB(skb)->brdev = dev;
  
        skb_reset_mac_header(skb);
        skb_pull(skb, ETH_HLEN);
  
 +      u64_stats_update_begin(&brstats->syncp);
 +      brstats->tx_packets++;
 +      /* Exclude ETH_HLEN from byte stats for consistency with Rx chain */
 +      brstats->tx_bytes += skb->len;
 +      u64_stats_update_end(&brstats->syncp);
 +
        if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid))
                goto out;
  
@@@ -200,6 -199,12 +200,12 @@@ static int br_set_mac_address(struct ne
        if (!is_valid_ether_addr(addr->sa_data))
                return -EADDRNOTAVAIL;
  
+       /* dev_set_mac_addr() can be called by a master device on bridge's
+        * NETDEV_UNREGISTER, but since it's being destroyed do nothing
+        */
+       if (dev->reg_state != NETREG_REGISTERED)
+               return -EBUSY;
        spin_lock_bh(&br->lock);
        if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) {
                /* Mac address will be changed in br_stp_change_bridge_id(). */
diff --combined net/core/dev.c
  #include <linux/errqueue.h>
  #include <linux/hrtimer.h>
  #include <linux/netfilter_ingress.h>
 +#include <linux/tcp.h>
 +#include <net/tcp.h>
  
  #include "net-sysfs.h"
  
@@@ -185,7 -183,7 +185,7 @@@ EXPORT_SYMBOL(dev_base_lock)
  static DEFINE_SPINLOCK(napi_hash_lock);
  
  static unsigned int napi_gen_id = NR_CPUS;
 -static DEFINE_HASHTABLE(napi_hash, 8);
 +static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
  
  static seqcount_t devnet_rename_seq;
  
@@@ -2838,10 -2836,6 +2838,10 @@@ static struct sk_buff *validate_xmit_sk
        if (netif_needs_gso(skb, features)) {
                struct sk_buff *segs;
  
 +              __be16 src_port = tcp_hdr(skb)->source;
 +              __be16 dest_port = tcp_hdr(skb)->dest;
 +
 +              trace_print_skb_gso(skb, src_port, dest_port);
                segs = skb_gso_segment(skb, features);
                if (IS_ERR(segs)) {
                        goto out_kfree_skb;
@@@ -2881,7 -2875,7 +2881,7 @@@ out_null
  
  struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
  {
 -      struct sk_buff *next, *head = NULL, *tail;
 +      struct sk_buff *next, *head = NULL, *tail = NULL;
  
        for (; skb != NULL; skb = next) {
                next = skb->next;
@@@ -4194,7 -4188,6 +4194,7 @@@ static int napi_gro_complete(struct sk_
        }
  
  out:
 +      __this_cpu_add(softnet_data.gro_coalesced, NAPI_GRO_CB(skb)->count > 1);
        return netif_receive_skb_internal(skb);
  }
  
@@@ -4237,7 -4230,6 +4237,7 @@@ static void gro_list_prepare(struct nap
                unsigned long diffs;
  
                NAPI_GRO_CB(p)->flush = 0;
 +              NAPI_GRO_CB(p)->flush_id = 0;
  
                if (hash != skb_get_hash_raw(p)) {
                        NAPI_GRO_CB(p)->same_flow = 0;
@@@ -4620,24 -4612,6 +4620,24 @@@ __sum16 __skb_gro_checksum_complete(str
  }
  EXPORT_SYMBOL(__skb_gro_checksum_complete);
  
 +static void net_rps_send_ipi(struct softnet_data *remsd)
 +{
 +#ifdef CONFIG_RPS
 +      while (remsd) {
 +              struct softnet_data *next = remsd->rps_ipi_next;
 +
 +              if (cpu_online(remsd->cpu)) {
 +                      smp_call_function_single_async(remsd->cpu, &remsd->csd);
 +              } else {
 +                      rps_lock(remsd);
 +                      remsd->backlog.state = 0;
 +                      rps_unlock(remsd);
 +              }
 +              remsd = next;
 +      }
 +#endif
 +}
 +
  /*
   * net_rps_action_and_irq_enable sends any pending IPI's for rps.
   * Note: called with local irq disabled, but exits with local irq enabled.
@@@ -4653,7 -4627,14 +4653,7 @@@ static void net_rps_action_and_irq_enab
                local_irq_enable();
  
                /* Send pending IPI's to kick RPS processing on remote cpus. */
 -              while (remsd) {
 -                      struct softnet_data *next = remsd->rps_ipi_next;
 -
 -                      if (cpu_online(remsd->cpu))
 -                              smp_call_function_single_async(remsd->cpu,
 -                                                         &remsd->csd);
 -                      remsd = next;
 -              }
 +              net_rps_send_ipi(remsd);
        } else
  #endif
                local_irq_enable();
@@@ -4694,7 -4675,8 +4694,7 @@@ static int process_backlog(struct napi_
                        local_irq_disable();
                        input_queue_head_incr(sd);
                        if (++work >= quota) {
 -                              local_irq_enable();
 -                              return work;
 +                              goto state_changed;
                        }
                }
  
                        napi->state = 0;
                        rps_unlock(sd);
  
 -                      break;
 +                      goto state_changed;
                }
  
                skb_queue_splice_tail_init(&sd->input_pkt_queue,
                                           &sd->process_queue);
                rps_unlock(sd);
        }
 +state_changed:
        local_irq_enable();
 +      napi_gro_flush(napi, false);
 +      sd->current_napi = NULL;
  
        return work;
  }
@@@ -4757,13 -4736,10 +4757,13 @@@ EXPORT_SYMBOL(__napi_schedule_irqoff)
  
  void __napi_complete(struct napi_struct *n)
  {
 +      struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 +
        BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
  
        list_del_init(&n->poll_list);
        smp_mb__before_atomic();
 +      sd->current_napi = NULL;
        clear_bit(NAPI_STATE_SCHED, &n->state);
  }
  EXPORT_SYMBOL(__napi_complete);
@@@ -4913,15 -4889,6 +4913,15 @@@ void netif_napi_del(struct napi_struct 
  }
  EXPORT_SYMBOL(netif_napi_del);
  
 +
 +struct napi_struct *get_current_napi_context(void)
 +{
 +      struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 +
 +      return sd->current_napi;
 +}
 +EXPORT_SYMBOL(get_current_napi_context);
 +
  static int napi_poll(struct napi_struct *n, struct list_head *repoll)
  {
        void *have;
         */
        work = 0;
        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 +              struct softnet_data *sd = this_cpu_ptr(&softnet_data);
 +
 +              sd->current_napi = n;
                work = n->poll(n, weight);
                trace_napi_poll(n);
        }
@@@ -6162,7 -6126,8 +6162,8 @@@ static int __dev_set_mtu(struct net_dev
        if (ops->ndo_change_mtu)
                return ops->ndo_change_mtu(dev, new_mtu);
  
-       dev->mtu = new_mtu;
+       /* Pairs with all the lockless reads of dev->mtu in the stack */
+       WRITE_ONCE(dev->mtu, new_mtu);
        return 0;
  }
  
@@@ -7569,7 -7534,7 +7570,7 @@@ static int dev_cpu_callback(struct noti
        struct sk_buff **list_skb;
        struct sk_buff *skb;
        unsigned int cpu, oldcpu = (unsigned long)ocpu;
 -      struct softnet_data *sd, *oldsd;
 +      struct softnet_data *sd, *oldsd, *remsd;
  
        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
                return NOTIFY_OK;
        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_enable();
  
 +#ifdef CONFIG_RPS
 +      remsd = oldsd->rps_ipi_list;
 +      oldsd->rps_ipi_list = NULL;
 +#endif
 +      /* send out pending IPI's on offline CPU */
 +      net_rps_send_ipi(remsd);
 +
        /* Process offline CPU's input_pkt_queue */
        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
                netif_rx_ni(skb);
diff --combined net/ipv4/devinet.c
@@@ -1364,11 -1364,6 +1364,6 @@@ skip
        }
  }
  
- static bool inetdev_valid_mtu(unsigned int mtu)
- {
-       return mtu >= IPV4_MIN_MTU;
- }
  static void inetdev_send_gratuitous_arp(struct net_device *dev,
                                        struct in_device *in_dev)
  
@@@ -2197,8 -2192,6 +2192,8 @@@ static struct devinet_sysctl_table 
                                        "igmpv3_unsolicited_report_interval"),
                DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
                                        "ignore_routes_with_linkdown"),
 +              DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP,
 +                                      "drop_gratuitous_arp"),
  
                DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
                                              "promote_secondaries"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
                                              "route_localnet"),
 +              DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
 +                                            "drop_unicast_in_l2_multicast"),
 +              DEVINET_SYSCTL_RW_ENTRY(NF_IPV4_DEFRAG_SKIP,
 +                                      "nf_ipv4_defrag_skip"),
        },
  };
  
diff --combined net/ipv4/ip_output.c
@@@ -1145,13 -1145,17 +1145,17 @@@ static int ip_setup_cork(struct sock *s
        rt = *rtp;
        if (unlikely(!rt))
                return -EFAULT;
-       /*
-        * We steal reference to this route, caller should not release it
-        */
-       *rtp = NULL;
        cork->fragsize = ip_sk_use_pmtu(sk) ?
-                        dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+                        dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
+       if (!inetdev_valid_mtu(cork->fragsize))
+               return -ENETUNREACH;
        cork->dst = &rt->dst;
+       /* We stole this route, caller should not release it. */
+       *rtp = NULL;
        cork->length = 0;
        cork->ttl = ipc->ttl;
        cork->tos = ipc->tos;
@@@ -1587,8 -1591,7 +1591,8 @@@ void ip_send_unicast_reply(struct sock 
                           RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
                           ip_reply_arg_flowi_flags(arg),
                           daddr, saddr,
 -                         tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
 +                         tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
 +                         arg->uid);
        security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
        rt = ip_route_output_key(net, &fl4);
        if (IS_ERR(rt))
diff --combined net/ipv4/tcp_output.c
@@@ -196,7 -196,7 +196,7 @@@ u32 tcp_default_init_rwnd(u32 mss
         * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
         * limit when mss is larger than 1460.
         */
 -      u32 init_rwnd = TCP_INIT_CWND * 2;
 +      u32 init_rwnd = sysctl_tcp_default_init_rwnd;
  
        if (mss > 1460)
                init_rwnd = max((1460 * init_rwnd) / mss, 2U);
@@@ -710,8 -710,9 +710,9 @@@ static unsigned int tcp_established_opt
                        min_t(unsigned int, eff_sacks,
                              (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
                              TCPOLEN_SACK_PERBLOCK);
-               size += TCPOLEN_SACK_BASE_ALIGNED +
-                       opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+               if (likely(opts->num_sack_blocks))
+                       size += TCPOLEN_SACK_BASE_ALIGNED +
+                               opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
        }
  
        return size;
diff --combined net/ipv4/tcp_timer.c
@@@ -32,40 -32,6 +32,40 @@@ int sysctl_tcp_retries2 __read_mostly 
  int sysctl_tcp_orphan_retries __read_mostly;
  int sysctl_tcp_thin_linear_timeouts __read_mostly;
  
 +/*Function to reset tcp_ack related sysctl on resetting master control */
 +void set_tcp_default(void)
 +{
 +      sysctl_tcp_delack_seg   = TCP_DELACK_SEG;
 +}
 +
 +/*sysctl handler for tcp_ack realted master control */
 +int tcp_proc_delayed_ack_control(struct ctl_table *table, int write,
 +                               void __user *buffer, size_t *length,
 +                               loff_t *ppos)
 +{
 +      int ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 +
 +      /* The ret value will be 0 if the input validation is successful
 +       * and the values are written to sysctl table. If not, the stack
 +       * will continue to work with currently configured values
 +       */
 +      return ret;
 +}
 +
 +/*sysctl handler for tcp_ack realted master control */
 +int tcp_use_userconfig_sysctl_handler(struct ctl_table *table, int write,
 +                                    void __user *buffer, size_t *length,
 +                                    loff_t *ppos)
 +{
 +      int ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 +
 +      if (write && ret == 0) {
 +              if (!sysctl_tcp_use_userconfig)
 +                      set_tcp_default();
 +      }
 +      return ret;
 +}
 +
  static void tcp_write_err(struct sock *sk)
  {
        sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
@@@ -370,7 -336,7 +370,7 @@@ static void tcp_probe_timer(struct soc
                        return;
        }
  
-       if (icsk->icsk_probes_out > max_probes) {
+       if (icsk->icsk_probes_out >= max_probes) {
  abort:                tcp_write_err(sk);
        } else {
                /* Only send another probe if we didn't close things up. */
diff --combined scripts/mod/modpost.c
@@@ -141,9 -141,6 +141,9 @@@ static struct module *new_module(const 
                p[strlen(p) - 2] = '\0';
                mod->is_dot_o = 1;
        }
 +      /* strip trailing .lto */
 +      if (strends(p, ".lto"))
 +              p[strlen(p) - 4] = '\0';
  
        /* add to list */
        mod->name = p;
@@@ -1159,6 -1156,14 +1159,14 @@@ static const struct sectioncheck *secti
   *   fromsec = text section
   *   refsymname = *.constprop.*
   *
+  * Pattern 6:
+  *   Hide section mismatch warnings for ELF local symbols.  The goal
+  *   is to eliminate false positive modpost warnings caused by
+  *   compiler-generated ELF local symbol names such as ".LANCHOR1".
+  *   Autogenerated symbol names bypass modpost's "Pattern 2"
+  *   whitelisting, which relies on pattern-matching against symbol
+  *   names to work.  (One situation where gcc can autogenerate ELF
+  *   local symbols is when "-fsection-anchors" is used.)
   **/
  static int secref_whitelist(const struct sectioncheck *mismatch,
                            const char *fromsec, const char *fromsym,
            match(fromsym, optim_symbols))
                return 0;
  
+       /* Check for pattern 6 */
+       if (strstarts(fromsym, ".L"))
+               return 0;
        return 1;
  }
  
@@@ -1924,10 -1933,6 +1936,10 @@@ static char *remove_dot(char *s
                size_t m = strspn(s + n + 1, "0123456789");
                if (m && (s[n + m] == '.' || s[n + m] == 0))
                        s[n] = 0;
 +
 +              /* strip trailing .lto */
 +              if (strends(s, ".lto"))
 +                      s[strlen(s) - 4] = '\0';
        }
        return s;
  }
diff --combined sound/core/pcm_lib.c
@@@ -41,9 -41,6 +41,9 @@@
  #define trace_hw_ptr_error(substream, reason)
  #endif
  
 +#define STRING_LENGTH_OF_INT 12
 +#define MAX_USR_CTRL_CNT 128
 +
  /*
   * fill ring buffer with silence
   * runtime->silence_start: starting pointer to silence area
@@@ -377,8 -374,7 +377,8 @@@ static int snd_pcm_update_hw_ptr0(struc
                 * the elapsed time to detect xruns.
                 */
                jdelta = curr_jiffies - runtime->hw_ptr_jiffies;
 -              if (jdelta < runtime->hw_ptr_buffer_jiffies / 2)
 +              if ((jdelta < runtime->hw_ptr_buffer_jiffies / 2) ||
 +                  (runtime->hw_ptr_buffer_jiffies <= 0))
                        goto no_delta_check;
                hdelta = jdelta - delta * HZ / runtime->rate;
                xrun_threshold = runtime->hw_ptr_buffer_jiffies / 2 + 1;
@@@ -1801,11 -1797,6 +1801,11 @@@ static int snd_pcm_lib_ioctl_channel_in
        switch (runtime->access) {
        case SNDRV_PCM_ACCESS_MMAP_INTERLEAVED:
        case SNDRV_PCM_ACCESS_RW_INTERLEAVED:
 +              if ((UINT_MAX/width) < info->channel) {
 +                      snd_printd("%s: integer overflow while multiply\n",
 +                                 __func__);
 +                      return -EINVAL;
 +              }
                info->first = info->channel * width;
                info->step = runtime->channels * width;
                break;
        case SNDRV_PCM_ACCESS_RW_NONINTERLEAVED:
        {
                size_t size = runtime->dma_bytes / runtime->channels;
 +
 +              if ((size > 0) && ((UINT_MAX/(size * 8)) < info->channel)) {
 +                      snd_printd("%s: integer overflow while multiply\n",
 +                                 __func__);
 +                      return -EINVAL;
 +              }
                info->first = info->channel * size * 8;
                info->step = width;
                break;
@@@ -1892,11 -1877,14 +1892,14 @@@ void snd_pcm_period_elapsed(struct snd_
        struct snd_pcm_runtime *runtime;
        unsigned long flags;
  
-       if (PCM_RUNTIME_CHECK(substream))
+       if (snd_BUG_ON(!substream))
                return;
-       runtime = substream->runtime;
  
        snd_pcm_stream_lock_irqsave(substream, flags);
+       if (PCM_RUNTIME_CHECK(substream))
+               goto _unlock;
+       runtime = substream->runtime;
        if (!snd_pcm_running(substream) ||
            snd_pcm_update_hw_ptr0(substream, 1) < 0)
                goto _end;
  #endif
   _end:
        kill_fasync(&runtime->fasync, SIGIO, POLL_IN);
+  _unlock:
        snd_pcm_stream_unlock_irqrestore(substream, flags);
  }
  
@@@ -2135,9 -2124,6 +2139,9 @@@ static int pcm_sanity_check(struct snd_
        struct snd_pcm_runtime *runtime;
        if (PCM_RUNTIME_CHECK(substream))
                return -ENXIO;
 +      /* TODO: consider and -EINVAL here */
 +      if (substream->hw_no_buffer)
 +              snd_printd("%s: warning this PCM is host less\n", __func__);
        runtime = substream->runtime;
        if (snd_BUG_ON(!substream->ops->copy && !runtime->dma_area))
                return -EINVAL;
@@@ -2588,23 -2574,6 +2592,23 @@@ static void pcm_chmap_ctl_private_free(
        kfree(info);
  }
  
 +static int pcm_volume_ctl_info(struct snd_kcontrol *kcontrol,
 +                              struct snd_ctl_elem_info *uinfo)
 +{
 +      uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER;
 +      uinfo->count = 1;
 +      uinfo->value.integer.min = 0;
 +      uinfo->value.integer.max = 0x2000;
 +      return 0;
 +}
 +
 +static void pcm_volume_ctl_private_free(struct snd_kcontrol *kcontrol)
 +{
 +      struct snd_pcm_volume *info = snd_kcontrol_chip(kcontrol);
 +      info->pcm->streams[info->stream].vol_kctl = NULL;
 +      kfree(info);
 +}
 +
  /**
   * snd_pcm_add_chmap_ctls - create channel-mapping control elements
   * @pcm: the assigned PCM instance
@@@ -2664,166 -2633,3 +2668,166 @@@ int snd_pcm_add_chmap_ctls(struct snd_p
        return 0;
  }
  EXPORT_SYMBOL_GPL(snd_pcm_add_chmap_ctls);
 +
 +/**
 + * snd_pcm_add_volume_ctls - create volume control elements
 + * @pcm: the assigned PCM instance
 + * @stream: stream direction
 + * @max_length: the max length of the volume parameter of stream
 + * @private_value: the value passed to each kcontrol's private_value field
 + * @info_ret: store struct snd_pcm_volume instance if non-NULL
 + *
 + * Create volume control elements assigned to the given PCM stream(s).
 + * Returns zero if succeed, or a negative error value.
 + */
 +int snd_pcm_add_volume_ctls(struct snd_pcm *pcm, int stream,
 +                         const struct snd_pcm_volume_elem *volume,
 +                         int max_length,
 +                         unsigned long private_value,
 +                         struct snd_pcm_volume **info_ret)
 +{
 +      struct snd_pcm_volume *info;
 +      struct snd_kcontrol_new knew = {
 +              .iface = SNDRV_CTL_ELEM_IFACE_MIXER,
 +              .access = SNDRV_CTL_ELEM_ACCESS_TLV_READ |
 +                      SNDRV_CTL_ELEM_ACCESS_READWRITE,
 +              .info = pcm_volume_ctl_info,
 +      };
 +      int err;
 +      int size;
 +
 +      info = kzalloc(sizeof(*info), GFP_KERNEL);
 +      if (!info)
 +              return -ENOMEM;
 +      info->pcm = pcm;
 +      info->stream = stream;
 +      info->volume = volume;
 +      info->max_length = max_length;
 +      size = sizeof("Playback ") + sizeof(" Volume") +
 +              STRING_LENGTH_OF_INT*sizeof(char) + 1;
 +      knew.name = kzalloc(size, GFP_KERNEL);
 +      if (!knew.name) {
 +              kfree(info);
 +              return -ENOMEM;
 +      }
 +      if (stream == SNDRV_PCM_STREAM_PLAYBACK)
 +              snprintf((char *)knew.name, size, "%s %d %s",
 +                      "Playback", pcm->device, "Volume");
 +      else
 +              snprintf((char *)knew.name, size, "%s %d %s",
 +                      "Capture", pcm->device, "Volume");
 +      knew.device = pcm->device;
 +      knew.count = pcm->streams[stream].substream_count;
 +      knew.private_value = private_value;
 +      info->kctl = snd_ctl_new1(&knew, info);
 +      if (!info->kctl) {
 +              kfree(info);
 +              kfree(knew.name);
 +              return -ENOMEM;
 +      }
 +      info->kctl->private_free = pcm_volume_ctl_private_free;
 +      err = snd_ctl_add(pcm->card, info->kctl);
 +      if (err < 0) {
 +              kfree(info);
 +              kfree(knew.name);
 +              return -ENOMEM;
 +      }
 +      pcm->streams[stream].vol_kctl = info->kctl;
 +      if (info_ret)
 +              *info_ret = info;
 +      kfree(knew.name);
 +      return 0;
 +}
 +EXPORT_SYMBOL_GPL(snd_pcm_add_volume_ctls);
 +
 +static int pcm_usr_ctl_info(struct snd_kcontrol *kcontrol,
 +                          struct snd_ctl_elem_info *uinfo)
 +{
 +      uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER;
 +      uinfo->count = MAX_USR_CTRL_CNT;
 +      uinfo->value.integer.min = 0;
 +      uinfo->value.integer.max = INT_MAX;
 +      return 0;
 +}
 +
 +static void pcm_usr_ctl_private_free(struct snd_kcontrol *kcontrol)
 +{
 +      struct snd_pcm_usr *info = snd_kcontrol_chip(kcontrol);
 +      info->pcm->streams[info->stream].usr_kctl = NULL;
 +      kfree(info);
 +}
 +
 +/**
 + * snd_pcm_add_usr_ctls - create user control elements
 + * @pcm: the assigned PCM instance
 + * @stream: stream direction
 + * @max_length: the max length of the user parameter of stream
 + * @private_value: the value passed to each kcontrol's private_value field
 + * @info_ret: store struct snd_pcm_usr instance if non-NULL
 + *
 + * Create usr control elements assigned to the given PCM stream(s).
 + * Returns zero if succeed, or a negative error value.
 + */
 +int snd_pcm_add_usr_ctls(struct snd_pcm *pcm, int stream,
 +                       const struct snd_pcm_usr_elem *usr,
 +                       int max_length, int max_kctrl_str_len,
 +                       unsigned long private_value,
 +                       struct snd_pcm_usr **info_ret)
 +{
 +      struct snd_pcm_usr *info;
 +      struct snd_kcontrol_new knew = {
 +              .iface = SNDRV_CTL_ELEM_IFACE_MIXER,
 +              .access = SNDRV_CTL_ELEM_ACCESS_READWRITE,
 +              .info = pcm_usr_ctl_info,
 +      };
 +      int err;
 +      char *buf;
 +
 +      info = kzalloc(sizeof(*info), GFP_KERNEL);
 +      if (!info) {
 +              pr_err("%s: snd_pcm_usr alloc failed\n", __func__);
 +              return -ENOMEM;
 +      }
 +      info->pcm = pcm;
 +      info->stream = stream;
 +      info->usr = usr;
 +      info->max_length = max_length;
 +      buf = kzalloc(max_kctrl_str_len, GFP_KERNEL);
 +      if (!buf) {
 +              pr_err("%s: buffer allocation failed\n", __func__);
 +              kfree(info);
 +              return -ENOMEM;
 +      }
 +      knew.name = buf;
 +      if (stream == SNDRV_PCM_STREAM_PLAYBACK)
 +              snprintf(buf, max_kctrl_str_len, "%s %d %s",
 +                      "Playback", pcm->device, "User kcontrol");
 +      else
 +              snprintf(buf, max_kctrl_str_len, "%s %d %s",
 +                      "Capture", pcm->device, "User kcontrol");
 +      knew.device = pcm->device;
 +      knew.count = pcm->streams[stream].substream_count;
 +      knew.private_value = private_value;
 +      info->kctl = snd_ctl_new1(&knew, info);
 +      if (!info->kctl) {
 +              kfree(info);
 +              kfree(knew.name);
 +              pr_err("%s: snd_ctl_new failed\n", __func__);
 +              return -ENOMEM;
 +      }
 +      info->kctl->private_free = pcm_usr_ctl_private_free;
 +      err = snd_ctl_add(pcm->card, info->kctl);
 +      if (err < 0) {
 +              kfree(info);
 +              kfree(knew.name);
 +              pr_err("%s: snd_ctl_add failed:%d\n", __func__,
 +                      err);
 +              return -ENOMEM;
 +      }
 +      pcm->streams[stream].usr_kctl = info->kctl;
 +      if (info_ret)
 +              *info_ret = info;
 +      kfree(knew.name);
 +      return 0;
 +}
 +EXPORT_SYMBOL(snd_pcm_add_usr_ctls);