VERSION = 4
PATCHLEVEL = 4
- SUBLEVEL = 206
- EXTRAVERSION = -rc1
+ SUBLEVEL = 207
+ EXTRAVERSION =
NAME = Blurry Fish Butt
# *DOCUMENTATION*
# Most importantly: sub-Makefiles should only ever modify files in
# their own directory. If in some directory we have a dependency on
# a file in another dir (which doesn't happen often, but it's often
-# unavoidable when linking the built-in.o targets which finally
+# unavoidable when linking the built-in.a targets which finally
# turn into vmlinux), we will call a sub make in that other dir, and
# after that we are sure that everything which is in that other dir
# is now up to date.
$(filter-out _all sub-make $(CURDIR)/Makefile, $(MAKECMDGOALS)) _all: sub-make
@:
-sub-make: FORCE
+sub-make:
$(Q)$(MAKE) -C $(KBUILD_OUTPUT) KBUILD_SRC=$(CURDIR) \
-f $(CURDIR)/Makefile $(filter-out _all sub-make,$(MAKECMDGOALS))
HOSTCC = gcc
HOSTCXX = g++
-HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89
+HOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 -pipe
HOSTCXXFLAGS = -O2
# Decide whether to build built-in, modular, or both.
# Make variables (CC, etc...)
AS = $(CROSS_COMPILE)as
LD = $(CROSS_COMPILE)ld
+LDLLD = ld.lld
CC = $(CROSS_COMPILE)gcc
CPP = $(CC) -E
AR = $(CROSS_COMPILE)ar
CFLAGS_KERNEL =
AFLAGS_KERNEL =
CFLAGS_GCOV = -fprofile-arcs -ftest-coverage -fno-tree-loop-im
+CFLAGS_KCOV = -fsanitize-coverage=trace-pc
# Use USERINCLUDE when you must reference the UAPI directories only.
KBUILD_CPPFLAGS := -D__KERNEL__
-KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
+KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs -pipe \
-fno-strict-aliasing -fno-common \
-Werror-implicit-function-declaration \
-Wno-format-security \
-std=gnu89 $(call cc-option,-fno-PIE)
-
+ifeq ($(TARGET_BOARD_TYPE),auto)
+KBUILD_CFLAGS += -DCONFIG_PLATFORM_AUTO
+endif
KBUILD_AFLAGS_KERNEL :=
KBUILD_CFLAGS_KERNEL :=
KBUILD_AFLAGS := -D__ASSEMBLY__ $(call cc-option,-fno-PIE)
export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV
-export CFLAGS_KASAN CFLAGS_KASAN_NOSANITIZE
+export CFLAGS_KASAN CFLAGS_UBSAN CFLAGS_KASAN_NOSANITIZE
+export CFLAGS_KCOV
export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
ifeq ($(cc-name),clang)
ifneq ($(CROSS_COMPILE),)
-CLANG_TARGET := --target=$(notdir $(CROSS_COMPILE:%-=%))
+CLANG_TRIPLE ?= $(CROSS_COMPILE)
+CLANG_TARGET := --target=$(notdir $(CLANG_TRIPLE:%-=%))
+ifeq ($(shell $(srctree)/scripts/clang-android.sh $(CC) $(CLANG_TARGET)), y)
+$(error "Clang with Android --target detected. Did you specify CLANG_TRIPLE?")
+endif
GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
CLANG_PREFIX := --prefix=$(GCC_TOOLCHAIN_DIR)
GCC_TOOLCHAIN := $(realpath $(GCC_TOOLCHAIN_DIR)/..)
KBUILD_AFLAGS += $(call cc-option, -no-integrated-as)
endif
+# Make toolchain changes before including arch/$(SRCARCH)/Makefile to ensure
+# ar/cc/ld-* macros return correct values.
+ifdef CONFIG_LTO_CLANG
+# use LLVM linker LLD for LTO linking and vmlinux_link
+LD := $(LDLLD)
+# use llvm-ar for building symbol tables from IR files, and llvm-nm instead
+# of objdump for processing symbol versions and exports
+LLVM_AR := llvm-ar
+LLVM_NM := llvm-nm
+export LLVM_AR LLVM_NM
+endif
+
+ifeq ($(cc-name),clang)
+ifeq ($(ld-name),lld)
+KBUILD_CFLAGS += -fuse-ld=lld
+LDFLAGS += -O2
+endif
+KBUILD_CPPFLAGS += -Qunused-arguments
+endif
+
# The arch Makefile can set ARCH_{CPP,A,C}FLAGS to override the default
# values of the respective KBUILD_* variables
ARCH_CPPFLAGS :=
ARCH_CFLAGS :=
include arch/$(SRCARCH)/Makefile
+ifeq ($(cc-name),clang)
+KBUILD_CFLAGS += -O3
+KBUILD_CFLAGS += $(call cc-option, -mllvm -polly) \
+ $(call cc-option, -mllvm -polly-run-dce) \
+ $(call cc-option, -mllvm -polly-run-inliner) \
+ $(call cc-option, -mllvm -polly-opt-fusion=max) \
+ $(call cc-option, -mllvm -polly-ast-use-context) \
+ $(call cc-option, -mllvm -polly-detect-keep-going) \
+ $(call cc-option, -mllvm -polly-vectorizer=stripmine) \
+ $(call cc-option, -mllvm -polly-invariant-load-hoisting)
+else
+KBUILD_CFLAGS += -O2
+endif
+
+ifeq ($(cc-name),gcc)
+KBUILD_CFLAGS += -mcpu=cortex-a73.cortex-a53
+KBUILD_AFLAGS += -mcpu=cortex-a73.cortex-a53
+endif
+ifeq ($(cc-name),clang)
+KBUILD_CFLAGS += -mcpu=cortex-a53
+KBUILD_AFLAGS += -mcpu=cortex-a53
+endif
+
KBUILD_CFLAGS += $(call cc-option,-fno-delete-null-pointer-checks,)
KBUILD_CFLAGS += $(call cc-disable-warning,maybe-uninitialized,)
KBUILD_CFLAGS += $(call cc-disable-warning,frame-address,)
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
KBUILD_CFLAGS += $(call cc-disable-warning, attribute-alias)
-ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
-KBUILD_CFLAGS += -Os
-else
-ifdef CONFIG_PROFILE_ALL_BRANCHES
-KBUILD_CFLAGS += -O2
-else
-KBUILD_CFLAGS += -O2
-endif
+ifdef CONFIG_CC_WERROR
+KBUILD_CFLAGS += -Werror
endif
# Tell gcc to never replace conditional load with a non-conditional one
endif
KBUILD_CFLAGS += $(stackp-flag)
+ifdef CONFIG_KCOV
+ ifeq ($(call cc-option, $(CFLAGS_KCOV)),)
+ $(warning Cannot use CONFIG_KCOV: \
+ -fsanitize-coverage=trace-pc is not supported by compiler)
+ CFLAGS_KCOV =
+ endif
+endif
+
ifeq ($(cc-name),clang)
-KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
+KBUILD_CFLAGS += $(call cc-disable-warning, duplicate-decl-specifier)
# Quiet clang warning: comparison of unsigned expression < 0 is always false
KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the
# source of a reference will be _MergedGlobals and not on of the whitelisted names.
# See modpost pattern 2
KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
-KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
else
# These warnings generated too much noise in a regular build.
endif
endif
+# Initialize all stack variables with a pattern, if desired.
+ifdef CONFIG_INIT_STACK_ALL
+KBUILD_CFLAGS += $(call cc-option, -ftrivial-auto-var-init=pattern)
+endif
+
KBUILD_CFLAGS += $(call cc-option, -fno-var-tracking-assignments)
ifdef CONFIG_DEBUG_INFO
KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
endif
+ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
+KBUILD_CFLAGS_KERNEL += $(call cc-option,-ffunction-sections,)
+KBUILD_CFLAGS_KERNEL += $(call cc-option,-fdata-sections,)
+endif
+
+ifdef CONFIG_LTO_CLANG
+ifdef CONFIG_THINLTO
+lto-clang-flags := -flto=thin
+LDFLAGS += --thinlto-cache-dir=.thinlto-cache
+else
+lto-clang-flags := -flto
+endif
+lto-clang-flags += -fvisibility=hidden
+
+# allow disabling only clang LTO where needed
+DISABLE_LTO_CLANG := -fno-lto -fvisibility=default
+export DISABLE_LTO_CLANG
+endif
+
+ifdef CONFIG_LTO
+LTO_CFLAGS := $(lto-clang-flags)
+KBUILD_CFLAGS += $(LTO_CFLAGS)
+
+DISABLE_LTO := $(DISABLE_LTO_CLANG)
+export LTO_CFLAGS DISABLE_LTO
+endif
+
# arch Makefile may override CC so keep this after arch Makefile is included
NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include)
CHECKFLAGS += $(NOSTDINC_FLAGS)
include scripts/Makefile.kasan
include scripts/Makefile.extrawarn
+include scripts/Makefile.ubsan
# Add any arch overrides and user supplied CPPFLAGS, AFLAGS and CFLAGS as the
# last assignments
KBUILD_LDFLAGS_MODULE += $(LDFLAGS_BUILD_ID)
LDFLAGS_vmlinux += $(LDFLAGS_BUILD_ID)
+ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
+LDFLAGS_vmlinux += $(call ld-option, --gc-sections,)
+endif
+
ifeq ($(CONFIG_STRIP_ASM_SYMS),y)
LDFLAGS_vmlinux += $(call ld-option, -X,)
endif
vmlinux-alldirs := $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
$(init-) $(core-) $(drivers-) $(net-) $(libs-) $(virt-))))
-init-y := $(patsubst %/, %/built-in.o, $(init-y))
-core-y := $(patsubst %/, %/built-in.o, $(core-y))
-drivers-y := $(patsubst %/, %/built-in.o, $(drivers-y))
-net-y := $(patsubst %/, %/built-in.o, $(net-y))
+init-y := $(patsubst %/, %/built-in.a, $(init-y))
+core-y := $(patsubst %/, %/built-in.a, $(core-y))
+drivers-y := $(patsubst %/, %/built-in.a, $(drivers-y))
+net-y := $(patsubst %/, %/built-in.a, $(net-y))
libs-y1 := $(patsubst %/, %/lib.a, $(libs-y))
-libs-y2 := $(patsubst %/, %/built-in.o, $(libs-y))
-libs-y := $(libs-y1) $(libs-y2)
-virt-y := $(patsubst %/, %/built-in.o, $(virt-y))
+libs-y2 := $(patsubst %/, %/built-in.a, $(filter-out %.a, $(libs-y)))
+virt-y := $(patsubst %/, %/built-in.a, $(virt-y))
# Externally visible symbols (used by link-vmlinux.sh)
export KBUILD_VMLINUX_INIT := $(head-y) $(init-y)
-export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) $(virt-y)
+export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y2) $(drivers-y) $(net-y) $(virt-y)
+export KBUILD_VMLINUX_LIBS := $(libs-y1)
export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds
export LDFLAGS_vmlinux
# used by scripts/pacmage/Makefile
export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools)
-vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN)
+vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN) $(KBUILD_VMLINUX_LIBS)
# Final link of vmlinux
cmd_link-vmlinux = $(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux)
PHONY += $(vmlinux-dirs)
$(vmlinux-dirs): prepare scripts
- $(Q)$(MAKE) $(build)=$@
+ $(Q)$(MAKE) $(build)=$@ need-builtin=1
define filechk_kernel.release
echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))"
archprepare: archheaders archscripts prepare1 scripts_basic
-prepare0: archprepare FORCE
+prepare0: archprepare
$(Q)$(MAKE) $(build)=.
# All the preparing..
prepare: prepare0
+# Make sure we're using a supported toolchain with LTO_CLANG
+ifdef CONFIG_LTO_CLANG
+ ifneq ($(call clang-ifversion, -ge, 0800, y), y)
+ @echo Cannot use CONFIG_LTO_CLANG: requires clang 8.0 or later >&2 && exit 1
+ endif
+ ifneq ($(ld-name),lld)
+ @echo Cannot use CONFIG_LTO_CLANG: requires LLD >&2 && exit 1
+ endif
+endif
+# Make sure compiler supports LTO flags
+ifdef lto-flags
+ ifeq ($(call cc-option, $(lto-flags)),)
+ @echo Cannot use CONFIG_LTO: $(lto-flags) not supported by compiler \
+ >&2 && exit 1
+ endif
+endif
+
# Generate some files
# ---------------------------------------------------------------------------
export INSTALL_FW_PATH
PHONY += firmware_install
-firmware_install: FORCE
+firmware_install:
@mkdir -p $(objtree)/firmware
$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.fwinst obj=firmware __fw_install
archscripts:
PHONY += __headers
-__headers: $(version_h) scripts_basic asm-generic archheaders archscripts FORCE
+__headers: $(version_h) scripts_basic asm-generic archheaders archscripts
$(Q)$(MAKE) $(build)=scripts build_unifdef
PHONY += headers_install_all
# We are always building modules
KBUILD_MODULES := 1
- PHONY += crmodverdir
- crmodverdir:
- $(cmd_crmodverdir)
PHONY += $(objtree)/Module.symvers
$(objtree)/Module.symvers:
module-dirs := $(addprefix _module_,$(KBUILD_EXTMOD))
PHONY += $(module-dirs) modules
- $(module-dirs): crmodverdir $(objtree)/Module.symvers
+ $(module-dirs): prepare $(objtree)/Module.symvers
$(Q)$(MAKE) $(build)=$(patsubst _module_%,%,$@)
modules: $(module-dirs)
# Dummies...
PHONY += prepare scripts
- prepare: ;
+ prepare:
+ $(cmd_crmodverdir)
scripts: ;
endif # KBUILD_EXTMOD
-o -name '*.symtypes' -o -name 'modules.order' \
-o -name modules.builtin -o -name '.tmp_*.o.*' \
-o -name '*.ll' \
- -o -name '*.gcno' \) -type f -print | xargs rm -f
+ -o -name '*.gcno' \
+ -o -name '*.*.symversions' \) -type f -print | xargs rm -f
# Generate tags for editors
# ---------------------------------------------------------------------------
# Modules
/: prepare scripts FORCE
- $(cmd_crmodverdir)
$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
$(build)=$(build-dir)
# Make sure the latest headers are built for Documentation
Documentation/: headers_install
%/: prepare scripts FORCE
- $(cmd_crmodverdir)
$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
$(build)=$(build-dir)
%.ko: prepare scripts FORCE
- $(cmd_crmodverdir)
$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
$(build)=$(build-dir) $(@:.ko=.o)
$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modpost
#define __get_user_asm_byte(x, addr, err) \
__get_user_asm(x, addr, err, ldrb)
+ #if __LINUX_ARM_ARCH__ >= 6
+
+ #define __get_user_asm_half(x, addr, err) \
+ __get_user_asm(x, addr, err, ldrh)
+
+ #else
+
#ifndef __ARMEB__
#define __get_user_asm_half(x, __gu_addr, err) \
({ \
})
#endif
+ #endif /* __LINUX_ARM_ARCH__ >= 6 */
+
#define __get_user_asm_word(x, addr, err) \
__get_user_asm(x, addr, err, ldr)
#endif
#define __put_user_asm_byte(x, __pu_addr, err) \
__put_user_asm(x, __pu_addr, err, strb)
+ #if __LINUX_ARM_ARCH__ >= 6
+
+ #define __put_user_asm_half(x, __pu_addr, err) \
+ __put_user_asm(x, __pu_addr, err, strh)
+
+ #else
+
#ifndef __ARMEB__
#define __put_user_asm_half(x, __pu_addr, err) \
({ \
})
#endif
+ #endif /* __LINUX_ARM_ARCH__ >= 6 */
+
#define __put_user_asm_word(x, __pu_addr, err) \
__put_user_asm(x, __pu_addr, err, str)
static inline unsigned long __must_check
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
- unsigned int __ua_flags = uaccess_save_and_enable();
+ unsigned int __ua_flags;
+
+ check_object_size(to, n, false);
+ __ua_flags = uaccess_save_and_enable();
n = arm_copy_from_user(to, from, n);
uaccess_restore(__ua_flags);
return n;
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
#ifndef CONFIG_UACCESS_WITH_MEMCPY
- unsigned int __ua_flags = uaccess_save_and_enable();
+ unsigned int __ua_flags;
+
+ check_object_size(from, n, true);
+ __ua_flags = uaccess_save_and_enable();
n = arm_copy_to_user(to, from, n);
uaccess_restore(__ua_flags);
return n;
#else
+ check_object_size(from, n, true);
return arm_copy_to_user(to, from, n);
#endif
}
select HAVE_IRQ_TIME_ACCOUNTING
select GENERIC_TIME_VSYSCALL
select ARCH_CLOCKSOURCE_DATA
+ select HANDLE_DOMAIN_IRQ
+ select HAVE_EXIT_THREAD
menu "Machine selection"
prompt "System type"
default SGI_IP22
+config MIPS_GENERIC
+ bool "Generic board-agnostic MIPS kernel"
+ select BOOT_RAW
+ select BUILTIN_DTB
+ select CEVT_R4K
+ select CLKSRC_MIPS_GIC
+ select COMMON_CLK
+ select CPU_MIPSR2_IRQ_VI
+ select CPU_MIPSR2_IRQ_EI
+ select CSRC_R4K
+ select DMA_PERDEV_COHERENT
+ select HW_HAS_PCI
+ select IRQ_MIPS_CPU
+ select LIBFDT
+ select MIPS_CPU_SCACHE
+ select MIPS_GIC
+ select MIPS_L1_CACHE_SHIFT_7
+ select NO_EXCEPT_FILL
+ select PCI_DRIVERS_GENERIC
+ select PINCTRL
+ select SMP_UP if SMP
+ select SYS_HAS_CPU_MIPS32_R1
+ select SYS_HAS_CPU_MIPS32_R2
+ select SYS_HAS_CPU_MIPS32_R6
+ select SYS_HAS_CPU_MIPS64_R1
+ select SYS_HAS_CPU_MIPS64_R2
+ select SYS_HAS_CPU_MIPS64_R6
+ select SYS_SUPPORTS_32BIT_KERNEL
+ select SYS_SUPPORTS_64BIT_KERNEL
+ select SYS_SUPPORTS_BIG_ENDIAN
+ select SYS_SUPPORTS_HIGHMEM
+ select SYS_SUPPORTS_LITTLE_ENDIAN
+ select SYS_SUPPORTS_MICROMIPS
+ select SYS_SUPPORTS_MIPS_CPS
+ select SYS_SUPPORTS_MIPS16
+ select SYS_SUPPORTS_MULTITHREADING
+ select SYS_SUPPORTS_RELOCATABLE
+ select SYS_SUPPORTS_SMARTMIPS
+ select USB_EHCI_BIG_ENDIAN_DESC if BIG_ENDIAN
+ select USB_EHCI_BIG_ENDIAN_MMIO if BIG_ENDIAN
+ select USB_OHCI_BIG_ENDIAN_DESC if BIG_ENDIAN
+ select USB_OHCI_BIG_ENDIAN_MMIO if BIG_ENDIAN
+ select USB_UHCI_BIG_ENDIAN_DESC if BIG_ENDIAN
+ select USB_UHCI_BIG_ENDIAN_MMIO if BIG_ENDIAN
+ select USE_OF
+ help
+ Select this to build a kernel which aims to support multiple boards,
+ generally using a flattened device tree passed from the bootloader
+ using the boot protocol defined in the UHI (Unified Hosting
+ Interface) specification.
+
config MIPS_ALCHEMY
bool "Alchemy processor based machines"
select ARCH_PHYS_ADDR_T_64BIT
select SYS_SUPPORTS_BIG_ENDIAN
select SYS_SUPPORTS_HIGHMEM
select SYS_SUPPORTS_LITTLE_ENDIAN
+ select ZONE_DMA32 if 64BIT
config SIBYTE_SENTOSA
bool "Sibyte BCM91250E-Sentosa"
source "arch/mips/bcm47xx/Kconfig"
source "arch/mips/bcm63xx/Kconfig"
source "arch/mips/bmips/Kconfig"
+source "arch/mips/generic/Kconfig"
source "arch/mips/jazz/Kconfig"
source "arch/mips/jz4740/Kconfig"
source "arch/mips/lantiq/Kconfig"
select DMA_NONCOHERENT
bool
+config DMA_PERDEV_COHERENT
+ bool
+ select DMA_MAYBE_COHERENT
+
config DMA_COHERENT
bool
bool
config MIPS_PGD_C0_CONTEXT
bool
- default y if 64BIT && CPU_MIPSR2 && !CPU_XLP
+ default y if 64BIT && (CPU_MIPSR2 || CPU_MIPSR6) && !CPU_XLP
#
# Set to y for ptrace access to watch registers.
config MIPS_CPS
bool "MIPS Coherent Processing System support"
- depends on SYS_SUPPORTS_MIPS_CPS && !CPU_MIPSR6
+ depends on SYS_SUPPORTS_MIPS_CPS
select MIPS_CM
select MIPS_CPC
select MIPS_CPS_PM if HOTPLUG_CPU
select SMP
select SYNC_R4K if (CEVT_R4K || CSRC_R4K)
select SYS_SUPPORTS_HOTPLUG_CPU
+ select SYS_SUPPORTS_SCHED_SMT if CPU_MIPSR6
select SYS_SUPPORTS_SMP
select WEAK_ORDERING
help
if (flags & AR5523_CMD_FLAG_MAGIC)
hdr->magic = cpu_to_be32(1 << 24);
- memcpy(hdr + 1, idata, ilen);
+ if (ilen)
+ memcpy(hdr + 1, idata, ilen);
cmd->odata = odata;
cmd->olen = olen;
memcpy(ar->channels, ar5523_channels, sizeof(ar5523_channels));
memcpy(ar->rates, ar5523_rates, sizeof(ar5523_rates));
- ar->band.band = IEEE80211_BAND_2GHZ;
+ ar->band.band = NL80211_BAND_2GHZ;
ar->band.channels = ar->channels;
ar->band.n_channels = ARRAY_SIZE(ar5523_channels);
ar->band.bitrates = ar->rates;
ar->band.n_bitrates = ARRAY_SIZE(ar5523_rates);
- ar->hw->wiphy->bands[IEEE80211_BAND_2GHZ] = &ar->band;
+ ar->hw->wiphy->bands[NL80211_BAND_2GHZ] = &ar->band;
return 0;
}
else
mvm->max_scans = IWL_MVM_MAX_LMAC_SCANS;
- if (mvm->nvm_data->bands[IEEE80211_BAND_2GHZ].n_channels)
- hw->wiphy->bands[IEEE80211_BAND_2GHZ] =
- &mvm->nvm_data->bands[IEEE80211_BAND_2GHZ];
- if (mvm->nvm_data->bands[IEEE80211_BAND_5GHZ].n_channels) {
- hw->wiphy->bands[IEEE80211_BAND_5GHZ] =
- &mvm->nvm_data->bands[IEEE80211_BAND_5GHZ];
+ if (mvm->nvm_data->bands[NL80211_BAND_2GHZ].n_channels)
+ hw->wiphy->bands[NL80211_BAND_2GHZ] =
+ &mvm->nvm_data->bands[NL80211_BAND_2GHZ];
+ if (mvm->nvm_data->bands[NL80211_BAND_5GHZ].n_channels) {
+ hw->wiphy->bands[NL80211_BAND_5GHZ] =
+ &mvm->nvm_data->bands[NL80211_BAND_5GHZ];
if (fw_has_capa(&mvm->fw->ucode_capa,
IWL_UCODE_TLV_CAPA_BEAMFORMER) &&
fw_has_api(&mvm->fw->ucode_capa,
IWL_UCODE_TLV_API_LQ_SS_PARAMS))
- hw->wiphy->bands[IEEE80211_BAND_5GHZ]->vht_cap.cap |=
+ hw->wiphy->bands[NL80211_BAND_5GHZ]->vht_cap.cap |=
IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE;
}
!ieee80211_is_action(hdr->frame_control)))
sta = NULL;
+ /* If there is no sta, and it's not offchannel - send through AP */
+ if (info->control.vif->type == NL80211_IFTYPE_STATION &&
+ info->hw_queue != IWL_MVM_OFFCHANNEL_QUEUE && !sta) {
+ struct iwl_mvm_vif *mvmvif =
+ iwl_mvm_vif_from_mac80211(info->control.vif);
+ u8 ap_sta_id = READ_ONCE(mvmvif->ap_sta_id);
+
+ if (ap_sta_id < IWL_MVM_STATION_COUNT) {
+ /* mac80211 holds rcu read lock */
+ sta = rcu_dereference(mvm->fw_id_to_mac_id[ap_sta_id]);
+ if (IS_ERR_OR_NULL(sta))
+ goto drop;
+ }
+ }
+
if (sta) {
if (iwl_mvm_defer_tx(mvm, sta, skb))
return;
cpu_to_le32(FW_CMD_ID_AND_COLOR(MAC_INDEX_AUX, 0)),
.sta_id_and_color = cpu_to_le32(mvm->aux_sta.sta_id),
/* Set the channel info data */
- .channel_info.band = (channel->band == IEEE80211_BAND_2GHZ) ?
+ .channel_info.band = (channel->band == NL80211_BAND_2GHZ) ?
PHY_BAND_24 : PHY_BAND_5,
.channel_info.channel = channel->hw_value,
.channel_info.width = PHY_VHT_CHANNEL_MODE20,
/*
* Copyright (c) 2015, Sony Mobile Communications AB.
- * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2013, 2018 The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
#include <linux/gpio.h>
#include <linux/interrupt.h>
#include <linux/of_device.h>
+#include <linux/of_irq.h>
#include <dt-bindings/pinctrl/qcom,pmic-gpio.h>
banks |= BIT(0);
break;
case PM8XXX_QCOM_DRIVE_STRENGH:
- if (arg > PMIC_GPIO_STRENGTH_LOW) {
+ if (arg > PM8921_GPIO_STRENGTH_LOW) {
dev_err(pctrl->dev, "invalid drive strength\n");
return -EINVAL;
}
}
static const struct of_device_id pm8xxx_gpio_of_match[] = {
- { .compatible = "qcom,pm8018-gpio", .data = (void *)6 },
- { .compatible = "qcom,pm8038-gpio", .data = (void *)12 },
- { .compatible = "qcom,pm8058-gpio", .data = (void *)40 },
- { .compatible = "qcom,pm8917-gpio", .data = (void *)38 },
- { .compatible = "qcom,pm8921-gpio", .data = (void *)44 },
+ { .compatible = "qcom,pm8018-gpio" },
+ { .compatible = "qcom,pm8038-gpio" },
+ { .compatible = "qcom,pm8058-gpio" },
+ { .compatible = "qcom,pm8917-gpio" },
+ { .compatible = "qcom,pm8921-gpio" },
+ { .compatible = "qcom,ssbi-gpio" },
{ },
};
MODULE_DEVICE_TABLE(of, pm8xxx_gpio_of_match);
struct pinctrl_pin_desc *pins;
struct pm8xxx_gpio *pctrl;
int ret;
- int i;
+ int i, npins;
pctrl = devm_kzalloc(&pdev->dev, sizeof(*pctrl), GFP_KERNEL);
if (!pctrl)
return -ENOMEM;
pctrl->dev = &pdev->dev;
- pctrl->npins = (unsigned long)of_device_get_match_data(&pdev->dev);
+ npins = platform_irq_count(pdev);
+ if (!npins)
+ return -EINVAL;
+ if (npins < 0)
+ return npins;
+ pctrl->npins = npins;
pctrl->regmap = dev_get_regmap(pdev->dev.parent, NULL);
if (!pctrl->regmap) {
goto unregister_pinctrl;
}
- ret = gpiochip_add_pin_range(&pctrl->chip,
- dev_name(pctrl->dev),
- 0, 0, pctrl->chip.ngpio);
- if (ret) {
- dev_err(pctrl->dev, "failed to add pin range\n");
- goto unregister_gpiochip;
+ /*
+ * For DeviceTree-supported systems, the gpio core checks the
+ * pinctrl's device node for the "gpio-ranges" property.
+ * If it is present, it takes care of adding the pin ranges
+ * for the driver. In this case the driver can skip ahead.
+ *
+ * In order to remain compatible with older, existing DeviceTree
+ * files which don't set the "gpio-ranges" property or systems that
+ * utilize ACPI the driver has to call gpiochip_add_pin_range().
+ */
+ if (!of_property_read_bool(pctrl->dev->of_node, "gpio-ranges")) {
+ ret = gpiochip_add_pin_range(&pctrl->chip, dev_name(pctrl->dev),
+ 0, 0, pctrl->chip.ngpio);
+ if (ret) {
+ dev_err(pctrl->dev, "failed to add pin range\n");
+ goto unregister_gpiochip;
+ }
}
platform_set_drvdata(pdev, pctrl);
* Copyright (C) 2008 Intel Corp
* Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com>
* Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com>
+ * Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
#include <linux/reboot.h>
#include <linux/string.h>
#include <linux/of.h>
+#include <linux/kthread.h>
#include <net/netlink.h>
#include <net/genetlink.h>
-#include <linux/suspend.h>
#define CREATE_TRACE_POINTS
#include <trace/events/thermal.h>
#include "thermal_core.h"
#include "thermal_hwmon.h"
+#define THERMAL_UEVENT_DATA "type"
+
MODULE_AUTHOR("Zhang Rui");
MODULE_DESCRIPTION("Generic thermal management sysfs support");
MODULE_LICENSE("GPL v2");
+#define THERMAL_MAX_ACTIVE 16
+
static DEFINE_IDR(thermal_tz_idr);
static DEFINE_IDR(thermal_cdev_idr);
static DEFINE_MUTEX(thermal_idr_lock);
static DEFINE_MUTEX(thermal_list_lock);
static DEFINE_MUTEX(thermal_governor_lock);
-static atomic_t in_suspend;
-
static struct thermal_governor *def_governor;
+static struct workqueue_struct *thermal_passive_wq;
+
static struct thermal_governor *__find_governor(const char *name)
{
struct thermal_governor *pos;
return;
}
+static LIST_HEAD(sensor_info_list);
+static DEFINE_MUTEX(sensor_list_lock);
+
+static struct sensor_info *get_sensor(uint32_t sensor_id)
+{
+ struct sensor_info *pos = NULL, *matching_sensor = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pos, &sensor_info_list, sensor_list) {
+ if (pos->sensor_id == sensor_id) {
+ matching_sensor = pos;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return matching_sensor;
+}
+
+int sensor_get_id(char *name)
+{
+ struct sensor_info *pos = NULL;
+ int matching_id = -ENODEV;
+
+ if (!name)
+ return matching_id;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pos, &sensor_info_list, sensor_list) {
+ if (!strcmp(pos->tz->type, name)) {
+ matching_id = pos->sensor_id;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return matching_id;
+}
+EXPORT_SYMBOL(sensor_get_id);
+
+static void init_sensor_trip(struct sensor_info *sensor)
+{
+ int ret = 0, i = 0;
+ enum thermal_trip_type type;
+
+ for (i = 0; ((sensor->max_idx == -1) ||
+ (sensor->min_idx == -1)) &&
+ (sensor->tz->ops->get_trip_type) &&
+ (i < sensor->tz->trips); i++) {
+
+ sensor->tz->ops->get_trip_type(sensor->tz, i, &type);
+ if (type == THERMAL_TRIP_CONFIGURABLE_HI)
+ sensor->max_idx = i;
+ if (type == THERMAL_TRIP_CONFIGURABLE_LOW)
+ sensor->min_idx = i;
+ type = 0;
+ }
+
+ ret = sensor->tz->ops->get_trip_temp(sensor->tz,
+ sensor->min_idx, &sensor->threshold_min);
+ if (ret)
+ pr_err("Unable to get MIN trip temp. sensor:%d err:%d\n",
+ sensor->sensor_id, ret);
+
+ ret = sensor->tz->ops->get_trip_temp(sensor->tz,
+ sensor->max_idx, &sensor->threshold_max);
+ if (ret)
+ pr_err("Unable to get MAX trip temp. sensor:%d err:%d\n",
+ sensor->sensor_id, ret);
+}
+
+static int __update_sensor_thresholds(struct sensor_info *sensor)
+{
+ long max_of_low_thresh = LONG_MIN;
+ long min_of_high_thresh = LONG_MAX;
+ struct sensor_threshold *pos = NULL;
+ int ret = 0;
+
+ if (!sensor->tz->ops->set_trip_temp ||
+ !sensor->tz->ops->activate_trip_type ||
+ !sensor->tz->ops->get_trip_type ||
+ !sensor->tz->ops->get_trip_temp) {
+ ret = -ENODEV;
+ goto update_done;
+ }
+
+ if ((sensor->max_idx == -1) || (sensor->min_idx == -1))
+ init_sensor_trip(sensor);
+
+ list_for_each_entry(pos, &sensor->threshold_list, list) {
+ if (!pos->active)
+ continue;
+ if (pos->trip == THERMAL_TRIP_CONFIGURABLE_LOW) {
+ if (pos->temp > max_of_low_thresh)
+ max_of_low_thresh = pos->temp;
+ }
+ if (pos->trip == THERMAL_TRIP_CONFIGURABLE_HI) {
+ if (pos->temp < min_of_high_thresh)
+ min_of_high_thresh = pos->temp;
+ }
+ }
+
+ pr_debug("sensor %d: Thresholds: max of low: %ld min of high: %ld\n",
+ sensor->sensor_id, max_of_low_thresh,
+ min_of_high_thresh);
+
+ if (min_of_high_thresh != LONG_MAX) {
+ ret = sensor->tz->ops->set_trip_temp(sensor->tz,
+ sensor->max_idx, min_of_high_thresh);
+ if (ret) {
+ pr_err("sensor %d: Unable to set high threshold %d",
+ sensor->sensor_id, ret);
+ goto update_done;
+ }
+ sensor->threshold_max = min_of_high_thresh;
+ }
+ ret = sensor->tz->ops->activate_trip_type(sensor->tz,
+ sensor->max_idx,
+ (min_of_high_thresh == LONG_MAX) ?
+ THERMAL_TRIP_ACTIVATION_DISABLED :
+ THERMAL_TRIP_ACTIVATION_ENABLED);
+ if (ret) {
+ pr_err("sensor %d: Unable to activate high threshold %d",
+ sensor->sensor_id, ret);
+ goto update_done;
+ }
+
+ if (max_of_low_thresh != LONG_MIN) {
+ ret = sensor->tz->ops->set_trip_temp(sensor->tz,
+ sensor->min_idx, max_of_low_thresh);
+ if (ret) {
+ pr_err("sensor %d: Unable to set low threshold %d",
+ sensor->sensor_id, ret);
+ goto update_done;
+ }
+ sensor->threshold_min = max_of_low_thresh;
+ }
+ ret = sensor->tz->ops->activate_trip_type(sensor->tz,
+ sensor->min_idx,
+ (max_of_low_thresh == LONG_MIN) ?
+ THERMAL_TRIP_ACTIVATION_DISABLED :
+ THERMAL_TRIP_ACTIVATION_ENABLED);
+ if (ret) {
+ pr_err("sensor %d: Unable to activate low threshold %d",
+ sensor->sensor_id, ret);
+ goto update_done;
+ }
+
+ pr_debug("sensor %d: low: %d high: %d\n",
+ sensor->sensor_id,
+ sensor->threshold_min, sensor->threshold_max);
+
+update_done:
+ return ret;
+}
+
+static void sensor_update_work(struct work_struct *work)
+{
+ struct sensor_info *sensor = container_of(work, struct sensor_info,
+ work);
+ int ret = 0;
+ mutex_lock(&sensor->lock);
+ ret = __update_sensor_thresholds(sensor);
+ if (ret)
+ pr_err("sensor %d: Error %d setting threshold\n",
+ sensor->sensor_id, ret);
+ mutex_unlock(&sensor->lock);
+}
+
+static __ref int sensor_sysfs_notify(void *data)
+{
+ int ret = 0;
+ struct sensor_info *sensor = (struct sensor_info *)data;
+
+ while (!kthread_should_stop()) {
+ if (wait_for_completion_interruptible(
+ &sensor->sysfs_notify_complete) != 0)
+ continue;
+ if (sensor->deregister_active)
+ return ret;
+ reinit_completion(&sensor->sysfs_notify_complete);
+ sysfs_notify(&sensor->tz->device.kobj, NULL,
+ THERMAL_UEVENT_DATA);
+ }
+ return ret;
+}
+
+/* May be called in an interrupt context.
+ * Do NOT call sensor_set_trip from this function
+ */
+int thermal_sensor_trip(struct thermal_zone_device *tz,
+ enum thermal_trip_type trip, long temp)
+{
+ struct sensor_threshold *pos = NULL;
+ int ret = -ENODEV;
+
+ if (trip != THERMAL_TRIP_CONFIGURABLE_HI &&
+ trip != THERMAL_TRIP_CONFIGURABLE_LOW)
+ return 0;
+
+ if (list_empty(&tz->sensor.threshold_list))
+ return 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pos, &tz->sensor.threshold_list, list) {
+ if ((pos->trip != trip) || (!pos->active))
+ continue;
+ if (((trip == THERMAL_TRIP_CONFIGURABLE_LOW) &&
+ (pos->temp <= tz->sensor.threshold_min) &&
+ (pos->temp >= temp)) ||
+ ((trip == THERMAL_TRIP_CONFIGURABLE_HI) &&
+ (pos->temp >= tz->sensor.threshold_max) &&
+ (pos->temp <= temp))) {
+ if ((pos == &tz->tz_threshold[0])
+ || (pos == &tz->tz_threshold[1]))
+ complete(&tz->sensor.sysfs_notify_complete);
+ pos->active = 0;
+ pos->notify(trip, temp, pos->data);
+ }
+ }
+ rcu_read_unlock();
+
+ schedule_work(&tz->sensor.work);
+
+ return ret;
+}
+EXPORT_SYMBOL(thermal_sensor_trip);
+
+int sensor_get_temp(uint32_t sensor_id, int *temp)
+{
+ struct sensor_info *sensor = get_sensor(sensor_id);
+ int ret = 0;
+
+ if (!sensor)
+ return -ENODEV;
+
+ ret = sensor->tz->ops->get_temp(sensor->tz, temp);
+
+ return ret;
+}
+EXPORT_SYMBOL(sensor_get_temp);
+
+int sensor_activate_trip(uint32_t sensor_id,
+ struct sensor_threshold *threshold, bool enable)
+{
+ struct sensor_info *sensor = get_sensor(sensor_id);
+ int ret = 0;
+
+ if (!sensor || !threshold) {
+ pr_err("%s: uninitialized data\n",
+ KBUILD_MODNAME);
+ ret = -ENODEV;
+ goto activate_trip_exit;
+ }
+
+ mutex_lock(&sensor->lock);
+ threshold->active = (enable) ? 1 : 0;
+ ret = __update_sensor_thresholds(sensor);
+ mutex_unlock(&sensor->lock);
+
+activate_trip_exit:
+ return ret;
+}
+EXPORT_SYMBOL(sensor_activate_trip);
+
+int sensor_set_trip(uint32_t sensor_id, struct sensor_threshold *threshold)
+{
+ struct sensor_threshold *pos = NULL;
+ struct sensor_info *sensor = get_sensor(sensor_id);
+
+ if (!sensor)
+ return -ENODEV;
+
+ if (!threshold || !threshold->notify)
+ return -EFAULT;
+
+ mutex_lock(&sensor->lock);
+ list_for_each_entry(pos, &sensor->threshold_list, list) {
+ if (pos == threshold)
+ break;
+ }
+
+ if (pos != threshold) {
+ INIT_LIST_HEAD(&threshold->list);
+ list_add_rcu(&threshold->list, &sensor->threshold_list);
+ }
+ threshold->active = 0; /* Do not allow active threshold right away */
+
+ mutex_unlock(&sensor->lock);
+
+ return 0;
+
+}
+EXPORT_SYMBOL(sensor_set_trip);
+
+int sensor_cancel_trip(uint32_t sensor_id, struct sensor_threshold *threshold)
+{
+ struct sensor_threshold *pos = NULL, *var = NULL;
+ struct sensor_info *sensor = get_sensor(sensor_id);
+ int ret = 0;
+
+ if (!sensor)
+ return -ENODEV;
+
+ mutex_lock(&sensor->lock);
+ list_for_each_entry_safe(pos, var, &sensor->threshold_list, list) {
+ if (pos == threshold) {
+ pos->active = 0;
+ list_del_rcu(&pos->list);
+ break;
+ }
+ }
+
+ ret = __update_sensor_thresholds(sensor);
+ mutex_unlock(&sensor->lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(sensor_cancel_trip);
+
+static int tz_notify_trip(enum thermal_trip_type type, int temp, void *data)
+{
+ struct thermal_zone_device *tz = (struct thermal_zone_device *)data;
+
+ pr_debug("sensor %d tripped: type %d temp %d\n",
+ tz->sensor.sensor_id, type, temp);
+
+ return 0;
+}
+
+static void get_trip_threshold(struct thermal_zone_device *tz, int trip,
+ struct sensor_threshold **threshold)
+{
+ enum thermal_trip_type type;
+
+ tz->ops->get_trip_type(tz, trip, &type);
+
+ if (type == THERMAL_TRIP_CONFIGURABLE_HI)
+ *threshold = &tz->tz_threshold[0];
+ else if (type == THERMAL_TRIP_CONFIGURABLE_LOW)
+ *threshold = &tz->tz_threshold[1];
+ else
+ *threshold = NULL;
+}
+
+int sensor_set_trip_temp(struct thermal_zone_device *tz,
+ int trip, long temp)
+{
+ int ret = 0;
+ struct sensor_threshold *threshold = NULL;
+
+ if (!tz->ops->get_trip_type)
+ return -EPERM;
+
+ get_trip_threshold(tz, trip, &threshold);
+ if (threshold) {
+ threshold->temp = temp;
+ ret = sensor_set_trip(tz->sensor.sensor_id, threshold);
+ } else {
+ ret = tz->ops->set_trip_temp(tz, trip, temp);
+ }
+
+ return ret;
+}
+
+int sensor_init(struct thermal_zone_device *tz)
+{
+ struct sensor_info *sensor = &tz->sensor;
+
+ sensor->sensor_id = tz->id;
+ sensor->tz = tz;
+ sensor->threshold_min = INT_MIN;
+ sensor->threshold_max = INT_MAX;
+ sensor->max_idx = -1;
+ sensor->min_idx = -1;
+ sensor->deregister_active = false;
+ mutex_init(&sensor->lock);
+ INIT_LIST_HEAD_RCU(&sensor->sensor_list);
+ INIT_LIST_HEAD_RCU(&sensor->threshold_list);
+ INIT_LIST_HEAD(&tz->tz_threshold[0].list);
+ INIT_LIST_HEAD(&tz->tz_threshold[1].list);
+ tz->tz_threshold[0].notify = tz_notify_trip;
+ tz->tz_threshold[0].data = tz;
+ tz->tz_threshold[0].trip = THERMAL_TRIP_CONFIGURABLE_HI;
+ tz->tz_threshold[1].notify = tz_notify_trip;
+ tz->tz_threshold[1].data = tz;
+ tz->tz_threshold[1].trip = THERMAL_TRIP_CONFIGURABLE_LOW;
+ list_add_rcu(&sensor->sensor_list, &sensor_info_list);
+ INIT_WORK(&sensor->work, sensor_update_work);
+ init_completion(&sensor->sysfs_notify_complete);
+ sensor->sysfs_notify_thread = kthread_run(sensor_sysfs_notify,
+ &tz->sensor,
+ "therm_core:notify%d",
+ tz->id);
+ if (IS_ERR(sensor->sysfs_notify_thread))
+ pr_err("Failed to create notify thread %d", tz->id);
+
+
+ return 0;
+}
+
static int get_idr(struct idr *idr, struct mutex *lock, int *id)
{
int ret;
mutex_unlock(&thermal_list_lock);
}
-static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
+static void thermal_zone_device_set_polling(struct workqueue_struct *queue,
+ struct thermal_zone_device *tz,
int delay)
{
if (delay > 1000)
- mod_delayed_work(system_freezable_wq, &tz->poll_queue,
+ mod_delayed_work(queue, &tz->poll_queue,
round_jiffies(msecs_to_jiffies(delay)));
else if (delay)
- mod_delayed_work(system_freezable_wq, &tz->poll_queue,
+ mod_delayed_work(queue, &tz->poll_queue,
msecs_to_jiffies(delay));
else
- cancel_delayed_work_sync(&tz->poll_queue);
+ cancel_delayed_work(&tz->poll_queue);
}
static void monitor_thermal_zone(struct thermal_zone_device *tz)
mutex_lock(&tz->lock);
if (tz->passive)
- thermal_zone_device_set_polling(tz, tz->passive_delay);
+ thermal_zone_device_set_polling(thermal_passive_wq,
+ tz, tz->passive_delay);
else if (tz->polling_delay)
- thermal_zone_device_set_polling(tz, tz->polling_delay);
+ thermal_zone_device_set_polling(
+ system_freezable_power_efficient_wq,
+ tz, tz->polling_delay);
else
- thermal_zone_device_set_polling(tz, 0);
+ thermal_zone_device_set_polling(NULL, tz, 0);
mutex_unlock(&tz->lock);
}
tz->ops->get_trip_temp(tz, trip, &trip_temp);
/* If we have not crossed the trip_temp, we do not care. */
- if (trip_temp <= 0 || tz->temperature < trip_temp)
- return;
-
- trace_thermal_zone_trip(tz, trip, trip_type);
+ if (trip_type != THERMAL_TRIP_CRITICAL_LOW &&
+ trip_type != THERMAL_TRIP_CONFIGURABLE_LOW) {
+ if (tz->temperature < trip_temp)
+ return;
+ } else
+ if (tz->temperature >= trip_temp)
+ return;
if (tz->ops->notify)
tz->ops->notify(tz, trip, trip_type);
- if (trip_type == THERMAL_TRIP_CRITICAL) {
+ if (trip_type == THERMAL_TRIP_CRITICAL ||
+ trip_type == THERMAL_TRIP_CRITICAL_LOW) {
dev_emerg(&tz->device,
"critical temperature reached(%d C),shutting down\n",
tz->temperature / 1000);
tz->ops->get_trip_type(tz, trip, &type);
- if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT)
+ if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT ||
+ type == THERMAL_TRIP_CONFIGURABLE_HI ||
+ type == THERMAL_TRIP_CONFIGURABLE_LOW ||
+ type == THERMAL_TRIP_CRITICAL_LOW)
handle_critical_trips(tz, trip, type);
else
handle_non_critical_trips(tz, trip, type);
{
int count;
- if (atomic_read(&in_suspend))
- return;
-
if (!tz->ops->get_temp)
return;
return sprintf(buf, "critical\n");
case THERMAL_TRIP_HOT:
return sprintf(buf, "hot\n");
+ case THERMAL_TRIP_CONFIGURABLE_HI:
+ return sprintf(buf, "configurable_hi\n");
+ case THERMAL_TRIP_CONFIGURABLE_LOW:
+ return sprintf(buf, "configurable_low\n");
+ case THERMAL_TRIP_CRITICAL_LOW:
+ return sprintf(buf, "critical_low\n");
case THERMAL_TRIP_PASSIVE:
return sprintf(buf, "passive\n");
case THERMAL_TRIP_ACTIVE:
}
static ssize_t
+trip_point_type_activate(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct thermal_zone_device *tz = to_thermal_zone(dev);
+ int trip, result = 0;
+ bool activate;
+ struct sensor_threshold *threshold = NULL;
+
+ if (!tz->ops->get_trip_type ||
+ !tz->ops->activate_trip_type) {
+ result = -EPERM;
+ goto trip_activate_exit;
+ }
+
+ if (!sscanf(attr->attr.name, "trip_point_%d_type", &trip)) {
+ result = -EINVAL;
+ goto trip_activate_exit;
+ }
+
+ if (!strcmp(buf, "enabled")) {
+ activate = true;
+ } else if (!strcmp(buf, "disabled")) {
+ activate = false;
+ } else {
+ result = -EINVAL;
+ goto trip_activate_exit;
+ }
+
+ get_trip_threshold(tz, trip, &threshold);
+ if (threshold)
+ result = sensor_activate_trip(tz->sensor.sensor_id,
+ threshold, activate);
+ else
+ result = tz->ops->activate_trip_type(tz, trip,
+ activate ? THERMAL_TRIP_ACTIVATION_ENABLED :
+ THERMAL_TRIP_ACTIVATION_DISABLED);
+
+trip_activate_exit:
+ if (result)
+ return result;
+
+ return count;
+}
+
+static ssize_t
trip_point_temp_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct thermal_zone_device *tz = to_thermal_zone(dev);
int trip, ret;
- unsigned long temperature;
+ long temperature;
if (!tz->ops->set_trip_temp)
return -EPERM;
if (!sscanf(attr->attr.name, "trip_point_%d_temp", &trip))
return -EINVAL;
- if (kstrtoul(buf, 10, &temperature))
+ if (kstrtol(buf, 10, &temperature))
return -EINVAL;
- ret = tz->ops->set_trip_temp(tz, trip, temperature);
+ ret = sensor_set_trip_temp(tz, trip, temperature);
return ret ? ret : count;
}
return -EINVAL;
ret = tz->ops->get_trip_temp(tz, trip, &temperature);
-
if (ret)
return ret;
sysfs_attr_init(&tz->trip_type_attrs[indx].attr.attr);
tz->trip_type_attrs[indx].attr.attr.name =
tz->trip_type_attrs[indx].name;
- tz->trip_type_attrs[indx].attr.attr.mode = S_IRUGO;
+ tz->trip_type_attrs[indx].attr.attr.mode = S_IRUGO | S_IWUSR;
tz->trip_type_attrs[indx].attr.show = trip_point_type_show;
+ tz->trip_type_attrs[indx].attr.store = trip_point_type_activate;
device_create_file(&tz->device,
&tz->trip_type_attrs[indx].attr);
}
mutex_lock(&thermal_list_lock);
- list_add_tail(&tz->node, &thermal_tz_list);
+ list_add_tail_rcu(&tz->node, &thermal_tz_list);
+ sensor_init(tz);
mutex_unlock(&thermal_list_lock);
/* Bind cooling devices for this zone */
bind_tz(tz);
- INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check);
+ INIT_DEFERRABLE_WORK(&(tz->poll_queue), thermal_zone_device_check);
thermal_zone_device_reset(tz);
/* Update the new thermal zone and mark it as already updated. */
mutex_unlock(&thermal_list_lock);
return;
}
- list_del(&tz->node);
+ list_del_rcu(&tz->node);
/* Unbind all cdevs associated with 'this' thermal zone */
list_for_each_entry(cdev, &thermal_cdev_list, node) {
mutex_unlock(&thermal_list_lock);
- thermal_zone_device_set_polling(NULL, tz, 0);
+ cancel_delayed_work_sync(&tz->poll_queue);
if (tz->type[0])
device_remove_file(&tz->device, &dev_attr_type);
thermal_set_governor(tz, NULL);
thermal_remove_hwmon_sysfs(tz);
+ flush_work(&tz->sensor.work);
+ tz->sensor.deregister_active = true;
+ complete(&tz->sensor.sysfs_notify_complete);
+ kthread_stop(tz->sensor.sysfs_notify_thread);
+ mutex_lock(&thermal_list_lock);
+ list_del_rcu(&tz->sensor.sensor_list);
+ mutex_unlock(&thermal_list_lock);
release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
idr_destroy(&tz->idr);
mutex_destroy(&tz->lock);
if (!name)
goto exit;
- mutex_lock(&thermal_list_lock);
- list_for_each_entry(pos, &thermal_tz_list, node)
+ rcu_read_lock();
+ list_for_each_entry_rcu(pos, &thermal_tz_list, node)
if (!strncasecmp(name, pos->type, THERMAL_NAME_LENGTH)) {
found++;
ref = pos;
}
- mutex_unlock(&thermal_list_lock);
+ rcu_read_unlock();
/* nothing has been found, thus an error code for it */
if (found == 0)
thermal_gov_power_allocator_unregister();
}
-static int thermal_pm_notify(struct notifier_block *nb,
- unsigned long mode, void *_unused)
-{
- struct thermal_zone_device *tz;
-
- switch (mode) {
- case PM_HIBERNATION_PREPARE:
- case PM_RESTORE_PREPARE:
- case PM_SUSPEND_PREPARE:
- atomic_set(&in_suspend, 1);
- break;
- case PM_POST_HIBERNATION:
- case PM_POST_RESTORE:
- case PM_POST_SUSPEND:
- atomic_set(&in_suspend, 0);
- list_for_each_entry(tz, &thermal_tz_list, node) {
- thermal_zone_device_reset(tz);
- thermal_zone_device_update(tz);
- }
- break;
- default:
- break;
- }
- return 0;
-}
-
-static struct notifier_block thermal_pm_nb = {
- .notifier_call = thermal_pm_notify,
-};
-
static int __init thermal_init(void)
{
int result;
+ thermal_passive_wq = alloc_workqueue("thermal_passive_wq",
+ WQ_HIGHPRI | WQ_UNBOUND
+ | WQ_FREEZABLE,
+ THERMAL_MAX_ACTIVE);
+ if (!thermal_passive_wq) {
+ result = -ENOMEM;
+ goto error;
+ }
+
result = thermal_register_governors();
if (result)
- goto error;
+ goto destroy_wq;
result = class_register(&thermal_class);
if (result)
if (result)
goto exit_netlink;
- result = register_pm_notifier(&thermal_pm_nb);
- if (result)
- pr_warn("Thermal: Can not register suspend notifier, return %d\n",
- result);
-
return 0;
exit_netlink:
class_unregister(&thermal_class);
unregister_governors:
thermal_unregister_governors();
+destroy_wq:
+ destroy_workqueue(thermal_passive_wq);
error:
idr_destroy(&thermal_tz_idr);
idr_destroy(&thermal_cdev_idr);
static void __exit thermal_exit(void)
{
- unregister_pm_notifier(&thermal_pm_nb);
of_thermal_destroy_zones();
+ destroy_workqueue(thermal_passive_wq);
genetlink_exit();
class_unregister(&thermal_class);
thermal_unregister_governors();
# define SUPPORT_SYSRQ
#endif
+#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/dma-mapping.h>
#include <linux/dmaengine.h>
-#include <linux/hrtimer.h>
#include <linux/module.h>
#include <linux/io.h>
#include <linux/ioport.h>
-#include <linux/irq.h>
+#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/console.h>
#include <linux/tty.h>
#include <linux/tty_flip.h>
#include <linux/serial_core.h>
-#include <linux/serial.h>
#include <linux/slab.h>
#include <linux/clk.h>
#include <linux/platform_device.h>
#include <linux/delay.h>
#include <linux/of.h>
#include <linux/of_device.h>
-
-#include "msm_serial.h"
-
-#define UARTDM_BURST_SIZE 16 /* in bytes */
-#define UARTDM_TX_AIGN(x) ((x) & ~0x3) /* valid for > 1p3 */
-#define UARTDM_TX_MAX 256 /* in bytes, valid for <= 1p3 */
-#define UARTDM_RX_SIZE (UART_XMIT_SIZE / 4)
+#include <linux/wait.h>
+
+#define UART_MR1 0x0000
+
+#define UART_MR1_AUTO_RFR_LEVEL0 0x3F
+#define UART_MR1_AUTO_RFR_LEVEL1 0x3FF00
+#define UART_DM_MR1_AUTO_RFR_LEVEL1 0xFFFFFF00
+#define UART_MR1_RX_RDY_CTL BIT(7)
+#define UART_MR1_CTS_CTL BIT(6)
+
+#define UART_MR2 0x0004
+#define UART_MR2_ERROR_MODE BIT(6)
+#define UART_MR2_BITS_PER_CHAR 0x30
+#define UART_MR2_BITS_PER_CHAR_5 (0x0 << 4)
+#define UART_MR2_BITS_PER_CHAR_6 (0x1 << 4)
+#define UART_MR2_BITS_PER_CHAR_7 (0x2 << 4)
+#define UART_MR2_BITS_PER_CHAR_8 (0x3 << 4)
+#define UART_MR2_STOP_BIT_LEN_ONE (0x1 << 2)
+#define UART_MR2_STOP_BIT_LEN_TWO (0x3 << 2)
+#define UART_MR2_PARITY_MODE_NONE 0x0
+#define UART_MR2_PARITY_MODE_ODD 0x1
+#define UART_MR2_PARITY_MODE_EVEN 0x2
+#define UART_MR2_PARITY_MODE_SPACE 0x3
+#define UART_MR2_PARITY_MODE 0x3
+
+#define UART_CSR 0x0008
+
+#define UART_TF 0x000C
+#define UARTDM_TF 0x0070
+
+#define UART_CR 0x0010
+#define UART_CR_CMD_NULL (0 << 4)
+#define UART_CR_CMD_RESET_RX (1 << 4)
+#define UART_CR_CMD_RESET_TX (2 << 4)
+#define UART_CR_CMD_RESET_ERR (3 << 4)
+#define UART_CR_CMD_RESET_BREAK_INT (4 << 4)
+#define UART_CR_CMD_START_BREAK (5 << 4)
+#define UART_CR_CMD_STOP_BREAK (6 << 4)
+#define UART_CR_CMD_RESET_CTS (7 << 4)
+#define UART_CR_CMD_RESET_STALE_INT (8 << 4)
+#define UART_CR_CMD_PACKET_MODE (9 << 4)
+#define UART_CR_CMD_MODE_RESET (12 << 4)
+#define UART_CR_CMD_SET_RFR (13 << 4)
+#define UART_CR_CMD_RESET_RFR (14 << 4)
+#define UART_CR_CMD_PROTECTION_EN (16 << 4)
+#define UART_CR_CMD_STALE_EVENT_DISABLE (6 << 8)
+#define UART_CR_CMD_STALE_EVENT_ENABLE (80 << 4)
+#define UART_CR_CMD_FORCE_STALE (4 << 8)
+#define UART_CR_CMD_RESET_TX_READY (3 << 8)
+#define UART_CR_TX_DISABLE BIT(3)
+#define UART_CR_TX_ENABLE BIT(2)
+#define UART_CR_RX_DISABLE BIT(1)
+#define UART_CR_RX_ENABLE BIT(0)
+#define UART_CR_CMD_RESET_RXBREAK_START ((1 << 11) | (2 << 4))
+
+#define UART_IMR 0x0014
+#define UART_IMR_TXLEV BIT(0)
+#define UART_IMR_RXSTALE BIT(3)
+#define UART_IMR_RXLEV BIT(4)
+#define UART_IMR_DELTA_CTS BIT(5)
+#define UART_IMR_CURRENT_CTS BIT(6)
+#define UART_IMR_RXBREAK_START BIT(10)
+
+#define UART_IPR_RXSTALE_LAST 0x20
+#define UART_IPR_STALE_LSB 0x1F
+#define UART_IPR_STALE_TIMEOUT_MSB 0x3FF80
+#define UART_DM_IPR_STALE_TIMEOUT_MSB 0xFFFFFF80
+
+#define UART_IPR 0x0018
+#define UART_TFWR 0x001C
+#define UART_RFWR 0x0020
+#define UART_HCR 0x0024
+
+#define UART_MREG 0x0028
+#define UART_NREG 0x002C
+#define UART_DREG 0x0030
+#define UART_MNDREG 0x0034
+#define UART_IRDA 0x0038
+#define UART_MISR_MODE 0x0040
+#define UART_MISR_RESET 0x0044
+#define UART_MISR_EXPORT 0x0048
+#define UART_MISR_VAL 0x004C
+#define UART_TEST_CTRL 0x0050
+
+#define UART_SR 0x0008
+#define UART_SR_HUNT_CHAR BIT(7)
+#define UART_SR_RX_BREAK BIT(6)
+#define UART_SR_PAR_FRAME_ERR BIT(5)
+#define UART_SR_OVERRUN BIT(4)
+#define UART_SR_TX_EMPTY BIT(3)
+#define UART_SR_TX_READY BIT(2)
+#define UART_SR_RX_FULL BIT(1)
+#define UART_SR_RX_READY BIT(0)
+
+#define UART_RF 0x000C
+#define UARTDM_RF 0x0070
+#define UART_MISR 0x0010
+#define UART_ISR 0x0014
+#define UART_ISR_TX_READY BIT(7)
+
+#define UARTDM_RXFS 0x50
+#define UARTDM_RXFS_BUF_SHIFT 0x7
+#define UARTDM_RXFS_BUF_MASK 0x7
+
+#define UARTDM_DMEN 0x3C
+#define UARTDM_DMEN_RX_SC_ENABLE BIT(5)
+#define UARTDM_DMEN_TX_SC_ENABLE BIT(4)
+
+#define UARTDM_DMEN_TX_BAM_ENABLE BIT(2) /* UARTDM_1P4 */
+#define UARTDM_DMEN_TX_DM_ENABLE BIT(0) /* < UARTDM_1P4 */
+
+#define UARTDM_DMEN_RX_BAM_ENABLE BIT(3) /* UARTDM_1P4 */
+#define UARTDM_DMEN_RX_DM_ENABLE BIT(1) /* < UARTDM_1P4 */
+
+#define UARTDM_DMRX 0x34
+#define UARTDM_NCF_TX 0x40
+#define UARTDM_RX_TOTAL_SNAP 0x38
+
+#define UARTDM_BURST_SIZE 16 /* in bytes */
+#define UARTDM_TX_AIGN(x) ((x) & ~0x3) /* valid for > 1p3 */
+#define UARTDM_TX_MAX 256 /* in bytes, valid for <= 1p3 */
+#define UARTDM_RX_SIZE (UART_XMIT_SIZE / 4)
enum {
UARTDM_1P1 = 1,
struct msm_dma rx_dma;
};
+#define UART_TO_MSM(uart_port) container_of(uart_port, struct msm_port, uart)
+
+static
+void msm_write(struct uart_port *port, unsigned int val, unsigned int off)
+{
+ writel_relaxed_no_log(val, port->membase + off);
+}
+
+static
+unsigned int msm_read(struct uart_port *port, unsigned int off)
+{
+ return readl_relaxed_no_log(port->membase + off);
+}
+
+/*
+ * Setup the MND registers to use the TCXO clock.
+ */
+static void msm_serial_set_mnd_regs_tcxo(struct uart_port *port)
+{
+ msm_write(port, 0x06, UART_MREG);
+ msm_write(port, 0xF1, UART_NREG);
+ msm_write(port, 0x0F, UART_DREG);
+ msm_write(port, 0x1A, UART_MNDREG);
+ port->uartclk = 1843200;
+}
+
+/*
+ * Setup the MND registers to use the TCXO clock divided by 4.
+ */
+static void msm_serial_set_mnd_regs_tcxoby4(struct uart_port *port)
+{
+ msm_write(port, 0x18, UART_MREG);
+ msm_write(port, 0xF6, UART_NREG);
+ msm_write(port, 0x0F, UART_DREG);
+ msm_write(port, 0x0A, UART_MNDREG);
+ port->uartclk = 1843200;
+}
+
+static void msm_serial_set_mnd_regs(struct uart_port *port)
+{
+ struct msm_port *msm_port = UART_TO_MSM(port);
+
+ /*
+ * These registers don't exist so we change the clk input rate
+ * on uartdm hardware instead
+ */
+ if (msm_port->is_uartdm)
+ return;
+
+ if (port->uartclk == 19200000)
+ msm_serial_set_mnd_regs_tcxo(port);
+ else if (port->uartclk == 4800000)
+ msm_serial_set_mnd_regs_tcxoby4(port);
+}
+
static void msm_handle_tx(struct uart_port *port);
static void msm_start_rx_dma(struct msm_port *msm_port);
-void msm_stop_dma(struct uart_port *port, struct msm_dma *dma)
+static void msm_stop_dma(struct uart_port *port, struct msm_dma *dma)
{
struct device *dev = port->dev;
unsigned int mapped;
struct device *dev = msm_port->uart.dev;
struct dma_slave_config conf;
struct msm_dma *dma;
+ struct dma_chan *dma_chan;
u32 crci = 0;
int ret;
dma = &msm_port->tx_dma;
/* allocate DMA resources, if available */
- dma->chan = dma_request_slave_channel_reason(dev, "tx");
- if (IS_ERR(dma->chan))
+ dma_chan = dma_request_slave_channel_reason(dev, "tx");
+ if (IS_ERR(dma_chan))
goto no_tx;
+ dma->chan = dma_chan;
of_property_read_u32(dev->of_node, "qcom,tx-crci", &crci);
struct device *dev = msm_port->uart.dev;
struct dma_slave_config conf;
struct msm_dma *dma;
+ struct dma_chan *dma_chan;
u32 crci = 0;
int ret;
dma = &msm_port->rx_dma;
/* allocate DMA resources, if available */
- dma->chan = dma_request_slave_channel_reason(dev, "rx");
- if (IS_ERR(dma->chan))
+ dma_chan = dma_request_slave_channel_reason(dev, "rx");
+ if (IS_ERR(dma_chan))
goto no_rx;
+ dma->chan = dma_chan;
of_property_read_u32(dev->of_node, "qcom,rx-crci", &crci);
val &= ~dma->enable_bit;
msm_write(port, val, UARTDM_DMEN);
- /* Restore interrupts */
- msm_port->imr |= UART_IMR_RXLEV | UART_IMR_RXSTALE;
- msm_write(port, msm_port->imr, UART_IMR);
-
if (msm_read(port, UART_SR) & UART_SR_OVERRUN) {
port->icount.overrun++;
tty_insert_flip_char(tport, 0, TTY_OVERRUN);
static void msm_reset(struct uart_port *port)
{
struct msm_port *msm_port = UART_TO_MSM(port);
+ unsigned int mr;
/* reset everything */
msm_write(port, UART_CR_CMD_RESET_RX, UART_CR);
msm_write(port, UART_CR_CMD_RESET_ERR, UART_CR);
msm_write(port, UART_CR_CMD_RESET_BREAK_INT, UART_CR);
msm_write(port, UART_CR_CMD_RESET_CTS, UART_CR);
- msm_write(port, UART_CR_CMD_SET_RFR, UART_CR);
+ msm_write(port, UART_CR_CMD_RESET_RFR, UART_CR);
+ mr = msm_read(port, UART_MR1);
+ mr &= ~UART_MR1_RX_RDY_CTL;
+ msm_write(port, mr, UART_MR1);
/* Disable DM modes */
if (msm_port->is_uartdm)
};
static const struct msm_baud_map *
-msm_find_best_baud(struct uart_port *port, unsigned int baud)
+msm_find_best_baud(struct uart_port *port, unsigned int baud,
+ unsigned long *rate)
{
- unsigned int i, divisor;
- const struct msm_baud_map *entry;
+ struct msm_port *msm_port = UART_TO_MSM(port);
+ unsigned int divisor, result;
+ unsigned long target, old, best_rate = 0, diff, best_diff = ULONG_MAX;
+ const struct msm_baud_map *entry, *end, *best;
static const struct msm_baud_map table[] = {
- { 1536, 0x00, 1 },
- { 768, 0x11, 1 },
- { 384, 0x22, 1 },
- { 192, 0x33, 1 },
- { 96, 0x44, 1 },
- { 48, 0x55, 1 },
- { 32, 0x66, 1 },
- { 24, 0x77, 1 },
- { 16, 0x88, 1 },
- { 12, 0x99, 6 },
- { 8, 0xaa, 6 },
- { 6, 0xbb, 6 },
- { 4, 0xcc, 6 },
- { 3, 0xdd, 8 },
- { 2, 0xee, 16 },
{ 1, 0xff, 31 },
- { 0, 0xff, 31 },
+ { 2, 0xee, 16 },
+ { 3, 0xdd, 8 },
+ { 4, 0xcc, 6 },
+ { 6, 0xbb, 6 },
+ { 8, 0xaa, 6 },
+ { 12, 0x99, 6 },
+ { 16, 0x88, 1 },
+ { 24, 0x77, 1 },
+ { 32, 0x66, 1 },
+ { 48, 0x55, 1 },
+ { 96, 0x44, 1 },
+ { 192, 0x33, 1 },
+ { 384, 0x22, 1 },
+ { 768, 0x11, 1 },
+ { 1536, 0x00, 1 },
};
- divisor = uart_get_divisor(port, baud);
+ best = table; /* Default to smallest divider */
+ target = clk_round_rate(msm_port->clk, 16 * baud);
+ divisor = DIV_ROUND_CLOSEST(target, 16 * baud);
+
+ end = table + ARRAY_SIZE(table);
+ entry = table;
+ while (entry < end) {
+ if (entry->divisor <= divisor) {
+ result = target / entry->divisor / 16;
+ diff = abs(result - baud);
+
+ /* Keep track of best entry */
+ if (diff < best_diff) {
+ best_diff = diff;
+ best = entry;
+ best_rate = target;
+ }
- for (i = 0, entry = table; i < ARRAY_SIZE(table); i++, entry++)
- if (entry->divisor <= divisor)
- break;
+ if (result == baud)
+ break;
+ } else if (entry->divisor > divisor) {
+ old = target;
+ target = clk_round_rate(msm_port->clk, old + 1);
+ /*
+ * The rate didn't get any faster so we can't do
+ * better at dividing it down
+ */
+ if (target == old)
+ break;
+
+ /* Start the divisor search over at this new rate */
+ entry = table;
+ divisor = DIV_ROUND_CLOSEST(target, 16 * baud);
+ continue;
+ }
+ entry++;
+ }
- return entry; /* Default to smallest divider */
+ *rate = best_rate;
+ return best;
}
static int msm_set_baud_rate(struct uart_port *port, unsigned int baud,
unsigned int rxstale, watermark, mask;
struct msm_port *msm_port = UART_TO_MSM(port);
const struct msm_baud_map *entry;
- unsigned long flags;
-
- entry = msm_find_best_baud(port, baud);
-
- msm_write(port, entry->code, UART_CSR);
-
- if (baud > 460800)
- port->uartclk = baud * 16;
+ unsigned long flags, rate;
flags = *saved_flags;
spin_unlock_irqrestore(&port->lock, flags);
- clk_set_rate(msm_port->clk, port->uartclk);
+ entry = msm_find_best_baud(port, baud, &rate);
+ clk_set_rate(msm_port->clk, rate);
+ baud = rate / 16 / entry->divisor;
spin_lock_irqsave(&port->lock, flags);
*saved_flags = flags;
+ port->uartclk = rate;
+
+ msm_write(port, entry->code, UART_CSR);
/* RX stale watermark */
rxstale = entry->rxstale;
return baud;
}
-static void msm_init_clock(struct uart_port *port)
-{
- struct msm_port *msm_port = UART_TO_MSM(port);
-
- clk_prepare_enable(msm_port->clk);
- clk_prepare_enable(msm_port->pclk);
- msm_serial_set_mnd_regs(port);
-}
-
static int msm_startup(struct uart_port *port)
{
struct msm_port *msm_port = UART_TO_MSM(port);
snprintf(msm_port->name, sizeof(msm_port->name),
"msm_serial%d", port->line);
- ret = request_irq(port->irq, msm_uart_irq, IRQF_TRIGGER_HIGH,
- msm_port->name, port);
- if (unlikely(ret))
+ /*
+ * UART clk must be kept enabled to
+ * avoid losing received character
+ */
+ ret = clk_prepare_enable(msm_port->clk);
+ if (ret)
return ret;
- msm_init_clock(port);
+ ret = clk_prepare_enable(msm_port->pclk);
+ if (ret)
+ goto err_pclk;
+
+ msm_serial_set_mnd_regs(port);
if (likely(port->fifosize > 12))
rfr_level = port->fifosize - 12;
msm_request_rx_dma(msm_port, msm_port->uart.mapbase);
}
+ ret = request_irq(port->irq, msm_uart_irq, IRQF_TRIGGER_HIGH,
+ msm_port->name, port);
+ if (unlikely(ret))
+ goto err_irq;
+
return 0;
+
+err_irq:
+ if (msm_port->is_uartdm)
+ msm_release_dma(msm_port);
+
+ clk_disable_unprepare(msm_port->pclk);
+
+err_pclk:
+ clk_disable_unprepare(msm_port->clk);
+
+ return ret;
}
static void msm_shutdown(struct uart_port *port)
if (msm_port->is_uartdm)
msm_release_dma(msm_port);
+ clk_disable_unprepare(msm_port->pclk);
clk_disable_unprepare(msm_port->clk);
free_irq(port->irq, port);
switch (state) {
case 0:
- clk_prepare_enable(msm_port->clk);
- clk_prepare_enable(msm_port->pclk);
+ /*
+ * UART clk must be kept enabled to
+ * avoid losing received character
+ */
+ if (clk_prepare_enable(msm_port->clk))
+ return;
+ if (clk_prepare_enable(msm_port->pclk)) {
+ clk_disable_unprepare(msm_port->clk);
+ return;
+ }
break;
case 3:
clk_disable_unprepare(msm_port->clk);
int j;
unsigned int num_chars;
char buf[4] = { 0 };
+ const u32 *buffer;
if (is_uartdm)
num_chars = min(count - i, (unsigned int)sizeof(buf));
while (!(msm_read(port, UART_SR) & UART_SR_TX_READY))
cpu_relax();
- iowrite32_rep(tf, buf, 1);
+ buffer = (const u32 *)buf;
+ writel_relaxed_no_log(*buffer, tf);
i += num_chars;
}
spin_unlock(&port->lock);
if (unlikely(!port->membase))
return -ENXIO;
- msm_init_clock(port);
+ msm_serial_set_mnd_regs(port);
if (options)
uart_parse_options(options, &baud, &parity, &bits, &flow);
device->con->write = msm_serial_early_write;
return 0;
}
-EARLYCON_DECLARE(msm_serial, msm_serial_early_console_setup);
OF_EARLYCON_DECLARE(msm_serial, "qcom,msm-uart",
msm_serial_early_console_setup);
device->con->write = msm_serial_early_write_dm;
return 0;
}
-EARLYCON_DECLARE(msm_serial_dm, msm_serial_early_console_setup_dm);
OF_EARLYCON_DECLARE(msm_serial_dm, "qcom,msm-uartdm",
msm_serial_early_console_setup_dm);
msm_port->pclk = devm_clk_get(&pdev->dev, "iface");
if (IS_ERR(msm_port->pclk))
return PTR_ERR(msm_port->pclk);
-
- clk_set_rate(msm_port->clk, 1843200);
}
port->uartclk = clk_get_rate(msm_port->clk);
};
MODULE_DEVICE_TABLE(of, msm_match_table);
+#ifdef CONFIG_PM_SLEEP
+static int msm_serial_suspend(struct device *dev)
+{
+ struct uart_port *port = dev_get_drvdata(dev);
+
+ uart_suspend_port(&msm_uart_driver, port);
+
+ return 0;
+}
+
+static int msm_serial_resume(struct device *dev)
+{
+ struct uart_port *port = dev_get_drvdata(dev);
+
+ uart_resume_port(&msm_uart_driver, port);
+
+ return 0;
+}
+#endif
+
+static const struct dev_pm_ops msm_serial_pm_ops = {
+ SET_SYSTEM_SLEEP_PM_OPS(msm_serial_suspend, msm_serial_resume)
+};
+
static struct platform_driver msm_platform_driver = {
.remove = msm_serial_remove,
.probe = msm_serial_probe,
.driver = {
.name = "msm_serial",
.of_match_table = msm_match_table,
+ .pm = &msm_serial_pm_ops,
},
};
struct uart_state *state = tty->driver_data;
struct uart_port *port = state->uart_port;
+ if (port->ops->wake_peer)
+ port->ops->wake_peer(port);
+
if (!uart_tx_stopped(port))
port->ops->start_tx(port);
}
mutex_lock(&port->mutex);
- if (uport->type != PORT_UNKNOWN)
+ if (uport->type != PORT_UNKNOWN && uport->ops->break_ctl)
uport->ops->break_ctl(uport, break_state);
mutex_unlock(&port->mutex);
#define USB_VENDOR_GENESYS_LOGIC 0x05e3
#define HUB_QUIRK_CHECK_PORT_AUTOSUSPEND 0x01
+extern int deny_new_usb;
+
/* Protect struct usb_device->state and ->children members
* Note: Both are also protected by ->dev.sem, except that ->state can
* change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */
/* synchronize hub-port add/remove and peering operations */
DEFINE_MUTEX(usb_port_peer_mutex);
+static bool skip_extended_resume_delay = 1;
+module_param(skip_extended_resume_delay, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(skip_extended_resume_delay,
+ "removes extra delay added to finish bus resume");
+
/* cycle leds on hubs that aren't blinking for attention */
static bool blinkenlights = 0;
module_param(blinkenlights, bool, S_IRUGO);
static bool hub_port_warm_reset_required(struct usb_hub *hub, int port1,
u16 portstatus);
+#define USB_VENDOR_XIAOMI 0x2717
+#define USB_PRODUCT_XIAOMI_HEADSET 0x3801
+
+bool is_xiaomi_headset = false;
+
static inline char *portspeed(struct usb_hub *hub, int portstatus)
{
if (hub_is_superspeed(hub->hdev))
kick_hub_wq(hub);
}
+void usb_flush_hub_wq(void)
+{
+ flush_workqueue(hub_wq);
+}
+EXPORT_SYMBOL(usb_flush_hub_wq);
+
/*
* Let the USB core know that a USB 3.0 device has sent a Function Wake Device
* Notification, which indicates it had initiated remote wakeup.
hdev = interface_to_usbdev(intf);
/*
- * Set default autosuspend delay as 0 to speedup bus suspend,
- * based on the below considerations:
- *
- * - Unlike other drivers, the hub driver does not rely on the
- * autosuspend delay to provide enough time to handle a wakeup
- * event, and the submitted status URB is just to check future
- * change on hub downstream ports, so it is safe to do it.
- *
- * - The patch might cause one or more auto supend/resume for
- * below very rare devices when they are plugged into hub
- * first time:
- *
- * devices having trouble initializing, and disconnect
- * themselves from the bus and then reconnect a second
- * or so later
- *
- * devices just for downloading firmware, and disconnects
- * themselves after completing it
- *
- * For these quite rare devices, their drivers may change the
- * autosuspend delay of their parent hub in the probe() to one
- * appropriate value to avoid the subtle problem if someone
- * does care it.
- *
- * - The patch may cause one or more auto suspend/resume on
- * hub during running 'lsusb', but it is probably too
- * infrequent to worry about.
- *
- * - Change autosuspend delay of hub can avoid unnecessary auto
- * suspend timer for hub, also may decrease power consumption
- * of USB bus.
- *
- * - If user has indicated to prevent autosuspend by passing
- * usbcore.autosuspend = -1 then keep autosuspend disabled.
- */
-#ifdef CONFIG_PM
- if (hdev->dev.power.autosuspend_delay >= 0)
- pm_runtime_set_autosuspend_delay(&hdev->dev, 0);
-#endif
-
- /*
* Hubs have proper suspend/resume support, except for root hubs
* where the controller driver doesn't have bus_suspend and
* bus_resume methods.
dev_info(&udev->dev, "USB disconnect, device number %d\n",
udev->devnum);
+ if (is_xiaomi_headset) {
+ dev_info(&udev->dev, "xiaomi headset removed, devnum %d\n", udev->devnum);
+ is_xiaomi_headset = false;
+ }
+
/*
* Ensure that the pm runtime code knows that the USB device
* is in the process of being disconnected.
udev->dev.devt = MKDEV(USB_DEVICE_MAJOR,
(((udev->bus->busnum-1) * 128) + (udev->devnum-1)));
+ if (USB_VENDOR_XIAOMI == le16_to_cpu(udev->descriptor.idVendor)
+ && USB_PRODUCT_XIAOMI_HEADSET == le16_to_cpu(udev->descriptor.idProduct)) {
+ dev_info(&udev->dev, "xiaomi headset identified, devnum %d\n", udev->devnum);
+ is_xiaomi_headset = true;
+ }
+
/* Tell the world! */
announce_device(udev);
/* drive resume for USB_RESUME_TIMEOUT msec */
dev_dbg(&udev->dev, "usb %sresume\n",
(PMSG_IS_AUTO(msg) ? "auto-" : ""));
- msleep(USB_RESUME_TIMEOUT);
+ if (!skip_extended_resume_delay)
+ usleep_range(USB_RESUME_TIMEOUT * 1000,
+ (USB_RESUME_TIMEOUT + 1) * 1000);
/* Virtual root hubs can trigger on GET_PORT_STATUS to
* stop resume signaling. Then finish the resume
status = hub_port_status(hub, port1, &portstatus, &portchange);
/* TRSMRCY = 10 msec */
- msleep(10);
+ usleep_range(10000, 10500);
}
SuspendCleared:
enum usb_device_speed oldspeed = udev->speed;
const char *speed;
int devnum = udev->devnum;
+ char *error_event[] = {
+ "USB_DEVICE_ERROR=Device_No_Response", NULL };
/* root hub ports have a slightly longer reset period
* (from USB 2.0 spec, section 7.1.7.5)
if (r != -ENODEV)
dev_err(&udev->dev, "device descriptor read/64, error %d\n",
r);
+ kobject_uevent_env(&udev->parent->dev.kobj,
+ KOBJ_CHANGE, error_event);
retval = -EMSGSIZE;
continue;
}
dev_err(&udev->dev,
"device descriptor read/8, error %d\n",
retval);
+ kobject_uevent_env(&udev->parent->dev.kobj,
+ KOBJ_CHANGE, error_event);
if (retval >= 0)
retval = -EMSGSIZE;
} else {
goto done;
return;
}
+
+ if (deny_new_usb) {
+ dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1);
+ goto done;
+ }
+
if (hub_is_superspeed(hub->hdev))
unit_load = 150;
else
/**
* usb_reset_device - warn interface drivers and perform a USB port reset
- * @udev: device to reset (not in SUSPENDED or NOTATTACHED state)
+ * @udev: device to reset (not in NOTATTACHED state)
*
* Warns all drivers bound to registered interfaces (using their pre_reset
* method), performs the port reset, and then lets the drivers know that
struct usb_host_config *config = udev->actconfig;
struct usb_hub *hub = usb_hub_to_struct_hub(udev->parent);
- if (udev->state == USB_STATE_NOTATTACHED ||
- udev->state == USB_STATE_SUSPENDED) {
+ if (udev->state == USB_STATE_NOTATTACHED) {
dev_dbg(&udev->dev, "device reset not allowed in state %d\n",
udev->state);
return -EINVAL;
#include "configfs.h"
#include "u_f.h"
#include "u_os_desc.h"
+#include "debug.h"
+
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+#include <linux/platform_device.h>
+#include <linux/kdev_t.h>
+#include <linux/usb/ch9.h>
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+extern int acc_ctrlrequest(struct usb_composite_dev *cdev,
+ const struct usb_ctrlrequest *ctrl);
+void acc_disconnect(void);
+#endif
+static struct class *android_class;
+static struct device *android_device;
+static int index;
+static int gadget_index;
+
+struct device *create_function_device(char *name)
+{
+ if (android_device && !IS_ERR(android_device))
+ return device_create(android_class, android_device,
+ MKDEV(0, index++), NULL, name);
+ else
+ return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(create_function_device);
+#endif
int check_user_usb_string(const char *name,
struct usb_gadget_strings *stringtab_dev)
struct usb_composite_driver composite;
struct usb_composite_dev cdev;
bool use_os_desc;
+ bool unbinding;
char b_vendor_code;
char qw_sign[OS_STRING_QW_SIGN_LEN];
spinlock_t spinlock;
bool unbind;
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ bool connected;
+ bool sw_connected;
+ struct work_struct work;
+ struct device *dev;
+#endif
};
static inline struct gadget_info *to_gadget_info(struct config_item *item)
struct list_head list;
};
+#define MAX_USB_STRING_LEN 126
+#define MAX_USB_STRING_WITH_NULL_LEN (MAX_USB_STRING_LEN+1)
+
static int usb_string_copy(const char *s, char **s_copy)
{
int ret;
char *str;
char *copy = *s_copy;
ret = strlen(s);
- if (ret > 126)
+ if (ret > MAX_USB_STRING_LEN)
return -EOVERFLOW;
- str = kstrdup(s, GFP_KERNEL);
- if (!str)
- return -ENOMEM;
+ if (copy) {
+ str = copy;
+ } else {
+ str = kmalloc(MAX_USB_STRING_WITH_NULL_LEN, GFP_KERNEL);
+ if (!str)
+ return -ENOMEM;
+ }
+ strlcpy(str, s, MAX_USB_STRING_WITH_NULL_LEN);
if (str[ret - 1] == '\n')
str[ret - 1] = '\0';
- kfree(copy);
*s_copy = str;
return 0;
}
if (!gi->udc_name)
return -ENODEV;
+ gi->unbinding = true;
ret = usb_gadget_unregister_driver(&gi->composite.gadget_driver);
if (ret)
return ret;
+ gi->unbinding = false;
kfree(gi->udc_name);
gi->udc_name = NULL;
return 0;
mutex_lock(&gi->lock);
- if (!strlen(name)) {
+ if (!strlen(name) || strcmp(name, "none") == 0) {
ret = unregister_gadget(gi);
if (ret)
goto err;
cfg = container_of(c, struct config_usb_cfg, c);
- list_for_each_entry_safe(f, tmp, &c->functions, list) {
+ list_for_each_entry_safe_reverse(f, tmp, &c->functions, list) {
- list_move_tail(&f->list, &cfg->func_list);
+ list_move(&f->list, &cfg->func_list);
if (f->unbind) {
dev_err(&gi->cdev.gadget->dev, "unbind function"
- " '%s'/%p\n", f->name, f);
+ " '%s'/%pK\n", f->name, f);
f->unbind(c, f);
}
}
int ret;
/* the gi->lock is hold by the caller */
- gi->unbind = 0;
cdev->gadget = gadget;
set_gadget_data(gadget, cdev);
ret = composite_dev_prepare(composite, cdev);
return ret;
}
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static void android_work(struct work_struct *data)
+{
+ struct gadget_info *gi = container_of(data, struct gadget_info, work);
+ struct usb_composite_dev *cdev = &gi->cdev;
+ char *disconnected[2] = { "USB_STATE=DISCONNECTED", NULL };
+ char *connected[2] = { "USB_STATE=CONNECTED", NULL };
+ char *configured[2] = { "USB_STATE=CONFIGURED", NULL };
+ /* 0-connected 1-configured 2-disconnected*/
+ bool status[3] = { false, false, false };
+ unsigned long flags;
+ bool uevent_sent = false;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (cdev->config)
+ status[1] = true;
+
+ if (gi->connected != gi->sw_connected) {
+ if (gi->connected)
+ status[0] = true;
+ else
+ status[2] = true;
+ gi->sw_connected = gi->connected;
+ }
+ spin_unlock_irqrestore(&cdev->lock, flags);
+
+ if (status[0]) {
+ kobject_uevent_env(&gi->dev->kobj,
+ KOBJ_CHANGE, connected);
+ pr_info("%s: sent uevent %s\n", __func__, connected[0]);
+ uevent_sent = true;
+ }
+
+ if (status[1]) {
+ kobject_uevent_env(&gi->dev->kobj,
+ KOBJ_CHANGE, configured);
+ pr_info("%s: sent uevent %s\n", __func__, configured[0]);
+ uevent_sent = true;
+ }
+
+ if (status[2]) {
+ kobject_uevent_env(&gi->dev->kobj,
+ KOBJ_CHANGE, disconnected);
+ pr_info("%s: sent uevent %s\n", __func__, disconnected[0]);
+ uevent_sent = true;
+ }
+
+ if (!uevent_sent) {
+ pr_info("%s: did not send uevent (%d %d %pK)\n", __func__,
+ gi->connected, gi->sw_connected, cdev->config);
+ }
+}
+#endif
+
static void configfs_composite_unbind(struct usb_gadget *gadget)
{
struct usb_composite_dev *cdev;
struct gadget_info *gi;
- unsigned long flags;
/* the gi->lock is hold by the caller */
cdev = get_gadget_data(gadget);
gi = container_of(cdev, struct gadget_info, cdev);
- spin_lock_irqsave(&gi->spinlock, flags);
- gi->unbind = 1;
- spin_unlock_irqrestore(&gi->spinlock, flags);
kfree(otg_desc[0]);
otg_desc[0] = NULL;
purge_configs_funcs(gi);
composite_dev_cleanup(cdev);
usb_ep_autoconfig_reset(cdev->gadget);
- spin_lock_irqsave(&gi->spinlock, flags);
cdev->gadget = NULL;
set_gadget_data(gadget, NULL);
- spin_unlock_irqrestore(&gi->spinlock, flags);
}
-static int configfs_composite_setup(struct usb_gadget *gadget,
- const struct usb_ctrlrequest *ctrl)
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static int android_setup(struct usb_gadget *gadget,
+ const struct usb_ctrlrequest *c)
{
- struct usb_composite_dev *cdev;
- struct gadget_info *gi;
+ struct usb_composite_dev *cdev = get_gadget_data(gadget);
unsigned long flags;
- int ret;
+ struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
+ int value = -EOPNOTSUPP;
+ struct usb_function_instance *fi;
- cdev = get_gadget_data(gadget);
- if (!cdev)
- return 0;
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (!gi->connected) {
+ gi->connected = 1;
+ schedule_work(&gi->work);
+ }
+ spin_unlock_irqrestore(&cdev->lock, flags);
+ list_for_each_entry(fi, &gi->available_func, cfs_list) {
+ if (fi != NULL && fi->f != NULL && fi->f->setup != NULL) {
+ value = fi->f->setup(fi->f, c);
+ if (value >= 0)
+ break;
+ }
+ }
- gi = container_of(cdev, struct gadget_info, cdev);
- spin_lock_irqsave(&gi->spinlock, flags);
- cdev = get_gadget_data(gadget);
- if (!cdev || gi->unbind) {
- spin_unlock_irqrestore(&gi->spinlock, flags);
- return 0;
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+ if (value < 0)
+ value = acc_ctrlrequest(cdev, c);
+#endif
+
+ if (value < 0)
+ value = composite_setup(gadget, c);
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (c->bRequest == USB_REQ_SET_CONFIGURATION &&
+ cdev->config) {
+ schedule_work(&gi->work);
}
+ spin_unlock_irqrestore(&cdev->lock, flags);
- ret = composite_setup(gadget, ctrl);
- spin_unlock_irqrestore(&gi->spinlock, flags);
- return ret;
+ return value;
}
-static void configfs_composite_disconnect(struct usb_gadget *gadget)
+static void android_disconnect(struct usb_gadget *gadget)
{
- struct usb_composite_dev *cdev;
+ struct usb_composite_dev *cdev = get_gadget_data(gadget);
struct gadget_info *gi;
- unsigned long flags;
- cdev = get_gadget_data(gadget);
- if (!cdev)
+ if (!cdev) {
+ pr_err("%s: gadget is not connected\n", __func__);
return;
+ }
gi = container_of(cdev, struct gadget_info, cdev);
- spin_lock_irqsave(&gi->spinlock, flags);
- cdev = get_gadget_data(gadget);
- if (!cdev || gi->unbind) {
- spin_unlock_irqrestore(&gi->spinlock, flags);
+
+ /* FIXME: There's a race between usb_gadget_udc_stop() which is likely
+ * to set the gadget driver to NULL in the udc driver and this drivers
+ * gadget disconnect fn which likely checks for the gadget driver to
+ * be a null ptr. It happens that unbind (doing set_gadget_data(NULL))
+ * is called before the gadget driver is set to NULL and the udc driver
+ * calls disconnect fn which results in cdev being a null ptr.
+ */
+ if (cdev == NULL) {
+ WARN(1, "%s: gadget driver already disconnected\n", __func__);
return;
}
+ /* accessory HID support can be active while the
+ accessory function is not actually enabled,
+ so we need to inform it when we are disconnected.
+ */
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+ acc_disconnect();
+#endif
+ gi->connected = 0;
+ if (!gi->unbinding)
+ schedule_work(&gi->work);
composite_disconnect(gadget);
- spin_unlock_irqrestore(&gi->spinlock, flags);
}
+#endif
+
+static const struct usb_gadget_driver configfs_driver_template = {
+ .bind = configfs_composite_bind,
+ .unbind = configfs_composite_unbind,
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ .setup = android_setup,
+ .reset = android_disconnect,
+ .disconnect = android_disconnect,
+#else
+ .setup = composite_setup,
+ .reset = composite_disconnect,
+ .disconnect = composite_disconnect,
+#endif
+ .suspend = composite_suspend,
+ .resume = composite_resume,
+
+ .max_speed = USB_SPEED_SUPER,
+ .driver = {
+ .owner = THIS_MODULE,
+ .name = "configfs-gadget",
+ },
+};
-static void configfs_composite_suspend(struct usb_gadget *gadget)
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static ssize_t state_show(struct device *pdev, struct device_attribute *attr,
+ char *buf)
{
+ struct gadget_info *dev = dev_get_drvdata(pdev);
struct usb_composite_dev *cdev;
- struct gadget_info *gi;
+ char *state = "DISCONNECTED";
unsigned long flags;
- cdev = get_gadget_data(gadget);
- if (!cdev)
- return;
+ if (!dev)
+ goto out;
- gi = container_of(cdev, struct gadget_info, cdev);
- spin_lock_irqsave(&gi->spinlock, flags);
- cdev = get_gadget_data(gadget);
- if (!cdev || gi->unbind) {
- spin_unlock_irqrestore(&gi->spinlock, flags);
- return;
- }
+ cdev = &dev->cdev;
- composite_suspend(gadget);
- spin_unlock_irqrestore(&gi->spinlock, flags);
+ if (!cdev)
+ goto out;
+
+ spin_lock_irqsave(&cdev->lock, flags);
+ if (cdev->config)
+ state = "CONFIGURED";
+ else if (dev->connected)
+ state = "CONNECTED";
+ spin_unlock_irqrestore(&cdev->lock, flags);
+out:
+ return sprintf(buf, "%s\n", state);
}
-static void configfs_composite_resume(struct usb_gadget *gadget)
-{
- struct usb_composite_dev *cdev;
- struct gadget_info *gi;
- unsigned long flags;
+static DEVICE_ATTR(state, S_IRUGO, state_show, NULL);
- cdev = get_gadget_data(gadget);
- if (!cdev)
- return;
+static struct device_attribute *android_usb_attributes[] = {
+ &dev_attr_state,
+ NULL
+};
- gi = container_of(cdev, struct gadget_info, cdev);
- spin_lock_irqsave(&gi->spinlock, flags);
- cdev = get_gadget_data(gadget);
- if (!cdev || gi->unbind) {
- spin_unlock_irqrestore(&gi->spinlock, flags);
- return;
+static int android_device_create(struct gadget_info *gi)
+{
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
+ char str[10];
+
+ INIT_WORK(&gi->work, android_work);
+ snprintf(str, sizeof(str), "android%d", gadget_index - 1);
+ pr_debug("Creating android device %s\n", str);
+ gi->dev = device_create(android_class, NULL,
+ MKDEV(0, 0), NULL, str);
+ if (IS_ERR(gi->dev))
+ return PTR_ERR(gi->dev);
+
+ dev_set_drvdata(gi->dev, gi);
+ if (gadget_index == 1)
+ android_device = gi->dev;
+
+ attrs = android_usb_attributes;
+ while ((attr = *attrs++)) {
+ int err;
+
+ err = device_create_file(gi->dev, attr);
+ if (err) {
+ device_destroy(gi->dev->class,
+ gi->dev->devt);
+ return err;
+ }
}
- composite_resume(gadget);
- spin_unlock_irqrestore(&gi->spinlock, flags);
+ return 0;
}
-static const struct usb_gadget_driver configfs_driver_template = {
- .bind = configfs_composite_bind,
- .unbind = configfs_composite_unbind,
-
- .setup = configfs_composite_setup,
- .reset = configfs_composite_disconnect,
- .disconnect = configfs_composite_disconnect,
+static void android_device_destroy(struct device *dev)
+{
+ struct device_attribute **attrs;
+ struct device_attribute *attr;
- .suspend = configfs_composite_suspend,
- .resume = configfs_composite_resume,
+ attrs = android_usb_attributes;
+ while ((attr = *attrs++))
+ device_remove_file(dev, attr);
+ device_destroy(dev->class, dev->devt);
+}
+#else
+static inline int android_device_create(struct gadget_info *gi)
+{
+ return 0;
+}
- .max_speed = USB_SPEED_SUPER,
- .driver = {
- .owner = THIS_MODULE,
- .name = "configfs-gadget",
- },
-};
+static inline void android_device_destroy(struct device *dev)
+{
+}
+#endif
static struct config_group *gadgets_make(
struct config_group *group,
gi = kzalloc(sizeof(*gi), GFP_KERNEL);
if (!gi)
return ERR_PTR(-ENOMEM);
-
gi->group.default_groups = gi->default_groups;
gi->group.default_groups[0] = &gi->functions_group;
gi->group.default_groups[1] = &gi->configs_group;
gi->composite.resume = NULL;
gi->composite.max_speed = USB_SPEED_SUPER;
+ spin_lock_init(&gi->spinlock);
mutex_init(&gi->lock);
INIT_LIST_HEAD(&gi->string_list);
INIT_LIST_HEAD(&gi->available_func);
if (!gi->composite.gadget_driver.function)
goto err;
+ gadget_index++;
+ pr_debug("Creating gadget index %d\n", gadget_index);
+ if (android_device_create(gi) < 0)
+ goto err;
+
config_group_init_type_name(&gi->group, name,
&gadget_root_type);
return &gi->group;
+
err:
kfree(gi);
return ERR_PTR(-ENOMEM);
static void gadgets_drop(struct config_group *group, struct config_item *item)
{
+ struct gadget_info *gi;
+
+ gi = container_of(to_config_group(item), struct gadget_info, group);
config_item_put(item);
+ if (gi->dev) {
+ android_device_destroy(gi->dev);
+ gi->dev = NULL;
+ }
}
static struct configfs_group_operations gadgets_ops = {
{
struct gadget_info *gi = to_gadget_info(item);
+ /* to protect race with gadget_dev_desc_UDC_store*/
mutex_lock(&gi->lock);
unregister_gadget(gi);
mutex_unlock(&gi->lock);
config_group_init(&gadget_subsys.su_group);
+ debug_debugfs_init();
+
ret = configfs_register_subsystem(&gadget_subsys);
+
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ android_class = class_create(THIS_MODULE, "android_usb");
+ if (IS_ERR(android_class))
+ return PTR_ERR(android_class);
+#endif
+
return ret;
}
module_init(gadget_cfs_init);
static void __exit gadget_cfs_exit(void)
{
+ debug_debugfs_exit();
configfs_unregister_subsystem(&gadget_subsys);
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+ if (!IS_ERR(android_class))
+ class_destroy(android_class);
+#endif
+
}
module_exit(gadget_cfs_exit);
* Copyright (C) 2003 Al Borchers (alborchers@steinerpoint.com)
* Copyright (C) 2008 David Brownell
* Copyright (C) 2008 by Nokia Corporation
+ * Copyright (c) 2013-2017 The Linux Foundation. All rights reserved.
*
* This code also borrows from usbserial.c, which is
* Copyright (C) 1999 - 2002 Greg Kroah-Hartman (greg@kroah.com)
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/workqueue.h>
#include "u_serial.h"
* next layer of buffering. For TX that's a circular buffer; for RX
* consider it a NOP. A third layer is provided by the TTY code.
*/
-#define QUEUE_SIZE 16
+#define TX_QUEUE_SIZE 8
+#define TX_BUF_SIZE 4096
#define WRITE_BUF_SIZE 8192 /* TX only */
+#define RX_QUEUE_SIZE 8
+#define RX_BUF_SIZE 4096
+
/* circular buffer */
struct gs_buf {
unsigned buf_size;
int read_allocated;
struct list_head read_queue;
unsigned n_read;
- struct tasklet_struct push;
+ struct work_struct push;
struct list_head write_pool;
int write_started;
/* REVISIT this state ... */
struct usb_cdc_line_coding port_line_coding; /* 8-N-1 etc */
+ unsigned long nbytes_from_host;
+ unsigned long nbytes_to_tty;
+ unsigned long nbytes_from_tty;
+ unsigned long nbytes_to_host;
};
static struct portmaster {
struct gs_port *port;
} ports[MAX_U_SERIAL_PORTS];
+static struct workqueue_struct *gserial_wq;
#define GS_CLOSE_TIMEOUT 15 /* seconds */
__acquires(&port->port_lock)
*/
{
- struct list_head *pool = &port->write_pool;
+ struct list_head *pool;
struct usb_ep *in;
int status = 0;
+ static long prev_len;
bool do_tty_wake = false;
- if (!port->port_usb)
- return status;
+ if (!port || !port->port_usb) {
+ pr_err("Error - port or port->usb is NULL.");
+ return -EIO;
+ }
- in = port->port_usb->in;
+ pool = &port->write_pool;
+ in = port->port_usb->in;
while (!port->write_busy && !list_empty(pool)) {
struct usb_request *req;
int len;
- if (port->write_started >= QUEUE_SIZE)
+ if (port->write_started >= TX_QUEUE_SIZE)
break;
req = list_entry(pool->next, struct usb_request, list);
- len = gs_send_packet(port, req->buf, in->maxpacket);
+ len = gs_send_packet(port, req->buf, TX_BUF_SIZE);
if (len == 0) {
+ /* Queue zero length packet explicitly to make it
+ * work with UDCs which don't support req->zero flag
+ */
+ if (prev_len && (prev_len % in->maxpacket == 0)) {
+ req->length = 0;
+ list_del(&req->list);
+ spin_unlock(&port->port_lock);
+ status = usb_ep_queue(in, req, GFP_ATOMIC);
+ spin_lock(&port->port_lock);
+ if (!port->port_usb) {
+ gs_free_req(in, req);
+ break;
+ }
+ if (status) {
+ printk(KERN_ERR "%s: %s err %d\n",
+ __func__, "queue", status);
+ list_add(&req->list, pool);
+ }
+ prev_len = 0;
+ }
wake_up_interruptible(&port->drain_wait);
break;
}
req->length = len;
list_del(&req->list);
- req->zero = (gs_buf_data_avail(&port->port_write_buf) == 0);
pr_vdebug("ttyGS%d: tx len=%d, 0x%02x 0x%02x 0x%02x ...\n",
port->port_num, len, *((u8 *)req->buf),
status = usb_ep_queue(in, req, GFP_ATOMIC);
spin_lock(&port->port_lock);
port->write_busy = false;
+ /*
+ * If port_usb is NULL, gserial disconnect is called
+ * while the spinlock is dropped and all requests are
+ * freed. Free the current request here.
+ */
+ if (!port->port_usb) {
+ do_tty_wake = false;
+ gs_free_req(in, req);
+ break;
+ }
if (status) {
pr_debug("%s: %s %s err %d\n",
break;
}
- port->write_started++;
+ prev_len = req->length;
+ port->nbytes_from_tty += req->length;
- /* abort immediately after disconnect */
- if (!port->port_usb)
- break;
+ port->write_started++;
}
if (do_tty_wake && port->port.tty)
__acquires(&port->port_lock)
*/
{
- struct list_head *pool = &port->read_pool;
- struct usb_ep *out = port->port_usb->out;
+ struct list_head *pool;
+ struct usb_ep *out;
+ unsigned started = 0;
+
+ if (!port || !port->port_usb) {
+ pr_err("Error - port or port->usb is NULL.");
+ return -EIO;
+ }
+
+ pool = &port->read_pool;
+ out = port->port_usb->out;
while (!list_empty(pool)) {
struct usb_request *req;
if (!tty)
break;
- if (port->read_started >= QUEUE_SIZE)
+ if (port->read_started >= RX_QUEUE_SIZE)
break;
req = list_entry(pool->next, struct usb_request, list);
list_del(&req->list);
- req->length = out->maxpacket;
+ req->length = RX_BUF_SIZE;
/* drop lock while we call out; the controller driver
* may need to call us back (e.g. for disconnect)
status = usb_ep_queue(out, req, GFP_ATOMIC);
spin_lock(&port->port_lock);
+ /*
+ * If port_usb is NULL, gserial disconnect is called
+ * while the spinlock is dropped and all requests are
+ * freed. Free the current request here.
+ */
+ if (!port->port_usb) {
+ started = 0;
+ gs_free_req(out, req);
+ break;
+ }
+
if (status) {
pr_debug("%s: %s %s err %d\n",
__func__, "queue", out->name, status);
break;
}
port->read_started++;
-
- /* abort immediately after disconnect */
- if (!port->port_usb)
- break;
}
return port->read_started;
}
* So QUEUE_SIZE packets plus however many the FIFO holds (usually two)
* can be buffered before the TTY layer's buffers (currently 64 KB).
*/
-static void gs_rx_push(unsigned long _port)
+static void gs_rx_push(struct work_struct *w)
{
- struct gs_port *port = (void *)_port;
+ struct gs_port *port = container_of(w, struct gs_port, push);
struct tty_struct *tty;
struct list_head *queue = &port->read_queue;
bool disconnect = false;
count = tty_insert_flip_string(&port->port, packet,
size);
+ port->nbytes_to_tty += count;
if (count)
do_push = true;
if (count != size) {
* this time around, there may be trouble unless there's an
* implicit tty_unthrottle() call on its way...
*
- * REVISIT we should probably add a timer to keep the tasklet
+ * REVISIT we should probably add a timer to keep the work queue
* from starving ... but it's not clear that case ever happens.
*/
if (!list_empty(queue) && tty) {
if (!test_bit(TTY_THROTTLED, &tty->flags)) {
if (do_push)
- tasklet_schedule(&port->push);
+ queue_work(gserial_wq, &port->push);
else
pr_warn("ttyGS%d: RX not scheduled?\n",
port->port_num);
static void gs_read_complete(struct usb_ep *ep, struct usb_request *req)
{
struct gs_port *port = ep->driver_data;
+ unsigned long flags;
/* Queue all received data until the tty layer is ready for it. */
- spin_lock(&port->port_lock);
+ spin_lock_irqsave(&port->port_lock, flags);
+ port->nbytes_from_host += req->actual;
list_add_tail(&req->list, &port->read_queue);
- tasklet_schedule(&port->push);
- spin_unlock(&port->port_lock);
+ queue_work(gserial_wq, &port->push);
+ spin_unlock_irqrestore(&port->port_lock, flags);
}
static void gs_write_complete(struct usb_ep *ep, struct usb_request *req)
{
struct gs_port *port = ep->driver_data;
+ unsigned long flags;
- spin_lock(&port->port_lock);
+ spin_lock_irqsave(&port->port_lock, flags);
+ port->nbytes_to_host += req->actual;
list_add(&req->list, &port->write_pool);
port->write_started--;
/* FALL THROUGH */
case 0:
/* normal completion */
- gs_start_tx(port);
+ if (port->port_usb)
+ gs_start_tx(port);
break;
case -ESHUTDOWN:
break;
}
- spin_unlock(&port->port_lock);
+ spin_unlock_irqrestore(&port->port_lock, flags);
}
static void gs_free_requests(struct usb_ep *ep, struct list_head *head,
}
static int gs_alloc_requests(struct usb_ep *ep, struct list_head *head,
+ int queue_size, int req_size,
void (*fn)(struct usb_ep *, struct usb_request *),
int *allocated)
{
int i;
struct usb_request *req;
- int n = allocated ? QUEUE_SIZE - *allocated : QUEUE_SIZE;
+ int n = allocated ? queue_size - *allocated : queue_size;
/* Pre-allocate up to QUEUE_SIZE transfers, but if we can't
* do quite that many this time, don't fail ... we just won't
* be as speedy as we might otherwise be.
*/
for (i = 0; i < n; i++) {
- req = gs_alloc_req(ep, ep->maxpacket, GFP_ATOMIC);
+ req = gs_alloc_req(ep, req_size, GFP_ATOMIC);
if (!req)
return list_empty(head) ? -ENOMEM : 0;
req->complete = fn;
*/
static int gs_start_io(struct gs_port *port)
{
- struct list_head *head = &port->read_pool;
- struct usb_ep *ep = port->port_usb->out;
+ struct list_head *head;
+ struct usb_ep *ep;
int status;
unsigned started;
+ if (!port || !port->port_usb) {
+ pr_err("Error - port or port->usb is NULL.");
+ return -EIO;
+ }
+
+ head = &port->read_pool;
+ ep = port->port_usb->out;
+
/* Allocate RX and TX I/O buffers. We can't easily do this much
* earlier (with GFP_KERNEL) because the requests are coupled to
* endpoints, as are the packet sizes we'll be using. Different
* configurations may use different endpoints with a given port;
* and high speed vs full speed changes packet sizes too.
*/
- status = gs_alloc_requests(ep, head, gs_read_complete,
- &port->read_allocated);
+ status = gs_alloc_requests(ep, head, RX_QUEUE_SIZE, RX_BUF_SIZE,
+ gs_read_complete, &port->read_allocated);
if (status)
return status;
status = gs_alloc_requests(port->port_usb->in, &port->write_pool,
+ TX_QUEUE_SIZE, TX_BUF_SIZE,
gs_write_complete, &port->write_allocated);
if (status) {
gs_free_requests(ep, head, &port->read_allocated);
port->n_read = 0;
started = gs_start_rx(port);
+ if (!port->port_usb)
+ return -EIO;
+
/* unblock any pending writes into our circular buffer */
if (started) {
tty_wakeup(port->port.tty);
spin_lock_irq(&port->port_lock);
if (status) {
- pr_debug("gs_open: ttyGS%d (%p,%p) no buffer\n",
+ pr_debug("gs_open: ttyGS%d (%pK,%pK) no buffer\n",
port->port_num, tty, file);
port->openclose = false;
goto exit_unlock_port;
gser->connect(gser);
}
- pr_debug("gs_open: ttyGS%d (%p,%p)\n", port->port_num, tty, file);
+ pr_debug("gs_open: ttyGS%d (%pK,%pK)\n", port->port_num, tty, file);
status = 0;
goto exit;
}
- pr_debug("gs_close: ttyGS%d (%p,%p) ...\n", port->port_num, tty, file);
+ pr_debug("gs_close: ttyGS%d (%pK,%pK) ...\n",
+ port->port_num, tty, file);
/* mark port as closing but in use; we can drop port lock
* and sleep if necessary
/* Iff we're disconnected, there can be no I/O in flight so it's
* ok to free the circular buffer; else just scrub it. And don't
- * let the push tasklet fire again until we're re-opened.
+ * let the push work queue fire again until we're re-opened.
*/
if (gser == NULL)
gs_buf_free(&port->port_write_buf);
port->openclose = false;
- pr_debug("gs_close: ttyGS%d (%p,%p) done!\n",
+ pr_debug("gs_close: ttyGS%d (%pK,%pK) done!\n",
port->port_num, tty, file);
wake_up(&port->close_wait);
unsigned long flags;
int status;
- pr_vdebug("gs_write: ttyGS%d (%p) writing %d bytes\n",
+ if (!port)
+ return 0;
+
+ pr_vdebug("gs_write: ttyGS%d (%pK) writing %d bytes\n",
port->port_num, tty, count);
spin_lock_irqsave(&port->port_lock, flags);
unsigned long flags;
int status;
- pr_vdebug("gs_put_char: (%d,%p) char=0x%x, called from %ps\n",
+ if (!port)
+ return 0;
+ pr_vdebug("gs_put_char: (%d,%pK) char=0x%x, called from %pKs\n",
port->port_num, tty, ch, __builtin_return_address(0));
spin_lock_irqsave(&port->port_lock, flags);
struct gs_port *port = tty->driver_data;
unsigned long flags;
- pr_vdebug("gs_flush_chars: (%d,%p)\n", port->port_num, tty);
+ if (!port)
+ return;
+ pr_vdebug("gs_flush_chars: (%d,%pK)\n", port->port_num, tty);
spin_lock_irqsave(&port->port_lock, flags);
if (port->port_usb)
unsigned long flags;
int room = 0;
+ if (!port)
+ return 0;
spin_lock_irqsave(&port->port_lock, flags);
if (port->port_usb)
room = gs_buf_space_avail(&port->port_write_buf);
spin_unlock_irqrestore(&port->port_lock, flags);
- pr_vdebug("gs_write_room: (%d,%p) room=%d\n",
+ pr_vdebug("gs_write_room: (%d,%pK) room=%d\n",
port->port_num, tty, room);
return room;
chars = gs_buf_data_avail(&port->port_write_buf);
spin_unlock_irqrestore(&port->port_lock, flags);
- pr_vdebug("gs_chars_in_buffer: (%d,%p) chars=%d\n",
+ pr_vdebug("gs_chars_in_buffer: (%d,%pK) chars=%d\n",
port->port_num, tty, chars);
return chars;
struct gs_port *port = tty->driver_data;
unsigned long flags;
+ /*
+ * tty's driver data is set to NULL during port close. Nothing
+ * to do here.
+ */
+ if (!port)
+ return;
+
spin_lock_irqsave(&port->port_lock, flags);
if (port->port_usb) {
/* Kickstart read queue processing. We don't do xon/xoff,
* rts/cts, or other handshaking with the host, but if the
* read queue backs up enough we'll be NAKing OUT packets.
*/
- tasklet_schedule(&port->push);
+ queue_work(gserial_wq, &port->push);
pr_vdebug("ttyGS%d: unthrottle\n", port->port_num);
}
spin_unlock_irqrestore(&port->port_lock, flags);
int status = 0;
struct gserial *gser;
+ if (!port)
+ return 0;
pr_vdebug("gs_break_ctl: ttyGS%d, send break (%d) \n",
port->port_num, duration);
return status;
}
+static int gs_tiocmget(struct tty_struct *tty)
+{
+ struct gs_port *port = tty->driver_data;
+ struct gserial *gser;
+ unsigned int result = 0;
+
+ spin_lock_irq(&port->port_lock);
+ gser = port->port_usb;
+ if (!gser) {
+ result = -ENODEV;
+ goto fail;
+ }
+
+ if (gser->get_dtr)
+ result |= (gser->get_dtr(gser) ? TIOCM_DTR : 0);
+
+ if (gser->get_rts)
+ result |= (gser->get_rts(gser) ? TIOCM_RTS : 0);
+
+ if (gser->serial_state & TIOCM_CD)
+ result |= TIOCM_CD;
+
+ if (gser->serial_state & TIOCM_RI)
+ result |= TIOCM_RI;
+
+fail:
+ spin_unlock_irq(&port->port_lock);
+ return result;
+}
+
+static int gs_tiocmset(struct tty_struct *tty,
+ unsigned int set, unsigned int clear)
+{
+ struct gs_port *port = tty->driver_data;
+ struct gserial *gser;
+ int status = 0;
+
+ spin_lock_irq(&port->port_lock);
+ gser = port->port_usb;
+
+ if (!gser) {
+ status = -ENODEV;
+ goto fail;
+ }
+
+ if (set & TIOCM_RI) {
+ if (gser->send_ring_indicator) {
+ gser->serial_state |= TIOCM_RI;
+ status = gser->send_ring_indicator(gser, 1);
+ }
+ }
+
+ if (clear & TIOCM_RI) {
+ if (gser->send_ring_indicator) {
+ gser->serial_state &= ~TIOCM_RI;
+ status = gser->send_ring_indicator(gser, 0);
+ }
+ }
+
+ if (set & TIOCM_CD) {
+ if (gser->send_carrier_detect) {
+ gser->serial_state |= TIOCM_CD;
+ status = gser->send_carrier_detect(gser, 1);
+ }
+ }
+
+ if (clear & TIOCM_CD) {
+ if (gser->send_carrier_detect) {
+ gser->serial_state &= ~TIOCM_CD;
+ status = gser->send_carrier_detect(gser, 0);
+ }
+ }
+fail:
+ spin_unlock_irq(&port->port_lock);
+ return status;
+}
+
static const struct tty_operations gs_tty_ops = {
.open = gs_open,
.close = gs_close,
.chars_in_buffer = gs_chars_in_buffer,
.unthrottle = gs_unthrottle,
.break_ctl = gs_break_ctl,
+ .tiocmget = gs_tiocmget,
+ .tiocmset = gs_tiocmset,
};
/*-------------------------------------------------------------------------*/
init_waitqueue_head(&port->drain_wait);
init_waitqueue_head(&port->close_wait);
- tasklet_init(&port->push, gs_rx_push, (unsigned long) port);
+ INIT_WORK(&port->push, gs_rx_push);
INIT_LIST_HEAD(&port->read_pool);
INIT_LIST_HEAD(&port->read_queue);
return ret;
}
+#if defined(CONFIG_DEBUG_FS)
+
+#define BUF_SIZE 512
+
+static ssize_t debug_read_status(struct file *file, char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct gs_port *ui_dev = file->private_data;
+ struct tty_struct *tty;
+ struct gserial *gser;
+ char *buf;
+ unsigned long flags;
+ int i = 0;
+ int ret;
+ int result = 0;
+
+ if (!ui_dev)
+ return -EINVAL;
+
+ tty = ui_dev->port.tty;
+ gser = ui_dev->port_usb;
+
+ buf = kzalloc(sizeof(char) * BUF_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&ui_dev->port_lock, flags);
+
+ i += scnprintf(buf + i, BUF_SIZE - i,
+ "nbytes_from_host: %lu\n", ui_dev->nbytes_from_host);
+
+ i += scnprintf(buf + i, BUF_SIZE - i,
+ "nbytes_to_tty: %lu\n", ui_dev->nbytes_to_tty);
+
+ i += scnprintf(buf + i, BUF_SIZE - i, "nbytes_with_usb_OUT_txr: %lu\n",
+ (ui_dev->nbytes_from_host - ui_dev->nbytes_to_tty));
+
+ i += scnprintf(buf + i, BUF_SIZE - i,
+ "nbytes_from_tty: %lu\n", ui_dev->nbytes_from_tty);
+
+ i += scnprintf(buf + i, BUF_SIZE - i,
+ "nbytes_to_host: %lu\n", ui_dev->nbytes_to_host);
+
+ i += scnprintf(buf + i, BUF_SIZE - i, "nbytes_with_usb_IN_txr: %lu\n",
+ (ui_dev->nbytes_from_tty - ui_dev->nbytes_to_host));
+
+ if (tty)
+ i += scnprintf(buf + i, BUF_SIZE - i,
+ "tty_flags: %lu\n", tty->flags);
+
+ if (gser->get_dtr) {
+ result |= (gser->get_dtr(gser) ? TIOCM_DTR : 0);
+ i += scnprintf(buf + i, BUF_SIZE - i,
+ "DTR_status: %d\n", result);
+ }
+
+ spin_unlock_irqrestore(&ui_dev->port_lock, flags);
+ ret = simple_read_from_buffer(ubuf, count, ppos, buf, i);
+ kfree(buf);
+ return ret;
+}
+
+static ssize_t debug_write_reset(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct gs_port *ui_dev = file->private_data;
+ unsigned long flags;
+
+ if (!ui_dev)
+ return -EINVAL;
+
+ spin_lock_irqsave(&ui_dev->port_lock, flags);
+ ui_dev->nbytes_from_host = ui_dev->nbytes_to_tty =
+ ui_dev->nbytes_from_tty = ui_dev->nbytes_to_host = 0;
+ spin_unlock_irqrestore(&ui_dev->port_lock, flags);
+
+ return count;
+}
+
+static int serial_debug_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+ return 0;
+}
+
+const struct file_operations debug_rst_ops = {
+ .open = serial_debug_open,
+ .write = debug_write_reset,
+};
+
+const struct file_operations debug_adb_ops = {
+ .open = serial_debug_open,
+ .read = debug_read_status,
+};
+
+struct dentry *gs_dent;
+static void usb_debugfs_init(struct gs_port *ui_dev, int port_num)
+{
+ char buf[48];
+
+ if (!ui_dev)
+ return;
+
+ snprintf(buf, 48, "usb_serial%d", port_num);
+ gs_dent = debugfs_create_dir(buf, 0);
+ if (!gs_dent || IS_ERR(gs_dent))
+ return;
+
+ debugfs_create_file("readstatus", 0444, gs_dent, ui_dev,
+ &debug_adb_ops);
+ debugfs_create_file("reset", S_IRUGO | S_IWUSR,
+ gs_dent, ui_dev, &debug_rst_ops);
+}
+
+static void usb_debugfs_remove(void)
+{
+ debugfs_remove_recursive(gs_dent);
+}
+#else
+static inline void usb_debugfs_init(struct gs_port *ui_dev, int port_num) {}
+static inline void usb_debugfs_remove(void) {}
+#endif
+
static int gs_closed(struct gs_port *port)
{
int cond;
static void gserial_free_port(struct gs_port *port)
{
- tasklet_kill(&port->push);
+ cancel_work_sync(&port->push);
/* wait for old opens to finish */
wait_event(port->close_wait, gs_closed(port));
WARN_ON(port->port_usb != NULL);
__func__, port_num, PTR_ERR(tty_dev));
ret = PTR_ERR(tty_dev);
+ mutex_lock(&ports[port_num].lock);
port = ports[port_num].port;
ports[port_num].port = NULL;
+ mutex_unlock(&ports[port_num].lock);
gserial_free_port(port);
goto err;
}
port->read_allocated = port->read_started =
port->write_allocated = port->write_started = 0;
+ port->nbytes_from_host = port->nbytes_to_tty =
+ port->nbytes_from_tty = port->nbytes_to_host = 0;
+
spin_unlock_irqrestore(&port->port_lock, flags);
}
EXPORT_SYMBOL_GPL(gserial_disconnect);
gs_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
gs_tty_driver->subtype = SERIAL_TYPE_NORMAL;
- gs_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
+ gs_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV
+ | TTY_DRIVER_RESET_TERMIOS;
gs_tty_driver->init_termios = tty_std_termios;
/* 9600-8-N-1 ... matches defaults expected by "usbser.sys" on
for (i = 0; i < MAX_U_SERIAL_PORTS; i++)
mutex_init(&ports[i].lock);
+ gserial_wq = create_singlethread_workqueue("k_gserial");
+ if (!gserial_wq) {
+ status = -ENOMEM;
+ goto fail;
+ }
+
/* export the driver ... */
status = tty_register_driver(gs_tty_driver);
if (status) {
goto fail;
}
+ for (i = 0; i < MAX_U_SERIAL_PORTS; i++)
+ usb_debugfs_init(ports[i].port, i);
+
pr_debug("%s: registered %d ttyGS* device%s\n", __func__,
MAX_U_SERIAL_PORTS,
(MAX_U_SERIAL_PORTS == 1) ? "" : "s");
return status;
fail:
put_tty_driver(gs_tty_driver);
+ if (gserial_wq)
+ destroy_workqueue(gserial_wq);
gs_tty_driver = NULL;
return status;
}
static void userial_cleanup(void)
{
+ usb_debugfs_remove();
+ destroy_workqueue(gserial_wq);
tty_unregister_driver(gs_tty_driver);
put_tty_driver(gs_tty_driver);
gs_tty_driver = NULL;
* Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-
+#include <linux/gfp.h>
#include <linux/slab.h>
#include <asm/unaligned.h>
int i;
ret = 0;
- virt_dev = xhci->devs[slot_id];
- if (!virt_dev)
- return -ENODEV;
-
cmd = xhci_alloc_command(xhci, false, true, GFP_NOIO);
if (!cmd) {
xhci_dbg(xhci, "Couldn't allocate command structure.\n");
}
spin_lock_irqsave(&xhci->lock, flags);
+ virt_dev = xhci->devs[slot_id];
+ if (!virt_dev) {
+ spin_unlock_irqrestore(&xhci->lock, flags);
+ xhci_free_command(xhci, cmd);
+ return -ENODEV;
+ }
+
for (i = LAST_EP_INDEX; i > 0; i--) {
if (virt_dev->eps[i].ring && virt_dev->eps[i].ring->dequeue) {
struct xhci_command *command;
struct xhci_bus_state *bus_state,
__le32 __iomem **port_array,
u16 wIndex, u32 raw_port_status,
- unsigned long flags)
+ unsigned long *flags)
__releases(&xhci->lock)
__acquires(&xhci->lock)
{
status |= USB_PORT_STAT_C_BH_RESET << 16;
if ((raw_port_status & PORT_CEC))
status |= USB_PORT_STAT_C_CONFIG_ERROR << 16;
+
+ /* USB3 remote wake resume signaling completed */
+ if (bus_state->port_remote_wakeup & (1 << wIndex) &&
+ (raw_port_status & PORT_PLS_MASK) != XDEV_RESUME &&
+ (raw_port_status & PORT_PLS_MASK) != XDEV_RECOVERY) {
+ bus_state->port_remote_wakeup &= ~(1 << wIndex);
+ usb_hcd_end_port_resume(&hcd->self, wIndex);
+ }
}
if (hcd->speed < HCD_USB3) {
xhci_set_link_state(xhci, port_array, wIndex,
XDEV_U0);
- spin_unlock_irqrestore(&xhci->lock, flags);
+ spin_unlock_irqrestore(&xhci->lock, *flags);
time_left = wait_for_completion_timeout(
&bus_state->rexit_done[wIndex],
msecs_to_jiffies(
XHCI_MAX_REXIT_TIMEOUT_MS));
- spin_lock_irqsave(&xhci->lock, flags);
+ spin_lock_irqsave(&xhci->lock, *flags);
if (time_left) {
slot_id = xhci_find_slot_id_by_port(hcd,
return status;
}
+static void xhci_single_step_completion(struct urb *urb)
+{
+ struct completion *done = urb->context;
+
+ complete(done);
+}
+
+/*
+ * Allocate a URB and initialize the various fields of it.
+ * This API is used by the single_step_set_feature test of
+ * EHSET where IN packet of the GetDescriptor request is
+ * sent 15secs after the SETUP packet.
+ * Return NULL if failed.
+ */
+static struct urb *xhci_request_single_step_set_feature_urb(
+ struct usb_device *udev,
+ void *dr,
+ void *buf,
+ struct completion *done)
+{
+ struct urb *urb;
+ struct usb_hcd *hcd = bus_to_hcd(udev->bus);
+ struct usb_host_endpoint *ep;
+
+ urb = usb_alloc_urb(0, GFP_KERNEL);
+ if (!urb)
+ return NULL;
+
+ urb->pipe = usb_rcvctrlpipe(udev, 0);
+ ep = udev->ep_in[usb_pipeendpoint(urb->pipe)];
+ if (!ep) {
+ usb_free_urb(urb);
+ return NULL;
+ }
+
+ /*
+ * Initialize the various URB fields as these are used by the HCD
+ * driver to queue it and as well as when completion happens.
+ */
+ urb->ep = ep;
+ urb->dev = udev;
+ urb->setup_packet = dr;
+ urb->transfer_buffer = buf;
+ urb->transfer_buffer_length = USB_DT_DEVICE_SIZE;
+ urb->complete = xhci_single_step_completion;
+ urb->status = -EINPROGRESS;
+ urb->actual_length = 0;
+ urb->transfer_flags = URB_DIR_IN;
+ usb_get_urb(urb);
+ atomic_inc(&urb->use_count);
+ atomic_inc(&urb->dev->urbnum);
+ usb_hcd_map_urb_for_dma(hcd, urb, GFP_KERNEL);
+ urb->context = done;
+ return urb;
+}
+
+/*
+ * This function implements the USB_PORT_FEAT_TEST handling of the
+ * SINGLE_STEP_SET_FEATURE test mode as defined in the Embedded
+ * High-Speed Electrical Test (EHSET) specification. This simply
+ * issues a GetDescriptor control transfer, with an inserted 15-second
+ * delay after the end of the SETUP stage and before the IN token of
+ * the DATA stage is set. The idea is that this gives the test operator
+ * enough time to configure the oscilloscope to perform a measurement
+ * of the response time between the DATA and ACK packets that follow.
+ */
+static int xhci_ehset_single_step_set_feature(struct usb_hcd *hcd, int port)
+{
+ int retval;
+ struct usb_ctrlrequest *dr;
+ struct urb *urb;
+ struct usb_device *udev;
+ struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+ struct usb_device_descriptor *buf;
+ unsigned long flags;
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ /* Obtain udev of the rhub's child port */
+ udev = usb_hub_find_child(hcd->self.root_hub, port);
+ if (!udev) {
+ xhci_err(xhci, "No device attached to the RootHub\n");
+ return -ENODEV;
+ }
+ buf = kmalloc(USB_DT_DEVICE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ dr = kmalloc(sizeof(struct usb_ctrlrequest), GFP_KERNEL);
+ if (!dr) {
+ kfree(buf);
+ return -ENOMEM;
+ }
+
+ /* Fill Setup packet for GetDescriptor */
+ dr->bRequestType = USB_DIR_IN;
+ dr->bRequest = USB_REQ_GET_DESCRIPTOR;
+ dr->wValue = cpu_to_le16(USB_DT_DEVICE << 8);
+ dr->wIndex = 0;
+ dr->wLength = cpu_to_le16(USB_DT_DEVICE_SIZE);
+ urb = xhci_request_single_step_set_feature_urb(udev, dr, buf, &done);
+ if (!urb) {
+ retval = -ENOMEM;
+ goto cleanup;
+ }
+
+ /* Now complete just the SETUP stage */
+ spin_lock_irqsave(&xhci->lock, flags);
+ retval = xhci_submit_single_step_set_feature(hcd, urb, 1);
+ spin_unlock_irqrestore(&xhci->lock, flags);
+ if (retval)
+ goto out1;
+
+ if (!wait_for_completion_timeout(&done, msecs_to_jiffies(2000))) {
+ usb_kill_urb(urb);
+ retval = -ETIMEDOUT;
+ xhci_err(xhci, "%s SETUP stage timed out on ep0\n", __func__);
+ goto out1;
+ }
+
+ /* Sleep for 15 seconds; HC will send SOFs during this period */
+ msleep(15 * 1000);
+
+ /* Complete remaining DATA and status stages. Re-use same URB */
+ urb->status = -EINPROGRESS;
+ usb_get_urb(urb);
+ atomic_inc(&urb->use_count);
+ atomic_inc(&urb->dev->urbnum);
+
+ spin_lock_irqsave(&xhci->lock, flags);
+ retval = xhci_submit_single_step_set_feature(hcd, urb, 0);
+ spin_unlock_irqrestore(&xhci->lock, flags);
+ if (!retval && !wait_for_completion_timeout(&done,
+ msecs_to_jiffies(2000))) {
+ usb_kill_urb(urb);
+ retval = -ETIMEDOUT;
+ xhci_err(xhci, "%s IN stage timed out on ep0\n", __func__);
+ }
+out1:
+ usb_free_urb(urb);
+cleanup:
+ kfree(dr);
+ kfree(buf);
+ return retval;
+}
+
int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
u16 wIndex, char *buf, u16 wLength)
{
u16 link_state = 0;
u16 wake_mask = 0;
u16 timeout = 0;
+ u16 test_mode = 0;
max_ports = xhci_get_ports(hcd, &port_array);
bus_state = &xhci->bus_state[hcd_index(hcd)];
break;
}
status = xhci_get_port_status(hcd, bus_state, port_array,
- wIndex, temp, flags);
+ wIndex, temp, &flags);
if (status == 0xffffffff)
goto error;
link_state = (wIndex & 0xff00) >> 3;
if (wValue == USB_PORT_FEAT_REMOTE_WAKE_MASK)
wake_mask = wIndex & 0xff00;
- /* The MSB of wIndex is the U1/U2 timeout */
- timeout = (wIndex & 0xff00) >> 8;
+ /* The MSB of wIndex is the U1/U2 timeout OR TEST mode*/
+ test_mode = timeout = (wIndex & 0xff00) >> 8;
wIndex &= 0xff;
if (!wIndex || wIndex > max_ports)
goto error;
temp = readl(port_array[wIndex]);
break;
}
+
+ /*
+ * For xHCI 1.1 according to section 4.19.1.2.4.1 a
+ * root hub port's transition to compliance mode upon
+ * detecting LFPS timeout may be controlled by an
+ * Compliance Transition Enabled (CTE) flag (not
+ * software visible). This flag is set by writing 0xA
+ * to PORTSC PLS field which will allow transition to
+ * compliance mode the next time LFPS timeout is
+ * encountered. A warm reset will clear it.
+ *
+ * The CTE flag is only supported if the HCCPARAMS2 CTC
+ * flag is set, otherwise, the compliance substate is
+ * automatically entered as on 1.0 and prior.
+ */
+ if (link_state == USB_SS_PORT_LS_COMP_MOD) {
+ if (!HCC2_CTC(xhci->hcc_params2)) {
+ xhci_dbg(xhci, "CTC flag is 0, port already supports entering compliance mode\n");
+ break;
+ }
+
+ if ((temp & PORT_CONNECT)) {
+ xhci_warn(xhci, "Can't set compliance mode when port is connected\n");
+ goto error;
+ }
+
+ xhci_dbg(xhci, "Enable compliance mode transition for port %d\n",
+ wIndex);
+ xhci_set_link_state(xhci, port_array, wIndex,
+ link_state);
+ temp = readl(port_array[wIndex]);
+ break;
+ }
+
/* Port must be enabled */
if (!(temp & PORT_PE)) {
retval = -ENODEV;
temp |= PORT_U2_TIMEOUT(timeout);
writel(temp, port_array[wIndex] + PORTPMSC);
break;
+ case USB_PORT_FEAT_TEST:
+ slot_id = xhci_find_slot_id_by_port(hcd, xhci,
+ wIndex + 1);
+ if (test_mode && test_mode <= 5) {
+ /* unlock to execute stop endpoint commands */
+ spin_unlock_irqrestore(&xhci->lock, flags);
+ xhci_stop_device(xhci, slot_id, 1);
+ spin_lock_irqsave(&xhci->lock, flags);
+ xhci_halt(xhci);
+
+ temp = readl_relaxed(port_array[wIndex] +
+ PORTPMSC);
+ temp |= test_mode << 28;
+ writel_relaxed(temp, port_array[wIndex] +
+ PORTPMSC);
+ /* to make sure above write goes through */
+ mb();
+ } else if (test_mode == 6) {
+ spin_unlock_irqrestore(&xhci->lock, flags);
+ retval = xhci_ehset_single_step_set_feature(hcd,
+ wIndex);
+ spin_lock_irqsave(&xhci->lock, flags);
+ } else {
+ goto error;
+ }
+ break;
default:
goto error;
}
xhci_set_link_state(xhci, port_array, wIndex,
XDEV_RESUME);
spin_unlock_irqrestore(&xhci->lock, flags);
- msleep(USB_RESUME_TIMEOUT);
+ usleep_range(21000, 21500);
spin_lock_irqsave(&xhci->lock, flags);
xhci_set_link_state(xhci, port_array, wIndex,
XDEV_U0);
if (need_usb2_u3_exit) {
spin_unlock_irqrestore(&xhci->lock, flags);
- msleep(USB_RESUME_TIMEOUT);
+ usleep_range(21000, 21500);
spin_lock_irqsave(&xhci->lock, flags);
}
/* Point to output device context in dcbaa. */
xhci->dcbaa->dev_context_ptrs[slot_id] = cpu_to_le64(dev->out_ctx->dma);
- xhci_dbg(xhci, "Set slot id %d dcbaa entry %p to 0x%llx\n",
+ xhci_dbg(xhci, "Set slot id %d dcbaa entry %pK to 0x%llx\n",
slot_id,
&xhci->dcbaa->dev_context_ptrs[slot_id],
le64_to_cpu(xhci->dcbaa->dev_context_ptrs[slot_id]));
if (udev->tt->multi)
slot_ctx->dev_info |= cpu_to_le32(DEV_MTT);
}
- xhci_dbg(xhci, "udev->tt = %p\n", udev->tt);
+ xhci_dbg(xhci, "udev->tt = %pK\n", udev->tt);
xhci_dbg(xhci, "udev->ttport = 0x%x\n", udev->ttport);
/* Step 4 - ring already allocated */
}
break;
case USB_SPEED_FULL:
+ if (usb_endpoint_xfer_bulk(&ep->desc) && max_packet < 8)
+ max_packet = 8;
case USB_SPEED_LOW:
break;
default:
kfree(command);
}
-void xhci_mem_cleanup(struct xhci_hcd *xhci)
+void xhci_handle_sec_intr_events(struct xhci_hcd *xhci, int intr_num)
{
+ union xhci_trb *erdp_trb, *current_trb;
+ struct xhci_segment *seg;
+ u64 erdp_reg;
+ u32 iman_reg;
+ dma_addr_t deq;
+ unsigned long segment_offset;
+
+ /* disable irq, ack pending interrupt and ack all pending events */
+
+ iman_reg =
+ readl_relaxed(&xhci->sec_ir_set[intr_num]->irq_pending);
+ iman_reg &= ~IMAN_IE;
+ writel_relaxed(iman_reg,
+ &xhci->sec_ir_set[intr_num]->irq_pending);
+ iman_reg =
+ readl_relaxed(&xhci->sec_ir_set[intr_num]->irq_pending);
+ if (iman_reg & IMAN_IP)
+ writel_relaxed(iman_reg,
+ &xhci->sec_ir_set[intr_num]->irq_pending);
+
+ /* last acked event trb is in erdp reg */
+ erdp_reg =
+ xhci_read_64(xhci, &xhci->sec_ir_set[intr_num]->erst_dequeue);
+ deq = (dma_addr_t)(erdp_reg & ~ERST_PTR_MASK);
+ if (!deq) {
+ pr_debug("%s: event ring handling not required\n", __func__);
+ return;
+ }
+
+ seg = xhci->sec_event_ring[intr_num]->first_seg;
+ segment_offset = deq - seg->dma;
+
+ /* find out virtual address of the last acked event trb */
+ erdp_trb = current_trb = &seg->trbs[0] +
+ (segment_offset/sizeof(*current_trb));
+
+ /* read cycle state of the last acked trb to find out CCS */
+ xhci->sec_event_ring[intr_num]->cycle_state =
+ (current_trb->event_cmd.flags & TRB_CYCLE);
+
+ while (1) {
+ /* last trb of the event ring: toggle cycle state */
+ if (current_trb == &seg->trbs[TRBS_PER_SEGMENT - 1]) {
+ xhci->sec_event_ring[intr_num]->cycle_state ^= 1;
+ current_trb = &seg->trbs[0];
+ } else {
+ current_trb++;
+ }
+
+ /* cycle state transition */
+ if ((le32_to_cpu(current_trb->event_cmd.flags) & TRB_CYCLE) !=
+ xhci->sec_event_ring[intr_num]->cycle_state)
+ break;
+ }
+
+ if (erdp_trb != current_trb) {
+ deq =
+ xhci_trb_virt_to_dma(xhci->sec_event_ring[intr_num]->deq_seg,
+ current_trb);
+ if (deq == 0)
+ xhci_warn(xhci,
+ "WARN ivalid SW event ring dequeue ptr.\n");
+ /* Update HC event ring dequeue pointer */
+ erdp_reg &= ERST_PTR_MASK;
+ erdp_reg |= ((u64) deq & (u64) ~ERST_PTR_MASK);
+ }
+
+ /* Clear the event handler busy flag (RW1C); event ring is empty. */
+ erdp_reg |= ERST_EHB;
+ xhci_write_64(xhci, erdp_reg,
+ &xhci->sec_ir_set[intr_num]->erst_dequeue);
+}
+
+int xhci_sec_event_ring_cleanup(struct usb_hcd *hcd, unsigned intr_num)
+{
+ int size;
+ struct xhci_hcd *xhci = hcd_to_xhci(hcd);
struct device *dev = xhci_to_hcd(xhci)->self.controller;
+
+ if (intr_num >= xhci->max_interrupters) {
+ xhci_err(xhci, "invalid secondary interrupter num %d\n",
+ intr_num);
+ return -EINVAL;
+ }
+
+ size =
+ sizeof(struct xhci_erst_entry)*(xhci->sec_erst[intr_num].num_entries);
+ if (xhci->sec_erst[intr_num].entries) {
+ xhci_handle_sec_intr_events(xhci, intr_num);
+ dma_free_coherent(dev, size, xhci->sec_erst[intr_num].entries,
+ xhci->sec_erst[intr_num].erst_dma_addr);
+ xhci->sec_erst[intr_num].entries = NULL;
+ }
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Freed SEC ERST#%d",
+ intr_num);
+ if (xhci->sec_event_ring[intr_num])
+ xhci_ring_free(xhci, xhci->sec_event_ring[intr_num]);
+
+ xhci->sec_event_ring[intr_num] = NULL;
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init,
+ "Freed sec event ring");
+
+ return 0;
+}
+
+void xhci_event_ring_cleanup(struct xhci_hcd *xhci)
+{
int size;
- int i, j, num_ports;
+ unsigned int i;
+ struct device *dev = xhci_to_hcd(xhci)->self.controller;
- cancel_delayed_work_sync(&xhci->cmd_timer);
+ /* sec event ring clean up */
+ for (i = 1; i < xhci->max_interrupters; i++)
+ xhci_sec_event_ring_cleanup(xhci_to_hcd(xhci), i);
- /* Free the Event Ring Segment Table and the actual Event Ring */
+ kfree(xhci->sec_ir_set);
+ xhci->sec_ir_set = NULL;
+ kfree(xhci->sec_erst);
+ xhci->sec_erst = NULL;
+ kfree(xhci->sec_event_ring);
+ xhci->sec_event_ring = NULL;
+
+ /* primary event ring clean up */
size = sizeof(struct xhci_erst_entry)*(xhci->erst.num_entries);
if (xhci->erst.entries)
dma_free_coherent(dev, size,
xhci->erst.entries, xhci->erst.erst_dma_addr);
xhci->erst.entries = NULL;
- xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Freed ERST");
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Freed primary ERST");
if (xhci->event_ring)
xhci_ring_free(xhci, xhci->event_ring);
xhci->event_ring = NULL;
- xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Freed event ring");
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Freed priamry event ring");
+}
+
+void xhci_mem_cleanup(struct xhci_hcd *xhci)
+{
+ struct device *dev = xhci_to_hcd(xhci)->self.controller;
+ int i, j, num_ports;
+
+ cancel_delayed_work_sync(&xhci->cmd_timer);
+
+ xhci_event_ring_cleanup(xhci);
if (xhci->lpm_command)
xhci_free_command(xhci, xhci->lpm_command);
kfree(xhci->port_array);
kfree(xhci->rh_bw);
kfree(xhci->ext_caps);
+ kfree(xhci->usb2_rhub.psi);
+ kfree(xhci->usb3_rhub.psi);
xhci->usb2_ports = NULL;
xhci->usb3_ports = NULL;
xhci->port_array = NULL;
+ xhci->usb2_rhub.psi = NULL;
+ xhci->usb3_rhub.psi = NULL;
xhci->rh_bw = NULL;
xhci->ext_caps = NULL;
if (seg != result_seg) {
xhci_warn(xhci, "WARN: %s TRB math test %d failed!\n",
test_name, test_number);
- xhci_warn(xhci, "Tested TRB math w/ seg %p and "
+ xhci_warn(xhci, "Tested TRB math w/ seg %pK and "
"input DMA 0x%llx\n",
input_seg,
(unsigned long long) input_dma);
- xhci_warn(xhci, "starting TRB %p (0x%llx DMA), "
- "ending TRB %p (0x%llx DMA)\n",
+ xhci_warn(xhci, "starting TRB %pK (0x%llx DMA), "
+ "ending TRB %pK (0x%llx DMA)\n",
start_trb, start_dma,
end_trb, end_dma);
- xhci_warn(xhci, "Expected seg %p, got seg %p\n",
+ xhci_warn(xhci, "Expected seg %pK, got seg %pK\n",
result_seg, seg);
trb_in_td(xhci, input_seg, start_trb, end_trb, input_dma,
true);
return 0;
}
-static void xhci_set_hc_event_deq(struct xhci_hcd *xhci)
-{
- u64 temp;
- dma_addr_t deq;
-
- deq = xhci_trb_virt_to_dma(xhci->event_ring->deq_seg,
- xhci->event_ring->dequeue);
- if (deq == 0 && !in_interrupt())
- xhci_warn(xhci, "WARN something wrong with SW event ring "
- "dequeue ptr.\n");
- /* Update HC event ring dequeue pointer */
- temp = xhci_read_64(xhci, &xhci->ir_set->erst_dequeue);
- temp &= ERST_PTR_MASK;
- /* Don't clear the EHB bit (which is RW1C) because
- * there might be more events to service.
- */
- temp &= ~ERST_EHB;
- xhci_dbg_trace(xhci, trace_xhci_dbg_init,
- "// Write event ring dequeue pointer, "
- "preserving EHB bit");
- xhci_write_64(xhci, ((u64) deq & (u64) ~ERST_PTR_MASK) | temp,
- &xhci->ir_set->erst_dequeue);
-}
-
static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports,
__le32 __iomem *addr, u8 major_revision, int max_caps)
{
rhub = &xhci->usb2_rhub;
} else {
xhci_warn(xhci, "Ignoring unknown port speed, "
- "Ext Cap %p, revision = 0x%x\n",
+ "Ext Cap %pK, revision = 0x%x\n",
addr, major_revision);
/* Ignoring port protocol we can't understand. FIXME */
return;
port_offset = XHCI_EXT_PORT_OFF(temp);
port_count = XHCI_EXT_PORT_COUNT(temp);
xhci_dbg_trace(xhci, trace_xhci_dbg_init,
- "Ext Cap %p, port offset = %u, "
+ "Ext Cap %pK, port offset = %u, "
"count = %u, revision = 0x%x",
addr, port_offset, port_count, major_revision);
/* Port count includes the current port offset */
for (i = port_offset; i < (port_offset + port_count); i++) {
/* Duplicate entry. Ignore the port if the revisions differ. */
if (xhci->port_array[i] != 0) {
- xhci_warn(xhci, "Duplicate port entry, Ext Cap %p,"
+ xhci_warn(xhci, "Duplicate port entry, Ext Cap %pK,"
" port %u\n", addr, i);
xhci_warn(xhci, "Port was marked as USB %u, "
"duplicated as USB %u\n",
NUM_PORT_REGS*i;
xhci_dbg_trace(xhci, trace_xhci_dbg_init,
"USB 2.0 port at index %u, "
- "addr = %p", i,
+ "addr = %pK", i,
xhci->usb2_ports[port_index]);
port_index++;
if (port_index == xhci->num_usb2_ports)
NUM_PORT_REGS*i;
xhci_dbg_trace(xhci, trace_xhci_dbg_init,
"USB 3.0 port at index %u, "
- "addr = %p", i,
+ "addr = %pK", i,
xhci->usb3_ports[port_index]);
port_index++;
if (port_index == xhci->num_usb3_ports)
return 0;
}
+int xhci_event_ring_setup(struct xhci_hcd *xhci, struct xhci_ring **er,
+ struct xhci_intr_reg __iomem *ir_set, struct xhci_erst *erst,
+ unsigned int intr_num, gfp_t flags)
+{
+ dma_addr_t dma, deq;
+ u64 val_64;
+ unsigned int val;
+ struct xhci_segment *seg;
+ struct device *dev = xhci_to_hcd(xhci)->self.controller;
+
+ *er = xhci_ring_alloc(xhci, ERST_NUM_SEGS, 1,
+ TYPE_EVENT, flags);
+ if (!*er)
+ return -ENOMEM;
+
+ erst->entries = dma_alloc_coherent(dev,
+ sizeof(struct xhci_erst_entry) * ERST_NUM_SEGS, &dma,
+ flags);
+ if (!erst->entries) {
+ xhci_ring_free(xhci, *er);
+ return -ENOMEM;
+ }
+
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init,
+ "intr# %d: Allocated event ring segment table at 0x%llx",
+ intr_num, (unsigned long long)dma);
+
+ memset(erst->entries, 0, sizeof(struct xhci_erst_entry)*ERST_NUM_SEGS);
+ erst->num_entries = ERST_NUM_SEGS;
+ erst->erst_dma_addr = dma;
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init,
+ "intr# %d: num segs = %i, virt addr = %pK, dma addr = 0x%llx",
+ intr_num,
+ erst->num_entries,
+ erst->entries,
+ (unsigned long long)erst->erst_dma_addr);
+
+ /* set ring base address and size for each segment table entry */
+ for (val = 0, seg = (*er)->first_seg; val < ERST_NUM_SEGS; val++) {
+ struct xhci_erst_entry *entry = &erst->entries[val];
+
+ entry->seg_addr = cpu_to_le64(seg->dma);
+ entry->seg_size = cpu_to_le32(TRBS_PER_SEGMENT);
+ entry->rsvd = 0;
+ seg = seg->next;
+ }
+
+ /* set ERST count with the number of entries in the segment table */
+ val = readl_relaxed(&ir_set->erst_size);
+ val &= ERST_SIZE_MASK;
+ val |= ERST_NUM_SEGS;
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init,
+ "Write ERST size = %i to ir_set %d (some bits preserved)", val,
+ intr_num);
+ writel_relaxed(val, &ir_set->erst_size);
+
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init,
+ "intr# %d: Set ERST entries to point to event ring.",
+ intr_num);
+ /* set the segment table base address */
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init,
+ "Set ERST base address for ir_set %d = 0x%llx",
+ intr_num,
+ (unsigned long long)erst->erst_dma_addr);
+ val_64 = xhci_read_64(xhci, &ir_set->erst_base);
+ val_64 &= ERST_PTR_MASK;
+ val_64 |= (erst->erst_dma_addr & (u64) ~ERST_PTR_MASK);
+ xhci_write_64(xhci, val_64, &ir_set->erst_base);
+
+ /* Set the event ring dequeue address */
+ deq = xhci_trb_virt_to_dma((*er)->deq_seg, (*er)->dequeue);
+ if (deq == 0 && !in_interrupt())
+ xhci_warn(xhci,
+ "intr# %d:WARN something wrong with SW event ring deq ptr.\n",
+ intr_num);
+ /* Update HC event ring dequeue pointer */
+ val_64 = xhci_read_64(xhci, &ir_set->erst_dequeue);
+ val_64 &= ERST_PTR_MASK;
+ /* Don't clear the EHB bit (which is RW1C) because
+ * there might be more events to service.
+ */
+ val_64 &= ~ERST_EHB;
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init,
+ "intr# %d:Write event ring dequeue pointer, preserving EHB bit",
+ intr_num);
+ xhci_write_64(xhci, ((u64) deq & (u64) ~ERST_PTR_MASK) | val_64,
+ &ir_set->erst_dequeue);
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init,
+ "Wrote ERST address to ir_set %d.", intr_num);
+ xhci_print_ir_set(xhci, intr_num);
+
+ return 0;
+}
+
+int xhci_sec_event_ring_setup(struct usb_hcd *hcd, unsigned intr_num)
+{
+ int ret;
+ struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+
+ if ((xhci->xhc_state & XHCI_STATE_HALTED) || !xhci->sec_ir_set
+ || !xhci->sec_event_ring || !xhci->sec_erst ||
+ intr_num >= xhci->max_interrupters) {
+ xhci_err(xhci,
+ "%s:state %x ir_set %pK evt_ring %pK erst %pK intr# %d\n",
+ __func__, xhci->xhc_state, xhci->sec_ir_set,
+ xhci->sec_event_ring, xhci->sec_erst, intr_num);
+ return -EINVAL;
+ }
+
+ if (xhci->sec_event_ring && xhci->sec_event_ring[intr_num]
+ && xhci->sec_event_ring[intr_num]->first_seg)
+ goto done;
+
+ xhci->sec_ir_set[intr_num] = &xhci->run_regs->ir_set[intr_num];
+ ret = xhci_event_ring_setup(xhci,
+ &xhci->sec_event_ring[intr_num],
+ xhci->sec_ir_set[intr_num],
+ &xhci->sec_erst[intr_num],
+ intr_num, GFP_KERNEL);
+ if (ret) {
+ xhci_err(xhci, "sec event ring setup failed inter#%d\n",
+ intr_num);
+ return ret;
+ }
+done:
+ return 0;
+}
+
+int xhci_event_ring_init(struct xhci_hcd *xhci, gfp_t flags)
+{
+ int ret = 0;
+
+ /* primary + secondary */
+ xhci->max_interrupters = HCS_MAX_INTRS(xhci->hcs_params1);
+
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init,
+ "// Allocating primary event ring");
+
+ /* Set ir_set to interrupt register set 0 */
+ xhci->ir_set = &xhci->run_regs->ir_set[0];
+ ret = xhci_event_ring_setup(xhci, &xhci->event_ring, xhci->ir_set,
+ &xhci->erst, 0, flags);
+ if (ret) {
+ xhci_err(xhci, "failed to setup primary event ring\n");
+ goto fail;
+ }
+
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init,
+ "// Allocating sec event ring related pointers");
+
+ xhci->sec_ir_set = kcalloc(xhci->max_interrupters,
+ sizeof(*xhci->sec_ir_set), flags);
+ if (!xhci->sec_ir_set) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ xhci->sec_event_ring = kcalloc(xhci->max_interrupters,
+ sizeof(*xhci->sec_event_ring), flags);
+ if (!xhci->sec_event_ring) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ xhci->sec_erst = kcalloc(xhci->max_interrupters,
+ sizeof(*xhci->sec_erst), flags);
+ if (!xhci->sec_erst)
+ ret = -ENOMEM;
+fail:
+ return ret;
+}
+
int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags)
{
dma_addr_t dma;
struct device *dev = xhci_to_hcd(xhci)->self.controller;
unsigned int val, val2;
u64 val_64;
- struct xhci_segment *seg;
u32 page_size, temp;
int i;
memset(xhci->dcbaa, 0, sizeof *(xhci->dcbaa));
xhci->dcbaa->dma = dma;
xhci_dbg_trace(xhci, trace_xhci_dbg_init,
- "// Device context base array address = 0x%llx (DMA), %p (virt)",
+ "// Device context base array address = 0x%llx (DMA), %pK (virt)",
(unsigned long long)xhci->dcbaa->dma, xhci->dcbaa);
xhci_write_64(xhci, dma, &xhci->op_regs->dcbaa_ptr);
if (!xhci->cmd_ring)
goto fail;
xhci_dbg_trace(xhci, trace_xhci_dbg_init,
- "Allocated command ring at %p", xhci->cmd_ring);
+ "Allocated command ring at %pK", xhci->cmd_ring);
xhci_dbg_trace(xhci, trace_xhci_dbg_init, "First segment DMA is 0x%llx",
(unsigned long long)xhci->cmd_ring->first_seg->dma);
xhci->dba = (void __iomem *) xhci->cap_regs + val;
xhci_dbg_regs(xhci);
xhci_print_run_regs(xhci);
- /* Set ir_set to interrupt register set 0 */
- xhci->ir_set = &xhci->run_regs->ir_set[0];
/*
* Event ring setup: Allocate a normal ring, but also setup
* the event ring segment table (ERST). Section 4.9.3.
*/
- xhci_dbg_trace(xhci, trace_xhci_dbg_init, "// Allocating event ring");
- xhci->event_ring = xhci_ring_alloc(xhci, ERST_NUM_SEGS, 1, TYPE_EVENT,
- flags);
- if (!xhci->event_ring)
- goto fail;
- if (xhci_check_trb_in_td_math(xhci) < 0)
+ if (xhci_event_ring_init(xhci, GFP_KERNEL))
goto fail;
- xhci->erst.entries = dma_alloc_coherent(dev,
- sizeof(struct xhci_erst_entry) * ERST_NUM_SEGS, &dma,
- flags);
- if (!xhci->erst.entries)
+ if (xhci_check_trb_in_td_math(xhci) < 0)
goto fail;
- xhci_dbg_trace(xhci, trace_xhci_dbg_init,
- "// Allocated event ring segment table at 0x%llx",
- (unsigned long long)dma);
-
- memset(xhci->erst.entries, 0, sizeof(struct xhci_erst_entry)*ERST_NUM_SEGS);
- xhci->erst.num_entries = ERST_NUM_SEGS;
- xhci->erst.erst_dma_addr = dma;
- xhci_dbg_trace(xhci, trace_xhci_dbg_init,
- "Set ERST to 0; private num segs = %i, virt addr = %p, dma addr = 0x%llx",
- xhci->erst.num_entries,
- xhci->erst.entries,
- (unsigned long long)xhci->erst.erst_dma_addr);
-
- /* set ring base address and size for each segment table entry */
- for (val = 0, seg = xhci->event_ring->first_seg; val < ERST_NUM_SEGS; val++) {
- struct xhci_erst_entry *entry = &xhci->erst.entries[val];
- entry->seg_addr = cpu_to_le64(seg->dma);
- entry->seg_size = cpu_to_le32(TRBS_PER_SEGMENT);
- entry->rsvd = 0;
- seg = seg->next;
- }
-
- /* set ERST count with the number of entries in the segment table */
- val = readl(&xhci->ir_set->erst_size);
- val &= ERST_SIZE_MASK;
- val |= ERST_NUM_SEGS;
- xhci_dbg_trace(xhci, trace_xhci_dbg_init,
- "// Write ERST size = %i to ir_set 0 (some bits preserved)",
- val);
- writel(val, &xhci->ir_set->erst_size);
-
- xhci_dbg_trace(xhci, trace_xhci_dbg_init,
- "// Set ERST entries to point to event ring.");
- /* set the segment table base address */
- xhci_dbg_trace(xhci, trace_xhci_dbg_init,
- "// Set ERST base address for ir_set 0 = 0x%llx",
- (unsigned long long)xhci->erst.erst_dma_addr);
- val_64 = xhci_read_64(xhci, &xhci->ir_set->erst_base);
- val_64 &= ERST_PTR_MASK;
- val_64 |= (xhci->erst.erst_dma_addr & (u64) ~ERST_PTR_MASK);
- xhci_write_64(xhci, val_64, &xhci->ir_set->erst_base);
-
- /* Set the event ring dequeue address */
- xhci_set_hc_event_deq(xhci);
- xhci_dbg_trace(xhci, trace_xhci_dbg_init,
- "Wrote ERST address to ir_set 0.");
- xhci_print_ir_set(xhci, 0);
/*
* XXX: Might need to set the Interrupter Moderation Register to
#include <linux/slab.h>
#include "xhci.h"
#include "xhci-trace.h"
+extern void kick_usbpd_vbus_sm(void);
+extern bool is_xiaomi_headset;
/*
* Returns zero if the TRB isn't in this segment, otherwise it returns the DMA
static bool xhci_mod_cmd_timer(struct xhci_hcd *xhci, unsigned long delay)
{
+ if (is_xiaomi_headset)
+ delay = msecs_to_jiffies(1000);
+
return mod_delayed_work(system_wq, &xhci->cmd_timer, delay);
}
i_cmd->status = COMP_CMD_STOP;
- xhci_dbg(xhci, "Turn aborted command %p to no-op\n",
+ xhci_dbg(xhci, "Turn aborted command %pK to no-op\n",
i_cmd->command_trb);
/* get cycle state from the original cmd trb */
cycle_state = le32_to_cpu(
{
u64 temp_64;
int ret;
+ int delay;
xhci_dbg(xhci, "Abort command ring\n");
&xhci->op_regs->cmd_ring);
/* Section 4.6.1.2 of xHCI 1.0 spec says software should
- * time the completion od all xHCI commands, including
+ * time the completion of all xHCI commands, including
* the Command Abort operation. If software doesn't see
- * CRR negated in a timely manner (e.g. longer than 5
- * seconds), then it should assume that the there are
- * larger problems with the xHC and assert HCRST.
+ * CRR negated in a timely manner, then it should assume
+ * that the there are larger problems with the xHC and assert HCRST.
*/
- ret = xhci_handshake(&xhci->op_regs->cmd_ring,
- CMD_RING_RUNNING, 0, 5 * 1000 * 1000);
+ if (is_xiaomi_headset) {
+ delay = 500 * 1000;
+ } else {
+ delay = 5000 * 1000;
+ }
+
+ ret = xhci_handshake_check_state(xhci, &xhci->op_regs->cmd_ring,
+ CMD_RING_RUNNING, 0, 1000 * 1000);
if (ret < 0) {
- /* we are about to kill xhci, give it one more chance */
- xhci_write_64(xhci, temp_64 | CMD_RING_ABORT,
- &xhci->op_regs->cmd_ring);
- udelay(1000);
- ret = xhci_handshake(&xhci->op_regs->cmd_ring,
- CMD_RING_RUNNING, 0, 3 * 1000 * 1000);
- if (ret < 0) {
- xhci_err(xhci, "Stopped the command ring failed, "
- "maybe the host is dead\n");
- xhci->xhc_state |= XHCI_STATE_DYING;
- xhci_quiesce(xhci);
- xhci_halt(xhci);
- return -ESHUTDOWN;
- }
+ if (is_xiaomi_headset)
+ return -EPERM;
+ xhci_err(xhci,
+ "Stop command ring failed, maybe the host is dead\n");
+ xhci->xhc_state |= XHCI_STATE_DYING;
+ xhci_quiesce(xhci);
+ xhci_halt(xhci);
+ return -ESHUTDOWN;
}
/*
* Writing the CMD_RING_ABORT bit should cause a cmd completion event,
"Cycle state = 0x%x", state->new_cycle_state);
xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
- "New dequeue segment = %p (virtual)",
+ "New dequeue segment = %pK (virtual)",
state->new_deq_seg);
addr = xhci_trb_virt_to_dma(state->new_deq_seg, state->new_deq_ptr);
xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
"Cancel (unchain) link TRB");
xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
- "Address = %p (0x%llx dma); "
- "in seg %p (0x%llx dma)",
+ "Address = %pK (0x%llx dma); "
+ "in seg %pK (0x%llx dma)",
cur_trb,
(unsigned long long)xhci_trb_virt_to_dma(cur_seg, cur_trb),
cur_seg,
* short, don't muck with the stream ID after
* submission.
*/
- xhci_warn(xhci, "WARN Cancelled URB %p "
+ xhci_warn(xhci, "WARN Cancelled URB %pK "
"has invalid stream ID %u.\n",
cur_td->urb,
cur_td->urb->stream_id);
ep_ring, ep_index);
} else {
xhci_warn(xhci, "Mismatch between completed Set TR Deq Ptr command & xHCI internal state.\n");
- xhci_warn(xhci, "ep deq seg = %p, deq ptr = %p\n",
+ xhci_warn(xhci, "ep deq seg = %pK, deq ptr = %pK\n",
ep->queued_deq_seg, ep->queued_deq_ptr);
}
}
xhci->cmd_ring_state = CMD_RING_STATE_ABORTED;
xhci_dbg(xhci, "Command timeout\n");
ret = xhci_abort_cmd_ring(xhci, flags);
+ if (ret == -EPERM) {
+ xhci_err(xhci, "Abort command ring failed reset usb device\n");
+ xhci_cleanup_command_queue(xhci);
+ spin_unlock_irqrestore(&xhci->lock, flags);
+ kick_usbpd_vbus_sm();
+ return;
+ }
+
if (unlikely(ret == -ESHUTDOWN)) {
xhci_err(xhci, "Abort command ring failed\n");
xhci_cleanup_command_queue(xhci);
usb_hcd_resume_root_hub(hcd);
}
- if (hcd->speed >= HCD_USB3 && (temp & PORT_PLS_MASK) == XDEV_INACTIVE)
- bus_state->port_remote_wakeup &= ~(1 << faked_port_index);
-
if ((temp & PORT_PLC) && (temp & PORT_PLS_MASK) == XDEV_RESUME) {
xhci_dbg(xhci, "port resume event for port %d\n", port_id);
bus_state->port_remote_wakeup |= 1 << faked_port_index;
xhci_test_and_clear_bit(xhci, port_array,
faked_port_index, PORT_PLC);
+ usb_hcd_start_port_resume(&hcd->self, faked_port_index);
xhci_set_link_state(xhci, port_array, faked_port_index,
XDEV_U0);
/* Need to wait until the next link state change
if (slot_id && xhci->devs[slot_id])
xhci_ring_device(xhci, slot_id);
if (bus_state->port_remote_wakeup & (1 << faked_port_index)) {
- bus_state->port_remote_wakeup &=
- ~(1 << faked_port_index);
xhci_test_and_clear_bit(xhci, port_array,
faked_port_index, PORT_PLC);
usb_wakeup_notification(hcd->self.root_hub,
URB_SHORT_NOT_OK)) ||
(status != 0 &&
!usb_endpoint_xfer_isoc(&urb->ep->desc)))
- xhci_dbg(xhci, "Giveback URB %p, len = %d, "
+ xhci_dbg(xhci, "Giveback URB %pK, len = %d, "
"expected = %d, status = %d\n",
urb, urb->actual_length,
urb->transfer_buffer_length,
return 0;
}
+/*
+ * Variant of xhci_queue_ctrl_tx() used to implement EHSET
+ * SINGLE_STEP_SET_FEATURE test mode. It differs in that the control
+ * transfer is broken up so that the SETUP stage can happen and call
+ * the URB's completion handler before the DATA/STATUS stages are
+ * executed by the xHC hardware. This assumes the control transfer is a
+ * GetDescriptor, with a DATA stage in the IN direction, and an OUT
+ * STATUS stage.
+ *
+ * This function is called twice, usually with a 15-second delay in between.
+ * - with is_setup==true, the SETUP stage for the control request
+ * (GetDescriptor) is queued in the TRB ring and sent to HW immediately
+ * - with is_setup==false, the DATA and STATUS TRBs are queued and exceuted
+ *
+ * Caller must have locked xhci->lock
+ */
+int xhci_submit_single_step_set_feature(struct usb_hcd *hcd, struct urb *urb,
+ int is_setup)
+{
+ struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+ struct xhci_ring *ep_ring;
+ int num_trbs;
+ int ret;
+ unsigned int slot_id, ep_index;
+ struct usb_ctrlrequest *setup;
+ struct xhci_generic_trb *start_trb;
+ int start_cycle;
+ u32 field, length_field, remainder;
+ struct urb_priv *urb_priv;
+ struct xhci_td *td;
+
+ ep_ring = xhci_urb_to_transfer_ring(xhci, urb);
+ if (!ep_ring)
+ return -EINVAL;
+
+ /* Need buffer for data stage */
+ if (urb->transfer_buffer_length <= 0)
+ return -EINVAL;
+
+ /*
+ * Need to copy setup packet into setup TRB, so we can't use the setup
+ * DMA address.
+ */
+ if (!urb->setup_packet)
+ return -EINVAL;
+ setup = (struct usb_ctrlrequest *) urb->setup_packet;
+
+ slot_id = urb->dev->slot_id;
+ ep_index = xhci_get_endpoint_index(&urb->ep->desc);
+
+ urb_priv = kzalloc(sizeof(struct urb_priv) +
+ sizeof(struct xhci_td *), GFP_ATOMIC);
+ if (!urb_priv)
+ return -ENOMEM;
+
+ td = urb_priv->td[0] = kzalloc(sizeof(struct xhci_td), GFP_ATOMIC);
+ if (!td) {
+ kfree(urb_priv);
+ return -ENOMEM;
+ }
+
+ urb_priv->length = 1;
+ urb_priv->td_cnt = 0;
+ urb->hcpriv = urb_priv;
+
+ num_trbs = is_setup ? 1 : 2;
+
+ ret = prepare_transfer(xhci, xhci->devs[slot_id],
+ ep_index, urb->stream_id,
+ num_trbs, urb, 0, GFP_ATOMIC);
+ if (ret < 0) {
+ kfree(td);
+ kfree(urb_priv);
+ return ret;
+ }
+
+ /*
+ * Don't give the first TRB to the hardware (by toggling the cycle bit)
+ * until we've finished creating all the other TRBs. The ring's cycle
+ * state may change as we enqueue the other TRBs, so save it too.
+ */
+ start_trb = &ep_ring->enqueue->generic;
+ start_cycle = ep_ring->cycle_state;
+
+ if (is_setup) {
+ /* Queue only the setup TRB */
+ field = TRB_IDT | TRB_IOC | TRB_TYPE(TRB_SETUP);
+ if (start_cycle == 0)
+ field |= 0x1;
+
+ /* xHCI 1.0 6.4.1.2.1: Transfer Type field */
+ if (xhci->hci_version == 0x100) {
+ if (setup->bRequestType & USB_DIR_IN)
+ field |= TRB_TX_TYPE(TRB_DATA_IN);
+ else
+ field |= TRB_TX_TYPE(TRB_DATA_OUT);
+ }
+
+ /* Save the DMA address of the last TRB in the TD */
+ td->last_trb = ep_ring->enqueue;
+
+ queue_trb(xhci, ep_ring, false,
+ setup->bRequestType | setup->bRequest << 8 |
+ le16_to_cpu(setup->wValue) << 16,
+ le16_to_cpu(setup->wIndex) |
+ le16_to_cpu(setup->wLength) << 16,
+ TRB_LEN(8) | TRB_INTR_TARGET(0),
+ field);
+ } else {
+ /* Queue data TRB */
+ field = TRB_ISP | TRB_TYPE(TRB_DATA);
+ if (start_cycle == 0)
+ field |= 0x1;
+ if (setup->bRequestType & USB_DIR_IN)
+ field |= TRB_DIR_IN;
+
+ remainder = xhci_td_remainder(xhci, 0,
+ urb->transfer_buffer_length,
+ urb->transfer_buffer_length,
+ urb, 1);
+
+ length_field = TRB_LEN(urb->transfer_buffer_length) |
+ TRB_TD_SIZE(remainder) |
+ TRB_INTR_TARGET(0);
+
+ queue_trb(xhci, ep_ring, true,
+ lower_32_bits(urb->transfer_dma),
+ upper_32_bits(urb->transfer_dma),
+ length_field,
+ field);
+
+ /* Save the DMA address of the last TRB in the TD */
+ td->last_trb = ep_ring->enqueue;
+
+ /* Queue status TRB */
+ field = TRB_IOC | TRB_TYPE(TRB_STATUS);
+ if (!(setup->bRequestType & USB_DIR_IN))
+ field |= TRB_DIR_IN;
+
+ queue_trb(xhci, ep_ring, false,
+ 0,
+ 0,
+ TRB_INTR_TARGET(0),
+ field | ep_ring->cycle_state);
+ }
+
+ giveback_first_trb(xhci, slot_id, ep_index, 0, start_cycle, start_trb);
+ return 0;
+}
+
static int count_isoc_trbs_needed(struct xhci_hcd *xhci,
struct urb *urb, int i)
{
int ret;
xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
- "Set TR Deq Ptr cmd, new deq seg = %p (0x%llx dma), new deq ptr = %p (0x%llx dma), new cycle = %u",
+ "Set TR Deq Ptr cmd, new deq seg = %pK (0x%llx dma), new deq ptr = %pK (0x%llx dma), new cycle = %u",
deq_state->new_deq_seg,
(unsigned long long)deq_state->new_deq_seg->dma,
deq_state->new_deq_ptr,
deq_state->new_deq_ptr);
if (addr == 0) {
xhci_warn(xhci, "WARN Cannot submit Set TR Deq Ptr\n");
- xhci_warn(xhci, "WARN deq seg = %p, deq pt = %p\n",
+ xhci_warn(xhci, "WARN deq seg = %pK, deq pt = %pK\n",
deq_state->new_deq_seg, deq_state->new_deq_ptr);
return;
}
return ret;
}
+int xhci_handshake_check_state(struct xhci_hcd *xhci,
+ void __iomem *ptr, u32 mask, u32 done, int usec)
+{
+ u32 result;
+
+ do {
+ result = readl_relaxed(ptr);
+ if (result == ~(u32)0) /* card removed */
+ return -ENODEV;
+ /* host removed. Bail out */
+ if (xhci->xhc_state & XHCI_STATE_REMOVING)
+ return -ENODEV;
+ result &= mask;
+ if (result == done)
+ return 0;
+ udelay(1);
+ usec--;
+ } while (usec > 0);
+ return -ETIMEDOUT;
+}
+
/*
* Disable interrupts and begin the xHCI halting process.
*/
STS_HALT, STS_HALT, XHCI_MAX_HALT_USEC);
if (!ret) {
xhci->xhc_state |= XHCI_STATE_HALTED;
- xhci->cmd_ring_state = CMD_RING_STATE_STOPPED;
- } else
+ } else {
xhci_warn(xhci, "Host not halted after %u microseconds.\n",
XHCI_MAX_HALT_USEC);
+ }
+
+ xhci->cmd_ring_state = CMD_RING_STATE_STOPPED;
+
+ if (delayed_work_pending(&xhci->cmd_timer)) {
+ xhci_dbg_trace(xhci, trace_xhci_dbg_init,
+ "Cleanup command queue");
+ cancel_delayed_work(&xhci->cmd_timer);
+ xhci_cleanup_command_queue(xhci);
+ }
+
return ret;
}
{
u32 temp;
int ret;
+ struct usb_hcd *hcd = xhci_to_hcd(xhci);
+ /*
+ * disable irq to avoid xhci_irq flooding due to unhandeled port
+ * change event in halt state, as soon as xhci_start clears halt bit
+ */
+ disable_irq(hcd->irq);
temp = readl(&xhci->op_regs->command);
temp |= (CMD_RUN);
xhci_dbg_trace(xhci, trace_xhci_dbg_init, "// Turn on HC, cmd = 0x%x.",
/* clear state flags. Including dying, halted or removing */
xhci->xhc_state = 0;
+ enable_irq(hcd->irq);
+
return ret;
}
temp = readl(&xhci->ir_set->irq_pending);
xhci_dbg_trace(xhci, trace_xhci_dbg_init,
- "// Enabling event ring interrupter %p by writing 0x%x to irq_pending",
+ "// Enabling event ring interrupter %pK by writing 0x%x to irq_pending",
xhci->ir_set, (unsigned int) ER_IRQ_ENABLE(temp));
writel(ER_IRQ_ENABLE(temp), &xhci->ir_set->irq_pending);
xhci_print_ir_set(xhci, 0);
usb_disable_xhci_ports(to_pci_dev(hcd->self.controller));
spin_lock_irq(&xhci->lock);
+ if (!HCD_HW_ACCESSIBLE(hcd)) {
+ spin_unlock_irq(&xhci->lock);
+ return;
+ }
xhci_halt(xhci);
/* Workaround for spurious wakeups at shutdown with HSW */
if (xhci->quirks & XHCI_SPURIOUS_WAKEUP)
xhci_dbg_trace(xhci, trace_xhci_dbg_init,
"xhci_shutdown completed - status = %x",
readl(&xhci->op_regs->status));
-
- /* Yet another workaround for spurious wakeups at shutdown with HSW */
- if (xhci->quirks & XHCI_SPURIOUS_WAKEUP)
- pci_set_power_state(to_pci_dev(hcd->self.controller), PCI_D3hot);
}
+ EXPORT_SYMBOL_GPL(xhci_shutdown);
#ifdef CONFIG_PM
static void xhci_save_registers(struct xhci_hcd *xhci)
int xhci_suspend(struct xhci_hcd *xhci, bool do_wakeup)
{
int rc = 0;
- unsigned int delay = XHCI_MAX_HALT_USEC;
+ unsigned int delay = XHCI_MAX_HALT_USEC * 2;
struct usb_hcd *hcd = xhci_to_hcd(xhci);
u32 command;
- if (!hcd->state)
+ if (!hcd->state || xhci->suspended)
return 0;
if (hcd->state != HC_STATE_SUSPENDED ||
/* step 5: remove core well power */
/* synchronize irq when using MSI-X */
xhci_msix_sync_irqs(xhci);
+ xhci->suspended = true;
return rc;
}
int retval = 0;
bool comp_timer_running = false;
- if (!hcd->state)
+ if (!hcd->state || !xhci->suspended)
return 0;
/* Wait a bit if either of the roothubs need to settle from the
/* Re-enable port polling. */
xhci_dbg(xhci, "%s: starting port polling.\n", __func__);
+ xhci->suspended = false;
set_bit(HCD_FLAG_POLL_RH, &xhci->shared_hcd->flags);
usb_hcd_poll_rh_status(xhci->shared_hcd);
set_bit(HCD_FLAG_POLL_RH, &hcd->flags);
exit:
return ret;
dying:
- xhci_dbg(xhci, "Ep 0x%x: URB %p submitted for "
+ xhci_dbg(xhci, "Ep 0x%x: URB %pK submitted for "
"non-responsive xHCI host.\n",
urb->ep->desc.bEndpointAddress, urb);
ret = -ESHUTDOWN;
i = urb_priv->td_cnt;
if (i < urb_priv->length)
xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb,
- "Cancel URB %p, dev %s, ep 0x%x, "
+ "Cancel URB %pK, dev %s, ep 0x%x, "
"starting at offset 0x%llx",
urb, urb->dev->devpath,
urb->ep->desc.bEndpointAddress,
if (xhci->xhc_state & XHCI_STATE_DYING)
return -ENODEV;
- xhci_dbg(xhci, "%s called for udev %p\n", __func__, udev);
+ xhci_dbg(xhci, "%s called for udev %pK\n", __func__, udev);
drop_flag = xhci_get_endpoint_flag(&ep->desc);
if (drop_flag == SLOT_FLAG || drop_flag == EP0_FLAG) {
xhci_dbg(xhci, "xHCI %s - can't drop slot or ep 0 %#x\n",
xhci_get_endpoint_flag(&ep->desc)) {
/* Do not warn when called after a usb_device_reset */
if (xhci->devs[udev->slot_id]->eps[ep_index].ring != NULL)
- xhci_warn(xhci, "xHCI %s called with disabled ep %p\n",
+ xhci_warn(xhci, "xHCI %s called with disabled ep %pK\n",
__func__, ep);
return 0;
}
* ignore this request.
*/
if (le32_to_cpu(ctrl_ctx->add_flags) & added_ctxs) {
- xhci_warn(xhci, "xHCI %s called with enabled ep %p\n",
+ xhci_warn(xhci, "xHCI %s called with enabled ep %pK\n",
__func__, ep);
return 0;
}
(xhci->xhc_state & XHCI_STATE_REMOVING))
return -ENODEV;
- xhci_dbg(xhci, "%s called for udev %p\n", __func__, udev);
+ xhci_dbg(xhci, "%s called for udev %pK\n", __func__, udev);
virt_dev = xhci->devs[udev->slot_id];
command = xhci_alloc_command(xhci, false, true, GFP_KERNEL);
return;
xhci = hcd_to_xhci(hcd);
- xhci_dbg(xhci, "%s called for udev %p\n", __func__, udev);
+ xhci_dbg(xhci, "%s called for udev %pK\n", __func__, udev);
virt_dev = xhci->devs[udev->slot_id];
/* Free any rings allocated for added endpoints */
for (i = 0; i < 31; ++i) {
if (addr == 0) {
xhci_warn(xhci, "WARN Cannot submit config ep after "
"reset ep command\n");
- xhci_warn(xhci, "WARN deq seg = %p, deq ptr = %p\n",
+ xhci_warn(xhci, "WARN deq seg = %pK, deq ptr = %pK\n",
deq_state->new_deq_seg,
deq_state->new_deq_ptr);
return;
del_timer_sync(&virt_dev->eps[i].stop_cmd_timer);
}
+ virt_dev->udev = NULL;
spin_lock_irqsave(&xhci->lock, flags);
virt_dev->udev = NULL;
xhci_dbg_trace(xhci, trace_xhci_dbg_address,
"Op regs DCBAA ptr = %#016llx", temp_64);
xhci_dbg_trace(xhci, trace_xhci_dbg_address,
- "Slot ID %d dcbaa entry @%p = %#016llx",
+ "Slot ID %d dcbaa entry @%pK = %#016llx",
udev->slot_id,
&xhci->dcbaa->dev_context_ptrs[udev->slot_id],
(unsigned long long)
}
EXPORT_SYMBOL_GPL(xhci_gen_setup);
+dma_addr_t xhci_get_sec_event_ring_dma_addr(struct usb_hcd *hcd,
+ unsigned intr_num)
+{
+ struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+
+ if (intr_num >= xhci->max_interrupters) {
+ xhci_err(xhci, "intr num %d >= max intrs %d\n", intr_num,
+ xhci->max_interrupters);
+ return 0;
+ }
+
+ if (!(xhci->xhc_state & XHCI_STATE_HALTED) &&
+ xhci->sec_event_ring && xhci->sec_event_ring[intr_num]
+ && xhci->sec_event_ring[intr_num]->first_seg)
+ return xhci->sec_event_ring[intr_num]->first_seg->dma;
+
+ return 0;
+}
+
+static dma_addr_t xhci_get_dcba_dma_addr(struct usb_hcd *hcd,
+ struct usb_device *udev)
+{
+ struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+
+ if (!(xhci->xhc_state & XHCI_STATE_HALTED) && xhci->dcbaa)
+ return xhci->dcbaa->dev_context_ptrs[udev->slot_id];
+
+ return 0;
+}
+
+dma_addr_t xhci_get_xfer_ring_dma_addr(struct usb_hcd *hcd,
+ struct usb_device *udev, struct usb_host_endpoint *ep)
+{
+ int ret;
+ unsigned int ep_index;
+ struct xhci_virt_device *virt_dev;
+
+ struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+
+ ret = xhci_check_args(hcd, udev, ep, 1, true, __func__);
+ if (ret <= 0) {
+ xhci_err(xhci, "%s: invalid args\n", __func__);
+ return 0;
+ }
+
+ virt_dev = xhci->devs[udev->slot_id];
+ ep_index = xhci_get_endpoint_index(&ep->desc);
+
+ if (virt_dev->eps[ep_index].ring &&
+ virt_dev->eps[ep_index].ring->first_seg)
+ return virt_dev->eps[ep_index].ring->first_seg->dma;
+
+ return 0;
+}
+
static const struct hc_driver xhci_hc_driver = {
.description = "xhci-hcd",
.product_desc = "xHCI Host Controller",
.enable_usb3_lpm_timeout = xhci_enable_usb3_lpm_timeout,
.disable_usb3_lpm_timeout = xhci_disable_usb3_lpm_timeout,
.find_raw_port_number = xhci_find_raw_port_number,
+ .sec_event_ring_setup = xhci_sec_event_ring_setup,
+ .sec_event_ring_cleanup = xhci_sec_event_ring_cleanup,
+ .get_sec_event_ring_dma_addr = xhci_get_sec_event_ring_dma_addr,
+ .get_xfer_ring_dma_addr = xhci_get_xfer_ring_dma_addr,
+ .get_dcba_dma_addr = xhci_get_dcba_dma_addr,
};
void xhci_init_driver(struct hc_driver *drv,
#define XDEV_U3 (0x3 << 5)
#define XDEV_INACTIVE (0x6 << 5)
#define XDEV_POLLING (0x7 << 5)
+ #define XDEV_RECOVERY (0x8 << 5)
#define XDEV_COMP_MODE (0xa << 5)
#define XDEV_RESUME (0xf << 5)
/* true: port has power (see HCC_PPC) */
/* Our HCD's current interrupter register set */
struct xhci_intr_reg __iomem *ir_set;
+ /* secondary interrupter */
+ struct xhci_intr_reg __iomem **sec_ir_set;
+
/* Cached register copies of read-only HC data */
__u32 hcs_params1;
__u32 hcs_params2;
struct xhci_command *current_cmd;
struct xhci_ring *event_ring;
struct xhci_erst erst;
+
+ /* secondary event ring and erst */
+ struct xhci_ring **sec_event_ring;
+ struct xhci_erst *sec_erst;
+
/* Scratchpad */
struct xhci_scratchpad *scratchpad;
/* Store LPM test failed devices' information */
/* Compliance Mode Recovery Data */
struct timer_list comp_mode_recovery_timer;
u32 port_status_u0;
+ bool suspended;
/* Compliance Mode Timer Triggered every 2 seconds */
#define COMP_MODE_RCVRY_MSECS 2000
};
void xhci_urb_free_priv(struct urb_priv *urb_priv);
void xhci_free_command(struct xhci_hcd *xhci,
struct xhci_command *command);
+int xhci_sec_event_ring_setup(struct usb_hcd *hcd, unsigned intr_num);
+int xhci_sec_event_ring_cleanup(struct usb_hcd *hcd, unsigned intr_num);
/* xHCI host controller glue */
typedef void (*xhci_get_quirks_t)(struct device *, struct xhci_hcd *);
int xhci_handshake(void __iomem *ptr, u32 mask, u32 done, int usec);
+int xhci_handshake_check_state(struct xhci_hcd *xhci,
+ void __iomem *ptr, u32 mask, u32 done, int usec);
void xhci_quiesce(struct xhci_hcd *xhci);
int xhci_halt(struct xhci_hcd *xhci);
int xhci_reset(struct xhci_hcd *xhci);
void xhci_stop(struct usb_hcd *hcd);
void xhci_shutdown(struct usb_hcd *hcd);
int xhci_gen_setup(struct usb_hcd *hcd, xhci_get_quirks_t get_quirks);
+ void xhci_shutdown(struct usb_hcd *hcd);
void xhci_init_driver(struct hc_driver *drv,
const struct xhci_driver_overrides *over);
struct xhci_slot_ctx *xhci_get_slot_ctx(struct xhci_hcd *xhci, struct xhci_container_ctx *ctx);
struct xhci_ep_ctx *xhci_get_ep_ctx(struct xhci_hcd *xhci, struct xhci_container_ctx *ctx, unsigned int ep_index);
+/* EHSET */
+int xhci_submit_single_step_set_feature(struct usb_hcd *hcd, struct urb *urb,
+ int is_setup);
+
#endif /* __LINUX_XHCI_HCD_H */
return "4:3";
case HDMI_PICTURE_ASPECT_16_9:
return "16:9";
+ case HDMI_PICTURE_ASPECT_64_27:
+ return "64:27";
+ case HDMI_PICTURE_ASPECT_256_135:
+ return "256:135";
case HDMI_PICTURE_ASPECT_RESERVED:
return "Reserved";
}
if (ptr[0] & 0x10)
frame->active_aspect = ptr[1] & 0xf;
if (ptr[0] & 0x8) {
- frame->top_bar = (ptr[5] << 8) + ptr[6];
- frame->bottom_bar = (ptr[7] << 8) + ptr[8];
+ frame->top_bar = (ptr[6] << 8) | ptr[5];
+ frame->bottom_bar = (ptr[8] << 8) | ptr[7];
}
if (ptr[0] & 0x4) {
- frame->left_bar = (ptr[9] << 8) + ptr[10];
- frame->right_bar = (ptr[11] << 8) + ptr[12];
+ frame->left_bar = (ptr[10] << 8) | ptr[9];
+ frame->right_bar = (ptr[12] << 8) | ptr[11];
}
frame->scan_mode = ptr[0] & 0x3;
#include <linux/balloon_compaction.h>
#include <linux/oom.h>
#include <linux/wait.h>
+#include <linux/mount.h>
/*
* Balloon device works in 4K page units. So each page is pointed to by
module_param(oom_pages, int, S_IRUSR | S_IWUSR);
MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
+#ifdef CONFIG_BALLOON_COMPACTION
+static struct vfsmount *balloon_mnt;
+#endif
+
struct virtio_balloon {
struct virtio_device *vdev;
struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
{
struct virtqueue *vqs[3];
vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
- const char *names[] = { "inflate", "deflate", "stats" };
+ static const char * const names[] = { "inflate", "deflate", "stats" };
int err, nvqs;
/*
get_page(newpage); /* balloon reference */
+ /*
+ * When we migrate a page to a different zone and adjusted the
+ * managed page count when inflating, we have to fixup the count of
+ * both involved zones.
+ */
+ if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM) &&
+ page_zone(page) != page_zone(newpage)) {
+ adjust_managed_page_count(page, 1);
+ adjust_managed_page_count(newpage, -1);
+ }
+
/* balloon's page migration 1st step -- inflate "newpage" */
spin_lock_irqsave(&vb_dev_info->pages_lock, flags);
balloon_page_insert(vb_dev_info, newpage);
return MIGRATEPAGE_SUCCESS;
}
+
+static struct dentry *balloon_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ static const struct dentry_operations ops = {
+ .d_dname = simple_dname,
+ };
+
+ return mount_pseudo(fs_type, "balloon-kvm:", NULL, &ops,
+ BALLOON_KVM_MAGIC);
+}
+
+static struct file_system_type balloon_fs = {
+ .name = "balloon-kvm",
+ .mount = balloon_mount,
+ .kill_sb = kill_anon_super,
+};
+
#endif /* CONFIG_BALLOON_COMPACTION */
static int virtballoon_probe(struct virtio_device *vdev)
vb->need_stats_update = 0;
balloon_devinfo_init(&vb->vb_dev_info);
-#ifdef CONFIG_BALLOON_COMPACTION
- vb->vb_dev_info.migratepage = virtballoon_migratepage;
-#endif
err = init_vqs(vb);
if (err)
vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
err = register_oom_notifier(&vb->nb);
if (err < 0)
- goto out_oom_notify;
+ goto out_del_vqs;
+
+#ifdef CONFIG_BALLOON_COMPACTION
+ balloon_mnt = kern_mount(&balloon_fs);
+ if (IS_ERR(balloon_mnt)) {
+ err = PTR_ERR(balloon_mnt);
+ unregister_oom_notifier(&vb->nb);
+ goto out_del_vqs;
+ }
+
+ vb->vb_dev_info.migratepage = virtballoon_migratepage;
+ vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
+ if (IS_ERR(vb->vb_dev_info.inode)) {
+ err = PTR_ERR(vb->vb_dev_info.inode);
+ kern_unmount(balloon_mnt);
+ unregister_oom_notifier(&vb->nb);
+ vb->vb_dev_info.inode = NULL;
+ goto out_del_vqs;
+ }
+ vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
+#endif
virtio_device_ready(vdev);
out_del_vqs:
unregister_oom_notifier(&vb->nb);
-out_oom_notify:
vdev->config->del_vqs(vdev);
out_free_vb:
kfree(vb);
unregister_oom_notifier(&vb->nb);
kthread_stop(vb->thread);
remove_common(vb);
+ if (vb->vb_dev_info.inode)
+ iput(vb->vb_dev_info.inode);
kfree(vb);
}
INIT_LIST_HEAD(&fdlocks->locks);
fdlocks->cfile = cfile;
cfile->llist = fdlocks;
- cifs_down_write(&cinode->lock_sem);
- list_add(&fdlocks->llist, &cinode->llist);
- up_write(&cinode->lock_sem);
cfile->count = 1;
cfile->pid = current->tgid;
oplock = 0;
}
+ cifs_down_write(&cinode->lock_sem);
+ list_add(&fdlocks->llist, &cinode->llist);
+ up_write(&cinode->lock_sem);
+
spin_lock(&tcon->open_file_lock);
if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock)
oplock = fid->pending_open->oplock;
if (backup_cred(cifs_sb))
create_options |= CREATE_OPEN_BACKUP_INTENT;
+ /* O_SYNC also has bit for O_DSYNC so following check picks up either */
+ if (cfile->f_flags & O_SYNC)
+ create_options |= CREATE_WRITE_THROUGH;
+
+ if (cfile->f_flags & O_DIRECT)
+ create_options |= CREATE_NO_BUFFER;
+
if (server->ops->get_lease_key)
server->ops->get_lease_key(inode, &cfile->fid);
* should have access to this page, we're safe to simply set
* PG_locked without checking it first.
*/
- __set_page_locked(page);
+ __SetPageLocked(page);
rc = add_to_page_cache_locked(page, mapping,
page->index, gfp);
/* give up if we can't stick it in the cache */
if (rc) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return rc;
}
if (*bytes + PAGE_CACHE_SIZE > rsize)
break;
- __set_page_locked(page);
+ __SetPageLocked(page);
if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
break;
}
list_move_tail(&page->lru, tmplist);
kfree(forget);
if (ret == -ENOMEM)
goto out;
- if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+ if (ret || fuse_invalid_attr(&outarg.attr) ||
+ (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
goto invalid;
fuse_change_attributes(inode, &outarg.attr,
goto out;
}
+/*
+ * Get the canonical path. Since we must translate to a path, this must be done
+ * in the context of the userspace daemon, however, the userspace daemon cannot
+ * look up paths on its own. Instead, we handle the lookup as a special case
+ * inside of the write request.
+ */
+static void fuse_dentry_canonical_path(const struct path *path, struct path *canonical_path) {
+ struct inode *inode = path->dentry->d_inode;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req;
+ int err;
+ char *path_name;
+
+ req = fuse_get_req(fc, 1);
+ err = PTR_ERR(req);
+ if (IS_ERR(req))
+ goto default_path;
+
+ path_name = (char*)__get_free_page(GFP_KERNEL);
+ if (!path_name) {
+ fuse_put_request(fc, req);
+ goto default_path;
+ }
+
+ req->in.h.opcode = FUSE_CANONICAL_PATH;
+ req->in.h.nodeid = get_node_id(inode);
+ req->in.numargs = 0;
+ req->out.numargs = 1;
+ req->out.args[0].size = PATH_MAX;
+ req->out.args[0].value = path_name;
+ req->canonical_path = canonical_path;
+ req->out.argvar = 1;
+ fuse_request_send(fc, req);
+ err = req->out.h.error;
+ fuse_put_request(fc, req);
+ free_page((unsigned long)path_name);
+ if (!err)
+ return;
+default_path:
+ canonical_path->dentry = path->dentry;
+ canonical_path->mnt = path->mnt;
+ path_get(canonical_path);
+}
+
static int invalid_nodeid(u64 nodeid)
{
return !nodeid || nodeid == FUSE_ROOT_ID;
const struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
+ .d_canonical_path = fuse_dentry_canonical_path,
};
int fuse_valid_type(int m)
S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
}
+ bool fuse_invalid_attr(struct fuse_attr *attr)
+ {
+ return !fuse_valid_type(attr->mode) ||
+ attr->size > LLONG_MAX;
+ }
+
int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode)
{
err = -EIO;
if (!outarg->nodeid)
goto out_put_forget;
- if (!fuse_valid_type(outarg->attr.mode))
+ if (fuse_invalid_attr(&outarg->attr))
goto out_put_forget;
*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
args.out.args[0].value = &outentry;
args.out.args[1].size = sizeof(outopen);
args.out.args[1].value = &outopen;
+ args.out.passthrough_filp = NULL;
err = fuse_simple_request(fc, &args);
if (err)
goto out_free_ff;
err = -EIO;
- if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
+ if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid) ||
+ fuse_invalid_attr(&outentry.attr))
goto out_free_ff;
ff->fh = outopen.fh;
ff->nodeid = outentry.nodeid;
ff->open_flags = outopen.open_flags;
+ if (args.out.passthrough_filp != NULL)
+ ff->passthrough_filp = args.out.passthrough_filp;
inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
&outentry.attr, entry_attr_timeout(&outentry), 0);
if (!inode) {
goto out_put_forget_req;
err = -EIO;
- if (invalid_nodeid(outarg.nodeid))
+ if (invalid_nodeid(outarg.nodeid) || fuse_invalid_attr(&outarg.attr))
goto out_put_forget_req;
if ((outarg.attr.mode ^ mode) & S_IFMT)
spin_lock(&fc->lock);
fi->attr_version = ++fc->attr_version;
- inc_nlink(inode);
+ if (likely(inode->i_nlink < UINT_MAX))
+ inc_nlink(inode);
spin_unlock(&fc->lock);
fuse_invalidate_attr(inode);
fuse_update_ctime(inode);
args.out.args[0].value = &outarg;
err = fuse_simple_request(fc, &args);
if (!err) {
- if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+ if (fuse_invalid_attr(&outarg.attr) ||
+ (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
make_bad_inode(inode);
err = -EIO;
} else {
if (invalid_nodeid(o->nodeid))
return -EIO;
- if (!fuse_valid_type(o->attr.mode))
+ if (fuse_invalid_attr(&o->attr))
return -EIO;
fc = get_fuse_conn(dir);
goto error;
}
- if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+ if (fuse_invalid_attr(&outarg.attr) ||
+ (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
make_bad_inode(inode);
err = -EIO;
goto error;
/** Has flock been performed on this file? */
bool flock:1;
+
+ /* the read write file */
+ struct file *passthrough_filp;
+ bool passthrough_enabled;
};
/** One input argument of a request */
unsigned argvar:1;
unsigned numargs;
struct fuse_arg args[2];
+ struct file *passthrough_filp;
} out;
};
/** Inode used in the request or NULL */
struct inode *inode;
+ /** Path used for completing d_canonical_path */
+ struct path *canonical_path;
+
/** AIO control block */
struct fuse_io_priv *io;
/** Request is stolen from fuse_file->reserved_req */
struct file *stolen_file;
+
+ /** fuse passthrough file */
+ struct file *passthrough_filp;
};
struct fuse_iqueue {
/** write-back cache policy (default is write-through) */
unsigned writeback_cache:1;
+ /** passthrough IO. */
+ unsigned passthrough:1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
*/
int fuse_valid_type(int m);
+ bool fuse_invalid_attr(struct fuse_attr *attr);
+
/**
* Is current process allowed to perform filesystem operation?
*/
seq_printf(m,
"State:\t%s\n"
"Tgid:\t%d\n"
- "Ngid:\t%d\n"
"Pid:\t%d\n"
"PPid:\t%d\n"
"TracerPid:\t%d\n"
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n"
+ "Ngid:\t%d\n"
"FDSize:\t%d\nGroups:\t",
get_task_state(p),
- tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid,
+ tgid, pid_nr_ns(pid, ns), ppid, tpid,
from_kuid_munged(user_ns, cred->uid),
from_kuid_munged(user_ns, cred->euid),
from_kuid_munged(user_ns, cred->suid),
from_kgid_munged(user_ns, cred->egid),
from_kgid_munged(user_ns, cred->sgid),
from_kgid_munged(user_ns, cred->fsgid),
- max_fds);
+ ngid, max_fds);
group_info = cred->group_info;
for (g = 0; g < group_info->ngroups; g++)
mm = get_task_mm(task);
if (mm) {
vsize = task_vsize(mm);
- if (permitted) {
- eip = KSTK_EIP(task);
- esp = KSTK_ESP(task);
+ /*
+ * esp and eip are intentionally zeroed out. There is no
+ * non-racy way to read them without freezing the task.
+ * Programs that need reliable values can use ptrace(2).
+ *
+ * The only exception is if the task is core dumping because
+ * a program is not able to use ptrace(2) in that case. It is
+ * safe because the task has stopped executing permanently.
+ */
+ if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) {
+ if (try_get_task_stack(task)) {
+ eip = KSTK_EIP(task);
+ esp = KSTK_ESP(task);
+ put_task_stack(task);
+ }
}
}
int (*mapping_error)(struct device *dev, dma_addr_t dma_addr);
int (*dma_supported)(struct device *dev, u64 mask);
int (*set_dma_mask)(struct device *dev, u64 mask);
+ void *(*remap)(struct device *dev, void *cpu_addr, dma_addr_t handle,
+ size_t size, struct dma_attrs *attrs);
+ void (*unremap)(struct device *dev, void *remapped_address,
+ size_t size);
#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
u64 (*get_required_mask)(struct device *dev);
#endif
#include <asm-generic/dma-mapping-broken.h>
#endif
+#ifndef CONFIG_NO_DMA
+static inline void *dma_remap(struct device *dev, void *cpu_addr,
+ dma_addr_t dma_handle, size_t size, struct dma_attrs *attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ BUG_ON(!ops);
+
+ if (!ops->remap) {
+ WARN_ONCE(1, "Remap function not implemented for %pS\n",
+ ops->remap);
+ return NULL;
+ }
+
+ return ops->remap(dev, cpu_addr, dma_handle, size, attrs);
+}
+
+
+static inline void dma_unremap(struct device *dev, void *remapped_addr,
+ size_t size)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ BUG_ON(!ops);
+
+ if (!ops->unremap) {
+ WARN_ONCE(1, "unremap function not implemented for %pS\n",
+ ops->unremap);
+ return;
+ }
+
+ return ops->unremap(dev, remapped_addr, size);
+}
+#endif
+
+
static inline u64 dma_get_mask(struct device *dev)
{
if (dev && dev->dma_mask && *dev->dma_mask)
return SZ_64K;
}
- static inline unsigned int dma_set_max_seg_size(struct device *dev,
- unsigned int size)
+ static inline int dma_set_max_seg_size(struct device *dev, unsigned int size)
{
if (dev->dma_parms) {
dev->dma_parms->max_segment_size = size;
unsigned char if_port;
unsigned char dma;
+ /* Note : dev->mtu is often read without holding a lock.
+ * Writers usually hold RTNL.
+ * It is recommended to use READ_ONCE() to annotate the reads,
+ * and to use WRITE_ONCE() to annotate the writes.
+ */
unsigned int mtu;
unsigned short type;
unsigned short hard_header_len;
*/
struct softnet_data {
struct list_head poll_list;
+ struct napi_struct *current_napi;
struct sk_buff_head process_queue;
/* stats */
unsigned int time_squeeze;
unsigned int cpu_collision;
unsigned int received_rps;
+ unsigned int gro_coalesced;
+
#ifdef CONFIG_RPS
struct softnet_data *rps_ipi_list;
#endif
gro_result_t napi_gro_frags(struct napi_struct *napi);
struct packet_offload *gro_find_receive_by_type(__be16 type);
struct packet_offload *gro_find_complete_by_type(__be16 type);
+extern struct napi_struct *get_current_napi_context(void);
static inline void napi_free_frags(struct napi_struct *napi)
{
* Data passed is old voltage cast to (void *).
* PRE_DISABLE Regulator is about to be disabled
* ABORT_DISABLE Regulator disable failed for some reason
+ * ENABLE Regulator was enabled.
*
* NOTE: These events can be OR'ed together when passed into handler.
*/
#define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE 0x200
#define REGULATOR_EVENT_PRE_DISABLE 0x400
#define REGULATOR_EVENT_ABORT_DISABLE 0x800
+#define REGULATOR_EVENT_ENABLE 0x1000
/**
* struct pre_voltage_change_data - Data sent with PRE_VOLTAGE_CHANGE event
* using the bulk regulator APIs.
* @consumer: The regulator consumer for the supply. This will be managed
* by the bulk API.
+ * @min_uV: The minimum requested voltage for the regulator (in microvolts),
+ * or 0 to not set a voltage.
+ * @max_uV: The maximum requested voltage for the regulator (in microvolts),
+ * or 0 to use @min_uV.
*
* The regulator APIs provide a series of regulator_bulk_() API calls as
* a convenience to consumers which require multiple supplies. This
struct regulator_bulk_data {
const char *supply;
struct regulator *consumer;
+ int min_uV;
+ int max_uV;
/* private: Internal use */
int ret;
struct regulator_bulk_data *consumers);
int __must_check regulator_bulk_enable(int num_consumers,
struct regulator_bulk_data *consumers);
+int regulator_bulk_set_voltage(int num_consumers,
+ struct regulator_bulk_data *consumers);
int regulator_bulk_disable(int num_consumers,
struct regulator_bulk_data *consumers);
int regulator_bulk_force_disable(int num_consumers,
int regulator_can_change_voltage(struct regulator *regulator);
int regulator_count_voltages(struct regulator *regulator);
int regulator_list_voltage(struct regulator *regulator, unsigned selector);
+int regulator_list_corner_voltage(struct regulator *regulator, int corner);
int regulator_is_supported_voltage(struct regulator *regulator,
int min_uV, int max_uV);
unsigned int regulator_get_linear_step(struct regulator *regulator);
static inline int regulator_set_load(struct regulator *regulator, int load_uA)
{
- return REGULATOR_MODE_NORMAL;
+ return 0;
}
static inline int regulator_allow_bypass(struct regulator *regulator,
return -EINVAL;
}
+static inline int regulator_list_corner_voltage(struct regulator *regulator,
+ int corner)
+{
+ return -EINVAL;
+}
#endif
static inline int regulator_set_voltage_triplet(struct regulator *regulator,
void (*set_ldisc)(struct uart_port *, struct ktermios *);
void (*pm)(struct uart_port *, unsigned int state,
unsigned int oldstate);
+ void (*wake_peer)(struct uart_port *);
/*
* Return a string describing the type of the port
struct console *cons; /* struct console, if any */
#if defined(CONFIG_SERIAL_CORE_CONSOLE) || defined(SUPPORT_SYSRQ)
unsigned long sysrq; /* sysrq timeout */
+ unsigned int sysrq_ch; /* char for sysrq */
#endif
/* flags must be updated while holding port mutex */
struct earlycon_id {
char name[16];
+ char compatible[128];
int (*setup)(struct earlycon_device *, const char *options);
} __aligned(32);
+extern const struct earlycon_id __earlycon_table[];
+extern const struct earlycon_id __earlycon_table_end[];
+
+#define OF_EARLYCON_DECLARE(_name, compat, fn) \
+ static const struct earlycon_id __UNIQUE_ID(__earlycon_##_name) \
+ __used __section(__earlycon_table) \
+ = { .name = __stringify(_name), \
+ .compatible = compat, \
+ .setup = fn }
+
+#define EARLYCON_DECLARE(_name, fn) OF_EARLYCON_DECLARE(_name, "", fn)
+
extern int setup_earlycon(char *buf);
extern int of_setup_earlycon(unsigned long addr,
int (*setup)(struct earlycon_device *, const char *));
-#define EARLYCON_DECLARE(_name, func) \
- static const struct earlycon_id __earlycon_##_name \
- __used __section(__earlycon_table) \
- = { .name = __stringify(_name), \
- .setup = func }
-
-#define OF_EARLYCON_DECLARE(name, compat, fn) \
- _OF_DECLARE(earlycon, name, compat, fn, void *)
-
struct uart_port *uart_get_console(struct uart_port *ports, int nr,
struct console *c);
int uart_parse_earlycon(char *p, unsigned char *iotype, unsigned long *addr,
static inline int uart_tx_stopped(struct uart_port *port)
{
struct tty_struct *tty = port->state->port.tty;
- if (tty->stopped || port->hw_stopped)
+ if ((tty && tty->stopped) || port->hw_stopped)
return 1;
return 0;
}
}
return 0;
}
+ static inline int
+ uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch)
+ {
+ if (port->sysrq) {
+ if (ch && time_before(jiffies, port->sysrq)) {
+ port->sysrq_ch = ch;
+ port->sysrq = 0;
+ return 1;
+ }
+ port->sysrq = 0;
+ }
+ return 0;
+ }
+ static inline void
+ uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags)
+ {
+ int sysrq_ch;
+
+ sysrq_ch = port->sysrq_ch;
+ port->sysrq_ch = 0;
+
+ spin_unlock_irqrestore(&port->lock, irqflags);
+
+ if (sysrq_ch)
+ handle_sysrq(sysrq_ch);
+ }
#else
- #define uart_handle_sysrq_char(port,ch) ({ (void)port; 0; })
+ static inline int
+ uart_handle_sysrq_char(struct uart_port *port, unsigned int ch) { return 0; }
+ static inline int
+ uart_prepare_sysrq_char(struct uart_port *port, unsigned int ch) { return 0; }
+ static inline void
+ uart_unlock_and_check_sysrq(struct uart_port *port, unsigned long irqflags)
+ {
+ spin_unlock_irqrestore(&port->lock, irqflags);
+ }
#endif
/*
#include <linux/types.h>
#include <linux/bug.h>
+#include <linux/restart_block.h>
-#ifdef CONFIG_THREAD_INFO_IN_TASK
-#define current_thread_info() ((struct thread_info *)current)
-#endif
-
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ struct timespec;
+ struct compat_timespec;
+
/*
- * System call restart block.
+ * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
+ * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
+ * including <asm/current.h> can cause a circular dependency on some platforms.
*/
-struct restart_block {
- long (*fn)(struct restart_block *);
- union {
- /* For futex_wait and futex_wait_requeue_pi */
- struct {
- u32 __user *uaddr;
- u32 val;
- u32 flags;
- u32 bitset;
- u64 time;
- u32 __user *uaddr2;
- } futex;
- /* For nanosleep */
- struct {
- clockid_t clockid;
- struct timespec __user *rmtp;
-#ifdef CONFIG_COMPAT
- struct compat_timespec __user *compat_rmtp;
+#include <asm/current.h>
+#define current_thread_info() ((struct thread_info *)current)
#endif
- u64 expires;
- } nanosleep;
- /* For poll */
- struct {
- struct pollfd __user *ufds;
- int nfds;
- int has_timeout;
- unsigned long tv_sec;
- unsigned long tv_nsec;
- } poll;
- };
-};
-
-extern long do_no_restart_syscall(struct restart_block *parm);
#include <linux/bitops.h>
#include <asm/thread_info.h>
#error "no set_restore_sigmask() provided and default one won't work"
#endif
+#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
+static inline int arch_within_stack_frames(const void * const stack,
+ const void * const stackend,
+ const void *obj, unsigned long len)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_HARDENED_USERCOPY
+extern void __check_object_size(const void *ptr, unsigned long n,
+ bool to_user);
+
+static __always_inline void check_object_size(const void *ptr, unsigned long n,
+ bool to_user)
+{
+ if (!__builtin_constant_p(n))
+ __check_object_size(ptr, n, to_user);
+}
+#else
+static inline void check_object_size(const void *ptr, unsigned long n,
+ bool to_user)
+{ }
+#endif /* CONFIG_HARDENED_USERCOPY */
+
#endif /* __KERNEL__ */
#endif /* _LINUX_THREAD_INFO_H */
/* -1 if not needed */
int bound_dev_if;
u8 tos;
+ kuid_t uid;
};
#define IP_REPLY_ARG_NOSRCCHECK 1
}
#endif
+extern int sysctl_reserved_port_bind;
+
/* From inetpeer.c */
extern int inet_peer_threshold;
extern int inet_peer_minttl;
int ip_misc_proc_init(void);
#endif
+ static inline bool inetdev_valid_mtu(unsigned int mtu)
+ {
+ return likely(mtu >= IPV4_MIN_MTU);
+ }
+
#endif /* _IP_H */
* most likely due to retrans in 3WHS.
*/
+/* Number of full MSS to receive before Acking RFC2581 */
+#define TCP_DELACK_SEG 1
+
#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
* for local resources.
*/
extern int sysctl_tcp_invalid_ratelimit;
extern int sysctl_tcp_pacing_ss_ratio;
extern int sysctl_tcp_pacing_ca_ratio;
+extern int sysctl_tcp_default_init_rwnd;
extern atomic_long_t tcp_memory_allocated;
+
+/* sysctl variables for controlling various tcp parameters */
+extern int sysctl_tcp_delack_seg;
+extern int sysctl_tcp_use_userconfig;
+
extern struct percpu_counter tcp_sockets_allocated;
extern int tcp_memory_pressure;
struct pipe_inode_info *pipe, size_t len,
unsigned int flags);
+/* sysctl master controller */
+extern int tcp_use_userconfig_sysctl_handler(struct ctl_table *, int,
+ void __user *, size_t *, loff_t *);
+extern int tcp_proc_delayed_ack_control(struct ctl_table *, int,
+ void __user *, size_t *, loff_t *);
+
void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks);
+
static inline void tcp_dec_quickack_mode(struct sock *sk,
const unsigned int pkts)
{
*/
static inline void tcp_synq_overflow(const struct sock *sk)
{
- unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
+ unsigned long last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
unsigned long now = jiffies;
- if (time_after(now, last_overflow + HZ))
- tcp_sk(sk)->rx_opt.ts_recent_stamp = now;
+ if (!time_between32(now, last_overflow, last_overflow + HZ))
+ WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now);
}
/* syncookies: no recent synqueue overflow on this listening socket? */
static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
{
- unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
-
- return time_after(jiffies, last_overflow + TCP_SYNCOOKIE_VALID);
+ unsigned long last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
+
+ /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID,
+ * then we're under synflood. However, we have to use
+ * 'last_overflow - HZ' as lower bound. That's because a concurrent
+ * tcp_synq_overflow() could update .ts_recent_stamp after we read
+ * jiffies but before we store .ts_recent_stamp into last_overflow,
+ * which could lead to rejecting a valid syncookie.
+ */
+ return !time_between32(jiffies, last_overflow - HZ,
+ last_overflow + TCP_SYNCOOKIE_VALID);
}
static inline u32 tcp_cookie_time(void)
void tcp_send_window_probe(struct sock *sk);
-/* TCP timestamps are only 32-bits, this causes a slight
- * complication on 64-bit systems since we store a snapshot
- * of jiffies in the buffer control blocks below. We decided
- * to use only the low 32-bits of jiffies and hide the ugly
- * casts with the following macro.
+/* TCP uses 32bit jiffies to save some space.
+ * Note that this is different from tcp_time_stamp, which
+ * historically has been the same until linux-4.13.
+ */
+#define tcp_jiffies32 ((u32)jiffies)
+
+/* Generator for TCP TS option (RFC 7323)
+ * Currently tied to 'jiffies' but will soon be driven by 1 ms clock.
*/
#define tcp_time_stamp ((__u32)(jiffies))
void tcp_done(struct sock *sk);
+int tcp_abort(struct sock *sk, int err);
+
static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
{
rx_opt->dsack = 0;
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
free_module(mod);
+ /* someone could wait for the module in add_unformed_module() */
+ wake_up_all(&module_wq);
return 0;
out:
mutex_unlock(&module_mutex);
/* We'll tack temporary mod_kallsyms on the end. */
mod->init_size = ALIGN(mod->init_size,
- __alignof__(struct mod_kallsyms));
+ __alignof__(struct mod_kallsyms));
info->mod_kallsyms_init_off = mod->init_size;
mod->init_size += sizeof(struct mod_kallsyms);
mod->init_size = debug_align(mod->init_size);
return vmalloc_exec(size);
}
-#ifdef CONFIG_DEBUG_KMEMLEAK
+#if defined(CONFIG_DEBUG_KMEMLEAK) && defined(CONFIG_DEBUG_MODULE_SCAN_OFF)
+static void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
+{
+ kmemleak_no_scan(mod->module_core);
+}
+#elif defined(CONFIG_DEBUG_KMEMLEAK)
static void kmemleak_load_module(const struct module *mod,
const struct load_info *info)
{
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
-
-#include <trace/events/sched.h>
+#include <linux/module.h>
#include "sched.h"
+#include <trace/events/sched.h>
+#include "tune.h"
+#include "walt.h"
/*
* Targeted preemption latency for CPU-bound tasks:
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_sync_hint_enable = 1;
+unsigned int sysctl_sched_cstate_aware = 1;
+
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+#ifdef CONFIG_SMP
+static int active_load_balance_cpu_stop(void *data);
+#endif
const struct sched_class fair_sched_class;
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases.
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the beg of the tree
*/
if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- } else {
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+ /*
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the beg of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else {
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the beg of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new beg
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
}
cfs_rq->on_list = 1;
}
#ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
/*
{
struct sched_avg *sa = &se->avg;
- sa->last_update_time = 0;
+ memset(sa, 0, sizeof(*sa));
/*
+ * util_avg is initialized in post_init_entity_util_avg.
+ * util_est should start from zero.
* sched_avg's period_contrib should be strictly less then 1024, so
* we give it 1023 to make sure it is almost a period (1024us), and
* will definitely be update (after enqueue).
*/
sa->period_contrib = 1023;
- sa->load_avg = scale_load_down(se->load.weight);
+ /*
+ * Tasks are intialized with full load to be seen as heavy tasks until
+ * they get a chance to stabilize to their real load level.
+ * Group entities are intialized with zero load to reflect the fact that
+ * nothing has been attached to the task group yet.
+ */
+ if (entity_is_task(se))
+ sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-#else
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void attach_entity_cfs_rq(struct sched_entity *se);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct sched_avg *sa = &se->avg;
+ long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
+ }
+ /*
+ * If we wish to restore tuning via setting initial util,
+ * this is where we should do it.
+ */
+ sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ }
+
+ if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
+ return;
+ }
+ }
+
+ attach_entity_cfs_rq(se);
+}
+
+#else /* !CONFIG_SMP */
void init_entity_runnable_average(struct sched_entity *se)
{
}
-#endif
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
/*
* Update the current task's runtime statistics.
update_curr(cfs_rq_of(&rq->curr->se));
}
+#ifdef CONFIG_SCHEDSTATS
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+ likely(wait_start > se->statistics.wait_start))
+ wait_start -= se->statistics.wait_start;
+
+ se->statistics.wait_start = wait_start;
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *p;
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ se->statistics.wait_start = delta;
+ return;
+ }
+ trace_sched_stat_wait(p, delta);
+ }
+
+ se->statistics.wait_max = max(se->statistics.wait_max, delta);
+ se->statistics.wait_count++;
+ se->statistics.wait_sum += delta;
+ se->statistics.wait_start = 0;
+}
+#else
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
}
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
/*
* Task is being enqueued - update stats:
*/
update_stats_wait_start(cfs_rq, se);
}
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
- schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
- schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- trace_sched_stat_wait(task_of(se),
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
- }
-#endif
- schedstat_set(se->statistics.wait_start, 0);
-}
-
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
* Call select_idle_sibling to maybe find a better one.
*/
if (!cur)
- env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+ env->dst_cpu);
assign:
assigned = true;
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
- long tg_weight;
+ long tg_weight, load, shares;
/*
- * Use this CPU's real-time load instead of the last load contribution
- * as the updating of the contribution is delayed, and we will use the
- * the real-time load to calc the share. See update_tg_load_avg().
+ * This really should be: cfs_rq->avg.load_avg, but instead we use
+ * cfs_rq->load.weight, which is its upper bound. This helps ramp up
+ * the shares for small weight interactive tasks.
*/
- tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_avg_contrib;
- tg_weight += cfs_rq->load.weight;
-
- return tg_weight;
-}
+ load = scale_load_down(cfs_rq->load.weight);
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- long tg_weight, load, shares;
+ tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
+ /* Ensure tg_weight >= load */
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += load;
shares = (tg->shares * load);
if (tg_weight)
return tg->shares;
}
# endif /* CONFIG_SMP */
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg;
- struct sched_entity *se;
long shares;
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
+ if (!cfs_rq)
+ return;
+
+ if (throttled_hierarchy(cfs_rq))
return;
+
+ tg = cfs_rq->tg;
+
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
reweight_entity(cfs_rq_of(se), se, shares);
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
+u32 sched_get_wake_up_idle(struct task_struct *p)
+{
+ u32 enabled = p->flags & PF_WAKE_UP_IDLE;
+
+ return !!enabled;
+}
+EXPORT_SYMBOL(sched_get_wake_up_idle);
+
+int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
+{
+ int enable = !!wake_up_idle;
+
+ if (enable)
+ p->flags |= PF_WAKE_UP_IDLE;
+ else
+ p->flags &= ~PF_WAKE_UP_IDLE;
+
+ return 0;
+}
+EXPORT_SYMBOL(sched_set_wake_up_idle);
+
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
return contrib + runnable_avg_yN_sum[n];
}
-#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
-#error "load tracking assumes 2^10 as unit"
-#endif
+#ifdef CONFIG_SCHED_HMP
+
+/* CPU selection flag */
+#define SBC_FLAG_PREV_CPU 0x1
+#define SBC_FLAG_BEST_CAP_CPU 0x2
+#define SBC_FLAG_CPU_COST 0x4
+#define SBC_FLAG_MIN_COST 0x8
+#define SBC_FLAG_IDLE_LEAST_LOADED 0x10
+#define SBC_FLAG_IDLE_CSTATE 0x20
+#define SBC_FLAG_COST_CSTATE_TIE_BREAKER 0x40
+#define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER 0x80
+#define SBC_FLAG_CSTATE_LOAD 0x100
+#define SBC_FLAG_BEST_SIBLING 0x200
+#define SBC_FLAG_WAKER_CPU 0x400
+#define SBC_FLAG_PACK_TASK 0x800
+
+/* Cluster selection flag */
+#define SBC_FLAG_COLOC_CLUSTER 0x10000
+#define SBC_FLAG_WAKER_CLUSTER 0x20000
+#define SBC_FLAG_BACKUP_CLUSTER 0x40000
+#define SBC_FLAG_BOOST_CLUSTER 0x80000
+
+struct cpu_select_env {
+ struct task_struct *p;
+ struct related_thread_group *rtg;
+ u8 reason;
+ u8 need_idle:1;
+ u8 need_waker_cluster:1;
+ u8 sync:1;
+ enum sched_boost_policy boost_policy;
+ u8 pack_task:1;
+ int prev_cpu;
+ DECLARE_BITMAP(candidate_list, NR_CPUS);
+ DECLARE_BITMAP(backup_list, NR_CPUS);
+ u64 task_load;
+ u64 cpu_load;
+ u32 sbc_best_flag;
+ u32 sbc_best_cluster_flag;
+ struct cpumask search_cpus;
+};
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+struct cluster_cpu_stats {
+ int best_idle_cpu, least_loaded_cpu;
+ int best_capacity_cpu, best_cpu, best_sibling_cpu;
+ int min_cost, best_sibling_cpu_cost;
+ int best_cpu_wakeup_latency;
+ u64 min_load, best_load, best_sibling_cpu_load;
+ s64 highest_spare_capacity;
+};
/*
- * We can represent the historical contribution to runnable average as the
- * coefficients of a geometric series. To do this we sub-divide our runnable
- * history into segments of approximately 1ms (1024us); label the segment that
- * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
- *
- * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
- * p0 p1 p2
- * (now) (~1ms ago) (~2ms ago)
- *
- * Let u_i denote the fraction of p_i that the entity was runnable.
- *
- * We then designate the fractions u_i as our co-efficients, yielding the
- * following representation of historical load:
- * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
- *
- * We choose y based on the with of a reasonably scheduling period, fixing:
- * y^32 = 0.5
- *
- * This means that the contribution to load ~32ms ago (u_32) will be weighted
- * approximately half as much as the contribution to load within the last ms
- * (u_0).
+ * Should task be woken to any available idle cpu?
*
- * When a period "rolls over" and we have new u_0`, multiplying the previous
- * sum again by y is sufficient to update:
- * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
- * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ * Waking tasks to idle cpu has mixed implications on both performance and
+ * power. In many cases, scheduler can't estimate correctly impact of using idle
+ * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
+ * module to pass a strong hint to scheduler that the task in question should be
+ * woken to idle cpu, generally to improve performance.
*/
-static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+static inline int wake_to_idle(struct task_struct *p)
{
- u64 delta, scaled_delta, periods;
- u32 contrib;
- unsigned int delta_w, scaled_delta_w, decayed = 0;
- unsigned long scale_freq, scale_cpu;
+ return (current->flags & PF_WAKE_UP_IDLE) ||
+ (p->flags & PF_WAKE_UP_IDLE);
+}
- delta = now - sa->last_update_time;
- /*
- * This should only happen when time goes backwards, which it
- * unfortunately does during sched clock init when we swap over to TSC.
- */
- if ((s64)delta < 0) {
- sa->last_update_time = now;
- return 0;
- }
+static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
+{
+ u64 total_load;
- /*
- * Use 1024ns as the unit of measurement since it's a reasonable
- * approximation of 1us and fast to compute.
- */
- delta >>= 10;
- if (!delta)
- return 0;
- sa->last_update_time = now;
+ total_load = env->task_load + env->cpu_load;
- scale_freq = arch_scale_freq_capacity(NULL, cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+ if (total_load > sched_spill_load ||
+ (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
+ return 1;
- /* delta_w is the amount already accumulated against our next period */
- delta_w = sa->period_contrib;
- if (delta + delta_w >= 1024) {
- decayed = 1;
+ return 0;
+}
- /* how much left for next period will start over, we don't know yet */
- sa->period_contrib = 0;
+static int skip_cpu(int cpu, struct cpu_select_env *env)
+{
+ int tcpu = task_cpu(env->p);
+ int skip = 0;
- /*
- * Now that we know we're crossing a period boundary, figure
- * out how much from delta we need to complete the current
- * period and accrue it.
- */
- delta_w = 1024 - delta_w;
- scaled_delta_w = cap_scale(delta_w, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta_w;
- if (cfs_rq) {
- cfs_rq->runnable_load_sum +=
- weight * scaled_delta_w;
- }
- }
- if (running)
- sa->util_sum += scaled_delta_w * scale_cpu;
+ if (!env->reason)
+ return 0;
- delta -= delta_w;
+ if (is_reserved(cpu))
+ return 1;
- /* Figure out how many additional periods this update spans */
- periods = delta / 1024;
- delta %= 1024;
+ switch (env->reason) {
+ case UP_MIGRATION:
+ skip = !idle_cpu(cpu);
+ break;
+ case IRQLOAD_MIGRATION:
+ /* Purposely fall through */
+ default:
+ skip = (cpu == tcpu);
+ break;
+ }
- sa->load_sum = decay_load(sa->load_sum, periods + 1);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods + 1);
- }
- sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
+ return skip;
+}
- /* Efficiently calculate \sum (1..n_period) 1024*y^i */
- contrib = __compute_runnable_contrib(periods);
- contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
- if (running)
- sa->util_sum += contrib * scale_cpu;
- }
+static inline int
+acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+ int tcpu;
+
+ if (!env->reason)
+ return 1;
+
+ tcpu = task_cpu(env->p);
+ switch (env->reason) {
+ case UP_MIGRATION:
+ return cluster->capacity > cpu_capacity(tcpu);
+
+ case DOWN_MIGRATION:
+ return cluster->capacity < cpu_capacity(tcpu);
+
+ default:
+ break;
+ }
+
+ return 1;
+}
+
+static int
+skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+ if (!test_bit(cluster->id, env->candidate_list))
+ return 1;
+
+ if (!acceptable_capacity(cluster, env)) {
+ __clear_bit(cluster->id, env->candidate_list);
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct sched_cluster *
+select_least_power_cluster(struct cpu_select_env *env)
+{
+ struct sched_cluster *cluster;
+
+ if (env->rtg) {
+ int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
+
+ env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
+
+ if (task_load_will_fit(env->p, env->task_load,
+ cpu, env->boost_policy)) {
+ env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
+
+ if (env->boost_policy == SCHED_BOOST_NONE)
+ return env->rtg->preferred_cluster;
+
+ for_each_sched_cluster(cluster) {
+ if (cluster != env->rtg->preferred_cluster) {
+ __set_bit(cluster->id,
+ env->backup_list);
+ __clear_bit(cluster->id,
+ env->candidate_list);
+ }
+ }
+
+ return env->rtg->preferred_cluster;
+ }
+
+ /*
+ * Since the task load does not fit on the preferred
+ * cluster anymore, pretend that the task does not
+ * have any preferred cluster. This allows the waking
+ * task to get the appropriate CPU it needs as per the
+ * non co-location placement policy without having to
+ * wait until the preferred cluster is updated.
+ */
+ env->rtg = NULL;
+ }
+
+ for_each_sched_cluster(cluster) {
+ if (!skip_cluster(cluster, env)) {
+ int cpu = cluster_first_cpu(cluster);
+
+ env->task_load = scale_load_to_cpu(task_load(env->p),
+ cpu);
+ if (task_load_will_fit(env->p, env->task_load, cpu,
+ env->boost_policy))
+ return cluster;
+
+ __set_bit(cluster->id, env->backup_list);
+ __clear_bit(cluster->id, env->candidate_list);
+ }
+ }
+
+ return NULL;
+}
+
+static struct sched_cluster *
+next_candidate(const unsigned long *list, int start, int end)
+{
+ int cluster_id;
+
+ cluster_id = find_next_bit(list, end, start - 1 + 1);
+ if (cluster_id >= end)
+ return NULL;
+
+ return sched_cluster[cluster_id];
+}
+
+static void
+update_spare_capacity(struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env, int cpu, int capacity,
+ u64 cpu_load)
+{
+ s64 spare_capacity = sched_ravg_window - cpu_load;
+
+ if (spare_capacity > 0 &&
+ (spare_capacity > stats->highest_spare_capacity ||
+ (spare_capacity == stats->highest_spare_capacity &&
+ ((!env->need_waker_cluster &&
+ capacity > cpu_capacity(stats->best_capacity_cpu)) ||
+ (env->need_waker_cluster &&
+ cpu_rq(cpu)->nr_running <
+ cpu_rq(stats->best_capacity_cpu)->nr_running))))) {
+ /*
+ * If sync waker is the only runnable of CPU, cr_avg of the
+ * CPU is 0 so we have high chance to place the wakee on the
+ * waker's CPU which likely causes preemtion of the waker.
+ * This can lead migration of preempted waker. Place the
+ * wakee on the real idle CPU when it's possible by checking
+ * nr_running to avoid such preemption.
+ */
+ stats->highest_spare_capacity = spare_capacity;
+ stats->best_capacity_cpu = cpu;
+ }
+}
+
+static inline void find_backup_cluster(
+struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+ struct sched_cluster *next = NULL;
+ int i;
+ struct cpumask search_cpus;
+
+ extern int num_clusters;
+
+ while (!bitmap_empty(env->backup_list, num_clusters)) {
+ next = next_candidate(env->backup_list, 0, num_clusters);
+ __clear_bit(next->id, env->backup_list);
+
+ cpumask_and(&search_cpus, &env->search_cpus, &next->cpus);
+ for_each_cpu(i, &search_cpus) {
+ trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+ sched_irqload(i), power_cost(i, task_load(env->p) +
+ cpu_cravg_sync(i, env->sync)), 0);
+
+ update_spare_capacity(stats, env, i, next->capacity,
+ cpu_load_sync(i, env->sync));
+ }
+ env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER;
+ }
+}
+
+struct sched_cluster *
+next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
+ struct cluster_cpu_stats *stats)
+{
+ struct sched_cluster *next = NULL;
+
+ extern int num_clusters;
+
+ __clear_bit(cluster->id, env->candidate_list);
+
+ if (env->rtg && preferred_cluster(cluster, env->p))
+ return NULL;
+
+ do {
+ if (bitmap_empty(env->candidate_list, num_clusters))
+ return NULL;
+
+ next = next_candidate(env->candidate_list, 0, num_clusters);
+ if (next) {
+ if (next->min_power_cost > stats->min_cost) {
+ clear_bit(next->id, env->candidate_list);
+ next = NULL;
+ continue;
+ }
+
+ if (skip_cluster(next, env))
+ next = NULL;
+ }
+ } while (!next);
+
+ env->task_load = scale_load_to_cpu(task_load(env->p),
+ cluster_first_cpu(next));
+ return next;
+}
+
+#ifdef CONFIG_SCHED_HMP_CSTATE_AWARE
+static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env, int cpu_cost)
+{
+ int wakeup_latency;
+ int prev_cpu = env->prev_cpu;
+
+ wakeup_latency = cpu_rq(cpu)->wakeup_latency;
+
+ if (env->need_idle) {
+ stats->min_cost = cpu_cost;
+ if (idle_cpu(cpu)) {
+ if (wakeup_latency < stats->best_cpu_wakeup_latency ||
+ (wakeup_latency == stats->best_cpu_wakeup_latency &&
+ cpu == prev_cpu)) {
+ stats->best_idle_cpu = cpu;
+ stats->best_cpu_wakeup_latency = wakeup_latency;
+ }
+ } else {
+ if (env->cpu_load < stats->min_load ||
+ (env->cpu_load == stats->min_load &&
+ cpu == prev_cpu)) {
+ stats->least_loaded_cpu = cpu;
+ stats->min_load = env->cpu_load;
+ }
+ }
+
+ return;
+ }
+
+ if (cpu_cost < stats->min_cost) {
+ stats->min_cost = cpu_cost;
+ stats->best_cpu_wakeup_latency = wakeup_latency;
+ stats->best_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_CPU_COST;
+ return;
+ }
+
+ /* CPU cost is the same. Start breaking the tie by C-state */
+
+ if (wakeup_latency > stats->best_cpu_wakeup_latency)
+ return;
+
+ if (wakeup_latency < stats->best_cpu_wakeup_latency) {
+ stats->best_cpu_wakeup_latency = wakeup_latency;
+ stats->best_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
+ return;
+ }
+
+ /* C-state is the same. Use prev CPU to break the tie */
+ if (cpu == prev_cpu) {
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER;
+ return;
+ }
+
+ if (stats->best_cpu != prev_cpu &&
+ ((wakeup_latency == 0 && env->cpu_load < stats->best_load) ||
+ (wakeup_latency > 0 && env->cpu_load > stats->best_load))) {
+ stats->best_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
+ }
+}
+#else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
+static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env, int cpu_cost)
+{
+ int prev_cpu = env->prev_cpu;
+
+ if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
+ if (stats->best_sibling_cpu_cost > cpu_cost ||
+ (stats->best_sibling_cpu_cost == cpu_cost &&
+ stats->best_sibling_cpu_load > env->cpu_load)) {
+ stats->best_sibling_cpu_cost = cpu_cost;
+ stats->best_sibling_cpu_load = env->cpu_load;
+ stats->best_sibling_cpu = cpu;
+ }
+ }
+
+ if ((cpu_cost < stats->min_cost) ||
+ ((stats->best_cpu != prev_cpu &&
+ stats->min_load > env->cpu_load) || cpu == prev_cpu)) {
+ if (env->need_idle) {
+ if (idle_cpu(cpu)) {
+ stats->min_cost = cpu_cost;
+ stats->best_idle_cpu = cpu;
+ }
+ } else {
+ stats->min_cost = cpu_cost;
+ stats->min_load = env->cpu_load;
+ stats->best_cpu = cpu;
+ env->sbc_best_flag = SBC_FLAG_MIN_COST;
+ }
+ }
+}
+#endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */
+
+static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+ struct cpu_select_env *env)
+{
+ int cpu_cost;
+
+ /*
+ * We try to find the least loaded *busy* CPU irrespective
+ * of the power cost.
+ */
+ if (env->pack_task)
+ cpu_cost = cpu_min_power_cost(cpu);
+
+ else
+ cpu_cost = power_cost(cpu, task_load(env->p) +
+ cpu_cravg_sync(cpu, env->sync));
+
+ if (cpu_cost <= stats->min_cost)
+ __update_cluster_stats(cpu, stats, env, cpu_cost);
+}
+
+static void find_best_cpu_in_cluster(struct sched_cluster *c,
+ struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+ int i;
+ struct cpumask search_cpus;
+
+ cpumask_and(&search_cpus, &env->search_cpus, &c->cpus);
+
+ env->need_idle = wake_to_idle(env->p) || c->wake_up_idle;
+
+ for_each_cpu(i, &search_cpus) {
+ env->cpu_load = cpu_load_sync(i, env->sync);
+
+ trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+ sched_irqload(i),
+ power_cost(i, task_load(env->p) +
+ cpu_cravg_sync(i, env->sync)), 0);
+
+ if (skip_cpu(i, env))
+ continue;
+
+ update_spare_capacity(stats, env, i, c->capacity,
+ env->cpu_load);
+
+ /*
+ * need_idle takes precedence over sched boost but when both
+ * are set, idlest CPU with in all the clusters is selected
+ * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
+ * big cluster is selected within boost_policy = BOOST_ON_BIG.
+ */
+ if ((!env->need_idle &&
+ env->boost_policy != SCHED_BOOST_NONE) ||
+ env->need_waker_cluster ||
+ sched_cpu_high_irqload(i) ||
+ spill_threshold_crossed(env, cpu_rq(i)))
+ continue;
+
+ update_cluster_stats(i, stats, env);
+ }
+}
+
+static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
+{
+ stats->best_cpu = stats->best_idle_cpu = -1;
+ stats->best_capacity_cpu = stats->best_sibling_cpu = -1;
+ stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
+ stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX;
+ stats->highest_spare_capacity = 0;
+ stats->least_loaded_cpu = -1;
+ stats->best_cpu_wakeup_latency = INT_MAX;
+ /* No need to initialize stats->best_load */
+}
+
+static inline bool env_has_special_flags(struct cpu_select_env *env)
+{
+ if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
+ env->reason)
+ return true;
+
+ return false;
+}
+
+static inline bool
+bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+ int prev_cpu;
+ struct task_struct *task = env->p;
+ struct sched_cluster *cluster;
+
+ if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
+ return false;
+
+ prev_cpu = env->prev_cpu;
+ if (!cpumask_test_cpu(prev_cpu, &env->search_cpus))
+ return false;
+
+ if (task->ravg.mark_start - task->last_cpu_selected_ts >=
+ sched_long_cpu_selection_threshold)
+ return false;
+
+ /*
+ * This function should be used by task wake up path only as it's
+ * assuming p->last_switch_out_ts as last sleep time.
+ * p->last_switch_out_ts can denote last preemption time as well as
+ * last sleep time.
+ */
+ if (task->ravg.mark_start - task->last_switch_out_ts >=
+ sched_short_sleep_task_threshold)
+ return false;
+
+ env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
+ cluster = cpu_rq(prev_cpu)->cluster;
+
+ if (!task_load_will_fit(task, env->task_load, prev_cpu,
+ sched_boost_policy())) {
+
+ __set_bit(cluster->id, env->backup_list);
+ __clear_bit(cluster->id, env->candidate_list);
+ return false;
+ }
+
+ env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
+ if (sched_cpu_high_irqload(prev_cpu) ||
+ spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
+ update_spare_capacity(stats, env, prev_cpu,
+ cluster->capacity, env->cpu_load);
+ cpumask_clear_cpu(prev_cpu, &env->search_cpus);
+ return false;
+ }
+
+ return true;
+}
+
+static inline bool
+wake_to_waker_cluster(struct cpu_select_env *env)
+{
+ return env->sync &&
+ task_load(current) > sched_big_waker_task_load &&
+ task_load(env->p) < sched_small_wakee_task_load;
+}
+
+static inline bool
+bias_to_waker_cpu(struct cpu_select_env *env, int cpu)
+{
+ return sysctl_sched_prefer_sync_wakee_to_waker &&
+ cpu_rq(cpu)->nr_running == 1 &&
+ cpumask_test_cpu(cpu, &env->search_cpus);
+}
+
+static inline int
+cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster)
+{
+ return cpumask_intersects(&env->search_cpus, &cluster->cpus);
+}
+
+/* return cheapest cpu that can fit this task */
+static int select_best_cpu(struct task_struct *p, int target, int reason,
+ int sync)
+{
+ struct sched_cluster *cluster, *pref_cluster = NULL;
+ struct cluster_cpu_stats stats;
+ struct related_thread_group *grp;
+ unsigned int sbc_flag = 0;
+ int cpu = raw_smp_processor_id();
+ bool special;
+
+ struct cpu_select_env env = {
+ .p = p,
+ .reason = reason,
+ .need_idle = wake_to_idle(p),
+ .need_waker_cluster = 0,
+ .sync = sync,
+ .prev_cpu = target,
+ .rtg = NULL,
+ .sbc_best_flag = 0,
+ .sbc_best_cluster_flag = 0,
+ .pack_task = false,
+ };
+
+ env.boost_policy = task_sched_boost(p) ?
+ sched_boost_policy() : SCHED_BOOST_NONE;
+
+ bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
+ bitmap_zero(env.backup_list, NR_CPUS);
+
+ cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask);
+ cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask);
+
+ init_cluster_cpu_stats(&stats);
+ special = env_has_special_flags(&env);
+
+ rcu_read_lock();
+
+ grp = task_related_thread_group(p);
+
+ if (grp && grp->preferred_cluster) {
+ pref_cluster = grp->preferred_cluster;
+ if (!cluster_allowed(&env, pref_cluster))
+ clear_bit(pref_cluster->id, env.candidate_list);
+ else
+ env.rtg = grp;
+ } else if (!special) {
+ cluster = cpu_rq(cpu)->cluster;
+ if (wake_to_waker_cluster(&env)) {
+ if (bias_to_waker_cpu(&env, cpu)) {
+ target = cpu;
+ sbc_flag = SBC_FLAG_WAKER_CLUSTER |
+ SBC_FLAG_WAKER_CPU;
+ goto out;
+ } else if (cluster_allowed(&env, cluster)) {
+ env.need_waker_cluster = 1;
+ bitmap_zero(env.candidate_list, NR_CPUS);
+ __set_bit(cluster->id, env.candidate_list);
+ env.sbc_best_cluster_flag =
+ SBC_FLAG_WAKER_CLUSTER;
+ }
+ } else if (bias_to_prev_cpu(&env, &stats)) {
+ sbc_flag = SBC_FLAG_PREV_CPU;
+ goto out;
+ }
+ }
+
+ if (!special && is_short_burst_task(p)) {
+ env.pack_task = true;
+ sbc_flag = SBC_FLAG_PACK_TASK;
+ }
+retry:
+ cluster = select_least_power_cluster(&env);
+
+ if (!cluster)
+ goto out;
+
+ /*
+ * 'cluster' now points to the minimum power cluster which can satisfy
+ * task's perf goals. Walk down the cluster list starting with that
+ * cluster. For non-small tasks, skip clusters that don't have
+ * mostly_idle/idle cpus
+ */
+
+ do {
+ find_best_cpu_in_cluster(cluster, &env, &stats);
+
+ } while ((cluster = next_best_cluster(cluster, &env, &stats)));
+
+ if (env.need_idle) {
+ if (stats.best_idle_cpu >= 0) {
+ target = stats.best_idle_cpu;
+ sbc_flag |= SBC_FLAG_IDLE_CSTATE;
+ } else if (stats.least_loaded_cpu >= 0) {
+ target = stats.least_loaded_cpu;
+ sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
+ }
+ } else if (stats.best_cpu >= 0) {
+ if (stats.best_sibling_cpu >= 0 &&
+ stats.best_cpu != task_cpu(p) &&
+ stats.min_cost == stats.best_sibling_cpu_cost) {
+ stats.best_cpu = stats.best_sibling_cpu;
+ sbc_flag |= SBC_FLAG_BEST_SIBLING;
+ }
+ sbc_flag |= env.sbc_best_flag;
+ target = stats.best_cpu;
+ } else {
+ if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
+ env.rtg = NULL;
+ goto retry;
+ }
+
+ /*
+ * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
+ * backup_list = little cluster, candidate_list = none and
+ * stats->best_capacity_cpu points the best spare capacity
+ * CPU among the CPUs in the big cluster.
+ */
+ if (env.boost_policy == SCHED_BOOST_ON_BIG &&
+ stats.best_capacity_cpu >= 0)
+ sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
+ else
+ find_backup_cluster(&env, &stats);
+
+ if (stats.best_capacity_cpu >= 0) {
+ target = stats.best_capacity_cpu;
+ sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
+ }
+ }
+ p->last_cpu_selected_ts = sched_ktime_clock();
+out:
+ sbc_flag |= env.sbc_best_cluster_flag;
+ rcu_read_unlock();
+ trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
+ env.reason, env.sync, env.need_idle, sbc_flag, target);
+ return target;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+ tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
+
+ return (&tg->list == &task_groups) ? NULL : tg;
+}
+
+/* Iterate over all cfs_rq in a cpu */
+#define for_each_cfs_rq(cfs_rq, tg, cpu) \
+ for (tg = container_of(&task_groups, struct task_group, list); \
+ ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
+
+void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
+{
+ struct task_group *tg;
+ struct cfs_rq *cfs_rq;
+
+ rcu_read_lock();
+
+ for_each_cfs_rq(cfs_rq, tg, cpu)
+ reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
+
+ rcu_read_unlock();
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra);
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra);
+
+/* Add task's contribution to a cpu' HMP statistics */
+void _inc_hmp_sched_stats_fair(struct rq *rq,
+ struct task_struct *p, int change_cra)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ /*
+ * Although below check is not strictly required (as
+ * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
+ * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
+ * efficiency by short-circuiting for_each_sched_entity() loop when
+ * sched_disable_window_stats
+ */
+ if (sched_disable_window_stats)
+ return;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se)
+ inc_rq_hmp_stats(rq, p, change_cra);
+}
+
+/* Remove task's contribution from a cpu' HMP statistics */
+static void
+_dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+
+ /* See comment on efficiency in _inc_hmp_sched_stats_fair */
+ if (sched_disable_window_stats)
+ return;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se)
+ dec_rq_hmp_stats(rq, p, change_cra);
+}
+
+static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ _inc_hmp_sched_stats_fair(rq, p, 1);
+}
+
+static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ _dec_hmp_sched_stats_fair(rq, p, 1);
+}
+
+static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
+ task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
+ if (!se) {
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
+ task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+ }
+}
+
+static int task_will_be_throttled(struct task_struct *p);
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
+inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
+
+static void
+inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ inc_nr_big_task(&rq->hmp_stats, p);
+ inc_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+
+static void
+dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p)
+{
+ dec_nr_big_task(&rq->hmp_stats, p);
+ dec_cumulative_runnable_avg(&rq->hmp_stats, p);
+}
+static void
+fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
+ u32 new_task_load, u32 new_pred_demand)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+ s64 pred_demand_delta = PRED_DEMAND_DELTA;
+
+ fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
+ pred_demand_delta);
+ fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
+}
+
+static inline int task_will_be_throttled(struct task_struct *p)
+{
+ return 0;
+}
+
+void _inc_hmp_sched_stats_fair(struct rq *rq,
+ struct task_struct *p, int change_cra)
+{
+ inc_nr_big_task(&rq->hmp_stats, p);
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+/*
+ * Reset balance_interval at all sched_domain levels of given cpu, so that it
+ * honors kick.
+ */
+static inline void reset_balance_interval(int cpu)
+{
+ struct sched_domain *sd;
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd)
+ sd->balance_interval = 0;
+ rcu_read_unlock();
+}
+
+/*
+ * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
+ * cpu as per its demand or priority)
+ *
+ * Returns reason why task needs to be migrated
+ */
+static inline int migration_needed(struct task_struct *p, int cpu)
+{
+ int nice;
+ struct related_thread_group *grp;
+
+ if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
+ return 0;
+
+ /* No need to migrate task that is about to be throttled */
+ if (task_will_be_throttled(p))
+ return 0;
+
+ if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
+ cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
+ return UP_MIGRATION;
+
+ if (sched_cpu_high_irqload(cpu))
+ return IRQLOAD_MIGRATION;
+
+ nice = task_nice(p);
+ rcu_read_lock();
+ grp = task_related_thread_group(p);
+ /*
+ * Don't assume higher capacity means higher power. If the task
+ * is running on the power efficient CPU, avoid migrating it
+ * to a lower capacity cluster.
+ */
+ if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
+ upmigrate_discouraged(p)) &&
+ cpu_capacity(cpu) > min_capacity &&
+ cpu_max_power_cost(cpu) == max_power_cost) {
+ rcu_read_unlock();
+ return DOWN_MIGRATION;
+ }
+
+ if (!task_will_fit(p, cpu)) {
+ rcu_read_unlock();
+ return UP_MIGRATION;
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static inline int
+kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+ unsigned long flags;
+ int rc = 0;
+
+ /* Invoke active balance to force migrate currently running task */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (!rq->active_balance) {
+ rq->active_balance = 1;
+ rq->push_cpu = new_cpu;
+ get_task_struct(p);
+ rq->push_task = p;
+ rc = 1;
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return rc;
+}
+
+static DEFINE_RAW_SPINLOCK(migration_lock);
+
+static bool do_migration(int reason, int new_cpu, int cpu)
+{
+ if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
+ && same_cluster(new_cpu, cpu))
+ return false;
+
+ /* Inter cluster high irqload migrations are OK */
+ return new_cpu != cpu;
+}
+
+/*
+ * Check if currently running task should be migrated to a better cpu.
+ *
+ * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
+ */
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq), new_cpu;
+ int active_balance = 0, reason;
+
+ reason = migration_needed(p, cpu);
+ if (!reason)
+ return;
+
+ raw_spin_lock(&migration_lock);
+ new_cpu = select_best_cpu(p, cpu, reason, 0);
+
+ if (do_migration(reason, new_cpu, cpu)) {
+ active_balance = kick_active_balance(rq, p, new_cpu);
+ if (active_balance)
+ mark_reserved(new_cpu);
+ }
+
+ raw_spin_unlock(&migration_lock);
+
+ if (active_balance)
+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
+ &rq->active_balance_work);
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->hmp_stats.nr_big_tasks = 0;
+ cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
+ cfs_rq->hmp_stats.pred_demands_sum = 0;
+}
+
+static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra)
+{
+ inc_nr_big_task(&cfs_rq->hmp_stats, p);
+ if (change_cra)
+ inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
+}
+
+static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra)
+{
+ dec_nr_big_task(&cfs_rq->hmp_stats, p);
+ if (change_cra)
+ dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
+}
+
+static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+ struct cfs_rq *cfs_rq)
+{
+ stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
+ stats->cumulative_runnable_avg +=
+ cfs_rq->hmp_stats.cumulative_runnable_avg;
+ stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum;
+}
+
+static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
+ struct cfs_rq *cfs_rq)
+{
+ stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
+ stats->cumulative_runnable_avg -=
+ cfs_rq->hmp_stats.cumulative_runnable_avg;
+ stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum;
+
+ BUG_ON(stats->nr_big_tasks < 0 ||
+ (s64)stats->cumulative_runnable_avg < 0);
+ BUG_ON((s64)stats->pred_demands_sum < 0);
+}
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
+
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+#define dec_throttled_cfs_rq_hmp_stats(...)
+#define inc_throttled_cfs_rq_hmp_stats(...)
+
+#endif /* CONFIG_SCHED_HMP */
+
+#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
+#error "load tracking assumes 2^10 as unit"
+#endif
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+ u64 delta, scaled_delta, periods;
+ u32 contrib;
+ unsigned int delta_w, scaled_delta_w, decayed = 0;
+ unsigned long scale_freq, scale_cpu;
+
+ delta = now - sa->last_update_time;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_update_time = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_update_time = now;
+
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+ trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->period_contrib;
+ if (delta + delta_w >= 1024) {
+ decayed = 1;
+
+ /* how much left for next period will start over, we don't know yet */
+ sa->period_contrib = 0;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ scaled_delta_w = cap_scale(delta_w, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * scaled_delta_w;
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum +=
+ weight * scaled_delta_w;
+ }
+ }
+ if (running)
+ sa->util_sum += scaled_delta_w * scale_cpu;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+
+ sa->load_sum = decay_load(sa->load_sum, periods + 1);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods + 1);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
+
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ contrib = __compute_runnable_contrib(periods);
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+ }
/* Remainder of delta accrued against u_0` */
scaled_delta = cap_scale(delta, scale_freq);
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * scaled_delta;
}
+
if (running)
sa->util_sum += scaled_delta * scale_cpu;
return decayed;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ /*
+ * No need to update load_avg for root_task_group as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next)
+{
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
+
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (se->avg.last_update_time && prev) {
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
+#ifndef CONFIG_64BIT
+ u64 p_last_update_time_copy;
+ u64 n_last_update_time_copy;
+
+ do {
+ p_last_update_time_copy = prev->load_last_update_time_copy;
+ n_last_update_time_copy = next->load_last_update_time_copy;
+
+ smp_rmb();
+
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+
+ } while (p_last_update_time != p_last_update_time_copy ||
+ n_last_update_time != n_last_update_time_copy);
+#else
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+#endif
+ __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+ &se->avg, 0, 0, NULL);
+ se->avg.last_update_time = n_last_update_time;
+ }
+}
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's utilization */
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta, load = gcfs_rq->avg.load_avg;
+
+ /*
+ * If the load of group cfs_rq is null, the load of the
+ * sched_entity will also be null so we can skip the formula
+ */
+ if (load) {
+ long tg_load;
+
+ /* Get tg's load and ensure tg_load > 0 */
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+ /* Ensure tg_load >= load and updated with current load*/
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
+ tg_load += load;
+
+ /*
+ * We need to compute a correction term in the case that the
+ * task group is consuming more CPU than a task of equal
+ * weight. A task with a weight equals to tg->shares will have
+ * a load less or equal to scale_load_down(tg->shares).
+ * Similarly, the sched_entities that represent the task group
+ * at parent level, can't have a load higher than
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
+ * load must be <= scale_load_down(tg->shares).
+ */
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+ /* scale gcfs_rq's load into tg's shares*/
+ load *= scale_load_down(gcfs_rq->tg->shares);
+ load /= tg_load;
+ }
+ }
+
+ delta = load - se->avg.load_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's load */
+ se->avg.load_avg = load;
+ se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq load */
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+ /*
+ * If the sched_entity is already enqueued, we also have to update the
+ * runnable load avg.
+ */
+ if (se->on_rq) {
+ /* Update parent cfs_rq runnable_load_avg */
+ add_positive(&cfs_rq->runnable_load_avg, delta);
+ cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ }
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+ if (!cfs_rq->propagate_avg)
+ return 0;
+
+ cfs_rq->propagate_avg = 0;
+ return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ if (entity_is_task(se))
+ return 0;
+
+ if (!test_and_clear_tg_cfs_propagate(se))
+ return 0;
+
+ cfs_rq = cfs_rq_of(se);
+
+ set_tg_cfs_propagate(cfs_rq);
+
+ update_tg_cfs_util(cfs_rq, se);
+ update_tg_cfs_load(cfs_rq, se);
+
+ return 1;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
+
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ if (&this_rq()->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq_of(cfs_rq), 0);
+ }
+}
+
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
/*
WRITE_ONCE(*ptr, res); \
} while (0)
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
struct sched_avg *sa = &cfs_rq->avg;
- int decayed, removed = 0;
+ int decayed, removed = 0, removed_util = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+ removed_util = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
+ /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+ if (cfs_rq == &rq_of(cfs_rq)->cfs)
+ trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
+ if (update_freq && (decayed || removed_util))
+ cfs_rq_util_change(cfs_rq);
+
return decayed || removed;
}
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+
/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
int cpu = cpu_of(rq_of(cfs_rq));
+ int decayed;
+ void *ptr = NULL;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_load_avg(now, cpu, &se->avg,
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+ __update_load_avg(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
+ }
+
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed |= propagate_entity_load_avg(se);
- if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
+
+ if (entity_is_task(se)) {
+#ifdef CONFIG_SCHED_WALT
+ ptr = (void *)&(task_of(se)->ravg);
+#endif
+ trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
+ }
}
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (!sched_feat(ATTACH_AGE_LOAD))
- goto skip_aging;
-
- /*
- * If we got migrated (either between CPUs or between cgroups) we'll
- * have aged the average right before clearing @last_update_time.
- */
- if (se->avg.last_update_time) {
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, 0, 0, NULL);
-
- /*
- * XXX: we could have just aged the entire load away if we've been
- * absent from the fair class for too long.
- */
- }
-
-skip_aging:
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ set_tg_cfs_propagate(cfs_rq);
+
+ cfs_rq_util_change(cfs_rq);
}
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ set_tg_cfs_propagate(cfs_rq);
+
+ cfs_rq_util_change(cfs_rq);
}
/* Add the load generated by se into cfs_rq's load average */
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- u64 now = cfs_rq_clock_task(cfs_rq);
- int migrated, decayed;
-
- migrated = !sa->last_update_time;
- if (!migrated) {
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
-
- decayed = update_cfs_rq_load_avg(now, cfs_rq);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
- if (migrated)
+ if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
-
- if (decayed || migrated)
update_tg_load_avg(cfs_rq, 0);
+ }
}
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- update_load_avg(se, 1);
-
cfs_rq->runnable_load_avg =
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
cfs_rq->runnable_load_sum =
#endif
/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
+/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
/*
- * Newly created task or never used group entity should not be removed
- * from its (source) cfs_rq
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * post_init_entity_util_avg() which will have added things to the
+ * cfs_rq, so we can remove unconditionally.
+ *
+ * Similarly for groups, they will have passed through
+ * post_init_entity_util_avg() before unregister_sched_fair_group()
+ * calls this.
*/
- if (se->avg.last_update_time == 0)
- return;
-
- last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ sync_entity_load_avg(se);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
#else /* CONFIG_SMP */
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+ return 0;
+}
+
+#define UPDATE_TG 0x0
+#define SKIP_AGE_LOAD 0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1){}
static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
return 0;
}
+static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
+static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
+ struct task_struct *p, int change_cra) { }
+
#endif /* CONFIG_SMP */
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
trace_sched_stat_blocked(tsk, delta);
+ trace_sched_blocked_reason(tsk);
/*
* Blocking time is in units of nanosecs, so shift by
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
+ update_cfs_shares(se);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Substract its load from the cfs_rq->runnable_avg.
+ * - Substract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
+ update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se);
return_cfs_rq_runtime(cfs_rq);
update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
}
/*
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(curr, UPDATE_TG);
+ update_cfs_shares(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
return cfs_bandwidth_used() && cfs_rq->throttled;
}
+#ifdef CONFIG_SCHED_HMP
+/*
+ * Check if task is part of a hierarchy where some cfs_rq does not have any
+ * runtime left.
+ *
+ * We can't rely on throttled_hierarchy() to do this test, as
+ * cfs_rq->throttle_count will not be updated yet when this function is called
+ * from scheduler_tick()
+ */
+static int task_will_be_throttled(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq;
+
+ if (!cfs_bandwidth_used())
+ return 0;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ if (!cfs_rq->runtime_enabled)
+ continue;
+ if (cfs_rq->runtime_remaining <= 0)
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
+ dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
if (qcfs_rq->load.weight)
dequeue = 0;
}
- if (!se)
+ if (!se) {
sub_nr_running(rq, task_delta);
+ dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
+ }
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
start_cfs_bandwidth(cfs_b);
raw_spin_unlock(&cfs_b->lock);
+
+ /* Log effect on hmp stats after throttling */
+ trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
+ sched_irqload(cpu_of(rq)),
+ power_cost(cpu_of(rq), 0),
+ cpu_temp(cpu_of(rq)));
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct sched_entity *se;
int enqueue = 1;
long task_delta;
+ struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
se = cfs_rq->tg->se[cpu_of(rq)];
if (enqueue)
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
cfs_rq->h_nr_running += task_delta;
+ inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
if (cfs_rq_throttled(cfs_rq))
break;
}
- if (!se)
+ if (!se) {
add_nr_running(rq, task_delta);
+ inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
+ }
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
resched_curr(rq);
+
+ /* Log effect on hmp stats after un-throttling */
+ trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
+ sched_irqload(cpu_of(rq)),
+ power_cost(cpu_of(rq), 0),
+ cpu_temp(cpu_of(rq)));
}
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
- new = (old * 147) / 128; /* ~115% */
- new = min(new, max_cfs_quota_period);
-
- cfs_b->period = ns_to_ktime(new);
-
- /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
- cfs_b->quota *= new;
- cfs_b->quota = div64_u64(cfs_b->quota, old);
-
- pr_warn_ratelimited(
- "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
- smp_processor_id(),
- div_u64(new, NSEC_PER_USEC),
- div_u64(cfs_b->quota, NSEC_PER_USEC));
+ /*
+ * Grow period by a factor of 2 to avoid losing precision.
+ * Precision loss in the quota/period ratio can cause __cfs_schedulable
+ * to fail.
+ */
+ new = old * 2;
+ if (new < max_cfs_quota_period) {
+ cfs_b->period = ns_to_ktime(new);
+ cfs_b->quota *= 2;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ } else {
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(old, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ }
/* reset count so we don't come right back in here */
count = 0;
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+ init_cfs_rq_hmp_stats(cfs_rq);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
WARN_ON(task_rq(p) != rq);
- if (cfs_rq->nr_running > 1) {
+ if (rq->cfs.h_nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
/*
* called from enqueue/dequeue and updates the hrtick when the
- * current task is from our class and nr_running is low enough
- * to matter.
+ * current task is from our class.
*/
static void hrtick_update(struct rq *rq)
{
if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
return;
- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
- hrtick_start_fair(rq, curr);
+ hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
}
#endif
+#ifdef CONFIG_SMP
+static bool __cpu_overutilized(int cpu, int delta);
+static bool cpu_overutilized(int cpu);
+unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+#ifdef CONFIG_SMP
+ int task_new = flags & ENQUEUE_WAKEUP_NEW;
+#endif
+
+ /*
+ * If in_iowait is set, the code below may not trigger any cpufreq
+ * utilization updates, so do it here explicitly with the IOWAIT flag
+ * passed.
+ */
+ if (p->in_iowait)
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
if (se->on_rq)
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
- */
+ */
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
- if (!se)
+ if (!se) {
add_nr_running(rq, 1);
+ inc_rq_hmp_stats(rq, p, 1);
+ }
+
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting.
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ *
+ * We do it also in the case where we enqueue a throttled task;
+ * we could argue that a throttled task should not boost a CPU,
+ * however:
+ * a) properly implementing CPU boosting considering throttled
+ * tasks will increase a lot the complexity of the solution
+ * b) it's not easy to quantify the benefits introduced by
+ * such a more complex solution.
+ * Thus, for the time being we go for the simple solution and boost
+ * also for throttled RQs.
+ */
+ schedtune_enqueue_task(p, cpu_of(rq));
+
+ if (energy_aware() && !se) {
+ if (!task_new && !rq->rd->overutilized &&
+ cpu_overutilized(rq->cpu)) {
+ rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
+ }
+#endif /* CONFIG_SMP */
hrtick_update(rq);
}
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
+ }
+
+ if (!se) {
+ sub_nr_running(rq, 1);
+ dec_rq_hmp_stats(rq, p, 1);
}
- if (!se)
- sub_nr_running(rq, 1);
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ */
+ schedtune_dequeue_task(p, cpu_of(rq));
+
+#endif /* CONFIG_SMP */
hrtick_update(rq);
}
return max(rq->cpu_load[type-1], total);
}
-static unsigned long capacity_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity;
-}
-
-static unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
static unsigned long cpu_avg_load_per_task(int cpu)
{
#endif
/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+unsigned long capacity_curr_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig *
+ arch_scale_freq_capacity(NULL, cpu)
+ >> SCHED_CAPACITY_SHIFT;
+}
+
+struct energy_env {
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ int cap_idx;
+ int util_delta;
+ int src_cpu;
+ int dst_cpu;
+ int trg_cpu;
+ int energy;
+ int payoff;
+ struct task_struct *task;
+ struct {
+ int before;
+ int after;
+ int delta;
+ int diff;
+ } nrg;
+ struct {
+ int before;
+ int after;
+ int delta;
+ } cap;
+};
+
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+/*
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
+ * energy calculations.
+ *
+ * Since util is a scale-invariant utilization defined as:
+ *
+ * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
+ *
+ * the normalized util can be found using the specific capacity.
+ *
+ * capacity = capacity_orig * curr_freq/max_freq
+ *
+ * norm_util = running_time/time ~ util/capacity
+ */
+static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
+{
+ if (util >= capacity)
+ return SCHED_CAPACITY_SCALE;
+
+ return (util << SCHED_CAPACITY_SHIFT)/capacity;
+}
+
+static unsigned long group_max_util(struct energy_env *eenv)
+{
+ unsigned long max_util = 0;
+ unsigned long util;
+ int cpu;
+
+ for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
+ util = cpu_util_wake(cpu, eenv->task);
+
+ /*
+ * If we are looking at the target CPU specified by the eenv,
+ * then we should add the (estimated) utilization of the task
+ * assuming we will wake it up on that CPU.
+ */
+ if (unlikely(cpu == eenv->trg_cpu))
+ util += eenv->util_delta;
+
+ max_util = max(max_util, util);
+ }
+
+ return max_util;
+}
+
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
+ * in energy calculations.
+ *
+ * Since task executions may or may not overlap in time in the group the true
+ * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
+ * when iterating over all CPUs in the group.
+ * The latter estimate is used as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned
+long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+{
+ unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+ unsigned long util, util_sum = 0;
+ int cpu;
+
+ for_each_cpu(cpu, sched_group_cpus(sg)) {
+ util = cpu_util_wake(cpu, eenv->task);
+
+ /*
+ * If we are looking at the target CPU specified by the eenv,
+ * then we should add the (estimated) utilization of the task
+ * assuming we will wake it up on that CPU.
+ */
+ if (unlikely(cpu == eenv->trg_cpu))
+ util += eenv->util_delta;
+
+ util_sum += __cpu_norm_util(util, capacity);
+ }
+
+ return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
+}
+
+static int find_new_capacity(struct energy_env *eenv,
+ const struct sched_group_energy * const sge)
+{
+ int idx, max_idx = sge->nr_cap_states - 1;
+ unsigned long util = group_max_util(eenv);
+
+ /* default is max_cap if we don't find a match */
+ eenv->cap_idx = max_idx;
+
+ for (idx = 0; idx < sge->nr_cap_states; idx++) {
+ if (sge->cap_states[idx].cap >= util) {
+ eenv->cap_idx = idx;
+ break;
+ }
+ }
+
+ return eenv->cap_idx;
+}
+
+static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
+{
+ int i, state = INT_MAX;
+ int src_in_grp, dst_in_grp;
+ long grp_util = 0;
+
+ /* Find the shallowest idle state in the sched group. */
+ for_each_cpu(i, sched_group_cpus(sg))
+ state = min(state, idle_get_state_idx(cpu_rq(i)));
+
+ /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
+ state++;
+
+ src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
+ dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
+ if (src_in_grp == dst_in_grp) {
+ /* both CPUs under consideration are in the same group or not in
+ * either group, migration should leave idle state the same.
+ */
+ goto end;
+ }
+
+ /*
+ * Try to estimate if a deeper idle state is
+ * achievable when we move the task.
+ */
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ grp_util += cpu_util_wake(i, eenv->task);
+ if (unlikely(i == eenv->trg_cpu))
+ grp_util += eenv->util_delta;
+ }
+
+ if (grp_util <=
+ ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
+ /* after moving, this group is at most partly
+ * occupied, so it should have some idle time.
+ */
+ int max_idle_state_idx = sg->sge->nr_idle_states - 2;
+ int new_state = grp_util * max_idle_state_idx;
+ if (grp_util <= 0)
+ /* group will have no util, use lowest state */
+ new_state = max_idle_state_idx + 1;
+ else {
+ /* for partially idle, linearly map util to idle
+ * states, excluding the lowest one. This does not
+ * correspond to the state we expect to enter in
+ * reality, but an indication of what might happen.
+ */
+ new_state = min(max_idle_state_idx, (int)
+ (new_state / sg->sgc->max_capacity));
+ new_state = max_idle_state_idx - new_state;
+ }
+ state = new_state;
+ } else {
+ /* After moving, the group will be fully occupied
+ * so assume it will not be idle at all.
+ */
+ state = 0;
+ }
+end:
+ return state;
+}
+
+/*
+ * sched_group_energy(): Computes the absolute energy consumption of cpus
+ * belonging to the sched_group including shared resources shared only by
+ * members of the group. Iterates over all cpus in the hierarchy below the
+ * sched_group starting from the bottom working it's way up before going to
+ * the next cpu until all cpus are covered at all levels. The current
+ * implementation is likely to gather the same util statistics multiple times.
+ * This can probably be done in a faster but more complex way.
+ * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ */
+static int sched_group_energy(struct energy_env *eenv)
+{
+ struct cpumask visit_cpus;
+ u64 total_energy = 0;
+ int cpu_count;
+
+ WARN_ON(!eenv->sg_top->sge);
+
+ cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+ /* If a cpu is hotplugged in while we are in this function,
+ * it does not appear in the existing visit_cpus mask
+ * which came from the sched_group pointer of the
+ * sched_domain pointed at by sd_ea for either the prev
+ * or next cpu and was dereferenced in __energy_diff.
+ * Since we will dereference sd_scs later as we iterate
+ * through the CPUs we expect to visit, new CPUs can
+ * be present which are not in the visit_cpus mask.
+ * Guard this with cpu_count.
+ */
+ cpu_count = cpumask_weight(&visit_cpus);
+
+ while (!cpumask_empty(&visit_cpus)) {
+ struct sched_group *sg_shared_cap = NULL;
+ int cpu = cpumask_first(&visit_cpus);
+ struct sched_domain *sd;
+
+ /*
+ * Is the group utilization affected by cpus outside this
+ * sched_group?
+ * This sd may have groups with cpus which were not present
+ * when we took visit_cpus.
+ */
+ sd = rcu_dereference(per_cpu(sd_scs, cpu));
+
+ if (sd && sd->parent)
+ sg_shared_cap = sd->parent->groups;
+
+ for_each_domain(cpu, sd) {
+ struct sched_group *sg = sd->groups;
+
+ /* Has this sched_domain already been visited? */
+ if (sd->child && group_first_cpu(sg) != cpu)
+ break;
+
+ do {
+ unsigned long group_util;
+ int sg_busy_energy, sg_idle_energy;
+ int cap_idx, idle_idx;
+
+ if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+ eenv->sg_cap = sg_shared_cap;
+ else
+ eenv->sg_cap = sg;
+
+ cap_idx = find_new_capacity(eenv, sg->sge);
+
+ if (sg->group_weight == 1) {
+ /* Remove capacity of src CPU (before task move) */
+ if (eenv->trg_cpu == eenv->src_cpu &&
+ cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
+ eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
+ eenv->cap.delta -= eenv->cap.before;
+ }
+ /* Add capacity of dst CPU (after task move) */
+ if (eenv->trg_cpu == eenv->dst_cpu &&
+ cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
+ eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
+ eenv->cap.delta += eenv->cap.after;
+ }
+ }
+
+ idle_idx = group_idle_state(eenv, sg);
+ group_util = group_norm_util(eenv, sg);
+
+ sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
+ sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
+ * sg->sge->idle_states[idle_idx].power);
+
+ total_energy += sg_busy_energy + sg_idle_energy;
+
+ if (!sd->child) {
+ /*
+ * cpu_count here is the number of
+ * cpus we expect to visit in this
+ * calculation. If we race against
+ * hotplug, we can have extra cpus
+ * added to the groups we are
+ * iterating which do not appear in
+ * the visit_cpus mask. In that case
+ * we are not able to calculate energy
+ * without restarting so we will bail
+ * out and use prev_cpu this time.
+ */
+ if (!cpu_count)
+ return -EINVAL;
+ cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+ cpu_count--;
+ }
+
+ if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
+ goto next_cpu;
+
+ } while (sg = sg->next, sg != sd->groups);
+ }
+
+ /*
+ * If we raced with hotplug and got an sd NULL-pointer;
+ * returning a wrong energy estimation is better than
+ * entering an infinite loop.
+ * Specifically: If a cpu is unplugged after we took
+ * the visit_cpus mask, it no longer has an sd_scs
+ * pointer, so when we dereference it, we get NULL.
+ */
+ if (cpumask_test_cpu(cpu, &visit_cpus))
+ return -EINVAL;
+next_cpu:
+ cpumask_clear_cpu(cpu, &visit_cpus);
+ continue;
+ }
+
+ eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
+ return 0;
+}
+
+static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
+{
+ return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
+}
+
+static inline unsigned long task_util(struct task_struct *p);
+
+/*
+ * energy_diff(): Estimate the energy impact of changing the utilization
+ * distribution. eenv specifies the change: utilisation amount, source, and
+ * destination cpu. Source or destination cpu may be -1 in which case the
+ * utilization is removed from or added to the system (e.g. task wake-up). If
+ * both are specified, the utilization is migrated.
+ */
+static inline int __energy_diff(struct energy_env *eenv)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ int sd_cpu = -1, energy_before = 0, energy_after = 0;
+ int diff, margin;
+
+ struct energy_env eenv_before = {
+ .util_delta = task_util(eenv->task),
+ .src_cpu = eenv->src_cpu,
+ .dst_cpu = eenv->dst_cpu,
+ .trg_cpu = eenv->src_cpu,
+ .nrg = { 0, 0, 0, 0},
+ .cap = { 0, 0, 0 },
+ .task = eenv->task,
+ };
+
+ if (eenv->src_cpu == eenv->dst_cpu)
+ return 0;
+
+ sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
+ sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
+
+ if (!sd)
+ return 0; /* Error */
+
+ sg = sd->groups;
+
+ do {
+ if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
+ eenv_before.sg_top = eenv->sg_top = sg;
+
+ if (sched_group_energy(&eenv_before))
+ return 0; /* Invalid result abort */
+ energy_before += eenv_before.energy;
+
+ /* Keep track of SRC cpu (before) capacity */
+ eenv->cap.before = eenv_before.cap.before;
+ eenv->cap.delta = eenv_before.cap.delta;
+
+ if (sched_group_energy(eenv))
+ return 0; /* Invalid result abort */
+ energy_after += eenv->energy;
+ }
+ } while (sg = sg->next, sg != sd->groups);
+
+ eenv->nrg.before = energy_before;
+ eenv->nrg.after = energy_after;
+ eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
+ eenv->payoff = 0;
+#ifndef CONFIG_SCHED_TUNE
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ eenv->nrg.delta, eenv->payoff);
+#endif
+ /*
+ * Dead-zone margin preventing too many migrations.
+ */
+
+ margin = eenv->nrg.before >> 6; /* ~1.56% */
+
+ diff = eenv->nrg.after - eenv->nrg.before;
+
+ eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
+
+ return eenv->nrg.diff;
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+struct target_nrg schedtune_target_nrg;
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+extern bool schedtune_initialized;
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
+ * corresponding to the specified energy variation.
+ */
+static inline int
+normalize_energy(int energy_diff)
+{
+ u32 normalized_nrg;
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ /* during early setup, we don't know the extents */
+ if (unlikely(!schedtune_initialized))
+ return energy_diff < 0 ? -1 : 1 ;
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+#ifdef CONFIG_SCHED_DEBUG
+ {
+ int max_delta;
+
+ /* Check for boundaries */
+ max_delta = schedtune_target_nrg.max_power;
+ max_delta -= schedtune_target_nrg.min_power;
+ WARN_ON(abs(energy_diff) >= max_delta);
+ }
+#endif
+
+ /* Do scaling using positive numbers to increase the range */
+ normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+ /* Scale by energy magnitude */
+ normalized_nrg <<= SCHED_CAPACITY_SHIFT;
+
+ /* Normalize on max energy for target platform */
+ normalized_nrg = reciprocal_divide(
+ normalized_nrg, schedtune_target_nrg.rdiv);
+
+ return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+static inline int
+energy_diff(struct energy_env *eenv)
+{
+ int boost = schedtune_task_boost(eenv->task);
+ int nrg_delta;
+
+ /* Conpute "absolute" energy diff */
+ __energy_diff(eenv);
+
+ /* Return energy diff when boost margin is 0 */
+ if (boost == 0) {
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ 0, -eenv->nrg.diff);
+ return eenv->nrg.diff;
+ }
+
+ /* Compute normalized energy diff */
+ nrg_delta = normalize_energy(eenv->nrg.diff);
+ eenv->nrg.delta = nrg_delta;
+
+ eenv->payoff = schedtune_accept_deltas(
+ eenv->nrg.delta,
+ eenv->cap.delta,
+ eenv->task);
+
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ eenv->nrg.delta, eenv->payoff);
+
+ /*
+ * When SchedTune is enabled, the energy_diff() function will return
+ * the computed energy payoff value. Since the energy_diff() return
+ * value is expected to be negative by its callers, this evaluation
+ * function return a negative value each time the evaluation return a
+ * positive payoff, which is the condition for the acceptance of
+ * a scheduling decision
+ */
+ return -eenv->payoff;
+}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff(eenv) __energy_diff(eenv)
+#endif
+
+/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
* A waker of many should wake a different task than the one last awakened
* at a frequency roughly N times higher than one of its wakees. In order
* being client/server, worker/dispatcher, interrupt source or whatever is
* irrelevant, spread criteria is apparent partner count exceeds socket size.
*/
-static int wake_wide(struct task_struct *p)
+static int wake_wide(struct task_struct *p, int sibling_count_hint)
{
unsigned int master = current->wakee_flips;
unsigned int slave = p->wakee_flips;
- int factor = this_cpu_read(sd_llc_size);
+ int llc_size = this_cpu_read(sd_llc_size);
+
+ if (sibling_count_hint >= llc_size)
+ return 1;
if (master < slave)
swap(master, slave);
- if (slave < factor || master < slave * factor)
+ if (slave < llc_size || master < slave * llc_size)
return 0;
return 1;
}
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ int prev_cpu, int sync)
{
s64 this_load, load;
s64 this_eff_load, prev_eff_load;
- int idx, this_cpu, prev_cpu;
+ int idx, this_cpu;
struct task_group *tg;
unsigned long weight;
int balanced;
idx = sd->wake_idx;
this_cpu = smp_processor_id();
- prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
this_eff_load = 100;
this_eff_load *= capacity_of(prev_cpu);
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= capacity_of(this_cpu);
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= capacity_of(this_cpu);
+
+ if (this_load > 0) {
+ this_eff_load *= this_load +
+ effective_load(tg, this_cpu, weight, weight);
+
+ prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+ }
+
+ balanced = this_eff_load <= prev_eff_load;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+
+ if (!balanced)
+ return 0;
+
+ schedstat_inc(sd, ttwu_move_affine);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine);
+
+ return 1;
+}
+
+static inline unsigned long task_util(struct task_struct *p)
+{
+ return p->se.avg.util_avg;
+}
+
+static inline unsigned long boosted_task_util(struct task_struct *task);
+
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
+{
+ unsigned long capacity = capacity_of(cpu);
+
+ util += boosted_task_util(p);
+
+ return (capacity * 1024) > (util * capacity_margin);
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+ unsigned long capacity = capacity_of(cpu);
+ unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+ if (capacity == max_capacity)
+ return true;
+
+ if (capacity * capacity_margin > max_capacity * 1024)
+ return true;
+
+ return __task_fits(p, cpu, 0);
+}
+
+static bool __cpu_overutilized(int cpu, int delta)
+{
+ return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
+}
+
+static bool cpu_overutilized(int cpu)
+{
+ return __cpu_overutilized(cpu, 0);
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+struct reciprocal_value schedtune_spc_rdiv;
+
+static long
+schedtune_margin(unsigned long signal, long boost)
+{
+ long long margin = 0;
+
+ /*
+ * Signal proportional compensation (SPC)
+ *
+ * The Boost (B) value is used to compute a Margin (M) which is
+ * proportional to the complement of the original Signal (S):
+ * M = B * (SCHED_CAPACITY_SCALE - S)
+ * The obtained M could be used by the caller to "boost" S.
+ */
+ if (boost >= 0) {
+ margin = SCHED_CAPACITY_SCALE - signal;
+ margin *= boost;
+ } else
+ margin = -signal * boost;
+
+ margin = reciprocal_divide(margin, schedtune_spc_rdiv);
+
+ if (boost < 0)
+ margin *= -1;
+ return margin;
+}
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ int boost = schedtune_cpu_boost(cpu);
+
+ if (boost == 0)
+ return 0;
+
+ return schedtune_margin(util, boost);
+}
+
+static inline long
+schedtune_task_margin(struct task_struct *task)
+{
+ int boost = schedtune_task_boost(task);
+ unsigned long util;
+ long margin;
+
+ if (boost == 0)
+ return 0;
+
+ util = task_util(task);
+ margin = schedtune_margin(util, boost);
+
+ return margin;
+}
+
+#else /* CONFIG_SCHED_TUNE */
- if (this_load > 0) {
- this_eff_load *= this_load +
- effective_load(tg, this_cpu, weight, weight);
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ return 0;
+}
- prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
- }
+static inline int
+schedtune_task_margin(struct task_struct *task)
+{
+ return 0;
+}
- balanced = this_eff_load <= prev_eff_load;
+#endif /* CONFIG_SCHED_TUNE */
- schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+unsigned long
+boosted_cpu_util(int cpu)
+{
+ unsigned long util = cpu_util_freq(cpu);
+ long margin = schedtune_cpu_margin(util, cpu);
- if (!balanced)
- return 0;
+ trace_sched_boost_cpu(cpu, util, margin);
- schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.statistics.nr_wakeups_affine);
+ return util + margin;
+}
- return 1;
+static inline unsigned long
+boosted_task_util(struct task_struct *task)
+{
+ unsigned long util = task_util(task);
+ long margin = schedtune_task_margin(task);
+
+ trace_sched_boost_task(task, util, margin);
+
+ return util + margin;
+}
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+ return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
}
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
- unsigned long min_load = ULONG_MAX, this_load = 0;
+ struct sched_group *most_spare_sg = NULL;
+ unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
+ unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load;
+ unsigned long load, avg_load, spare_cap, max_spare_cap;
int local_group;
int i;
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
- /* Tally up the load of all CPUs in the group */
+ /*
+ * Tally up the load of all CPUs in the group and find
+ * the group containing the CPU with most spare capacity.
+ */
avg_load = 0;
+ max_spare_cap = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
load = target_load(i, load_idx);
avg_load += load;
+
+ spare_cap = capacity_spare_wake(i, p);
+
+ if (spare_cap > max_spare_cap)
+ max_spare_cap = spare_cap;
}
/* Adjust by relative CPU capacity of the group */
if (local_group) {
this_load = avg_load;
- } else if (avg_load < min_load) {
- min_load = avg_load;
- idlest = group;
+ this_spare = max_spare_cap;
+ } else {
+ if (avg_load < min_load) {
+ min_load = avg_load;
+ idlest = group;
+ }
+
+ if (most_spare < max_spare_cap) {
+ most_spare = max_spare_cap;
+ most_spare_sg = group;
+ }
}
} while (group = group->next, group != sd->groups);
+ /*
+ * The cross-over point between using spare capacity or least load
+ * is too conservative for high utilization tasks on partially
+ * utilized systems if we require spare_capacity > task_util(p),
+ * so we allow for some task stuffing by using
+ * spare_capacity > task_util(p)/2.
+ *
+ * Spare capacity can't be used for fork because the utilization has
+ * not been set yet, we must first select a rq to compute the initial
+ * utilization.
+ */
+ if (sd_flag & SD_BALANCE_FORK)
+ goto skip_spare;
+
+ if (this_spare > task_util(p) / 2 &&
+ imbalance*this_spare > 100*most_spare)
+ return NULL;
+ else if (most_spare > task_util(p) / 2)
+ return most_spare_sg;
+
+skip_spare:
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
}
/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
*/
static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
int shallowest_idle_cpu = -1;
int i;
+ /* Check if we have any choice: */
+ if (group->group_weight == 1)
+ return cpumask_first(sched_group_cpus(group));
+
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
if (idle_cpu(i)) {
}
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ }
+
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+ int cpu, int prev_cpu, int sd_flag)
+{
+ int new_cpu = cpu;
+ int wu = sd_flag & SD_BALANCE_WAKE;
+ int cas_cpu = -1;
+
+ if (wu) {
+ schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
+ schedstat_inc(this_rq(), eas_stats.cas_attempts);
+ }
+
+ if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ return prev_cpu;
+
+ while (sd) {
+ struct sched_group *group;
+ struct sched_domain *tmp;
+ int weight;
+
+ if (wu)
+ schedstat_inc(sd, eas_stats.cas_attempts);
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_group_cpu(group, p, cpu);
+ if (new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = cas_cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ if (wu && (cas_cpu >= 0)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
+ schedstat_inc(this_rq(), eas_stats.cas_count);
+ }
+
+ return new_cpu;
}
/*
* Try and locate an idle CPU in the sched_domain.
*/
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
struct sched_group *sg;
- int i = task_cpu(p);
+ int best_idle_cpu = -1;
+ int best_idle_cstate = INT_MAX;
+ unsigned long best_idle_capacity = ULONG_MAX;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
+ schedstat_inc(this_rq(), eas_stats.sis_attempts);
+
+ if (!sysctl_sched_cstate_aware) {
+ if (idle_cpu(target)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
+ schedstat_inc(this_rq(), eas_stats.sis_idle);
+ return target;
+ }
- if (idle_cpu(target))
- return target;
+ /*
+ * If the prevous cpu is cache affine and idle, don't be stupid.
+ */
+ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
+ schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
+ return prev;
+ }
+ }
- /*
- * If the prevous cpu is cache affine and idle, don't be stupid.
- */
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ if (!(current->flags & PF_WAKE_UP_IDLE) &&
+ !(p->flags & PF_WAKE_UP_IDLE))
+ return target;
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
for_each_lower_domain(sd) {
sg = sd->groups;
do {
+ int i;
if (!cpumask_intersects(sched_group_cpus(sg),
tsk_cpus_allowed(p)))
goto next;
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (sysctl_sched_cstate_aware) {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
+ unsigned long new_usage = boosted_task_util(p);
+ unsigned long capacity_orig = capacity_orig_of(i);
+
+ if (new_usage > capacity_orig || !idle_cpu(i))
+ goto next;
+
+ if (i == target && new_usage <= capacity_curr_of(target)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
+ schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
+ schedstat_inc(sd, eas_stats.sis_suff_cap);
+ return target;
+ }
+
+ if (idle_idx < best_idle_cstate &&
+ capacity_orig <= best_idle_capacity) {
+ best_idle_cpu = i;
+ best_idle_cstate = idle_idx;
+ best_idle_capacity = capacity_orig;
+ }
+ }
+ } else {
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (i == target || !idle_cpu(i))
+ goto next;
+ }
- target = cpumask_first_and(sched_group_cpus(sg),
+ target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
- goto done;
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
+ schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
+ schedstat_inc(sd, eas_stats.sis_idle_cpu);
+ goto done;
+ }
next:
sg = sg->next;
} while (sg != sd->groups);
}
+
+ if (best_idle_cpu >= 0)
+ target = best_idle_cpu;
+
done:
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
+ schedstat_inc(this_rq(), eas_stats.sis_count);
+
return target;
}
/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed. check_for_migration() looks for a better CPU of
+ * rq->curr. For that case we should return cpu util with contributions from
+ * currently running task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
+{
+ unsigned long util, capacity;
+
+#ifdef CONFIG_SCHED_WALT
+ /*
+ * WALT does not decay idle tasks in the same manner
+ * as PELT, so it makes little sense to subtract task
+ * utilization from cpu utilization. Instead just use
+ * cpu_util for this case.
+ */
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
+ p->state == TASK_WAKING)
+ return cpu_util(cpu);
+#endif
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+ return cpu_util(cpu);
+
+ capacity = capacity_orig_of(cpu);
+ util = max_t(long, cpu_util(cpu) - task_util(p), 0);
+
+ return (util >= capacity) ? capacity : util;
+}
+
+static int start_cpu(bool boosted)
+{
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+
+ return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
+}
+
+static inline int find_best_target(struct task_struct *p, int *backup_cpu,
+ bool boosted, bool prefer_idle)
+{
+ unsigned long best_idle_min_cap_orig = ULONG_MAX;
+ unsigned long min_util = boosted_task_util(p);
+ unsigned long target_capacity = ULONG_MAX;
+ unsigned long min_wake_util = ULONG_MAX;
+ unsigned long target_max_spare_cap = 0;
+ unsigned long best_active_util = ULONG_MAX;
+ int best_idle_cstate = INT_MAX;
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ int best_active_cpu = -1;
+ int best_idle_cpu = -1;
+ int target_cpu = -1;
+ int cpu, i;
+ struct task_struct *curr_tsk;
+
+ *backup_cpu = -1;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
+ schedstat_inc(this_rq(), eas_stats.fbt_attempts);
+
+ /* Find start CPU based on boost value */
+ cpu = start_cpu(boosted);
+ if (cpu < 0) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
+ schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
+ return -1;
+ }
+
+ /* Find SD for the start CPU */
+ sd = rcu_dereference(per_cpu(sd_ea, cpu));
+ if (!sd) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
+ schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
+ return -1;
+ }
+
+ /* Scan CPUs in all SDs */
+ sg = sd->groups;
+ do {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ unsigned long capacity_curr = capacity_curr_of(i);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ unsigned long wake_util, new_util;
+
+ if (!cpu_online(i))
+ continue;
+
+ if (walt_cpu_high_irqload(i))
+ continue;
+
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ wake_util = cpu_util_wake(i, p);
+ new_util = wake_util + task_util(p);
+
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ new_util = max(min_util, new_util);
+ if (new_util > capacity_orig)
+ continue;
+
+ /*
+ * Case A) Latency sensitive tasks
+ *
+ * Unconditionally favoring tasks that prefer idle CPU to
+ * improve latency.
+ *
+ * Looking for:
+ * - an idle CPU, whatever its idle_state is, since
+ * the first CPUs we explore are more likely to be
+ * reserved for latency sensitive tasks.
+ * - a non idle CPU where the task fits in its current
+ * capacity and has the maximum spare capacity.
+ * - a non idle CPU with lower contention from other
+ * tasks and running at the lowest possible OPP.
+ *
+ * The last two goals tries to favor a non idle CPU
+ * where the task can run as if it is "almost alone".
+ * A maximum spare capacity CPU is favoured since
+ * the task already fits into that CPU's capacity
+ * without waiting for an OPP chance.
+ *
+ * The following code path is the only one in the CPUs
+ * exploration loop which is always used by
+ * prefer_idle tasks. It exits the loop with wither a
+ * best_active_cpu or a target_cpu which should
+ * represent an optimal choice for latency sensitive
+ * tasks.
+ */
+ if (prefer_idle) {
+
+ /*
+ * Case A.1: IDLE CPU
+ * Return the first IDLE CPU we find.
+ */
+ if (idle_cpu(i)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
+ schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
+
+ trace_sched_find_best_target(p,
+ prefer_idle, min_util,
+ cpu, best_idle_cpu,
+ best_active_cpu, i);
+
+ return i;
+ }
+
+ /*
+ * Case A.2: Target ACTIVE CPU
+ * Favor CPUs with max spare capacity.
+ */
+ if ((capacity_curr > new_util) &&
+ (capacity_orig - new_util > target_max_spare_cap)) {
+ target_max_spare_cap = capacity_orig - new_util;
+ target_cpu = i;
+ continue;
+ }
+ if (target_cpu != -1)
+ continue;
+
+
+ /*
+ * Case A.3: Backup ACTIVE CPU
+ * Favor CPUs with:
+ * - lower utilization due to other tasks
+ * - lower utilization with the task in
+ */
+ if (wake_util > min_wake_util)
+ continue;
+ if (new_util > best_active_util)
+ continue;
+ min_wake_util = wake_util;
+ best_active_util = new_util;
+ best_active_cpu = i;
+ continue;
+ }
+
+ /*
+ * Enforce EAS mode
+ *
+ * For non latency sensitive tasks, skip CPUs that
+ * will be overutilized by moving the task there.
+ *
+ * The goal here is to remain in EAS mode as long as
+ * possible at least for !prefer_idle tasks.
+ */
+ if ((new_util * capacity_margin) >
+ (capacity_orig * SCHED_CAPACITY_SCALE))
+ continue;
+
+ /*
+ * Case B) Non latency sensitive tasks on IDLE CPUs.
+ *
+ * Find an optimal backup IDLE CPU for non latency
+ * sensitive tasks.
+ *
+ * Looking for:
+ * - minimizing the capacity_orig,
+ * i.e. preferring LITTLE CPUs
+ * - favoring shallowest idle states
+ * i.e. avoid to wakeup deep-idle CPUs
+ *
+ * The following code path is used by non latency
+ * sensitive tasks if IDLE CPUs are available. If at
+ * least one of such CPUs are available it sets the
+ * best_idle_cpu to the most suitable idle CPU to be
+ * selected.
+ *
+ * If idle CPUs are available, favour these CPUs to
+ * improve performances by spreading tasks.
+ * Indeed, the energy_diff() computed by the caller
+ * will take care to ensure the minimization of energy
+ * consumptions without affecting performance.
+ */
+ if (idle_cpu(i)) {
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+ /* Select idle CPU with lower cap_orig */
+ if (capacity_orig > best_idle_min_cap_orig)
+ continue;
+
+ /*
+ * Skip CPUs in deeper idle state, but only
+ * if they are also less energy efficient.
+ * IOW, prefer a deep IDLE LITTLE CPU vs a
+ * shallow idle big CPU.
+ */
+ if (sysctl_sched_cstate_aware &&
+ best_idle_cstate <= idle_idx)
+ continue;
+
+ /* Keep track of best idle CPU */
+ best_idle_min_cap_orig = capacity_orig;
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ continue;
+ }
+
+ /*
+ * Case C) Non latency sensitive tasks on ACTIVE CPUs.
+ *
+ * Pack tasks in the most energy efficient capacities.
+ *
+ * This task packing strategy prefers more energy
+ * efficient CPUs (i.e. pack on smaller maximum
+ * capacity CPUs) while also trying to spread tasks to
+ * run them all at the lower OPP.
+ *
+ * This assumes for example that it's more energy
+ * efficient to run two tasks on two CPUs at a lower
+ * OPP than packing both on a single CPU but running
+ * that CPU at an higher OPP.
+ *
+ * Thus, this case keep track of the CPU with the
+ * smallest maximum capacity and highest spare maximum
+ * capacity.
+ */
+
+ /* Favor CPUs with smaller capacity */
+ if (capacity_orig > target_capacity)
+ continue;
+
+ /* Favor CPUs with maximum spare capacity */
+ if ((capacity_orig - new_util) < target_max_spare_cap)
+ continue;
+
+ target_max_spare_cap = capacity_orig - new_util;
+ target_capacity = capacity_orig;
+ target_cpu = i;
+ }
+
+ } while (sg = sg->next, sg != sd->groups);
+
+ /*
+ * For non latency sensitive tasks, cases B and C in the previous loop,
+ * we pick the best IDLE CPU only if we was not able to find a target
+ * ACTIVE CPU.
+ *
+ * Policies priorities:
+ *
+ * - prefer_idle tasks:
+ *
+ * a) IDLE CPU available, we return immediately
+ * b) ACTIVE CPU where task fits and has the bigger maximum spare
+ * capacity (i.e. target_cpu)
+ * c) ACTIVE CPU with less contention due to other tasks
+ * (i.e. best_active_cpu)
+ *
+ * - NON prefer_idle tasks:
+ *
+ * a) ACTIVE CPU: target_cpu
+ * b) IDLE CPU: best_idle_cpu
+ */
+ if (target_cpu != -1 && !idle_cpu(target_cpu) &&
+ best_idle_cpu != -1) {
+ curr_tsk = READ_ONCE(cpu_rq(target_cpu)->curr);
+ if (curr_tsk && schedtune_task_boost_rcu_locked(curr_tsk)) {
+ target_cpu = best_idle_cpu;
+ }
+ }
+
+ if (target_cpu == -1)
+ target_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+ else
+ *backup_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+
+ trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
+ best_idle_cpu, best_active_cpu,
+ target_cpu);
+
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
+ schedstat_inc(this_rq(), eas_stats.fbt_count);
+
+ return target_cpu;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
*
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
*/
-static int cpu_util(int cpu)
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+ long min_cap, max_cap;
+
+ min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+ /* Minimum capacity is close to max, no need to abort wake_affine */
+ if (max_cap - min_cap < max_cap >> 3)
+ return 0;
+
+ /* Bring task utilization in sync with prev_cpu */
+ sync_entity_load_avg(&p->se);
+
+ return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
{
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
- unsigned long capacity = capacity_orig_of(cpu);
+ struct sched_domain *sd;
+ int target_cpu = prev_cpu, tmp_target, tmp_backup;
+ bool boosted, prefer_idle;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
+ schedstat_inc(this_rq(), eas_stats.secb_attempts);
+
+ if (sysctl_sched_sync_hint_enable && sync) {
+ int cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_sync);
+ schedstat_inc(this_rq(), eas_stats.secb_sync);
+ return cpu;
+ }
+ }
+
+ rcu_read_lock();
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ boosted = schedtune_task_boost(p) > 0;
+ prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+ boosted = get_sysctl_sched_cfs_boost() > 0;
+ prefer_idle = 0;
+#endif
+
+ sync_entity_load_avg(&p->se);
+
+ sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+ /* Find a cpu with sufficient capacity */
+ tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
+
+ if (!sd)
+ goto unlock;
+ if (tmp_target >= 0) {
+ target_cpu = tmp_target;
+ if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
+ schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
+ goto unlock;
+ }
+ }
+
+ if (target_cpu != prev_cpu) {
+ int delta = 0;
+ struct energy_env eenv = {
+ .util_delta = task_util(p),
+ .src_cpu = prev_cpu,
+ .dst_cpu = target_cpu,
+ .task = p,
+ .trg_cpu = target_cpu,
+ };
+
+
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
+ p->state == TASK_WAKING)
+ delta = task_util(p);
+#endif
+ /* Not enough spare capacity on previous cpu */
+ if (__cpu_overutilized(prev_cpu, delta)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
+ schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
+ goto unlock;
+ }
+
+ if (energy_diff(&eenv) >= 0) {
+ /* No energy saving for target_cpu, try backup */
+ target_cpu = tmp_backup;
+ eenv.dst_cpu = target_cpu;
+ eenv.trg_cpu = target_cpu;
+ if (tmp_backup < 0 ||
+ tmp_backup == prev_cpu ||
+ energy_diff(&eenv) >= 0) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
+ }
+
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
+ goto unlock;
+ }
+
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_count);
+ schedstat_inc(this_rq(), eas_stats.secb_count);
+
+unlock:
+ rcu_read_unlock();
- return (util >= capacity) ? capacity : util;
+ return target_cpu;
}
/*
* preempt must be disabled.
*/
static int
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
+ int sibling_count_hint)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
- if (sd_flag & SD_BALANCE_WAKE)
- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+#ifdef CONFIG_SCHED_HMP
+ return select_best_cpu(p, prev_cpu, 0, sync);
+#endif
+
+ if (sd_flag & SD_BALANCE_WAKE) {
+ int _wake_cap = wake_cap(p, cpu, prev_cpu);
+
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ bool about_to_idle = (cpu_rq(cpu)->nr_running < 2);
+
+ if (sysctl_sched_sync_hint_enable && sync &&
+ !_wake_cap && about_to_idle)
+ return cpu;
+ }
+
+ record_wakee(p);
+ want_affine = !wake_wide(p, sibling_count_hint) &&
+ !_wake_cap &&
+ cpumask_test_cpu(cpu, &p->cpus_allowed);
+ }
+
+ if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
+ return select_energy_cpu_brute(p, prev_cpu, sync);
rcu_read_lock();
for_each_domain(cpu, tmp) {
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
+ if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+ /*
+ * We're going to need the task's util for capacity_spare_wake
+ * in find_idlest_group. Sync it up to prev_cpu's
+ * last_update_time.
+ */
+ sync_entity_load_avg(&p->se);
+ }
+
if (!sd) {
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
- new_cpu = select_idle_sibling(p, new_cpu);
-
- } else while (sd) {
- struct sched_group *group;
- int weight;
-
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
- }
-
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
-
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
- continue;
- }
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
- }
- /* while loop will break here if sd == NULL */
+ } else {
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
{
remove_entity_load_avg(&p->se);
}
+#else
+#define task_fits_max(p, cpu) true
#endif /* CONFIG_SMP */
static unsigned long
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
+
return p;
simple:
cfs_rq = &rq->cfs;
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
+
return p;
idle:
+ rq->misfit_task = 0;
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
enum fbq_type { regular, remote, all };
+enum group_type {
+ group_other = 0,
+ group_misfit_task,
+ group_imbalanced,
+ group_overloaded,
+};
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
+#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
+#define LBF_IGNORE_BIG_TASKS 0x100
+#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
+#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
struct lb_env {
struct sched_domain *sd;
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
+ unsigned int src_grp_nr_running;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
+ unsigned int busiest_grp_capacity;
+ unsigned int busiest_nr_running;
unsigned int flags;
unsigned int loop_max;
enum fbq_type fbq_type;
+ enum group_type busiest_group_type;
struct list_head tasks;
+ enum sched_boost_policy boost_policy;
};
/*
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
+ int twf, group_cpus;
lockdep_assert_held(&env->src_rq->lock);
/* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
+ if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
+ if (nr_big_tasks(env->src_rq) && !is_big_task(p))
+ return 0;
+
+ if (env->boost_policy == SCHED_BOOST_ON_BIG &&
+ !task_sched_boost(p))
+ return 0;
+ }
+
+ twf = task_will_fit(p, env->dst_cpu);
+
+ /*
+ * Attempt to not pull tasks that don't fit. We may get lucky and find
+ * one that actually fits.
+ */
+ if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
+ return 0;
+
+ if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
+ !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p))
+ return 0;
+
+ /*
+ * Group imbalance can sometimes cause work to be pulled across groups
+ * even though the group could have managed the imbalance on its own.
+ * Prevent inter-cluster migrations for big tasks when the number of
+ * tasks is lower than the capacity of the group.
+ */
+ group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity,
+ SCHED_CAPACITY_SCALE);
+ if (!twf && env->busiest_nr_running <= group_cpus)
+ return 0;
+
if (task_running(env->src_rq, p)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
/*
* Aggressive migration if:
- * 1) destination numa is preferred
- * 2) task is cache cold, or
- * 3) too many balance attempts have failed.
+ * 1) IDLE or NEWLY_IDLE balance.
+ * 2) destination numa is preferred
+ * 3) task is cache cold, or
+ * 4) too many balance attempts have failed.
*/
tsk_cache_hot = migrate_degrades_locality(p, env);
if (tsk_cache_hot == -1)
tsk_cache_hot = task_hot(p, env);
- if (tsk_cache_hot <= 0 ||
+ if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
{
lockdep_assert_held(&env->src_rq->lock);
- deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(env->src_rq, p, 0);
+ double_lock_balance(env->src_rq, env->dst_rq);
set_task_cpu(p, env->dst_cpu);
+ if (task_in_related_thread_group(p))
+ env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
+ double_unlock_balance(env->src_rq, env->dst_rq);
}
/*
* inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
+
return p;
}
return NULL;
struct task_struct *p;
unsigned long load;
int detached = 0;
+ int orig_loop = env->loop;
lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0)
return 0;
+ if (!same_cluster(env->dst_cpu, env->src_cpu))
+ env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
+
+ if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
+ env->flags |= LBF_IGNORE_BIG_TASKS;
+
+redo:
while (!list_empty(tasks)) {
/*
* We don't want to steal all, otherwise we may be treated likewise,
list_move_tail(&p->se.group_node, tasks);
}
+ if (env->flags & (LBF_IGNORE_BIG_TASKS |
+ LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
+ tasks = &env->src_rq->cfs_tasks;
+ env->flags &= ~(LBF_IGNORE_BIG_TASKS |
+ LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
+ env->loop = orig_loop;
+ goto redo;
+ }
+
/*
* Right now, this is one of only two places we collect this stat
* so we can safely collect detach_one_task() stats here rather
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
- p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
}
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
+ true))
update_tg_load_avg(cfs_rq, 0);
+
+ /* Propagate pending load changes to the parent */
+ if (cfs_rq->tg->se[cpu])
+ update_load_avg(cfs_rq->tg->se[cpu], 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
/********** Helpers for find_busiest_group ************************/
-enum group_type {
- group_other = 0,
- group_imbalanced,
- group_overloaded,
-};
-
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
unsigned long group_capacity;
unsigned long group_util; /* Total utilization of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
+#ifdef CONFIG_SCHED_HMP
+ unsigned long sum_nr_big_tasks;
+ u64 group_cpu_load; /* Scaled load of all CPUs of the group */
+#endif
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
int group_no_capacity;
+ int group_misfit_task; /* A cpu has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
.avg_load = 0UL,
.sum_nr_running = 0,
.group_type = group_other,
+#ifdef CONFIG_SCHED_HMP
+ .sum_nr_big_tasks = 0UL,
+ .group_cpu_load = 0ULL,
+#endif
},
};
}
+#ifdef CONFIG_SCHED_HMP
+
+static int
+bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ int local_cpu, busiest_cpu;
+ int local_capacity, busiest_capacity;
+ int local_pwr_cost, busiest_pwr_cost;
+ int nr_cpus;
+ int boost = sched_boost();
+
+ if (!sysctl_sched_restrict_cluster_spill ||
+ boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
+ return 0;
+
+ local_cpu = group_first_cpu(sds->local);
+ busiest_cpu = group_first_cpu(sds->busiest);
+
+ local_capacity = cpu_max_possible_capacity(local_cpu);
+ busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
+
+ local_pwr_cost = cpu_max_power_cost(local_cpu);
+ busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
+
+ if (local_pwr_cost <= busiest_pwr_cost)
+ return 0;
+
+ if (local_capacity > busiest_capacity &&
+ sds->busiest_stat.sum_nr_big_tasks)
+ return 0;
+
+ nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
+ if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
+ (sds->busiest_stat.sum_nr_running <
+ nr_cpus * sysctl_sched_spill_nr_run))
+ return 1;
+
+ return 0;
+}
+
+#else /* CONFIG_SCHED_HMP */
+
+static inline int
+bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_HMP */
+
/**
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
used = div_u64(avg, total);
+ /*
+ * deadline bandwidth is defined at system level so we must
+ * weight this bandwidth with the max capacity of the system.
+ * As a reminder, avg_bw is 20bits width and
+ * scale_cpu_capacity is 10 bits width
+ */
+ used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
+
if (likely(used < SCHED_CAPACITY_SCALE))
return SCHED_CAPACITY_SCALE - used;
return 1;
}
+void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
+{
+ raw_spin_lock_init(&mcc->lock);
+ mcc->val = 0;
+ mcc->cpu = -1;
+}
+
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
+ struct max_cpu_capacity *mcc;
+ unsigned long max_capacity;
+ int max_cap_cpu;
+ unsigned long flags;
cpu_rq(cpu)->cpu_capacity_orig = capacity;
+ mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
+
+ raw_spin_lock_irqsave(&mcc->lock, flags);
+ max_capacity = mcc->val;
+ max_cap_cpu = mcc->cpu;
+
+ if ((max_capacity > capacity && max_cap_cpu == cpu) ||
+ (max_capacity < capacity)) {
+ mcc->val = capacity;
+ mcc->cpu = cpu;
+#ifdef CONFIG_SCHED_DEBUG
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+ printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
+ cpu, capacity);
+ goto skip_unlock;
+#endif
+ }
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+
+skip_unlock: __attribute__ ((unused));
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
+ sdg->sgc->max_capacity = capacity;
+ sdg->sgc->min_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity;
+ unsigned long capacity, max_capacity, min_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
}
capacity = 0;
+ max_capacity = 0;
+ min_capacity = ULONG_MAX;
if (child->flags & SD_OVERLAP) {
/*
struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
+ if (cpumask_test_cpu(cpu, cpu_isolated_mask))
+ continue;
/*
* build_sched_domains() -> init_sched_groups_capacity()
* gets here before we've attached the domains to the
*/
if (unlikely(!rq->sd)) {
capacity += capacity_of(cpu);
- continue;
+ } else {
+ sgc = rq->sd->groups->sgc;
+ capacity += sgc->capacity;
}
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
+ max_capacity = max(capacity, max_capacity);
+ min_capacity = min(capacity, min_capacity);
}
} else {
/*
group = child->groups;
do {
- capacity += group->sgc->capacity;
+ struct sched_group_capacity *sgc = group->sgc;
+
+ cpumask_t *cpus = sched_group_cpus(group);
+
+ /* Revisit this later. This won't work for MT domain */
+ if (!cpu_isolated(cpumask_first(cpus))) {
+ capacity += sgc->capacity;
+ max_capacity = max(sgc->max_capacity, max_capacity);
+ min_capacity = min(sgc->min_capacity, min_capacity);
+ }
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
+ sdg->sgc->max_capacity = max_capacity;
+ sdg->sgc->min_capacity = min_capacity;
}
/*
return false;
}
+
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-cpu capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
+ ref->sgc->max_capacity;
+}
+
static inline enum
group_type group_classify(struct sched_group *group,
- struct sg_lb_stats *sgs)
+ struct sg_lb_stats *sgs, struct lb_env *env)
{
if (sgs->group_no_capacity)
return group_overloaded;
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_misfit_task)
+ return group_misfit_task;
+
return group_other;
}
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * idle load balancing data
+ * - used by the nohz balance, but we want it available here
+ * so that we can see which CPUs have no tick.
+ */
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ unsigned long next_balance; /* in jiffy units */
+} nohz ____cacheline_aligned;
+
+static inline void update_cpu_stats_if_tickless(struct rq *rq)
+{
+ /* only called from update_sg_lb_stats when irqs are disabled */
+ if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
+ /* rate limit updates to once-per-jiffie at most */
+ if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+ update_idle_cpu_load(rq);
+ update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
+ raw_spin_unlock(&rq->lock);
+ }
+}
+
+#else
+static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
+#endif
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
* @overload: Indicate more than one runnable task for any CPU.
+ * @overutilized: Indicate overutilization for any CPU.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs,
- bool *overload)
+ bool *overload, bool *overutilized)
{
unsigned long load;
- int i;
+ int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
+ sched_irqload(i),
+ power_cost(i, 0),
+ cpu_temp(i));
+
+ if (cpu_isolated(i))
+ continue;
+
+ /* if we are entering idle and there are CPUs with
+ * their tick stopped, do an update for them
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ update_cpu_stats_if_tickless(rq);
+
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
- if (rq->nr_running > 1)
+ nr_running = rq->nr_running;
+ if (nr_running > 1)
*overload = true;
+#ifdef CONFIG_SCHED_HMP
+ sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks;
+ sgs->group_cpu_load += cpu_load(i);
+#endif
+
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
sgs->sum_weighted_load += weighted_cpuload(i);
- if (idle_cpu(i))
+ /*
+ * No need to call idle_cpu() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
+
+ if (energy_aware() && cpu_overutilized(i)) {
+ *overutilized = true;
+ if (!sgs->group_misfit_task && rq->misfit_task)
+ sgs->group_misfit_task = capacity_of(i);
+ }
}
- /* Adjust by relative CPU capacity of the group */
- sgs->group_capacity = group->sgc->capacity;
- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+ /* Isolated CPU has no weight */
+ if (!group->group_weight) {
+ sgs->group_capacity = 0;
+ sgs->avg_load = 0;
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_other;
+ sgs->group_weight = group->group_weight;
+ } else {
+ /* Adjust by relative CPU capacity of the group */
+ sgs->group_capacity = group->sgc->capacity;
+ sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
+
+ sgs->group_weight = group->group_weight;
+
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(group, sgs, env);
+ }
if (sgs->sum_nr_running)
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+}
- sgs->group_weight = group->group_weight;
+#ifdef CONFIG_SCHED_HMP
+static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs)
+{
+ if (env->idle != CPU_NOT_IDLE &&
+ cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
+ if (sgs->sum_nr_big_tasks >
+ sds->busiest_stat.sum_nr_big_tasks) {
+ env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
+ return true;
+ }
+ }
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
- sgs->group_type = group_classify(group, sgs);
+ return false;
+}
+#else
+static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs)
+{
+ return false;
}
+#endif
/**
* update_sd_pick_busiest - return 1 on busiest group
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs))
+ return true;
+
if (sgs->group_type > busiest->group_type)
return true;
if (sgs->group_type < busiest->group_type)
return false;
- if (sgs->avg_load <= busiest->avg_load)
- return false;
+ if (energy_aware()) {
+ /*
+ * Candidate sg doesn't face any serious load-balance problems
+ * so don't pick it if the local sg is already filled up.
+ */
+ if (sgs->group_type == group_other &&
+ !group_has_capacity(env, &sds->local_stat))
+ return false;
+
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+ goto asym_packing;
+
+ /*
+ * Candidate sg has no more than one task per CPU and
+ * has higher per-CPU capacity. Migrating tasks to less
+ * capable CPUs may harm throughput. Maximize throughput,
+ * power/energy consequences are not considered.
+ */
+ if (sgs->sum_nr_running <= sgs->group_weight &&
+ group_smaller_cpu_capacity(sds->local, sg))
+ return false;
+ }
+
+asym_packing:
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
}
#endif /* CONFIG_NUMA_BALANCING */
+#define lb_sd_parent(sd) \
+ (sd->parent && sd->parent->groups != sd->parent->groups->next)
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
- bool overload = false;
+ bool overload = false, overutilized = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload);
+ &overload, &overutilized);
if (local_group)
goto next_group;
group_has_capacity(env, &sds->local_stat) &&
(sgs->sum_nr_running > 1)) {
sgs->group_no_capacity = 1;
- sgs->group_type = group_classify(sg, sgs);
+ sgs->group_type = group_classify(sg, sgs, env);
}
+ /*
+ * Ignore task groups with misfit tasks if local group has no
+ * capacity or if per-cpu capacity isn't higher.
+ */
+ if (energy_aware() &&
+ sgs->group_type == group_misfit_task &&
+ (!group_has_capacity(env, &sds->local_stat) ||
+ !group_smaller_cpu_capacity(sg, sds->local)))
+ sgs->group_type = group_other;
+
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
+ env->busiest_nr_running = sgs->sum_nr_running;
+ env->busiest_grp_capacity = sgs->group_capacity;
}
next_group:
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
- if (!env->sd->parent) {
+ env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
+ if (!lb_sd_parent(env->sd)) {
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
+
+ /* Update over-utilization (tipping point, U >= 0) indicator */
+ if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) {
+ env->dst_rq->rd->overutilized = overutilized;
+ trace_sched_overutilized(overutilized);
+ }
+ } else {
+ if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) {
+ env->dst_rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
}
}
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
+ if (energy_aware()) {
+ /* Misfitting tasks should be migrated in any case */
+ if (busiest->group_type == group_misfit_task) {
+ env->imbalance = busiest->group_misfit_task;
+ return;
+ }
+
+ /*
+ * Busiest group is overloaded, local is not, use the spare
+ * cycles to maximize throughput
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type <= group_misfit_task) {
+ env->imbalance = busiest->load_per_task;
+ return;
+ }
+ }
+
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
+ /* Boost imbalance to allow misfit task to be balanced. */
+ if (energy_aware() && busiest->group_type == group_misfit_task)
+ env->imbalance = max_t(long, env->imbalance,
+ busiest->group_misfit_task);
+
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
* this level.
*/
update_sd_lb_stats(env, &sds);
+
+ if (energy_aware() && !env->dst_rq->rd->overutilized)
+ goto out_balanced;
+
local = &sds.local_stat;
busiest = &sds.busiest_stat;
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
+ if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
+ goto force_balance;
+
+ if (bail_inter_cluster_balance(env, &sds))
+ goto out_balanced;
+
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
if (busiest->group_type == group_imbalanced)
goto force_balance;
- /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ /*
+ * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+ * capacities from resulting in underutilization due to avg_load.
+ */
+ if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;
+ /* Misfitting tasks should be dealt with regardless of the avg load */
+ if (energy_aware() && busiest->group_type == group_misfit_task) {
+ goto force_balance;
+ }
+
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
* might end up to just move the imbalance on another group
*/
if ((busiest->group_type != group_overloaded) &&
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
+ !group_smaller_cpu_capacity(sds.busiest, sds.local))
goto out_balanced;
} else {
/*
goto out_balanced;
}
-force_balance:
- /* Looks like there is an imbalance. Compute it */
- calculate_imbalance(env, &sds);
- return sds.busiest;
+force_balance:
+ env->busiest_group_type = busiest->group_type;
+ /* Looks like there is an imbalance. Compute it */
+ calculate_imbalance(env, &sds);
+ return sds.busiest;
+
+out_balanced:
+ env->imbalance = 0;
+ return NULL;
+}
+
+#ifdef CONFIG_SCHED_HMP
+static struct rq *find_busiest_queue_hmp(struct lb_env *env,
+ struct sched_group *group)
+{
+ struct rq *busiest = NULL, *busiest_big = NULL;
+ u64 max_runnable_avg = 0, max_runnable_avg_big = 0;
+ int max_nr_big = 0, nr_big;
+ bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
+ int i;
+ cpumask_t cpus;
+
+ cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
+
+ for_each_cpu(i, &cpus) {
+ struct rq *rq = cpu_rq(i);
+ u64 cumulative_runnable_avg =
+ rq->hmp_stats.cumulative_runnable_avg;
+
+ if (!cpumask_test_cpu(i, env->cpus))
+ continue;
+
+
+ if (find_big) {
+ nr_big = nr_big_tasks(rq);
+ if (nr_big > max_nr_big ||
+ (nr_big > 0 && nr_big == max_nr_big &&
+ cumulative_runnable_avg > max_runnable_avg_big)) {
+ max_runnable_avg_big = cumulative_runnable_avg;
+ busiest_big = rq;
+ max_nr_big = nr_big;
+ continue;
+ }
+ }
+
+ if (cumulative_runnable_avg > max_runnable_avg) {
+ max_runnable_avg = cumulative_runnable_avg;
+ busiest = rq;
+ }
+ }
+
+ if (busiest_big)
+ return busiest_big;
-out_balanced:
- env->imbalance = 0;
+ env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;
+ return busiest;
+}
+#else
+static inline struct rq *find_busiest_queue_hmp(struct lb_env *env,
+ struct sched_group *group)
+{
return NULL;
}
+#endif
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
+#ifdef CONFIG_SCHED_HMP
+ return find_busiest_queue_hmp(env, group);
+#endif
+
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
unsigned long capacity, wl;
enum fbq_type rt;
*/
if (rq->nr_running == 1 && wl > env->imbalance &&
- !check_cpu_capacity(rq, env->sd))
+ !check_cpu_capacity(rq, env->sd) &&
+ env->busiest_group_type != group_misfit_task)
continue;
/*
* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
* so long as it is large enough.
*/
-#define MAX_PINNED_INTERVAL 512
+#define MAX_PINNED_INTERVAL 16
/* Working cpumask for load_balance and load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+#define NEED_ACTIVE_BALANCE_THRESHOLD 10
+
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
+ if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
+ return 1;
+
if (env->idle == CPU_NEWLY_IDLE) {
/*
return 1;
}
- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+ if (energy_aware() &&
+ (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+ ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
+ env->src_rq->cfs.h_nr_running == 1 &&
+ cpu_overutilized(env->src_cpu) &&
+ !cpu_overutilized(env->dst_cpu)) {
+ return 1;
+ }
+
+ return unlikely(sd->nr_balance_failed >
+ sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
}
-static int active_load_balance_cpu_stop(void *data);
+static int group_balance_cpu_not_isolated(struct sched_group *sg)
+{
+ cpumask_t cpus;
+
+ cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
+ cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
+ return cpumask_first(&cpus);
+}
static int should_we_balance(struct lb_env *env)
{
sg_mask = sched_group_mask(sg);
/* Try to find first idle cpu */
for_each_cpu_and(cpu, sg_cpus, env->cpus) {
- if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+ if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
+ cpu_isolated(cpu))
continue;
balance_cpu = cpu;
}
if (balance_cpu == -1)
- balance_cpu = group_balance_cpu(sg);
+ balance_cpu = group_balance_cpu_not_isolated(sg);
/*
* First idle cpu or the first cpu(busiest) in this sched group
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
- int ld_moved, cur_ld_moved, active_balance = 0;
- struct sched_domain *sd_parent = sd->parent;
- struct sched_group *group;
- struct rq *busiest;
+ int ld_moved = 0, cur_ld_moved, active_balance = 0;
+ struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
+ struct sched_group *group = NULL;
+ struct rq *busiest = NULL;
unsigned long flags;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = {
- .sd = sd,
- .dst_cpu = this_cpu,
- .dst_rq = this_rq,
- .dst_grpmask = sched_group_cpus(sd->groups),
- .idle = idle,
- .loop_break = sched_nr_migrate_break,
- .cpus = cpus,
- .fbq_type = all,
- .tasks = LIST_HEAD_INIT(env.tasks),
+ .sd = sd,
+ .dst_cpu = this_cpu,
+ .dst_rq = this_rq,
+ .dst_grpmask = sched_group_cpus(sd->groups),
+ .idle = idle,
+ .loop_break = sched_nr_migrate_break,
+ .cpus = cpus,
+ .fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
+ .imbalance = 0,
+ .flags = 0,
+ .loop = 0,
+ .busiest_nr_running = 0,
+ .busiest_grp_capacity = 0,
+ .boost_policy = sched_boost_policy(),
};
/*
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
+ update_rq_clock(busiest);
+
+ /* The world might have changed. Validate assumptions */
+ if (busiest->nr_running <= 1) {
+ raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ env.flags &= ~LBF_ALL_PINNED;
+ goto no_move;
+ }
+
+ /*
+ * Set loop_max when rq's lock is taken to prevent a race.
+ */
+ env.loop_max = min(sysctl_sched_nr_migrate,
+ busiest->nr_running);
/*
* cur_ld_moved - load moved in current iteration
}
}
+no_move:
if (!ld_moved) {
- schedstat_inc(sd, lb_failed[idle]);
+ if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
+ schedstat_inc(sd, lb_failed[idle]);
+
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
- if (idle != CPU_NEWLY_IDLE)
- sd->nr_balance_failed++;
+ if (idle != CPU_NEWLY_IDLE &&
+ !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) {
+ if (env.src_grp_nr_running > 1)
+ sd->nr_balance_failed++;
+ }
if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
* ->active_balance_work. Once set, it's cleared
* only after active load balance is finished.
*/
- if (!busiest->active_balance) {
+ if (!busiest->active_balance &&
+ !cpu_isolated(cpu_of(busiest))) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
+ *continue_balancing = 0;
}
/*
* We've kicked active balancing, reset the failure
* counter.
*/
- sd->nr_balance_failed = sd->cache_nice_tries+1;
+ sd->nr_balance_failed =
+ sd->cache_nice_tries +
+ NEED_ACTIVE_BALANCE_THRESHOLD - 1;
}
- } else
+ } else {
sd->nr_balance_failed = 0;
+ /* Assumes one 'busiest' cpu that we pulled tasks from */
+ if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
+ int check_groups = !!(env.flags &
+ LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+
+ check_for_freq_change(this_rq, false, check_groups);
+ check_for_freq_change(busiest, false, check_groups);
+ } else {
+ check_for_freq_change(this_rq, true, false);
+ }
+ }
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
out:
+ trace_sched_load_balance(this_cpu, idle, *continue_balancing,
+ group ? group->cpumask[0] : 0,
+ busiest ? busiest->nr_running : 0,
+ env.imbalance, env.flags, ld_moved,
+ sd->balance_interval);
return ld_moved;
}
int pulled_task = 0;
u64 curr_cost = 0;
+ if (cpu_isolated(this_cpu))
+ return 0;
+
idle_enter_fair(this_rq);
/*
*/
this_rq->idle_stamp = rq_clock(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
+ if (!energy_aware() &&
+ (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload)) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
/*
* Stop searching for tasks to pull if there are
- * now runnable tasks on this rq.
+ * now runnable tasks on the balance rq or if
+ * continue_balancing has been unset (only possible
+ * due to active migration).
*/
- if (pulled_task || this_rq->nr_running > 0)
+ if (pulled_task || this_rq->nr_running > 0 ||
+ !continue_balancing)
break;
}
rcu_read_unlock();
int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
- struct sched_domain *sd;
+ struct sched_domain *sd = NULL;
struct task_struct *p = NULL;
+ struct task_struct *push_task = NULL;
+ int push_task_detached = 0;
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ .busiest_nr_running = 0,
+ .busiest_grp_capacity = 0,
+ .flags = 0,
+ .loop = 0,
+ .boost_policy = sched_boost_policy(),
+ };
+ bool moved = false;
raw_spin_lock_irq(&busiest_rq->lock);
*/
BUG_ON(busiest_rq == target_rq);
+ push_task = busiest_rq->push_task;
+ target_cpu = busiest_rq->push_cpu;
+ if (push_task) {
+ if (task_on_rq_queued(push_task) &&
+ push_task->state == TASK_RUNNING &&
+ task_cpu(push_task) == busiest_cpu &&
+ cpu_online(target_cpu)) {
+ detach_task(push_task, &env);
+ push_task_detached = 1;
+ moved = true;
+ }
+ goto out_unlock;
+ }
+
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
}
if (likely(sd)) {
- struct lb_env env = {
- .sd = sd,
- .dst_cpu = target_cpu,
- .dst_rq = target_rq,
- .src_cpu = busiest_rq->cpu,
- .src_rq = busiest_rq,
- .idle = CPU_IDLE,
- };
-
+ env.sd = sd;
schedstat_inc(sd, alb_count);
+ update_rq_clock(busiest_rq);
p = detach_one_task(&env);
- if (p)
+ if (p) {
schedstat_inc(sd, alb_pushed);
- else
+ moved = true;
+ } else {
schedstat_inc(sd, alb_failed);
+ }
}
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
+ push_task = busiest_rq->push_task;
+ target_cpu = busiest_rq->push_cpu;
+
+ if (push_task)
+ busiest_rq->push_task = NULL;
+
raw_spin_unlock(&busiest_rq->lock);
+ if (push_task) {
+ if (push_task_detached)
+ attach_one_task(target_rq, push_task);
+ put_task_struct(push_task);
+ clear_reserved(target_cpu);
+ }
+
if (p)
attach_one_task(target_rq, p);
local_irq_enable();
+ if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
+ int check_groups = !!(env.flags &
+ LBF_MOVED_RELATED_THREAD_GROUP_TASK);
+ check_for_freq_change(busiest_rq, false, check_groups);
+ check_for_freq_change(target_rq, false, check_groups);
+ } else if (moved) {
+ check_for_freq_change(target_rq, true, false);
+ }
+
return 0;
}
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
-static struct {
- cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- unsigned long next_balance; /* in jiffy units */
-} nohz ____cacheline_aligned;
-static inline int find_new_ilb(void)
+#ifdef CONFIG_SCHED_HMP
+static inline int find_new_hmp_ilb(int type)
+{
+ int call_cpu = raw_smp_processor_id();
+ struct sched_domain *sd;
+ int ilb;
+
+ rcu_read_lock();
+
+ /* Pick an idle cpu "closest" to call_cpu */
+ for_each_domain(call_cpu, sd) {
+ for_each_cpu_and(ilb, nohz.idle_cpus_mask,
+ sched_domain_span(sd)) {
+ if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
+ cpu_max_power_cost(ilb) <=
+ cpu_max_power_cost(call_cpu))) {
+ rcu_read_unlock();
+ reset_balance_interval(ilb);
+ return ilb;
+ }
+ }
+ }
+
+ rcu_read_unlock();
+ return nr_cpu_ids;
+}
+#else /* CONFIG_SCHED_HMP */
+static inline int find_new_hmp_ilb(int type)
+{
+ return 0;
+}
+#endif /* CONFIG_SCHED_HMP */
+
+static inline int find_new_ilb(int type)
{
- int ilb = cpumask_first(nohz.idle_cpus_mask);
+ int ilb;
+
+#ifdef CONFIG_SCHED_HMP
+ return find_new_hmp_ilb(type);
+#endif
+
+ ilb = cpumask_first(nohz.idle_cpus_mask);
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
* CPU (if there is one).
*/
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(int type)
{
int ilb_cpu;
nohz.next_balance++;
- ilb_cpu = find_new_ilb();
+ ilb_cpu = find_new_ilb(type);
if (ilb_cpu >= nr_cpu_ids)
return;
return;
}
+void nohz_balance_clear_nohz_mask(int cpu)
+{
+ if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ }
+}
+
static inline void nohz_balance_exit_idle(int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
/*
* Completely isolated CPUs don't ever set, so we must test.
*/
- if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
- }
+ nohz_balance_clear_nohz_mask(cpu);
clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
}
/*
* If we're a completely isolated CPU, we don't play.
*/
- if (on_null_domain(cpu_rq(cpu)))
+ if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
return;
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
*/
void update_max_interval(void)
{
- max_load_balance_interval = HZ*num_online_cpus()/10;
+ cpumask_t avail_mask;
+ unsigned int available_cpus;
+
+ cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
+ available_cpus = cpumask_weight(&avail_mask);
+
+ max_load_balance_interval = HZ*available_cpus/10;
}
/*
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
+ cpumask_t cpus;
if (idle != CPU_IDLE ||
!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
goto end;
- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
+
+ for_each_cpu(balance_cpu, &cpus) {
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}
+#ifdef CONFIG_SCHED_HMP
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+ struct sched_domain *sd;
+ int i;
+
+ if (rq->nr_running < 2)
+ return 0;
+
+ if (!sysctl_sched_restrict_cluster_spill ||
+ sched_boost_policy() == SCHED_BOOST_ON_ALL)
+ return 1;
+
+ if (cpu_max_power_cost(cpu) == max_power_cost)
+ return 1;
+
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(rq->sd);
+ if (!sd) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (cpu_load(i) < sched_spill_load &&
+ cpu_rq(i)->nr_running <
+ sysctl_sched_spill_nr_run) {
+ /* Change the kick type to limit to CPUs that
+ * are of equal or lower capacity.
+ */
+ *type = NOHZ_KICK_RESTRICT;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return 1;
+}
+#else
+static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
+{
+ return 0;
+}
+#endif
+
+static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
+{
+ unsigned long now = jiffies;
+
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing.
+ */
+ if (likely(!atomic_read(&nohz.nr_cpus)))
+ return 0;
+
+#ifdef CONFIG_SCHED_HMP
+ return _nohz_kick_needed_hmp(rq, cpu, type);
+#endif
+
+ if (time_before(now, nohz.next_balance))
+ return 0;
+
+ if (rq->nr_running >= 2 &&
+ (!energy_aware() || cpu_overutilized(cpu)))
+ return true;
+
+ /* Do idle load balance if there have misfit task */
+ if (energy_aware())
+ return rq->misfit_task;
+
+ return (rq->nr_running >= 2);
+}
+
/*
* Current heuristic for kicking the idle load balancer in the presence
* of an idle cpu in the system.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline bool nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq, int *type)
{
- unsigned long now = jiffies;
+#ifndef CONFIG_SCHED_HMP
struct sched_domain *sd;
struct sched_group_capacity *sgc;
- int nr_busy, cpu = rq->cpu;
+ int nr_busy;
+#endif
+ int cpu = rq->cpu;
bool kick = false;
if (unlikely(rq->idle_balance))
set_cpu_sd_state_busy();
nohz_balance_exit_idle(cpu);
- /*
- * None are in tickless mode and hence no need for NOHZ idle load
- * balancing.
- */
- if (likely(!atomic_read(&nohz.nr_cpus)))
- return false;
-
- if (time_before(now, nohz.next_balance))
- return false;
-
- if (rq->nr_running >= 2)
+ if (_nohz_kick_needed(rq, cpu, type))
return true;
+#ifndef CONFIG_SCHED_HMP
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (sd) {
unlock:
rcu_read_unlock();
+#endif
return kick;
}
#else
*/
void trigger_load_balance(struct rq *rq)
{
- /* Don't need to rebalance while attached to NULL domain */
- if (unlikely(on_null_domain(rq)))
+ int type = NOHZ_KICK_ANY;
+
+ /* Don't need to rebalance while attached to NULL domain or
+ * cpu is isolated.
+ */
+ if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
return;
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ_COMMON
- if (nohz_kick_needed(rq))
- nohz_balancer_kick();
+ if (nohz_kick_needed(rq, &type))
+ nohz_balancer_kick(type);
#endif
}
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+
+#ifdef CONFIG_SMP
+ if (energy_aware() &&
+ !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
+ rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
+
+ rq->misfit_task = !task_fits_max(curr, rq->cpu);
+#endif
+
}
/*
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
- int this_cpu = smp_processor_id();
struct rq *rq = this_rq();
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock(&rq->lock);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
-
- /*
- * Not only the cpu but also the task_group of the parent might have
- * been changed after parent->se.parent,cfs_rq were copied to
- * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
- * of child point to valid ones.
- */
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
-
- update_curr(cfs_rq);
-
- if (curr)
+ if (curr) {
+ update_curr(cfs_rq);
se->vruntime = curr->vruntime;
+ }
place_entity(cfs_rq, se, 1);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
}
se->vruntime -= cfs_rq->min_vruntime;
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
}
/*
return false;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ /* Start to propagate at parent */
+ se = se->parent;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(se, UPDATE_TG);
+ }
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /* Catch up with the cfs_rq and remove our load when we leave */
+ update_load_avg(se, 0);
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Since the real-depth could have been changed (only FAIR
+ * class maintain depth value), reset depth properly.
+ */
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+
+ /* Synchronize entity with its cfs_rq */
+ update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ attach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
se->vruntime -= cfs_rq->min_vruntime;
}
- /* Catch up with the cfs_rq and remove our load when we leave */
- detach_entity_load_avg(cfs_rq, se);
+ detach_entity_cfs_rq(se);
}
static void attach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
- /* Synchronize task with its cfs_rq */
- attach_entity_load_avg(cfs_rq, se);
+ attach_entity_cfs_rq(se);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->propagate_avg = 0;
+#endif
atomic_long_set(&cfs_rq->removed_load_avg, 0);
atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
static void task_move_group_fair(struct task_struct *p)
{
detach_task_cfs_rq(p);
attach_task_cfs_rq(p);
}
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+ switch (type) {
+ case TASK_SET_GROUP:
+ task_set_group_fair(p);
+ break;
+
+ case TASK_MOVE_GROUP:
+ task_move_group_fair(p);
+ break;
+ }
+}
+
void free_fair_sched_group(struct task_group *tg)
{
int i;
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
+
+ raw_spin_lock_irq(&rq->lock);
+ post_init_entity_util_avg(se);
+ raw_spin_unlock_irq(&rq->lock);
}
return 1;
/* Possible calls to update_curr() need rq clock */
update_rq_clock(rq);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
+ for_each_sched_entity(se) {
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .task_move_group = task_move_group_fair,
+ .task_change_group = task_change_group_fair,
+#endif
+#ifdef CONFIG_SCHED_HMP
+ .inc_hmp_sched_stats = inc_hmp_sched_stats_fair,
+ .dec_hmp_sched_stats = dec_hmp_sched_stats_fair,
+ .fixup_hmp_sched_stats = fixup_hmp_sched_stats_fair,
#endif
};
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
+#include <linux/bug.h>
+#include <linux/delay.h>
#include "workqueue_internal.h"
int id; /* I: pool ID */
unsigned int flags; /* X: flags */
+ unsigned long watchdog_ts; /* L: watchdog timestamp */
+
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
struct pool_workqueue *pwq = get_work_pwq(work);
trace_workqueue_activate_work(work);
+ if (list_empty(&pwq->pool->worklist))
+ pwq->pool->watchdog_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
pwq->nr_active++;
if (work_is_canceling(work))
return -ENOENT;
cpu_relax();
+ /*
+ * The queueing is in progress in another context. If we keep
+ * taking the pool->lock in a busy loop, the other context may
+ * never get the lock. Give 1 usec delay to avoid this contention.
+ */
+ udelay(1);
return -EAGAIN;
}
trace_workqueue_activate_work(work);
pwq->nr_active++;
worklist = &pwq->pool->worklist;
+ if (list_empty(worklist))
+ pwq->pool->watchdog_ts = jiffies;
} else {
work_flags |= WORK_STRUCT_DELAYED;
worklist = &pwq->delayed_works;
return;
}
- timer_stats_timer_set_start_info(&dwork->timer);
-
dwork->wq = wq;
dwork->cpu = cpu;
timer->expires = jiffies + delay;
current->comm, preempt_count(), task_pid_nr(current),
worker->current_func);
debug_show_held_locks(current);
+ BUG_ON(PANIC_CORRUPTION);
dump_stack();
}
list_first_entry(&pool->worklist,
struct work_struct, entry);
+ pool->watchdog_ts = jiffies;
+
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
/* optimization path, not strictly necessary */
process_one_work(worker, work);
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n;
+ bool first = true;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
* process'em.
*/
WARN_ON_ONCE(!list_empty(scheduled));
- list_for_each_entry_safe(work, n, &pool->worklist, entry)
- if (get_work_pwq(work) == pwq)
+ list_for_each_entry_safe(work, n, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq) {
+ if (first)
+ pool->watchdog_ts = jiffies;
move_linked_works(work, scheduled, &n);
+ }
+ first = false;
+ }
if (!list_empty(scheduled)) {
process_scheduled_works(rescuer);
*/
if (need_to_create_worker(pool)) {
spin_lock(&wq_mayday_lock);
- get_pwq(pwq);
- list_move_tail(&pwq->mayday_node, &wq->maydays);
+ /*
+ * Queue iff we aren't racing destruction
+ * and somebody else hasn't queued it already.
+ */
+ if (wq->rescuer && list_empty(&pwq->mayday_node)) {
+ get_pwq(pwq);
+ list_add_tail(&pwq->mayday_node, &wq->maydays);
+ }
spin_unlock(&wq_mayday_lock);
}
}
}
EXPORT_SYMBOL(flush_delayed_work);
+static bool __cancel_work(struct work_struct *work, bool is_dwork)
+{
+ unsigned long flags;
+ int ret;
+
+ do {
+ ret = try_to_grab_pending(work, is_dwork, &flags);
+ } while (unlikely(ret == -EAGAIN));
+
+ if (unlikely(ret < 0))
+ return false;
+
+ set_work_pool_and_clear_pending(work, get_work_pool_id(work));
+ local_irq_restore(flags);
+ return ret;
+}
+
+/*
+ * See cancel_delayed_work()
+ */
+bool cancel_work(struct work_struct *work)
+{
+ return __cancel_work(work, false);
+}
+
/**
* cancel_delayed_work - cancel a delayed work
* @dwork: delayed_work to cancel
*/
bool cancel_delayed_work(struct delayed_work *dwork)
{
- unsigned long flags;
- int ret;
-
- do {
- ret = try_to_grab_pending(&dwork->work, true, &flags);
- } while (unlikely(ret == -EAGAIN));
-
- if (unlikely(ret < 0))
- return false;
-
- set_work_pool_and_clear_pending(&dwork->work,
- get_work_pool_id(&dwork->work));
- local_irq_restore(flags);
- return ret;
+ return __cancel_work(&dwork->work, true);
}
EXPORT_SYMBOL(cancel_delayed_work);
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
pool->flags |= POOL_DISASSOCIATED;
+ pool->watchdog_ts = jiffies;
INIT_LIST_HEAD(&pool->worklist);
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
struct pool_workqueue *pwq;
int node;
+ /*
+ * Remove it from sysfs first so that sanity check failure doesn't
+ * lead to sysfs name conflicts.
+ */
+ workqueue_sysfs_unregister(wq);
+
/* drain it before proceeding with destruction */
drain_workqueue(wq);
+ /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
+ if (wq->rescuer) {
+ struct worker *rescuer = wq->rescuer;
+
+ /* this prevents new queueing */
+ spin_lock_irq(&wq_mayday_lock);
+ wq->rescuer = NULL;
+ spin_unlock_irq(&wq_mayday_lock);
+
+ /* rescuer will empty maydays list before exiting */
+ kthread_stop(rescuer->task);
+ kfree(rescuer);
+ }
+
/* sanity checks */
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq) {
list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
- workqueue_sysfs_unregister(wq);
-
- if (wq->rescuer)
- kthread_stop(wq->rescuer->task);
-
if (!(wq->flags & WQ_UNBOUND)) {
/*
* The base ref is never dropped on per-cpu pwqs. Directly
pr_info(" pwq %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+ pr_cont(" active=%d/%d refcnt=%d%s\n",
+ pwq->nr_active, pwq->max_active, pwq->refcnt,
!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
pr_info("pool %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" workers=%d", pool->nr_workers);
+ pr_cont(" hung=%us workers=%d",
+ jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
+ pool->nr_workers);
if (pool->manager)
pr_cont(" manager: %d",
task_pid_nr(pool->manager->task));
static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
#endif /* CONFIG_SYSFS */
+/*
+ * Workqueue watchdog.
+ *
+ * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
+ * flush dependency, a concurrency managed work item which stays RUNNING
+ * indefinitely. Workqueue stalls can be very difficult to debug as the
+ * usual warning mechanisms don't trigger and internal workqueue state is
+ * largely opaque.
+ *
+ * Workqueue watchdog monitors all worker pools periodically and dumps
+ * state if some pools failed to make forward progress for a while where
+ * forward progress is defined as the first item on ->worklist changing.
+ *
+ * This mechanism is controlled through the kernel parameter
+ * "workqueue.watchdog_thresh" which can be updated at runtime through the
+ * corresponding sysfs parameter file.
+ */
+#ifdef CONFIG_WQ_WATCHDOG
+
+static void wq_watchdog_timer_fn(unsigned long data);
+
+static unsigned long wq_watchdog_thresh = 30;
+static struct timer_list wq_watchdog_timer =
+ TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
+
+static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
+
+static void wq_watchdog_reset_touched(void)
+{
+ int cpu;
+
+ wq_watchdog_touched = jiffies;
+ for_each_possible_cpu(cpu)
+ per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+}
+
+static void wq_watchdog_timer_fn(unsigned long data)
+{
+ unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+ bool lockup_detected = false;
+ struct worker_pool *pool;
+ int pi;
+
+ if (!thresh)
+ return;
+
+ rcu_read_lock();
+
+ for_each_pool(pool, pi) {
+ unsigned long pool_ts, touched, ts;
+
+ if (list_empty(&pool->worklist))
+ continue;
+
+ /* get the latest of pool and touched timestamps */
+ pool_ts = READ_ONCE(pool->watchdog_ts);
+ touched = READ_ONCE(wq_watchdog_touched);
+
+ if (time_after(pool_ts, touched))
+ ts = pool_ts;
+ else
+ ts = touched;
+
+ if (pool->cpu >= 0) {
+ unsigned long cpu_touched =
+ READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
+ pool->cpu));
+ if (time_after(cpu_touched, ts))
+ ts = cpu_touched;
+ }
+
+ /* did we stall? */
+ if (time_after(jiffies, ts + thresh)) {
+ lockup_detected = true;
+ pr_emerg("BUG: workqueue lockup - pool");
+ pr_cont_pool_info(pool);
+ pr_cont(" stuck for %us!\n",
+ jiffies_to_msecs(jiffies - pool_ts) / 1000);
+ }
+ }
+
+ rcu_read_unlock();
+
+ if (lockup_detected)
+ show_workqueue_state();
+
+ wq_watchdog_reset_touched();
+ mod_timer(&wq_watchdog_timer, jiffies + thresh);
+}
+
+void wq_watchdog_touch(int cpu)
+{
+ if (cpu >= 0)
+ per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+ else
+ wq_watchdog_touched = jiffies;
+}
+
+static void wq_watchdog_set_thresh(unsigned long thresh)
+{
+ wq_watchdog_thresh = 0;
+ del_timer_sync(&wq_watchdog_timer);
+
+ if (thresh) {
+ wq_watchdog_thresh = thresh;
+ wq_watchdog_reset_touched();
+ mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
+ }
+}
+
+static int wq_watchdog_param_set_thresh(const char *val,
+ const struct kernel_param *kp)
+{
+ unsigned long thresh;
+ int ret;
+
+ ret = kstrtoul(val, 0, &thresh);
+ if (ret)
+ return ret;
+
+ if (system_wq)
+ wq_watchdog_set_thresh(thresh);
+ else
+ wq_watchdog_thresh = thresh;
+
+ return 0;
+}
+
+static const struct kernel_param_ops wq_watchdog_thresh_ops = {
+ .set = wq_watchdog_param_set_thresh,
+ .get = param_get_ulong,
+};
+
+module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
+ 0644);
+
+static void wq_watchdog_init(void)
+{
+ wq_watchdog_set_thresh(wq_watchdog_thresh);
+}
+
+#else /* CONFIG_WQ_WATCHDOG */
+
+static inline void wq_watchdog_init(void) { }
+
+#endif /* CONFIG_WQ_WATCHDOG */
+
static void __init wq_numa_init(void)
{
cpumask_var_t *tbl;
!system_unbound_wq || !system_freezable_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq);
+
+ wq_watchdog_init();
+
return 0;
}
early_initcall(init_workqueues);
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __set_page_locked(newpage);
+ __SetPageLocked(newpage);
SetPageUptodate(newpage);
SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
}
__SetPageSwapBacked(page);
- __set_page_locked(page);
+ __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
/* i_mutex is held by caller */
- if (unlikely(info->seals)) {
+ if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) {
if (info->seals & F_SEAL_WRITE)
return -EPERM;
if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
}
shmem_falloc.waitq = &shmem_falloc_waitq;
- shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+ shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
spin_lock(&inode->i_lock);
inode->i_private = &shmem_falloc;
}
EXPORT_SYMBOL_GPL(shmem_file_setup);
+void shmem_set_file(struct vm_area_struct *vma, struct file *file)
+{
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = file;
+ vma->vm_ops = &shmem_vm_ops;
+}
+
/**
* shmem_zero_setup - setup a shared anonymous mapping
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
if (IS_ERR(file))
return PTR_ERR(file);
- if (vma->vm_file)
- fput(vma->vm_file);
- vma->vm_file = file;
- vma->vm_ops = &shmem_vm_ops;
+ shmem_set_file(vma, file);
return 0;
}
return NETDEV_TX_OK;
}
- u64_stats_update_begin(&brstats->syncp);
- brstats->tx_packets++;
- brstats->tx_bytes += skb->len;
- u64_stats_update_end(&brstats->syncp);
-
BR_INPUT_SKB_CB(skb)->brdev = dev;
skb_reset_mac_header(skb);
skb_pull(skb, ETH_HLEN);
+ u64_stats_update_begin(&brstats->syncp);
+ brstats->tx_packets++;
+ /* Exclude ETH_HLEN from byte stats for consistency with Rx chain */
+ brstats->tx_bytes += skb->len;
+ u64_stats_update_end(&brstats->syncp);
+
if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid))
goto out;
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
+ /* dev_set_mac_addr() can be called by a master device on bridge's
+ * NETDEV_UNREGISTER, but since it's being destroyed do nothing
+ */
+ if (dev->reg_state != NETREG_REGISTERED)
+ return -EBUSY;
+
spin_lock_bh(&br->lock);
if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) {
/* Mac address will be changed in br_stp_change_bridge_id(). */
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_ingress.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
#include "net-sysfs.h"
static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
-static DEFINE_HASHTABLE(napi_hash, 8);
+static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
static seqcount_t devnet_rename_seq;
if (netif_needs_gso(skb, features)) {
struct sk_buff *segs;
+ __be16 src_port = tcp_hdr(skb)->source;
+ __be16 dest_port = tcp_hdr(skb)->dest;
+
+ trace_print_skb_gso(skb, src_port, dest_port);
segs = skb_gso_segment(skb, features);
if (IS_ERR(segs)) {
goto out_kfree_skb;
struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
{
- struct sk_buff *next, *head = NULL, *tail;
+ struct sk_buff *next, *head = NULL, *tail = NULL;
for (; skb != NULL; skb = next) {
next = skb->next;
}
out:
+ __this_cpu_add(softnet_data.gro_coalesced, NAPI_GRO_CB(skb)->count > 1);
return netif_receive_skb_internal(skb);
}
unsigned long diffs;
NAPI_GRO_CB(p)->flush = 0;
+ NAPI_GRO_CB(p)->flush_id = 0;
if (hash != skb_get_hash_raw(p)) {
NAPI_GRO_CB(p)->same_flow = 0;
}
EXPORT_SYMBOL(__skb_gro_checksum_complete);
+static void net_rps_send_ipi(struct softnet_data *remsd)
+{
+#ifdef CONFIG_RPS
+ while (remsd) {
+ struct softnet_data *next = remsd->rps_ipi_next;
+
+ if (cpu_online(remsd->cpu)) {
+ smp_call_function_single_async(remsd->cpu, &remsd->csd);
+ } else {
+ rps_lock(remsd);
+ remsd->backlog.state = 0;
+ rps_unlock(remsd);
+ }
+ remsd = next;
+ }
+#endif
+}
+
/*
* net_rps_action_and_irq_enable sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
local_irq_enable();
/* Send pending IPI's to kick RPS processing on remote cpus. */
- while (remsd) {
- struct softnet_data *next = remsd->rps_ipi_next;
-
- if (cpu_online(remsd->cpu))
- smp_call_function_single_async(remsd->cpu,
- &remsd->csd);
- remsd = next;
- }
+ net_rps_send_ipi(remsd);
} else
#endif
local_irq_enable();
local_irq_disable();
input_queue_head_incr(sd);
if (++work >= quota) {
- local_irq_enable();
- return work;
+ goto state_changed;
}
}
napi->state = 0;
rps_unlock(sd);
- break;
+ goto state_changed;
}
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
rps_unlock(sd);
}
+state_changed:
local_irq_enable();
+ napi_gro_flush(napi, false);
+ sd->current_napi = NULL;
return work;
}
void __napi_complete(struct napi_struct *n)
{
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
list_del_init(&n->poll_list);
smp_mb__before_atomic();
+ sd->current_napi = NULL;
clear_bit(NAPI_STATE_SCHED, &n->state);
}
EXPORT_SYMBOL(__napi_complete);
}
EXPORT_SYMBOL(netif_napi_del);
+
+struct napi_struct *get_current_napi_context(void)
+{
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+
+ return sd->current_napi;
+}
+EXPORT_SYMBOL(get_current_napi_context);
+
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
void *have;
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+
+ sd->current_napi = n;
work = n->poll(n, weight);
trace_napi_poll(n);
}
if (ops->ndo_change_mtu)
return ops->ndo_change_mtu(dev, new_mtu);
- dev->mtu = new_mtu;
+ /* Pairs with all the lockless reads of dev->mtu in the stack */
+ WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
struct sk_buff **list_skb;
struct sk_buff *skb;
unsigned int cpu, oldcpu = (unsigned long)ocpu;
- struct softnet_data *sd, *oldsd;
+ struct softnet_data *sd, *oldsd, *remsd;
if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
return NOTIFY_OK;
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
+#ifdef CONFIG_RPS
+ remsd = oldsd->rps_ipi_list;
+ oldsd->rps_ipi_list = NULL;
+#endif
+ /* send out pending IPI's on offline CPU */
+ net_rps_send_ipi(remsd);
+
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
netif_rx_ni(skb);
}
}
- static bool inetdev_valid_mtu(unsigned int mtu)
- {
- return mtu >= IPV4_MIN_MTU;
- }
-
static void inetdev_send_gratuitous_arp(struct net_device *dev,
struct in_device *in_dev)
"igmpv3_unsolicited_report_interval"),
DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
"ignore_routes_with_linkdown"),
+ DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP,
+ "drop_gratuitous_arp"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
"promote_secondaries"),
DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
"route_localnet"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
+ "drop_unicast_in_l2_multicast"),
+ DEVINET_SYSCTL_RW_ENTRY(NF_IPV4_DEFRAG_SKIP,
+ "nf_ipv4_defrag_skip"),
},
};
rt = *rtp;
if (unlikely(!rt))
return -EFAULT;
- /*
- * We steal reference to this route, caller should not release it
- */
- *rtp = NULL;
+
cork->fragsize = ip_sk_use_pmtu(sk) ?
- dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+ dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
+
+ if (!inetdev_valid_mtu(cork->fragsize))
+ return -ENETUNREACH;
+
cork->dst = &rt->dst;
+ /* We stole this route, caller should not release it. */
+ *rtp = NULL;
+
cork->length = 0;
cork->ttl = ipc->ttl;
cork->tos = ipc->tos;
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
daddr, saddr,
- tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
+ tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
+ arg->uid);
security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
* (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
* limit when mss is larger than 1460.
*/
- u32 init_rwnd = TCP_INIT_CWND * 2;
+ u32 init_rwnd = sysctl_tcp_default_init_rwnd;
if (mss > 1460)
init_rwnd = max((1460 * init_rwnd) / mss, 2U);
min_t(unsigned int, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
- size += TCPOLEN_SACK_BASE_ALIGNED +
- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+ if (likely(opts->num_sack_blocks))
+ size += TCPOLEN_SACK_BASE_ALIGNED +
+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
}
return size;
int sysctl_tcp_orphan_retries __read_mostly;
int sysctl_tcp_thin_linear_timeouts __read_mostly;
+/*Function to reset tcp_ack related sysctl on resetting master control */
+void set_tcp_default(void)
+{
+ sysctl_tcp_delack_seg = TCP_DELACK_SEG;
+}
+
+/*sysctl handler for tcp_ack realted master control */
+int tcp_proc_delayed_ack_control(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+ /* The ret value will be 0 if the input validation is successful
+ * and the values are written to sysctl table. If not, the stack
+ * will continue to work with currently configured values
+ */
+ return ret;
+}
+
+/*sysctl handler for tcp_ack realted master control */
+int tcp_use_userconfig_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+ if (write && ret == 0) {
+ if (!sysctl_tcp_use_userconfig)
+ set_tcp_default();
+ }
+ return ret;
+}
+
static void tcp_write_err(struct sock *sk)
{
sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
return;
}
- if (icsk->icsk_probes_out > max_probes) {
+ if (icsk->icsk_probes_out >= max_probes) {
abort: tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
p[strlen(p) - 2] = '\0';
mod->is_dot_o = 1;
}
+ /* strip trailing .lto */
+ if (strends(p, ".lto"))
+ p[strlen(p) - 4] = '\0';
/* add to list */
mod->name = p;
* fromsec = text section
* refsymname = *.constprop.*
*
+ * Pattern 6:
+ * Hide section mismatch warnings for ELF local symbols. The goal
+ * is to eliminate false positive modpost warnings caused by
+ * compiler-generated ELF local symbol names such as ".LANCHOR1".
+ * Autogenerated symbol names bypass modpost's "Pattern 2"
+ * whitelisting, which relies on pattern-matching against symbol
+ * names to work. (One situation where gcc can autogenerate ELF
+ * local symbols is when "-fsection-anchors" is used.)
**/
static int secref_whitelist(const struct sectioncheck *mismatch,
const char *fromsec, const char *fromsym,
match(fromsym, optim_symbols))
return 0;
+ /* Check for pattern 6 */
+ if (strstarts(fromsym, ".L"))
+ return 0;
+
return 1;
}
size_t m = strspn(s + n + 1, "0123456789");
if (m && (s[n + m] == '.' || s[n + m] == 0))
s[n] = 0;
+
+ /* strip trailing .lto */
+ if (strends(s, ".lto"))
+ s[strlen(s) - 4] = '\0';
}
return s;
}
#define trace_hw_ptr_error(substream, reason)
#endif
+#define STRING_LENGTH_OF_INT 12
+#define MAX_USR_CTRL_CNT 128
+
/*
* fill ring buffer with silence
* runtime->silence_start: starting pointer to silence area
* the elapsed time to detect xruns.
*/
jdelta = curr_jiffies - runtime->hw_ptr_jiffies;
- if (jdelta < runtime->hw_ptr_buffer_jiffies / 2)
+ if ((jdelta < runtime->hw_ptr_buffer_jiffies / 2) ||
+ (runtime->hw_ptr_buffer_jiffies <= 0))
goto no_delta_check;
hdelta = jdelta - delta * HZ / runtime->rate;
xrun_threshold = runtime->hw_ptr_buffer_jiffies / 2 + 1;
switch (runtime->access) {
case SNDRV_PCM_ACCESS_MMAP_INTERLEAVED:
case SNDRV_PCM_ACCESS_RW_INTERLEAVED:
+ if ((UINT_MAX/width) < info->channel) {
+ snd_printd("%s: integer overflow while multiply\n",
+ __func__);
+ return -EINVAL;
+ }
info->first = info->channel * width;
info->step = runtime->channels * width;
break;
case SNDRV_PCM_ACCESS_RW_NONINTERLEAVED:
{
size_t size = runtime->dma_bytes / runtime->channels;
+
+ if ((size > 0) && ((UINT_MAX/(size * 8)) < info->channel)) {
+ snd_printd("%s: integer overflow while multiply\n",
+ __func__);
+ return -EINVAL;
+ }
info->first = info->channel * size * 8;
info->step = width;
break;
struct snd_pcm_runtime *runtime;
unsigned long flags;
- if (PCM_RUNTIME_CHECK(substream))
+ if (snd_BUG_ON(!substream))
return;
- runtime = substream->runtime;
snd_pcm_stream_lock_irqsave(substream, flags);
+ if (PCM_RUNTIME_CHECK(substream))
+ goto _unlock;
+ runtime = substream->runtime;
+
if (!snd_pcm_running(substream) ||
snd_pcm_update_hw_ptr0(substream, 1) < 0)
goto _end;
#endif
_end:
kill_fasync(&runtime->fasync, SIGIO, POLL_IN);
+ _unlock:
snd_pcm_stream_unlock_irqrestore(substream, flags);
}
struct snd_pcm_runtime *runtime;
if (PCM_RUNTIME_CHECK(substream))
return -ENXIO;
+ /* TODO: consider and -EINVAL here */
+ if (substream->hw_no_buffer)
+ snd_printd("%s: warning this PCM is host less\n", __func__);
runtime = substream->runtime;
if (snd_BUG_ON(!substream->ops->copy && !runtime->dma_area))
return -EINVAL;
kfree(info);
}
+static int pcm_volume_ctl_info(struct snd_kcontrol *kcontrol,
+ struct snd_ctl_elem_info *uinfo)
+{
+ uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER;
+ uinfo->count = 1;
+ uinfo->value.integer.min = 0;
+ uinfo->value.integer.max = 0x2000;
+ return 0;
+}
+
+static void pcm_volume_ctl_private_free(struct snd_kcontrol *kcontrol)
+{
+ struct snd_pcm_volume *info = snd_kcontrol_chip(kcontrol);
+ info->pcm->streams[info->stream].vol_kctl = NULL;
+ kfree(info);
+}
+
/**
* snd_pcm_add_chmap_ctls - create channel-mapping control elements
* @pcm: the assigned PCM instance
return 0;
}
EXPORT_SYMBOL_GPL(snd_pcm_add_chmap_ctls);
+
+/**
+ * snd_pcm_add_volume_ctls - create volume control elements
+ * @pcm: the assigned PCM instance
+ * @stream: stream direction
+ * @max_length: the max length of the volume parameter of stream
+ * @private_value: the value passed to each kcontrol's private_value field
+ * @info_ret: store struct snd_pcm_volume instance if non-NULL
+ *
+ * Create volume control elements assigned to the given PCM stream(s).
+ * Returns zero if succeed, or a negative error value.
+ */
+int snd_pcm_add_volume_ctls(struct snd_pcm *pcm, int stream,
+ const struct snd_pcm_volume_elem *volume,
+ int max_length,
+ unsigned long private_value,
+ struct snd_pcm_volume **info_ret)
+{
+ struct snd_pcm_volume *info;
+ struct snd_kcontrol_new knew = {
+ .iface = SNDRV_CTL_ELEM_IFACE_MIXER,
+ .access = SNDRV_CTL_ELEM_ACCESS_TLV_READ |
+ SNDRV_CTL_ELEM_ACCESS_READWRITE,
+ .info = pcm_volume_ctl_info,
+ };
+ int err;
+ int size;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+ info->pcm = pcm;
+ info->stream = stream;
+ info->volume = volume;
+ info->max_length = max_length;
+ size = sizeof("Playback ") + sizeof(" Volume") +
+ STRING_LENGTH_OF_INT*sizeof(char) + 1;
+ knew.name = kzalloc(size, GFP_KERNEL);
+ if (!knew.name) {
+ kfree(info);
+ return -ENOMEM;
+ }
+ if (stream == SNDRV_PCM_STREAM_PLAYBACK)
+ snprintf((char *)knew.name, size, "%s %d %s",
+ "Playback", pcm->device, "Volume");
+ else
+ snprintf((char *)knew.name, size, "%s %d %s",
+ "Capture", pcm->device, "Volume");
+ knew.device = pcm->device;
+ knew.count = pcm->streams[stream].substream_count;
+ knew.private_value = private_value;
+ info->kctl = snd_ctl_new1(&knew, info);
+ if (!info->kctl) {
+ kfree(info);
+ kfree(knew.name);
+ return -ENOMEM;
+ }
+ info->kctl->private_free = pcm_volume_ctl_private_free;
+ err = snd_ctl_add(pcm->card, info->kctl);
+ if (err < 0) {
+ kfree(info);
+ kfree(knew.name);
+ return -ENOMEM;
+ }
+ pcm->streams[stream].vol_kctl = info->kctl;
+ if (info_ret)
+ *info_ret = info;
+ kfree(knew.name);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(snd_pcm_add_volume_ctls);
+
+static int pcm_usr_ctl_info(struct snd_kcontrol *kcontrol,
+ struct snd_ctl_elem_info *uinfo)
+{
+ uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER;
+ uinfo->count = MAX_USR_CTRL_CNT;
+ uinfo->value.integer.min = 0;
+ uinfo->value.integer.max = INT_MAX;
+ return 0;
+}
+
+static void pcm_usr_ctl_private_free(struct snd_kcontrol *kcontrol)
+{
+ struct snd_pcm_usr *info = snd_kcontrol_chip(kcontrol);
+ info->pcm->streams[info->stream].usr_kctl = NULL;
+ kfree(info);
+}
+
+/**
+ * snd_pcm_add_usr_ctls - create user control elements
+ * @pcm: the assigned PCM instance
+ * @stream: stream direction
+ * @max_length: the max length of the user parameter of stream
+ * @private_value: the value passed to each kcontrol's private_value field
+ * @info_ret: store struct snd_pcm_usr instance if non-NULL
+ *
+ * Create usr control elements assigned to the given PCM stream(s).
+ * Returns zero if succeed, or a negative error value.
+ */
+int snd_pcm_add_usr_ctls(struct snd_pcm *pcm, int stream,
+ const struct snd_pcm_usr_elem *usr,
+ int max_length, int max_kctrl_str_len,
+ unsigned long private_value,
+ struct snd_pcm_usr **info_ret)
+{
+ struct snd_pcm_usr *info;
+ struct snd_kcontrol_new knew = {
+ .iface = SNDRV_CTL_ELEM_IFACE_MIXER,
+ .access = SNDRV_CTL_ELEM_ACCESS_READWRITE,
+ .info = pcm_usr_ctl_info,
+ };
+ int err;
+ char *buf;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ pr_err("%s: snd_pcm_usr alloc failed\n", __func__);
+ return -ENOMEM;
+ }
+ info->pcm = pcm;
+ info->stream = stream;
+ info->usr = usr;
+ info->max_length = max_length;
+ buf = kzalloc(max_kctrl_str_len, GFP_KERNEL);
+ if (!buf) {
+ pr_err("%s: buffer allocation failed\n", __func__);
+ kfree(info);
+ return -ENOMEM;
+ }
+ knew.name = buf;
+ if (stream == SNDRV_PCM_STREAM_PLAYBACK)
+ snprintf(buf, max_kctrl_str_len, "%s %d %s",
+ "Playback", pcm->device, "User kcontrol");
+ else
+ snprintf(buf, max_kctrl_str_len, "%s %d %s",
+ "Capture", pcm->device, "User kcontrol");
+ knew.device = pcm->device;
+ knew.count = pcm->streams[stream].substream_count;
+ knew.private_value = private_value;
+ info->kctl = snd_ctl_new1(&knew, info);
+ if (!info->kctl) {
+ kfree(info);
+ kfree(knew.name);
+ pr_err("%s: snd_ctl_new failed\n", __func__);
+ return -ENOMEM;
+ }
+ info->kctl->private_free = pcm_usr_ctl_private_free;
+ err = snd_ctl_add(pcm->card, info->kctl);
+ if (err < 0) {
+ kfree(info);
+ kfree(knew.name);
+ pr_err("%s: snd_ctl_add failed:%d\n", __func__,
+ err);
+ return -ENOMEM;
+ }
+ pcm->streams[stream].usr_kctl = info->kctl;
+ if (info_ret)
+ *info_ret = info;
+ kfree(knew.name);
+ return 0;
+}
+EXPORT_SYMBOL(snd_pcm_add_usr_ctls);