From f84cd97e5c61fe3ce66af838dd955dee02076639 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 21 Oct 2013 07:57:41 +0200 Subject: [PATCH] s390/percpu: make use of interlocked-access facility 1 instructions Optimize this_cpu_* functions for 64 bit by making use of new instructions that came with the interlocked-access facility 1 (load-and-*) and the general-instructions-extension facility (asi, agsi). That way we get rid of the compare-and-swap loop in most cases. Code size reduction (defconfig, -march=z196): 11,555 bytes. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/percpu.h | 125 ++++++++++++++++++++++++++++++++--------- 1 file changed, 100 insertions(+), 25 deletions(-) diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h index 41baca870d0c..061ab45faf70 100644 --- a/arch/s390/include/asm/percpu.h +++ b/arch/s390/include/asm/percpu.h @@ -21,7 +21,11 @@ #define ARCH_NEEDS_WEAK_PER_CPU #endif -#define arch_this_cpu_to_op(pcp, val, op) \ +/* + * We use a compare-and-swap loop since that uses less cpu cycles than + * disabling and enabling interrupts like the generic variant would do. + */ +#define arch_this_cpu_to_op_simple(pcp, val, op) \ ({ \ typedef typeof(pcp) pcp_op_T__; \ pcp_op_T__ old__, new__, prev__; \ @@ -38,30 +42,101 @@ new__; \ }) -#define this_cpu_add_1(pcp, val) arch_this_cpu_to_op(pcp, val, +) -#define this_cpu_add_2(pcp, val) arch_this_cpu_to_op(pcp, val, +) -#define this_cpu_add_4(pcp, val) arch_this_cpu_to_op(pcp, val, +) -#define this_cpu_add_8(pcp, val) arch_this_cpu_to_op(pcp, val, +) - -#define this_cpu_add_return_1(pcp, val) arch_this_cpu_to_op(pcp, val, +) -#define this_cpu_add_return_2(pcp, val) arch_this_cpu_to_op(pcp, val, +) -#define this_cpu_add_return_4(pcp, val) arch_this_cpu_to_op(pcp, val, +) -#define this_cpu_add_return_8(pcp, val) arch_this_cpu_to_op(pcp, val, +) - -#define this_cpu_and_1(pcp, val) arch_this_cpu_to_op(pcp, val, &) -#define this_cpu_and_2(pcp, val) arch_this_cpu_to_op(pcp, val, &) -#define this_cpu_and_4(pcp, val) arch_this_cpu_to_op(pcp, val, &) -#define this_cpu_and_8(pcp, val) arch_this_cpu_to_op(pcp, val, &) - -#define this_cpu_or_1(pcp, val) arch_this_cpu_to_op(pcp, val, |) -#define this_cpu_or_2(pcp, val) arch_this_cpu_to_op(pcp, val, |) -#define this_cpu_or_4(pcp, val) arch_this_cpu_to_op(pcp, val, |) -#define this_cpu_or_8(pcp, val) arch_this_cpu_to_op(pcp, val, |) - -#define this_cpu_xor_1(pcp, val) arch_this_cpu_to_op(pcp, val, ^) -#define this_cpu_xor_2(pcp, val) arch_this_cpu_to_op(pcp, val, ^) -#define this_cpu_xor_4(pcp, val) arch_this_cpu_to_op(pcp, val, ^) -#define this_cpu_xor_8(pcp, val) arch_this_cpu_to_op(pcp, val, ^) +#define this_cpu_add_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_add_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_add_return_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_add_return_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_and_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, &) +#define this_cpu_and_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, &) +#define this_cpu_or_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) +#define this_cpu_or_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) +#define this_cpu_xor_1(pcp, val) arch_this_cpu_to_op_simple(pcp, val, ^) +#define this_cpu_xor_2(pcp, val) arch_this_cpu_to_op_simple(pcp, val, ^) + +#ifndef CONFIG_HAVE_MARCH_Z196_FEATURES + +#define this_cpu_add_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_add_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_add_return_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_add_return_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, +) +#define this_cpu_and_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, &) +#define this_cpu_and_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, &) +#define this_cpu_or_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) +#define this_cpu_or_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, |) +#define this_cpu_xor_4(pcp, val) arch_this_cpu_to_op_simple(pcp, val, ^) +#define this_cpu_xor_8(pcp, val) arch_this_cpu_to_op_simple(pcp, val, ^) + +#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +#define arch_this_cpu_add(pcp, val, op1, op2, szcast) \ +{ \ + typedef typeof(pcp) pcp_op_T__; \ + pcp_op_T__ val__ = (val); \ + pcp_op_T__ old__, *ptr__; \ + preempt_disable(); \ + ptr__ = __this_cpu_ptr(&(pcp)); \ + if (__builtin_constant_p(val__) && \ + ((szcast)val__ > -129) && ((szcast)val__ < 128)) { \ + asm volatile( \ + op2 " %[ptr__],%[val__]\n" \ + : [ptr__] "+Q" (*ptr__) \ + : [val__] "i" ((szcast)val__) \ + : "cc"); \ + } else { \ + asm volatile( \ + op1 " %[old__],%[val__],%[ptr__]\n" \ + : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \ + : [val__] "d" (val__) \ + : "cc"); \ + } \ + preempt_enable(); \ +} + +#define this_cpu_add_4(pcp, val) arch_this_cpu_add(pcp, val, "laa", "asi", int) +#define this_cpu_add_8(pcp, val) arch_this_cpu_add(pcp, val, "laag", "agsi", long) + +#define arch_this_cpu_add_return(pcp, val, op) \ +({ \ + typedef typeof(pcp) pcp_op_T__; \ + pcp_op_T__ val__ = (val); \ + pcp_op_T__ old__, *ptr__; \ + preempt_disable(); \ + ptr__ = __this_cpu_ptr(&(pcp)); \ + asm volatile( \ + op " %[old__],%[val__],%[ptr__]\n" \ + : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \ + : [val__] "d" (val__) \ + : "cc"); \ + preempt_enable(); \ + old__ + val__; \ +}) + +#define this_cpu_add_return_4(pcp, val) arch_this_cpu_add_return(pcp, val, "laa") +#define this_cpu_add_return_8(pcp, val) arch_this_cpu_add_return(pcp, val, "laag") + +#define arch_this_cpu_to_op(pcp, val, op) \ +{ \ + typedef typeof(pcp) pcp_op_T__; \ + pcp_op_T__ val__ = (val); \ + pcp_op_T__ old__, *ptr__; \ + preempt_disable(); \ + ptr__ = __this_cpu_ptr(&(pcp)); \ + asm volatile( \ + op " %[old__],%[val__],%[ptr__]\n" \ + : [old__] "=d" (old__), [ptr__] "+Q" (*ptr__) \ + : [val__] "d" (val__) \ + : "cc"); \ + preempt_enable(); \ +} + +#define this_cpu_and_4(pcp, val) arch_this_cpu_to_op(pcp, val, "lan") +#define this_cpu_and_8(pcp, val) arch_this_cpu_to_op(pcp, val, "lang") +#define this_cpu_or_4(pcp, val) arch_this_cpu_to_op(pcp, val, "lao") +#define this_cpu_or_8(pcp, val) arch_this_cpu_to_op(pcp, val, "laog") +#define this_cpu_xor_4(pcp, val) arch_this_cpu_to_op(pcp, val, "lax") +#define this_cpu_xor_8(pcp, val) arch_this_cpu_to_op(pcp, val, "laxg") + +#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ #define arch_this_cpu_cmpxchg(pcp, oval, nval) \ ({ \ -- 2.11.0