From 0d7286a652371bca460357348f3b4828cd4ca214 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Tue, 7 Jul 2020 10:45:26 -0700 Subject: [PATCH] [WebAssembly] Avoid scalarizing vector shifts in more cases Since WebAssembly's vector shift instructions take a scalar shift amount rather than a vector shift amount, we have to check in ISel that the vector shift amount is a splat. Previously, we were checking explicitly for splat BUILD_VECTOR nodes, but this change uses the standard utilities for detecting splat values that can handle more complex splat patterns. Since the C++ ISel lowering is now more general than the ISel patterns, this change also simplifies shift lowering by using the C++ lowering for all SIMD shifts rather than mixing C++ and normal pattern-based lowering. This change improves ISel for shifts to the point that the simd-shift-unroll.ll regression test no longer tests the code path it was originally meant to test. The bug corresponding to that regression test is no longer reproducible with its original reported reproducer, so rather than try to fix the regression test, this change just removes it. Differential Revision: https://reviews.llvm.org/D83278 --- .../Target/WebAssembly/WebAssemblyISelLowering.cpp | 20 ++-- .../lib/Target/WebAssembly/WebAssemblyInstrSIMD.td | 55 +++------ .../WebAssembly/simd-shift-complex-splats.ll | 27 +++++ llvm/test/CodeGen/WebAssembly/simd-shift-unroll.ll | 128 --------------------- 4 files changed, 53 insertions(+), 177 deletions(-) create mode 100644 llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll delete mode 100644 llvm/test/CodeGen/WebAssembly/simd-shift-unroll.ll diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 651c504efc0..3f4ebd50159 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1677,19 +1677,13 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op, // Only manually lower vector shifts assert(Op.getSimpleValueType().isVector()); - // Unroll non-splat vector shifts - BuildVectorSDNode *ShiftVec; - SDValue SplatVal; - if (!(ShiftVec = dyn_cast(Op.getOperand(1).getNode())) || - !(SplatVal = ShiftVec->getSplatValue())) + auto ShiftVal = Op.getOperand(1); + if (!DAG.isSplatValue(ShiftVal, /*AllowUndefs=*/true)) return unrollVectorShift(Op, DAG); - // All splats except i64x2 const splats are handled by patterns - auto *SplatConst = dyn_cast(SplatVal); - if (!SplatConst || Op.getSimpleValueType() != MVT::v2i64) - return Op; + auto SplatVal = DAG.getSplatValue(ShiftVal); + assert(SplatVal != SDValue()); - // i64x2 const splats are custom lowered to avoid unnecessary wraps unsigned Opcode; switch (Op.getOpcode()) { case ISD::SHL: @@ -1704,9 +1698,11 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op, default: llvm_unreachable("unexpected opcode"); } - APInt Shift = SplatConst->getAPIntValue().zextOrTrunc(32); + + // Use anyext because none of the high bits can affect the shift + auto ScalarShift = DAG.getAnyExtOrTrunc(SplatVal, DL, MVT::i32); return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(0), - DAG.getConstant(Shift, DL, MVT::i32)); + ScalarShift); } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index b4a8a7bc42a..814bb80fb69 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -654,55 +654,36 @@ defm BITMASK : SIMDBitmask; // Bit shifts //===----------------------------------------------------------------------===// -multiclass SIMDShift simdop> { +multiclass SIMDShift simdop> { defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec, I32:$x), (outs), (ins), - [(set (vec_t V128:$dst), - (node V128:$vec, (vec_t shift_vec)))], + [(set (vec_t V128:$dst), (node V128:$vec, I32:$x))], vec#"."#name#"\t$dst, $vec, $x", vec#"."#name, simdop>; } multiclass SIMDShiftInt baseInst> { - defm "" : SIMDShift; - defm "" : SIMDShift; - defm "" : SIMDShift; - defm "" : SIMDShift; + defm "" : SIMDShift; + defm "" : SIMDShift; + defm "" : SIMDShift; + defm "" : SIMDShift; } -// Left shift by scalar: shl -defm SHL : SIMDShiftInt; - -// Right shift by scalar: shr_s / shr_u -defm SHR_S : SIMDShiftInt; -defm SHR_U : SIMDShiftInt; - -// Truncate i64 shift operands to i32s, except if they are already i32s -foreach shifts = [[shl, SHL_v2i64], [sra, SHR_S_v2i64], [srl, SHR_U_v2i64]] in { -def : Pat<(v2i64 (shifts[0] - (v2i64 V128:$vec), - (v2i64 (splat2 (i64 (sext I32:$x)))) - )), - (v2i64 (shifts[1] (v2i64 V128:$vec), (i32 I32:$x)))>; -def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), (v2i64 (splat2 I64:$x)))), - (v2i64 (shifts[1] (v2i64 V128:$vec), (I32_WRAP_I64 I64:$x)))>; -} - -// 2xi64 shifts with constant shift amounts are custom lowered to avoid wrapping +// WebAssembly SIMD shifts are nonstandard in that the shift amount is +// an i32 rather than a vector, so they need custom nodes. def wasm_shift_t : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>] >; def wasm_shl : SDNode<"WebAssemblyISD::VEC_SHL", wasm_shift_t>; def wasm_shr_s : SDNode<"WebAssemblyISD::VEC_SHR_S", wasm_shift_t>; def wasm_shr_u : SDNode<"WebAssemblyISD::VEC_SHR_U", wasm_shift_t>; -foreach shifts = [[wasm_shl, SHL_v2i64], - [wasm_shr_s, SHR_S_v2i64], - [wasm_shr_u, SHR_U_v2i64]] in -def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), I32:$x)), - (v2i64 (shifts[1] (v2i64 V128:$vec), I32:$x))>; + +// Left shift by scalar: shl +defm SHL : SIMDShiftInt; + +// Right shift by scalar: shr_s / shr_u +defm SHR_S : SIMDShiftInt; +defm SHR_U : SIMDShiftInt; //===----------------------------------------------------------------------===// // Integer binary arithmetic @@ -766,12 +747,12 @@ def add_nuw : PatFrag<(ops node:$lhs, node:$rhs), "return N->getFlags().hasNoUnsignedWrap();">; foreach nodes = [[v16i8, splat16], [v8i16, splat8]] in -def : Pat<(srl +def : Pat<(wasm_shr_u (add_nuw (add_nuw (nodes[0] V128:$lhs), (nodes[0] V128:$rhs)), (nodes[1] (i32 1)) ), - (nodes[0] (nodes[1] (i32 1))) + (i32 1) ), (!cast("AVGR_U_"#nodes[0]) V128:$lhs, V128:$rhs)>; diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll new file mode 100644 index 00000000000..ded430f8954 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s + +; Test that SIMD shifts can be lowered correctly even with shift +; values that are more complex than plain splats. + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +;; TODO: Optimize this further by scalarizing the add + +; CHECK-LABEL: shl_add: +; CHECK-NEXT: .functype shl_add (v128, i32, i32) -> (v128) +; CHECK-NEXT: i8x16.splat $push1=, $1 +; CHECK-NEXT: i8x16.splat $push0=, $2 +; CHECK-NEXT: i8x16.add $push2=, $pop1, $pop0 +; CHECK-NEXT: i8x16.extract_lane_u $push3=, $pop2, 0 +; CHECK-NEXT: i8x16.shl $push4=, $0, $pop3 +; CHECK-NEXT: return $pop4 +define <16 x i8> @shl_add(<16 x i8> %v, i8 %a, i8 %b) { + %t1 = insertelement <16 x i8> undef, i8 %a, i32 0 + %va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer + %t2 = insertelement <16 x i8> undef, i8 %b, i32 0 + %vb = shufflevector <16 x i8> %t2, <16 x i8> undef, <16 x i32> zeroinitializer + %shift = add <16 x i8> %va, %vb + %r = shl <16 x i8> %v, %shift + ret <16 x i8> %r +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-unroll.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-unroll.ll deleted file mode 100644 index 2a5422cb011..00000000000 --- a/llvm/test/CodeGen/WebAssembly/simd-shift-unroll.ll +++ /dev/null @@ -1,128 +0,0 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s --check-prefixes CHECK,SIMD128,SIMD128-SLOW - -;; Test that the custom shift unrolling works correctly in cases that -;; cause assertion failures due to illegal types when using -;; DAG.UnrollVectorOp. Regression test for PR45178. - -target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" -target triple = "wasm32-unknown-unknown" - -; CHECK-LABEL: shl_v16i8: -; CHECK-NEXT: .functype shl_v16i8 (v128) -> (v128) -; CHECK-NEXT: i8x16.extract_lane_u $push0=, $0, 0 -; CHECK-NEXT: i32.const $push1=, 3 -; CHECK-NEXT: i32.shl $push2=, $pop0, $pop1 -; CHECK-NEXT: i8x16.splat $push3=, $pop2 -; CHECK-NEXT: i8x16.extract_lane_u $push4=, $0, 1 -; CHECK-NEXT: i8x16.replace_lane $push5=, $pop3, 1, $pop4 -; ... -; CHECK: i8x16.extract_lane_u $push32=, $0, 15 -; CHECK-NEXT: i8x16.replace_lane $push33=, $pop31, 15, $pop32 -; CHECK-NEXT: v8x16.shuffle $push34=, $pop33, $0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: return $pop34 -define <16 x i8> @shl_v16i8(<16 x i8> %in) { - %out = shl <16 x i8> %in, - - %ret = shufflevector <16 x i8> %out, <16 x i8> undef, <16 x i32> zeroinitializer - ret <16 x i8> %ret -} - -; CHECK-LABEL: shr_s_v16i8: -; CHECK-NEXT: functype shr_s_v16i8 (v128) -> (v128) -; CHECK-NEXT: i8x16.extract_lane_s $push0=, $0, 0 -; CHECK-NEXT: i32.const $push1=, 3 -; CHECK-NEXT: i32.shr_s $push2=, $pop0, $pop1 -; CHECK-NEXT: i8x16.splat $push3=, $pop2 -; CHECK-NEXT: i8x16.extract_lane_s $push4=, $0, 1 -; CHECK-NEXT: i8x16.replace_lane $push5=, $pop3, 1, $pop4 -; ... -; CHECK: i8x16.extract_lane_s $push32=, $0, 15 -; CHECK-NEXT: i8x16.replace_lane $push33=, $pop31, 15, $pop32 -; CHECK-NEXT: v8x16.shuffle $push34=, $pop33, $0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: return $pop34 -define <16 x i8> @shr_s_v16i8(<16 x i8> %in) { - %out = ashr <16 x i8> %in, - - %ret = shufflevector <16 x i8> %out, <16 x i8> undef, <16 x i32> zeroinitializer - ret <16 x i8> %ret -} - -; CHECK-LABEL: shr_u_v16i8: -; CHECK-NEXT: functype shr_u_v16i8 (v128) -> (v128) -; CHECK-NEXT: i8x16.extract_lane_u $push0=, $0, 0 -; CHECK-NEXT: i32.const $push1=, 3 -; CHECK-NEXT: i32.shr_u $push2=, $pop0, $pop1 -; CHECK-NEXT: i8x16.splat $push3=, $pop2 -; CHECK-NEXT: i8x16.extract_lane_u $push4=, $0, 1 -; CHECK-NEXT: i8x16.replace_lane $push5=, $pop3, 1, $pop4 -; ... -; CHECK: i8x16.extract_lane_u $push32=, $0, 15 -; CHECK-NEXT: i8x16.replace_lane $push33=, $pop31, 15, $pop32 -; CHECK-NEXT: v8x16.shuffle $push34=, $pop33, $0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: return $pop34 -define <16 x i8> @shr_u_v16i8(<16 x i8> %in) { - %out = lshr <16 x i8> %in, - - %ret = shufflevector <16 x i8> %out, <16 x i8> undef, <16 x i32> zeroinitializer - ret <16 x i8> %ret -} - -; CHECK-LABEL: shl_v8i16: -; CHECK-NEXT: functype shl_v8i16 (v128) -> (v128) -; CHECK-NEXT: i16x8.extract_lane_u $push0=, $0, 0 -; CHECK-NEXT: i32.const $push1=, 9 -; CHECK-NEXT: i32.shl $push2=, $pop0, $pop1 -; CHECK-NEXT: i16x8.splat $push3=, $pop2 -; CHECK-NEXT: i16x8.extract_lane_u $push4=, $0, 1 -; CHECK-NEXT: i16x8.replace_lane $push5=, $pop3, 1, $pop4 -; ... -; CHECK: i16x8.extract_lane_u $push16=, $0, 7 -; CHECK-NEXT: i16x8.replace_lane $push17=, $pop15, 7, $pop16 -; CHECK-NEXT: v8x16.shuffle $push18=, $pop17, $0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK-NEXT: return $pop18 -define <8 x i16> @shl_v8i16(<8 x i16> %in) { - %out = shl <8 x i16> %in, - %ret = shufflevector <8 x i16> %out, <8 x i16> undef, <8 x i32> zeroinitializer - ret <8 x i16> %ret -} - -; CHECK-LABEL: shr_s_v8i16: -; CHECK-NEXT: functype shr_s_v8i16 (v128) -> (v128) -; CHECK-NEXT: i16x8.extract_lane_s $push0=, $0, 0 -; CHECK-NEXT: i32.const $push1=, 9 -; CHECK-NEXT: i32.shr_s $push2=, $pop0, $pop1 -; CHECK-NEXT: i16x8.splat $push3=, $pop2 -; CHECK-NEXT: i16x8.extract_lane_s $push4=, $0, 1 -; CHECK-NEXT: i16x8.replace_lane $push5=, $pop3, 1, $pop4 -; ... -; CHECK: i16x8.extract_lane_s $push16=, $0, 7 -; CHECK-NEXT: i16x8.replace_lane $push17=, $pop15, 7, $pop16 -; CHECK-NEXT: v8x16.shuffle $push18=, $pop17, $0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK-NEXT: return $pop18 -define <8 x i16> @shr_s_v8i16(<8 x i16> %in) { - %out = ashr <8 x i16> %in, - %ret = shufflevector <8 x i16> %out, <8 x i16> undef, <8 x i32> zeroinitializer - ret <8 x i16> %ret -} - -; CHECK-LABEL: shr_u_v8i16: -; CHECK-NEXT: functype shr_u_v8i16 (v128) -> (v128) -; CHECK-NEXT: i16x8.extract_lane_u $push0=, $0, 0 -; CHECK-NEXT: i32.const $push1=, 9 -; CHECK-NEXT: i32.shr_u $push2=, $pop0, $pop1 -; CHECK-NEXT: i16x8.splat $push3=, $pop2 -; CHECK-NEXT: i16x8.extract_lane_u $push4=, $0, 1 -; CHECK-NEXT: i16x8.replace_lane $push5=, $pop3, 1, $pop4 -; ... -; CHECK: i16x8.extract_lane_u $push16=, $0, 7 -; CHECK-NEXT: i16x8.replace_lane $push17=, $pop15, 7, $pop16 -; CHECK-NEXT: v8x16.shuffle $push18=, $pop17, $0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 -; CHECK-NEXT: return $pop18 -define <8 x i16> @shr_u_v8i16(<8 x i16> %in) { - %out = lshr <8 x i16> %in, - %ret = shufflevector <8 x i16> %out, <8 x i16> undef, <8 x i32> zeroinitializer - ret <8 x i16> %ret -} -- 2.11.0