[x86] eliminate redundant shuffle of horizontal math ops when both inputs are the...

author Sanjay Patel <spatel@rotateright.com>

Fri, 1 Sep 2017 21:09:04 +0000 (21:09 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Fri, 1 Sep 2017 21:09:04 +0000 (21:09 +0000)
author Sanjay Patel <spatel@rotateright.com>
Fri, 1 Sep 2017 21:09:04 +0000 (21:09 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Fri, 1 Sep 2017 21:09:04 +0000 (21:09 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index c7b383a..e0c17a2 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -29053,6 +29053,40 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
    return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
  }
  
+/// Eliminate a redundant shuffle of a horizontal math op.
+static SDValue foldShuffleOfHorizOp(SDNode *N) {
+  if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
+    return SDValue();
+
+  SDValue HOp = N->getOperand(0);
+  if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
+      HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
+    return SDValue();
+
+  // 128-bit horizontal math instructions are defined to operate on adjacent
+  // lanes of each operand as:
+  // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
+  // ...similarly for v2f64 and v8i16.
+  // TODO: 256-bit is not the same because...x86.
+  if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
+    return SDValue();
+
+  // When the operands of a horizontal math op are identical, the low half of
+  // the result is the same as the high half. If the shuffle is also replicating
+  // low and high halves, we don't need the shuffle.
+  // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+  // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
+  // but this should be tied to whatever horizontal op matching and shuffle
+  // canonicalization are producing.
+  if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
+      isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
+      isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
+    return HOp;
+
+  return SDValue();
+}
+
  static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
                                TargetLowering::DAGCombinerInfo &DCI,
                                const X86Subtarget &Subtarget) {
@@ -29061,10 +29095,14 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    // If we have legalized the vector types, look for blends of FADD and FSUB
    // nodes that we can fuse into an ADDSUB node.
-  if (TLI.isTypeLegal(VT))
+  if (TLI.isTypeLegal(VT)) {
      if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
        return AddSub;
  
+    if (SDValue HAddSub = foldShuffleOfHorizOp(N))
+      return HAddSub;
+  }
+
    // During Type Legalization, when promoting illegal vector types,
    // the backend might introduce new shuffle dag nodes and bitcasts.
    //
diff --git a/test/CodeGen/X86/haddsub-shuf.ll b/test/CodeGen/X86/haddsub-shuf.ll

index c22453b..37597c4 100644 (file)
--- a/test/CodeGen/X86/haddsub-shuf.ll
+++ b/test/CodeGen/X86/haddsub-shuf.ll
@@ -9,13 +9,11 @@ define <4 x float> @hadd_v4f32(<4 x float> %a) {
  ; SSSE3-LABEL: hadd_v4f32:
  ; SSSE3:       # BB#0:
  ; SSSE3-NEXT:    haddps %xmm0, %xmm0
-; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
  ; SSSE3-NEXT:    retq
  ;
  ; AVX-LABEL: hadd_v4f32:
  ; AVX:       # BB#0:
  ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
  ; AVX-NEXT:    retq
    %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
    %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
@@ -28,13 +26,11 @@ define <4 x float> @hsub_v4f32(<4 x float> %a) {
  ; SSSE3-LABEL: hsub_v4f32:
  ; SSSE3:       # BB#0:
  ; SSSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
  ; SSSE3-NEXT:    retq
  ;
  ; AVX-LABEL: hsub_v4f32:
  ; AVX:       # BB#0:
  ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
  ; AVX-NEXT:    retq
    %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
    %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
@@ -47,13 +43,11 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) {
  ; SSSE3-LABEL: hadd_v2f64:
  ; SSSE3:       # BB#0:
  ; SSSE3-NEXT:    haddpd %xmm0, %xmm0
-; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
  ; SSSE3-NEXT:    retq
  ;
  ; AVX-LABEL: hadd_v2f64:
  ; AVX:       # BB#0:
  ; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
  ; AVX-NEXT:    retq
    %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
    %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -66,13 +60,11 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) {
  ; SSSE3-LABEL: hsub_v2f64:
  ; SSSE3:       # BB#0:
  ; SSSE3-NEXT:    hsubpd %xmm0, %xmm0
-; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
  ; SSSE3-NEXT:    retq
  ;
  ; AVX-LABEL: hsub_v2f64:
  ; AVX:       # BB#0:
  ; AVX-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
  ; AVX-NEXT:    retq
    %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
    %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -85,13 +77,11 @@ define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
  ; SSSE3-LABEL: hadd_v4i32:
  ; SSSE3:       # BB#0:
  ; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
  ; SSSE3-NEXT:    retq
  ;
  ; AVX-LABEL: hadd_v4i32:
  ; AVX:       # BB#0:
  ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
  ; AVX-NEXT:    retq
    %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
    %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
@@ -104,13 +94,11 @@ define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
  ; SSSE3-LABEL: hsub_v4i32:
  ; SSSE3:       # BB#0:
  ; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
  ; SSSE3-NEXT:    retq
  ;
  ; AVX-LABEL: hsub_v4i32:
  ; AVX:       # BB#0:
  ; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
  ; AVX-NEXT:    retq
    %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
    %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
@@ -123,13 +111,11 @@ define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
  ; SSSE3-LABEL: hadd_v8i16:
  ; SSSE3:       # BB#0:
  ; SSSE3-NEXT:    phaddw %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
  ; SSSE3-NEXT:    retq
  ;
  ; AVX-LABEL: hadd_v8i16:
  ; AVX:       # BB#0:
  ; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
  ; AVX-NEXT:    retq
    %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
    %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -142,13 +128,11 @@ define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
  ; SSSE3-LABEL: hsub_v8i16:
  ; SSSE3:       # BB#0:
  ; SSSE3-NEXT:    phsubw %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
  ; SSSE3-NEXT:    retq
  ;
  ; AVX-LABEL: hsub_v8i16:
  ; AVX:       # BB#0:
  ; AVX-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
  ; AVX-NEXT:    retq
    %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
    %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
author	Sanjay Patel <spatel@rotateright.com>
	Fri, 1 Sep 2017 21:09:04 +0000 (21:09 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Fri, 1 Sep 2017 21:09:04 +0000 (21:09 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/haddsub-shuf.ll		patch \| blob \| history