[x86] Teach the new vector shuffle lowering to re-use the SHUFPS

author Chandler Carruth <chandlerc@gmail.com>

Sun, 21 Sep 2014 13:35:14 +0000 (13:35 +0000)

committer Chandler Carruth <chandlerc@gmail.com>

Sun, 21 Sep 2014 13:35:14 +0000 (13:35 +0000)
author Chandler Carruth <chandlerc@gmail.com>
Sun, 21 Sep 2014 13:35:14 +0000 (13:35 +0000)
committer Chandler Carruth <chandlerc@gmail.com>
Sun, 21 Sep 2014 13:35:14 +0000 (13:35 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 14613d3..2a020b5 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7800,7 +7800,7 @@ static SDValue lowerVectorShuffleWithSHUPFS(SDLoc DL, MVT VT,
        // To make this work, blend them together as the first step.
        int V1Index = V2AdjIndex;
        int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
-      V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
+      V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
                         getV4X86ShuffleImm8ForMask(BlendMask, DAG));
  
        // Now proceed to reconstruct the final blend as we have the necessary
@@ -7831,7 +7831,7 @@ static SDValue lowerVectorShuffleWithSHUPFS(SDLoc DL, MVT VT,
                            Mask[2] < 4 ? Mask[2] : Mask[3],
                            (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
                            (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
-      V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
+      V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
                         getV4X86ShuffleImm8ForMask(BlendMask, DAG));
  
        // Now we do a normal shuffle of V1 by giving V1 as both operands to
@@ -7843,7 +7843,7 @@ static SDValue lowerVectorShuffleWithSHUPFS(SDLoc DL, MVT VT,
        NewMask[3] = Mask[2] < 4 ? 3 : 1;
      }
    }
-  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
+  return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
                       getV4X86ShuffleImm8ForMask(NewMask, DAG));
  }
  
@@ -9385,6 +9385,14 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
      if (isShuffleEquivalent(LoMask, 2, 10, 3, 11))
        return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
+
+    // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
+    // have already handled any direct blends.
+    int SHUFPSMask[] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+    for (int &M : SHUFPSMask)
+      if (M >= 8)
+        M -= 4;
+    return lowerVectorShuffleWithSHUPFS(DL, MVT::v8f32, SHUFPSMask, V1, V2, DAG);
    }
  
    if (isSingleInputShuffleMask(Mask))
@@ -9531,7 +9539,9 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
      return DAG.getCommutedVectorShuffle(*SVOp);
  
    // When the number of V1 and V2 elements are the same, try to minimize the
-  // number of uses of V2 in the low half of the vector.
+  // number of uses of V2 in the low half of the vector. When that is tied,
+  // ensure that the sum of indices for V1 is equal to or lower than the sum
+  // indices for V2.
    if (NumV1Elements == NumV2Elements) {
      int LowV1Elements = 0, LowV2Elements = 0;
      for (int M : SVOp->getMask().slice(0, NumElements / 2))
@@ -9541,6 +9551,15 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
          ++LowV1Elements;
      if (LowV2Elements > LowV1Elements)
        return DAG.getCommutedVectorShuffle(*SVOp);
+
+    int SumV1Indices = 0, SumV2Indices = 0;
+    for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+      if (SVOp->getMask()[i] >= NumElements)
+        SumV2Indices += i;
+      else if (SVOp->getMask()[i] >= 0)
+        SumV1Indices += i;
+    if (SumV2Indices < SumV1Indices)
+      return DAG.getCommutedVectorShuffle(*SVOp);
    }
  
    // For each vector width, delegate to a specialized lowering routine.
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll

index 2d4f157..311fe7e 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -153,9 +153,8 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
  define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
  ; ALL-LABEL: @shuffle_v8f32_08084c4c
  ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm1[0,0,2,0,4,4,6,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,0,3,4,5,4,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm0[0,2,1,3,4,6,5,7]
  ; ALL-NEXT:    retq
    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
    ret <8 x float> %shuffle
@@ -164,8 +163,7 @@ define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
  define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
  ; ALL-LABEL: @shuffle_v8f32_8823cc67
  ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm1[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
  ; ALL-NEXT:    retq
    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
    ret <8 x float> %shuffle
@@ -174,9 +172,7 @@ define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
  define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
  ; ALL-LABEL: @shuffle_v8f32_9832dc76
  ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm1[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,3,2,4,5,7,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
  ; ALL-NEXT:    retq
    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
    ret <8 x float> %shuffle
@@ -185,9 +181,7 @@ define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
  define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) {
  ; ALL-LABEL: @shuffle_v8f32_9810dc54
  ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm1[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,1,0,4,5,5,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vshufps {{.*}} # ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
  ; ALL-NEXT:    retq
    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
    ret <8 x float> %shuffle
author	Chandler Carruth <chandlerc@gmail.com>
	Sun, 21 Sep 2014 13:35:14 +0000 (13:35 +0000)
committer	Chandler Carruth <chandlerc@gmail.com>
	Sun, 21 Sep 2014 13:35:14 +0000 (13:35 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-256-v8.ll		patch \| blob \| history