From 3589550b3ef0f25f3383a63dd7071861c401de4c Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 29 Sep 2014 01:32:54 +0000
Subject: [PATCH] [x86] Refactor all of the VSELECT-as-blend lowering code to
 avoid domain crossing and generally work more like the blend emission code in
 the new vector shuffle lowering.

My goal is to have the new vector shuffle lowering just produce VSELECT
nodes that are either matched here to BLENDI or are legal and matched in
the .td files to specific blend instructions. That seems much cleaner as
there are other ways to produce a VSELECT anyways. =]

No *observable* functionality changed yet, mostly because this code
appears to be near-dead. The behavior of this lowering routine did
change though. This code being mostly dead and untestable will change
with my next commit which will also point some new tests at it.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218588 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 75 +++++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 18 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ed542560742..552d420b805 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -11841,41 +11841,80 @@ static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
   SDValue Cond = Op.getOperand(0);
   SDValue LHS = Op.getOperand(1);
   SDValue RHS = Op.getOperand(2);
-  SDLoc dl(Op);
+  SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
-  unsigned NumElems = VT.getVectorNumElements();
 
   // There is no blend with immediate in AVX-512.
   if (VT.is512BitVector())
     return SDValue();
 
-  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+  // No blend instruction before SSE4.1.
+  if (!Subtarget->hasSSE41())
     return SDValue();
-  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+  // There is no byte-blend immediate controlled instruction.
+  if (EltVT == MVT::i8)
     return SDValue();
 
   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
     return SDValue();
 
-  // Check the mask for BLEND and build the value.
-  unsigned MaskValue = 0;
-  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
-    return SDValue();
+  auto *CondBV = cast<BuildVectorSDNode>(Cond);
 
-  // Convert i32 vectors to floating point if it is not AVX2.
-  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  unsigned BlendMask = 0;
   MVT BlendVT = VT;
-  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
-    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
-                               NumElems);
-    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
-    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+  if (VT == MVT::v16i16) {
+    // v16i16 blends are completely special. We can only do them when we have
+    // a repeated blend across the two 128-bit halves and we have AVX2.
+    if (!Subtarget->hasAVX2())
+      return SDValue();
+
+    for (int i = 0; i < 8; ++i) {
+      SDValue Lo = CondBV->getOperand(i);
+      SDValue Hi = CondBV->getOperand(i + 8);
+      bool IsLoZero = X86::isZeroNode(Lo);
+      bool IsHiZero = X86::isZeroNode(Hi);
+      if (Lo->getOpcode() != ISD::UNDEF && Hi->getOpcode() != ISD::UNDEF &&
+          IsLoZero != IsHiZero)
+        // Asymmetric blends, bail.
+        return SDValue();
+      BlendMask |= (unsigned)(IsLoZero || IsHiZero) << i;
+    }
+  } else {
+    // Everything else uses a generic blend mask computation with a custom type.
+    if (VT.isInteger()) {
+      if (VT.is256BitVector()) {
+        // The 256-bit integer blend instructions are only available on AVX2.
+        if (!Subtarget->hasAVX2())
+          return SDValue();
+
+        // We do the blend on v8i32 for 256-bit integer types.
+        BlendVT = MVT::v8i32;
+      } else {
+        // For 128-bit vectors we do the blend on v8i16 types.
+        BlendVT = MVT::v8i16;
+      }
+    }
+    assert(BlendVT.getVectorNumElements() <= 8 &&
+           "Cannot blend more than 8 elements with an immediate!");
+    // Scale the blend mask based on the number of elements in the selected
+    // blend type.
+    int Scale = BlendVT.getVectorNumElements() / VT.getVectorNumElements();
+    for (int i = 0, e = CondBV->getNumOperands(); i < e; ++i) {
+      SDValue CondElement = CondBV->getOperand(i);
+      if (CondElement->getOpcode() != ISD::UNDEF &&
+          X86::isZeroNode(CondElement))
+        for (int j = 0; j < Scale; ++j)
+          BlendMask |= 1u << (i * Scale + j);
+    }
   }
 
-  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
-                            DAG.getConstant(MaskValue, MVT::i32));
-  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+  LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, LHS);
+  RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, RHS);
+
+  return DAG.getNode(ISD::BITCAST, DL, VT,
+                     DAG.getNode(X86ISD::BLENDI, DL, BlendVT, LHS, RHS,
+                                 DAG.getConstant(BlendMask, MVT::i8)));
 }
 
 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
-- 
2.11.0