From 93d995719e2459a6e9ccdb2c93a8ede8fa88c899 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel.sanders@imgtec.com>
Date: Tue, 24 Sep 2013 14:20:00 +0000
Subject: [PATCH] [mips][msa] Added support for matching shf from normal IR
 (i.e. not intrinsics)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@191302 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp |  5 ++
 lib/Target/Mips/InstPrinter/MipsInstPrinter.h   |  1 +
 lib/Target/Mips/MipsISelLowering.cpp            |  1 +
 lib/Target/Mips/MipsISelLowering.h              |  1 +
 lib/Target/Mips/MipsMSAInstrInfo.td             | 19 ++++++-
 lib/Target/Mips/MipsSEISelLowering.cpp          | 74 +++++++++++++++++++++++++
 test/CodeGen/Mips/msa/shuffle.ll                | 59 +++++++++++++++++---
 7 files changed, 149 insertions(+), 11 deletions(-)

diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 26321c99243..2612e3ba1ce 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -211,6 +211,11 @@ printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) {
   O << MipsFCCToString((Mips::CondCode)MO.getImm());
 }
 
+void MipsInstPrinter::
+printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) {
+  llvm_unreachable("TODO");
+}
+
 bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
                                  unsigned OpNo, raw_ostream &OS) {
   OS << "\t" << Str << "\t";
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 1253ab0c48a..11590ad8241 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -96,6 +96,7 @@ private:
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
+  void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O);
 
   bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo,
                   raw_ostream &OS);
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 04f90833d82..90d2cc6f290 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -225,6 +225,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::VEXTRACT_ZEXT_ELT: return "MipsISD::VEXTRACT_ZEXT_ELT";
   case MipsISD::VNOR:              return "MipsISD::VNOR";
   case MipsISD::VSHF:              return "MipsISD::VSHF";
+  case MipsISD::SHF:               return "MipsISD::SHF";
   default:                         return NULL;
   }
 }
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 0cb67b8d440..9e9e9959183 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -174,6 +174,7 @@ namespace llvm {
 
       // Vector Shuffle with mask as an operand
       VSHF,  // Generic shuffle
+      SHF,   // 4-element set shuffle.
 
       // Combined (XOR (OR $a, $b), -1)
       VNOR,
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index 4909743e349..5592ac5690b 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -23,6 +23,8 @@ def SDT_VFSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
 def SDT_VSHF : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisVec<0>,
                                     SDTCisInt<1>, SDTCisVec<1>,
                                     SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
+def SDT_SHF : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+                                   SDTCisVT<1, i32>, SDTCisSameAs<0, 2>]>;
 
 def MipsVAllNonZero : SDNode<"MipsISD::VALL_NONZERO", SDT_MipsVecCond>;
 def MipsVAnyNonZero : SDNode<"MipsISD::VANY_NONZERO", SDT_MipsVecCond>;
@@ -39,6 +41,7 @@ def MipsVUMin : SDNode<"MipsISD::VUMIN", SDTIntBinOp,
 def MipsVNOR : SDNode<"MipsISD::VNOR", SDTIntBinOp,
                       [SDNPCommutative, SDNPAssociative]>;
 def MipsVSHF : SDNode<"MipsISD::VSHF", SDT_VSHF>;
+def MipsSHF : SDNode<"MipsISD::SHF", SDT_SHF>;
 
 def vsetcc : SDNode<"ISD::SETCC", SDT_VSetCC>;
 def vfsetcc : SDNode<"ISD::SETCC", SDT_VFSetCC>;
@@ -1074,6 +1077,16 @@ class MSA_I8_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
   InstrItinClass Itinerary = itin;
 }
 
+class MSA_I8_SHF_DESC_BASE<string instr_asm, RegisterClass RCWD,
+                           RegisterClass RCWS = RCWD,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs RCWD:$wd);
+  dag InOperandList = (ins RCWS:$ws, uimm8:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set RCWD:$wd, (MipsSHF immZExt8:$u8, RCWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
 class MSA_I10_LDI_DESC_BASE<string instr_asm, RegisterClass RCWD,
                             InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs RCWD:$wd);
@@ -2066,9 +2079,9 @@ class SAT_U_H_DESC : MSA_BIT_H_DESC_BASE<"sat_u.h", int_mips_sat_u_h, MSA128H>;
 class SAT_U_W_DESC : MSA_BIT_W_DESC_BASE<"sat_u.w", int_mips_sat_u_w, MSA128W>;
 class SAT_U_D_DESC : MSA_BIT_D_DESC_BASE<"sat_u.d", int_mips_sat_u_d, MSA128D>;
 
-class SHF_B_DESC : MSA_I8_X_DESC_BASE<"shf.b", int_mips_shf_b, MSA128B>;
-class SHF_H_DESC : MSA_I8_X_DESC_BASE<"shf.h", int_mips_shf_h, MSA128H>;
-class SHF_W_DESC : MSA_I8_X_DESC_BASE<"shf.w", int_mips_shf_w, MSA128W>;
+class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128B>;
+class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128H>;
+class SHF_W_DESC : MSA_I8_SHF_DESC_BASE<"shf.w", MSA128W>;
 
 class SLD_B_DESC : MSA_3R_DESC_BASE<"sld.b", int_mips_sld_b, MSA128B>;
 class SLD_H_DESC : MSA_3R_DESC_BASE<"sld.h", int_mips_sld_h, MSA128H>;
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 929e91eb60e..b79e5321b69 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1428,6 +1428,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_pcnt_w:
   case Intrinsic::mips_pcnt_d:
     return lowerMSAUnaryIntr(Op, DAG, ISD::CTPOP);
+  case Intrinsic::mips_shf_b:
+  case Intrinsic::mips_shf_h:
+  case Intrinsic::mips_shf_w:
+    return DAG.getNode(MipsISD::SHF, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(2), Op->getOperand(1));
   case Intrinsic::mips_sll_b:
   case Intrinsic::mips_sll_h:
   case Intrinsic::mips_sll_w:
@@ -1735,6 +1740,72 @@ SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op,
   return SDValue();
 }
 
+// Lower VECTOR_SHUFFLE into SHF (if possible).
+//
+// SHF splits the vector into blocks of four elements, then shuffles these
+// elements according to a <4 x i2> constant (encoded as an integer immediate).
+//
+// It is therefore possible to lower into SHF when the mask takes the form:
+//   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
+// When undef's appear they are treated as if they were whatever value is
+// necessary in order to fit the above form.
+//
+// For example:
+//   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
+//                      <8 x i32> <i32 3, i32 2, i32 1, i32 0,
+//                                 i32 7, i32 6, i32 5, i32 4>
+// is lowered to:
+//   (SHF_H $w0, $w1, 27)
+// where the 27 comes from:
+//   3 + (2 << 2) + (1 << 4) + (0 << 6)
+static SDValue lowerVECTOR_SHUFFLE_SHF(SDValue Op, EVT ResTy,
+                                       SmallVector<int, 16> Indices,
+                                       SelectionDAG &DAG) {
+  int SHFIndices[4] = { -1, -1, -1, -1 };
+
+  if (Indices.size() < 4)
+    return SDValue();
+
+  for (unsigned i = 0; i < 4; ++i) {
+    for (unsigned j = i; j < Indices.size(); j += 4) {
+      int Idx = Indices[j];
+
+      // Convert from vector index to 4-element subvector index
+      // If an index refers to an element outside of the subvector then give up
+      if (Idx != -1) {
+        Idx -= 4 * (j / 4);
+        if (Idx < 0 || Idx >= 4)
+          return SDValue();
+      }
+
+      // If the mask has an undef, replace it with the current index.
+      // Note that it might still be undef if the current index is also undef
+      if (SHFIndices[i] == -1)
+        SHFIndices[i] = Idx;
+
+      // Check that non-undef values are the same as in the mask. If they
+      // aren't then give up
+      if (!(Idx == -1 || Idx == SHFIndices[i]))
+        return SDValue();
+    }
+  }
+
+  // Calculate the immediate. Replace any remaining undefs with zero
+  APInt Imm(32, 0);
+  for (int i = 3; i >= 0; --i) {
+    int Idx = SHFIndices[i];
+
+    if (Idx == -1)
+      Idx = 0;
+
+    Imm <<= 2;
+    Imm |= Idx & 0x3;
+  }
+
+  return DAG.getNode(MipsISD::SHF, SDLoc(Op), ResTy,
+                     DAG.getConstant(Imm, MVT::i32), Op->getOperand(0));
+}
+
 // Lower VECTOR_SHUFFLE into VSHF.
 //
 // This mostly consists of converting the shuffle indices in Indices into a
@@ -1802,6 +1873,9 @@ SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
   for (int i = 0; i < ResTyNumElts; ++i)
     Indices.push_back(Node->getMaskElt(i));
 
+  SDValue Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
   return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
 }
 
diff --git a/test/CodeGen/Mips/msa/shuffle.ll b/test/CodeGen/Mips/msa/shuffle.ll
index 35a5cf8658c..9854234c719 100644
--- a/test/CodeGen/Mips/msa/shuffle.ll
+++ b/test/CodeGen/Mips/msa/shuffle.ll
@@ -156,14 +156,16 @@ define void @vshf_v8i16_4(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind
   ; CHECK: .size vshf_v8i16_4
 }
 
+; Note: v4i32 only has one 4-element set so it's impossible to get a vshf.w
+; instruction when using a single vector.
+
 define void @vshf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
   ; CHECK: vshf_v4i32_0:
 
   %1 = load <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], %lo
-  ; CHECK-DAG: vshf.w [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 27
   store <4 x i32> %2, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
@@ -177,8 +179,7 @@ define void @vshf_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind
   %1 = load <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
-  ; CHECK-DAG: vshf.w [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85
   store <4 x i32> %2, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
@@ -193,8 +194,7 @@ define void @vshf_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind
   %2 = load <4 x i32>* %b
   ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
   %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 4, i32 5, i32 6, i32 4>
-  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], %lo
-  ; CHECK-DAG: vshf.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R2]], 36
   store <4 x i32> %3, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
@@ -225,8 +225,7 @@ define void @vshf_v4i32_4(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind
   %1 = load <4 x i32>* %a
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = shufflevector <4 x i32> %1, <4 x i32> %1, <4 x i32> <i32 1, i32 5, i32 5, i32 1>
-  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
-  ; CHECK-DAG: vshf.w [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85
   store <4 x i32> %2, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R3]], 0($4)
 
@@ -311,3 +310,47 @@ define void @vshf_v2i64_4(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind
   ret void
   ; CHECK: .size vshf_v2i64_4
 }
+
+define void @shf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: shf_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 2, i32 0, i32 5, i32 7, i32 6, i32 4, i32 9, i32 11, i32 10, i32 8, i32 13, i32 15, i32 14, i32 12>
+  ; CHECK-DAG: shf.b [[R3:\$w[0-9]+]], [[R1]], 45
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size shf_v16i8_0
+}
+
+define void @shf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: shf_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ; CHECK-DAG: shf.h [[R3:\$w[0-9]+]], [[R1]], 27
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size shf_v8i16_0
+}
+
+define void @shf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: shf_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 27
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size shf_v4i32_0
+}
+
+; shf.d does not exist
-- 
2.11.0