BBVectorize: Choose pair ordering to minimize shuffles

author Hal Finkel <hfinkel@anl.gov>

Wed, 31 Oct 2012 15:17:07 +0000 (15:17 +0000)

committer Hal Finkel <hfinkel@anl.gov>

Wed, 31 Oct 2012 15:17:07 +0000 (15:17 +0000)
author Hal Finkel <hfinkel@anl.gov>
Wed, 31 Oct 2012 15:17:07 +0000 (15:17 +0000)
committer Hal Finkel <hfinkel@anl.gov>
Wed, 31 Oct 2012 15:17:07 +0000 (15:17 +0000)
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp

index 051606b..40277dc 100644 (file)
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -166,6 +166,12 @@ DebugCycleCheck("bb-vectorize-debug-cycle-check",
    cl::init(false), cl::Hidden,
    cl::desc("When debugging is enabled, output information on the"
             " cycle-checking process"));
+
+static cl::opt<bool>
+PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, dump the basic block after"
+           " every pair is fused"));
  #endif
  
  STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
@@ -196,6 +202,7 @@ namespace {
      typedef std::pair<ValuePair, int> ValuePairWithCost;
      typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
      typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
+    typedef std::pair<VPPair, unsigned> VPPairWithType;
      typedef std::pair<std::multimap<Value *, Value *>::iterator,
                std::multimap<Value *, Value *>::iterator> VPIteratorPair;
      typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
@@ -220,9 +227,16 @@ namespace {
                         DenseMap<ValuePair, int> &CandidatePairCostSavings,
                         std::vector<Value *> &PairableInsts, bool NonPow2Len);
  
+    enum PairConnectionType {
+      PairConnectionDirect,
+      PairConnectionSwap,
+      PairConnectionSplat
+    };
+
      void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
                         std::vector<Value *> &PairableInsts,
-                       std::multimap<ValuePair, ValuePair> &ConnectedPairs);
+                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                       DenseMap<VPPair, unsigned> &PairConnectionTypes);
  
      void buildDepMap(BasicBlock &BB,
                         std::multimap<Value *, Value *> &CandidatePairs,
@@ -239,7 +253,11 @@ namespace {
      void fuseChosenPairs(BasicBlock &BB,
                       std::vector<Value *> &PairableInsts,
                       DenseMap<Value *, Value *>& ChosenPairs,
-                     DenseSet<ValuePair> &FixedOrderPairs);
+                     DenseSet<ValuePair> &FixedOrderPairs,
+                     DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                     std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                     std::multimap<ValuePair, ValuePair> &ConnectedPairDeps);
+
  
      bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
  
@@ -256,6 +274,7 @@ namespace {
                        std::multimap<Value *, Value *> &CandidatePairs,
                        std::vector<Value *> &PairableInsts,
                        std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
                        ValuePair P);
  
      bool pairsConflict(ValuePair P, ValuePair Q,
@@ -310,14 +329,15 @@ namespace {
  
      bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
                         unsigned o, Value *&LOp, unsigned numElemL,
-                       Type *ArgTypeL, Type *ArgTypeR,
+                       Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
                         unsigned IdxOff = 0);
  
      Value *getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o);
+                     Instruction *J, unsigned o, bool IBeforeJ);
  
      void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, SmallVector<Value *, 3> &ReplacedOperands);
+                     Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
+                     bool IBeforeJ);
  
      void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
                       Instruction *J, Instruction *K,
@@ -647,6 +667,8 @@ namespace {
      std::vector<Value *> AllPairableInsts;
      DenseMap<Value *, Value *> AllChosenPairs;
      DenseSet<ValuePair> AllFixedOrderPairs;
+    DenseMap<VPPair, unsigned> AllPairConnectionTypes;
+    std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps;
  
      do {
        std::vector<Value *> PairableInsts;
@@ -668,10 +690,18 @@ namespace {
        // Note that it only matters that both members of the second pair use some
        // element of the first pair (to allow for splatting).
  
-      std::multimap<ValuePair, ValuePair> ConnectedPairs;
-      computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
+      std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps;
+      DenseMap<VPPair, unsigned> PairConnectionTypes;
+      computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs,
+                            PairConnectionTypes);
        if (ConnectedPairs.empty()) continue;
  
+      for (std::multimap<ValuePair, ValuePair>::iterator
+           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
+           I != IE; ++I) {
+        ConnectedPairDeps.insert(VPPair(I->second, I->first));
+      }
+
        // Build the pairable-instruction dependency map
        DenseSet<ValuePair> PairableInstUsers;
        buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
@@ -692,12 +722,37 @@ namespace {
                                PairableInsts.end());
        AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
  
+      // Only for the chosen pairs, propagate information on fixed-order pairs,
+      // pair connections, and their types to the data structures used by the
+      // pair fusion procedures.
        for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
             IE = ChosenPairs.end(); I != IE; ++I) {
          if (FixedOrderPairs.count(*I))
            AllFixedOrderPairs.insert(*I);
          else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
            AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
+
+        for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
+             J != IE; ++J) {
+          DenseMap<VPPair, unsigned>::iterator K =
+            PairConnectionTypes.find(VPPair(*I, *J));
+          if (K != PairConnectionTypes.end()) {
+            AllPairConnectionTypes.insert(*K);
+          } else {
+            K = PairConnectionTypes.find(VPPair(*J, *I));
+            if (K != PairConnectionTypes.end())
+              AllPairConnectionTypes.insert(*K);
+          }
+        }
+      }
+
+      for (std::multimap<ValuePair, ValuePair>::iterator
+           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
+           I != IE; ++I) {
+        if (AllPairConnectionTypes.count(*I)) {
+          AllConnectedPairs.insert(*I);
+          AllConnectedPairDeps.insert(VPPair(I->second, I->first));
+        }
        }
      } while (ShouldContinue);
  
@@ -711,7 +766,9 @@ namespace {
      // replaced with a vector_extract on the result.  Subsequent optimization
      // passes should coalesce the build/extract combinations.
  
-    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs);
+    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
+                    AllPairConnectionTypes,
+                    AllConnectedPairs, AllConnectedPairDeps);
  
      // It is important to cleanup here so that future iterations of this
      // function have less work to do.
@@ -1098,6 +1155,7 @@ namespace {
                        std::multimap<Value *, Value *> &CandidatePairs,
                        std::vector<Value *> &PairableInsts,
                        std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
                        ValuePair P) {
      StoreInst *SI, *SJ;
  
@@ -1129,12 +1187,18 @@ namespace {
          VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
  
          // Look for <I, J>:
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
-          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+          VPPair VP(P, ValuePair(*I, *J));
+          ConnectedPairs.insert(VP);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
+        }
  
          // Look for <J, I>:
-        if (isSecondInIteratorPair<Value*>(*I, JPairRange))
-          ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
+        if (isSecondInIteratorPair<Value*>(*I, JPairRange)) {
+          VPPair VP(P, ValuePair(*J, *I));
+          ConnectedPairs.insert(VP);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
+        }
        }
  
        if (Config.SplatBreaksChain) continue;
@@ -1145,8 +1209,11 @@ namespace {
              P.first == SJ->getPointerOperand())
            continue;
  
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
-          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+          VPPair VP(P, ValuePair(*I, *J));
+          ConnectedPairs.insert(VP);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
+        }
        }
      }
  
@@ -1168,8 +1235,11 @@ namespace {
              P.second == SJ->getPointerOperand())
            continue;
  
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
-          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+          VPPair VP(P, ValuePair(*I, *J));
+          ConnectedPairs.insert(VP);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
+        }
        }
      }
    }
@@ -1180,7 +1250,8 @@ namespace {
    void BBVectorize::computeConnectedPairs(
                        std::multimap<Value *, Value *> &CandidatePairs,
                        std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs) {
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseMap<VPPair, unsigned> &PairConnectionTypes) {
  
      for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
           PE = PairableInsts.end(); PI != PE; ++PI) {
@@ -1189,7 +1260,7 @@ namespace {
        for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
             P != choiceRange.second; ++P)
          computePairsConnectedTo(CandidatePairs, PairableInsts,
-                                ConnectedPairs, *P);
+                                ConnectedPairs, PairConnectionTypes, *P);
      }
  
      DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
@@ -1776,7 +1847,7 @@ namespace {
                                    Instruction *J, unsigned o, Value *&LOp,
                                    unsigned numElemL,
                                    Type *ArgTypeL, Type *ArgTypeH,
-                                  unsigned IdxOff) {
+                                  bool IBeforeJ, unsigned IdxOff) {
      bool ExpandedIEChain = false;
      if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
        // If we have a pure insertelement chain, then this can be rewritten
@@ -1810,8 +1881,9 @@ namespace {
            LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
                               ConstantInt::get(Type::getInt32Ty(Context),
                                                i + IdxOff),
-                             getReplacementName(I, true, o, i+1));
-          LIENext->insertBefore(J);
+                             getReplacementName(IBeforeJ ? I : J,
+                                                true, o, i+1));
+          LIENext->insertBefore(IBeforeJ ? J : I);
            LIEPrev = LIENext;
          }
  
@@ -1826,7 +1898,7 @@ namespace {
    // Returns the value to be used as the specified operand of the vector
    // instruction that fuses I with J.
    Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o) {
+                     Instruction *J, unsigned o, bool IBeforeJ) {
      Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
      Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
  
@@ -1989,8 +2061,9 @@ namespace {
            Instruction *S =
              new ShuffleVectorInst(I1, UndefValue::get(I1T),
                                    ConstantVector::get(Mask),
-                                  getReplacementName(I, true, o));
-          S->insertBefore(J);
+                                  getReplacementName(IBeforeJ ? I : J,
+                                                     true, o));
+          S->insertBefore(IBeforeJ ? J : I);
            return S;
          }
  
@@ -2011,8 +2084,9 @@ namespace {
            Instruction *NewI1 =
              new ShuffleVectorInst(I1, UndefValue::get(I1T),
                                    ConstantVector::get(Mask),
-                                  getReplacementName(I, true, o, 1));
-          NewI1->insertBefore(J);
+                                  getReplacementName(IBeforeJ ? I : J,
+                                                     true, o, 1));
+          NewI1->insertBefore(IBeforeJ ? J : I);
            I1 = NewI1;
            I1T = I2T;
            I1Elem = I2Elem;
@@ -2027,8 +2101,9 @@ namespace {
            Instruction *NewI2 =
              new ShuffleVectorInst(I2, UndefValue::get(I2T),
                                    ConstantVector::get(Mask),
-                                  getReplacementName(I, true, o, 1));
-          NewI2->insertBefore(J);
+                                  getReplacementName(IBeforeJ ? I : J,
+                                                     true, o, 1));
+          NewI2->insertBefore(IBeforeJ ? J : I);
            I2 = NewI2;
            I2T = I1T;
            I2Elem = I1Elem;
@@ -2048,8 +2123,8 @@ namespace {
  
          Instruction *NewOp =
            new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
-                                getReplacementName(I, true, o));
-        NewOp->insertBefore(J);
+                                getReplacementName(IBeforeJ ? I : J, true, o));
+        NewOp->insertBefore(IBeforeJ ? J : I);
          return NewOp;
        }
      }
@@ -2057,17 +2132,17 @@ namespace {
      Type *ArgType = ArgTypeL;
      if (numElemL < numElemH) {
        if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
-                                         ArgTypeL, VArgType, 1)) {
+                                         ArgTypeL, VArgType, IBeforeJ, 1)) {
          // This is another short-circuit case: we're combining a scalar into
          // a vector that is formed by an IE chain. We've just expanded the IE
          // chain, now insert the scalar and we're done.
  
          Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
-                                               getReplacementName(I, true, o));
-        S->insertBefore(J);
+                           getReplacementName(IBeforeJ ? I : J, true, o));
+        S->insertBefore(IBeforeJ ? J : I);
          return S;
        } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
-                                ArgTypeH)) {
+                                ArgTypeH, IBeforeJ)) {
          // The two vector inputs to the shuffle must be the same length,
          // so extend the smaller vector to be the same length as the larger one.
          Instruction *NLOp;
@@ -2082,29 +2157,32 @@ namespace {
      
            NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
                                         ConstantVector::get(Mask),
-                                       getReplacementName(I, true, o, 1));
+                                       getReplacementName(IBeforeJ ? I : J,
+                                                          true, o, 1));
          } else {
            NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
-                                           getReplacementName(I, true, o, 1));
+                                           getReplacementName(IBeforeJ ? I : J,
+                                                              true, o, 1));
          }
    
-        NLOp->insertBefore(J);
+        NLOp->insertBefore(IBeforeJ ? J : I);
          LOp = NLOp;
        }
  
        ArgType = ArgTypeH;
      } else if (numElemL > numElemH) {
        if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
-                                         ArgTypeH, VArgType)) {
+                                         ArgTypeH, VArgType, IBeforeJ)) {
          Instruction *S =
            InsertElementInst::Create(LOp, HOp, 
                                      ConstantInt::get(Type::getInt32Ty(Context),
                                                       numElemL),
-                                    getReplacementName(I, true, o));
-        S->insertBefore(J);
+                                    getReplacementName(IBeforeJ ? I : J,
+                                                       true, o));
+        S->insertBefore(IBeforeJ ? J : I);
          return S;
        } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
-                                ArgTypeL)) {
+                                ArgTypeL, IBeforeJ)) {
          Instruction *NHOp;
          if (numElemH > 1) {
            std::vector<Constant *> Mask(numElemL);
@@ -2116,13 +2194,15 @@ namespace {
      
            NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
                                         ConstantVector::get(Mask),
-                                       getReplacementName(I, true, o, 1));
+                                       getReplacementName(IBeforeJ ? I : J,
+                                                          true, o, 1));
          } else {
            NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
-                                           getReplacementName(I, true, o, 1));
+                                           getReplacementName(IBeforeJ ? I : J,
+                                                              true, o, 1));
          }
    
-        NHOp->insertBefore(J);
+        NHOp->insertBefore(IBeforeJ ? J : I);
          HOp = NHOp;
        }
      }
@@ -2140,19 +2220,21 @@ namespace {
        }
  
        Instruction *BV = new ShuffleVectorInst(LOp, HOp,
-                                              ConstantVector::get(Mask),
-                                              getReplacementName(I, true, o));
-      BV->insertBefore(J);
+                          ConstantVector::get(Mask),
+                          getReplacementName(IBeforeJ ? I : J, true, o));
+      BV->insertBefore(IBeforeJ ? J : I);
        return BV;
      }
  
      Instruction *BV1 = InsertElementInst::Create(
                                            UndefValue::get(VArgType), LOp, CV0,
-                                          getReplacementName(I, true, o, 1));
-    BV1->insertBefore(I);
+                                          getReplacementName(IBeforeJ ? I : J,
+                                                             true, o, 1));
+    BV1->insertBefore(IBeforeJ ? J : I);
      Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
-                                          getReplacementName(I, true, o, 2));
-    BV2->insertBefore(J);
+                                          getReplacementName(IBeforeJ ? I : J,
+                                                             true, o, 2));
+    BV2->insertBefore(IBeforeJ ? J : I);
      return BV2;
    }
  
@@ -2160,7 +2242,8 @@ namespace {
    // to the vector instruction that fuses I with J.
    void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
                       Instruction *I, Instruction *J,
-                     SmallVector<Value *, 3> &ReplacedOperands) {
+                     SmallVector<Value *, 3> &ReplacedOperands,
+                     bool IBeforeJ) {
      unsigned NumOperands = I->getNumOperands();
  
      for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
@@ -2197,7 +2280,7 @@ namespace {
          continue;
        }
  
-      ReplacedOperands[o] = getReplacementInput(Context, I, J, o);
+      ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
      }
    }
  
@@ -2392,18 +2475,20 @@ namespace {
    void BBVectorize::fuseChosenPairs(BasicBlock &BB,
                       std::vector<Value *> &PairableInsts,
                       DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseSet<ValuePair> &FixedOrderPairs) {
+                     DenseSet<ValuePair> &FixedOrderPairs,
+                     DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                     std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                     std::multimap<ValuePair, ValuePair> &ConnectedPairDeps) {
      LLVMContext& Context = BB.getContext();
  
      // During the vectorization process, the order of the pairs to be fused
      // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
      // list. After a pair is fused, the flipped pair is removed from the list.
-    std::vector<ValuePair> FlippedPairs;
-    FlippedPairs.reserve(ChosenPairs.size());
+    DenseSet<ValuePair> FlippedPairs;
      for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
           E = ChosenPairs.end(); P != E; ++P)
-      FlippedPairs.push_back(ValuePair(P->second, P->first));
-    for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(),
+      FlippedPairs.insert(ValuePair(P->second, P->first));
+    for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
           E = FlippedPairs.end(); P != E; ++P)
        ChosenPairs.insert(*P);
  
@@ -2451,37 +2536,83 @@ namespace {
  
        // If the pair must have the other order, then flip it.
        bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
+      if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
+        // This pair does not have a fixed order, and so we might want to
+        // flip it if that will yield fewer shuffles. We count the number
+        // of dependencies connected via swaps, and those directly connected,
+        // and flip the order if the number of swaps is greater.
+        bool OrigOrder = true;
+        VPPIteratorPair IP = ConnectedPairDeps.equal_range(ValuePair(I, J));
+        if (IP.first == ConnectedPairDeps.end()) {
+          IP = ConnectedPairDeps.equal_range(ValuePair(J, I));
+          OrigOrder = false;
+        }
+
+        if (IP.first != ConnectedPairDeps.end()) {
+          unsigned NumDepsDirect = 0, NumDepsSwap = 0;
+          for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
+               Q != IP.second; ++Q) {
+            DenseMap<VPPair, unsigned>::iterator R =
+              PairConnectionTypes.find(VPPair(Q->second, Q->first));
+            assert(R != PairConnectionTypes.end() &&
+                   "Cannot find pair connection type");
+            if (R->second == PairConnectionDirect)
+              ++NumDepsDirect;
+            else if (R->second == PairConnectionSwap)
+              ++NumDepsSwap;
+          }
+
+          if (!OrigOrder)
+            std::swap(NumDepsDirect, NumDepsSwap);
+
+          if (NumDepsSwap > NumDepsDirect) {
+            FlipPairOrder = true;
+            DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
+                            " <-> " << *J << "\n");
+          }
+        }
+      }
  
        Instruction *L = I, *H = J;
        if (FlipPairOrder)
          std::swap(H, L);
  
+      // If the pair being fused uses the opposite order from that in the pair
+      // connection map, then we need to flip the types.
+      VPPIteratorPair IP = ConnectedPairs.equal_range(ValuePair(H, L));
+      for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
+           Q != IP.second; ++Q) {
+        DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(*Q);
+        assert(R != PairConnectionTypes.end() &&
+               "Cannot find pair connection type");
+        if (R->second == PairConnectionDirect)
+          R->second = PairConnectionSwap;
+        else if (R->second == PairConnectionSwap)
+          R->second = PairConnectionDirect;
+      }
+
+      bool LBeforeH = !FlipPairOrder;
        unsigned NumOperands = I->getNumOperands();
        SmallVector<Value *, 3> ReplacedOperands(NumOperands);
-      getReplacementInputsForPair(Context, L, H, ReplacedOperands);
+      getReplacementInputsForPair(Context, L, H, ReplacedOperands,
+                                  LBeforeH);
  
        // Make a copy of the original operation, change its type to the vector
        // type and replace its operands with the vector operands.
-      Instruction *K = I->clone();
-      if (I->hasName()) K->takeName(I);
+      Instruction *K = L->clone();
+      if (L->hasName())
+        K->takeName(L);
+      else if (H->hasName())
+        K->takeName(H);
  
        if (!isa<StoreInst>(K))
          K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
  
-      combineMetadata(K, J);
+      combineMetadata(K, H);
  
        for (unsigned o = 0; o < NumOperands; ++o)
          K->setOperand(o, ReplacedOperands[o]);
  
-      // If we've flipped the memory inputs, make sure that we take the correct
-      // alignment.
-      if (FlipPairOrder) {
-        if (isa<StoreInst>(K))
-          cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment());
-        else if (isa<LoadInst>(K))
-          cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment());
-      }
-
        K->insertAfter(J);
  
        // Instruction insertion point:
@@ -2497,10 +2628,10 @@ namespace {
        moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
  
        if (!isa<StoreInst>(I)) {
-        I->replaceAllUsesWith(K1);
-        J->replaceAllUsesWith(K2);
-        AA->replaceWithNewValue(I, K1);
-        AA->replaceWithNewValue(J, K2);
+        L->replaceAllUsesWith(K1);
+        H->replaceAllUsesWith(K2);
+        AA->replaceWithNewValue(L, K1);
+        AA->replaceWithNewValue(H, K2);
        }
  
        // Instructions that may read from memory may be in the load move set.
@@ -2533,6 +2664,9 @@ namespace {
        SE->forgetValue(J);
        I->eraseFromParent();
        J->eraseFromParent();
+
+      DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
+                                               BB << "\n");
      }
  
      DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
diff --git a/test/Transforms/BBVectorize/X86/loop1.ll b/test/Transforms/BBVectorize/X86/loop1.ll

index 9d5d9fb..c1be622 100644 (file)
--- a/test/Transforms/BBVectorize/X86/loop1.ll
+++ b/test/Transforms/BBVectorize/X86/loop1.ll
@@ -42,8 +42,8 @@ for.body:                                         ; preds = %for.body, %entry
  ; CHECK: %mul = fmul double %0, %0
  ; CHECK: %mul3 = fmul double %0, %1
  ; CHECK: %add = fadd double %mul, %mul3
-; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
  ; CHECK: %mul8 = fmul double %1, %1
+; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
  ; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
  ; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
  ; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
diff --git a/test/Transforms/BBVectorize/X86/simple.ll b/test/Transforms/BBVectorize/X86/simple.ll

index 6450f82..d11c9b9 100644 (file)
--- a/test/Transforms/BBVectorize/X86/simple.ll
+++ b/test/Transforms/BBVectorize/X86/simple.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
  define double @test1(double %A1, double %A2, double %B1, double %B2) {
  ; CHECK: @test1
  ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
         %X1 = fsub double %A1, %B1
         %X2 = fsub double %A2, %B2
diff --git a/test/Transforms/BBVectorize/cycle.ll b/test/Transforms/BBVectorize/cycle.ll

index 32a91ce..e8e82ce 100644 (file)
--- a/test/Transforms/BBVectorize/cycle.ll
+++ b/test/Transforms/BBVectorize/cycle.ll
@@ -107,6 +107,6 @@ done:
    ret void
  ; CHECK: @test1
  ; CHECK: go:
-; CHECK-NEXT: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
+; CHECK: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
  ; FIXME: When tree pruning is deterministic, include the entire output.
  }
diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll

index bebc91a..c22ea58 100644 (file)
--- a/test/Transforms/BBVectorize/loop1.ll
+++ b/test/Transforms/BBVectorize/loop1.ll
@@ -42,8 +42,8 @@ for.body:                                         ; preds = %for.body, %entry
  ; CHECK: %mul = fmul double %0, %0
  ; CHECK: %mul3 = fmul double %0, %1
  ; CHECK: %add = fadd double %mul, %mul3
-; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
  ; CHECK: %mul8 = fmul double %1, %1
+; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
  ; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
  ; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
  ; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
diff --git a/test/Transforms/BBVectorize/search-limit.ll b/test/Transforms/BBVectorize/search-limit.ll

index d9945b5..aeaf988 100644 (file)
--- a/test/Transforms/BBVectorize/search-limit.ll
+++ b/test/Transforms/BBVectorize/search-limit.ll
@@ -7,8 +7,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2) {
  ; CHECK-SL4: @test1
  ; CHECK-SL4-NOT: <2 x double>
  ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
         %X1 = fsub double %A1, %B1
         %X2 = fsub double %A2, %B2
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll

index 6844977..ae1d63b 100644 (file)
--- a/test/Transforms/BBVectorize/simple-int.ll
+++ b/test/Transforms/BBVectorize/simple-int.ll
@@ -17,8 +17,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1,
         ret double %R
  ; CHECK: @test1
  ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
  ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
  ; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0
@@ -43,8 +43,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
         ret double %R
  ; CHECK: @test2
  ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
  ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
  ; CHECK: %Y1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %X1)
@@ -68,8 +68,8 @@ define double @test3(double %A1, double %A2, double %B1, double %B2, i32 %P) {
         ret double %R
  ; CHECK: @test3
  ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
  ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
  ; CHECK: %Y1 = call <2 x double> @llvm.powi.v2f64(<2 x double> %X1, i32 %P)
diff --git a/test/Transforms/BBVectorize/simple-ldstr.ll b/test/Transforms/BBVectorize/simple-ldstr.ll

index 6883e84..7dd77c9 100644 (file)
--- a/test/Transforms/BBVectorize/simple-ldstr.ll
+++ b/test/Transforms/BBVectorize/simple-ldstr.ll
@@ -94,13 +94,13 @@ entry:
  ; CHECK-AO: @test3
  ; CHECK-AO: %i0 = load double* %a, align 8
  ; CHECK-AO: %i1 = load double* %b, align 8
-; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
-; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
  ; CHECK-AO: %arrayidx3 = getelementptr inbounds double* %a, i64 1
  ; CHECK-AO: %i3 = load double* %arrayidx3, align 8
  ; CHECK-AO: %arrayidx4 = getelementptr inbounds double* %b, i64 1
  ; CHECK-AO: %i4 = load double* %arrayidx4, align 8
+; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
  ; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1
+; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
  ; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1
  ; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2
  ; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float>
diff --git a/test/Transforms/BBVectorize/simple-sel.ll b/test/Transforms/BBVectorize/simple-sel.ll

index 325792a..15ecb59 100644 (file)
--- a/test/Transforms/BBVectorize/simple-sel.ll
+++ b/test/Transforms/BBVectorize/simple-sel.ll
@@ -6,8 +6,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
  define double @test1(double %A1, double %A2, double %B1, double %B2, i1 %C1, i1 %C2) {
  ; CHECK: @test1
  ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
         %X1 = fsub double %A1, %B1
         %X2 = fsub double %A2, %B2
@@ -33,8 +33,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
  ; CHECK: @test2
  ; CHECK-NB: @test2
  ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
         %X1 = fsub double %A1, %B1
         %X2 = fsub double %A2, %B2
diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll

index 7cd8133..d9a12ee 100644 (file)
--- a/test/Transforms/BBVectorize/simple.ll
+++ b/test/Transforms/BBVectorize/simple.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
  define double @test1(double %A1, double %A2, double %B1, double %B2) {
  ; CHECK: @test1
  ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
         %X1 = fsub double %A1, %B1
         %X2 = fsub double %A2, %B2
@@ -29,8 +29,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2) {
  define double @test2(double %A1, double %A2, double %B1, double %B2) {
  ; CHECK: @test2
  ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
         %X1 = fsub double %A1, %B1
         %X2 = fsub double %A2, %B2
@@ -40,12 +40,13 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
  ; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
         %Z1 = fadd double %Y2, %B1
         %Z2 = fadd double %Y1, %B2
-; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+; CHECK: %Z1.v.i1.1 = insertelement <2 x double> undef, double %B2, i32 0
+; CHECK: %Z1.v.i1.2 = insertelement <2 x double> %Z1.v.i1.1, double %B1, i32 1
+; CHECK: %Z2 = fadd <2 x double> %Y1, %Z1.v.i1.2
         %R  = fmul double %Z1, %Z2
-; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
-; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
-; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: %Z2.v.r1 = extractelement <2 x double> %Z2, i32 0
+; CHECK: %Z2.v.r2 = extractelement <2 x double> %Z2, i32 1
+; CHECK: %R = fmul double %Z2.v.r2, %Z2.v.r1
         ret double %R
  ; CHECK: ret double %R
  }
@@ -54,8 +55,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
  define double @test3(double %A1, double %A2, double %B1, double %B2) {
  ; CHECK: @test3
  ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
         %X1 = fsub double %A1, %B1
         %X2 = fsub double %A2, %B2
@@ -79,8 +80,8 @@ define double @test3(double %A1, double %A2, double %B1, double %B2) {
  define double @test4(double %A1, double %A2, double %B1, double %B2) {
  ; CHECK: @test4
  ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
  ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
         %X1 = fsub double %A1, %B1
         %X2 = fsub double %A2, %B2
@@ -148,3 +149,27 @@ define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
  ; CHECK: ret <8 x i8> %R
  }
  
+; Basic depth-3 chain (flipped order)
+define double @test7(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test7
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+       %X1 = fsub double %A1, %B1
+       %X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+       %Y1 = fmul double %X1, %A1
+       %Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+       %Z2 = fadd double %Y2, %B2
+       %Z1 = fadd double %Y1, %B1
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+       %R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+       ret double %R
+; CHECK: ret double %R
+}
+
author	Hal Finkel <hfinkel@anl.gov>
	Wed, 31 Oct 2012 15:17:07 +0000 (15:17 +0000)
committer	Hal Finkel <hfinkel@anl.gov>
	Wed, 31 Oct 2012 15:17:07 +0000 (15:17 +0000)
lib/Transforms/Vectorize/BBVectorize.cpp		patch \| blob \| history
test/Transforms/BBVectorize/X86/loop1.ll		patch \| blob \| history
test/Transforms/BBVectorize/X86/simple.ll		patch \| blob \| history
test/Transforms/BBVectorize/cycle.ll		patch \| blob \| history
test/Transforms/BBVectorize/loop1.ll		patch \| blob \| history
test/Transforms/BBVectorize/search-limit.ll		patch \| blob \| history
test/Transforms/BBVectorize/simple-int.ll		patch \| blob \| history
test/Transforms/BBVectorize/simple-ldstr.ll		patch \| blob \| history
test/Transforms/BBVectorize/simple-sel.ll		patch \| blob \| history
test/Transforms/BBVectorize/simple.ll		patch \| blob \| history