From faed772a259443843840b58d0853cef509ff0972 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Fri, 1 Dec 2017 16:17:24 +0000 Subject: [PATCH] Revert r319531 "[SLPVectorizer] Failure to beneficially vectorize 'copyable' elements in integer binary ops." It causes builds to fail with "Instruction does not dominate all uses" (PR35497). > Patch tries to improve vectorization of the following code: > > void add1(int * __restrict dst, const int * __restrict src) { > *dst++ = *src++; > *dst++ = *src++ + 1; > *dst++ = *src++ + 2; > *dst++ = *src++ + 3; > } > Allows to vectorize even if the very first operation is not a binary add, but just a load. > > Fixed issues related to previous commit. > > Reviewers: spatel, mzolotukhin, mkuper, hfinkel, RKSimon, filcab, ABataev > > Reviewed By: ABataev, RKSimon > > Subscribers: llvm-commits, RKSimon > > Differential Revision: https://reviews.llvm.org/D28907 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319550 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 486 ++++++--------------- test/Transforms/SLPVectorizer/SystemZ/pr34619.ll | 52 --- test/Transforms/SLPVectorizer/X86/load-dominate.ll | 27 -- .../SLPVectorizer/X86/vect_copyable_in_binops.ll | 200 ++++++--- 4 files changed, 275 insertions(+), 490 deletions(-) delete mode 100644 test/Transforms/SLPVectorizer/SystemZ/pr34619.ll delete mode 100644 test/Transforms/SLPVectorizer/X86/load-dominate.ll diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2c5c256daf7..d30c1063c0d 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -333,7 +333,7 @@ static unsigned getAltOpcode(unsigned Op) { case Instruction::Sub: return Instruction::Add; default: - return Op; + return 0; } } @@ -346,20 +346,6 @@ static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode, return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode; } -/// Checks if the \p Opcode can be considered as an operand of a (possibly) -/// binary operation \p I. -/// \returns The code of the binary operation of instruction \p I if the -/// instruction with \p Opcode can be considered as an operand of \p I with the -/// default value. -static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) { - assert(!sameOpcodeOrAlt(Opcode, getAltOpcode(Opcode), I->getOpcode()) - && "Invalid Opcode"); - if (Opcode != Instruction::PHI && isa(I) && - (I->getType()->isIntegerTy() || cast(I)->isFast())) - return I->getOpcode(); - return 0; -} - /// Chooses the correct key for scheduling data. If \p Op has the same (or /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p /// OpValue. @@ -381,12 +367,7 @@ namespace { struct RawInstructionsData { /// Main Opcode of the instructions going to be vectorized. unsigned Opcode = 0; - /// Position of the first instruction with the \a Opcode. - unsigned OpcodePos = 0; - /// Need an additional analysis (if at least one of the instruction is not - /// same instruction kind as an instruction at OpcodePos position in the - /// list). - bool NeedAnalysis = false; + /// The list of instructions have some instructions with alternate opcodes. bool HasAltOpcodes = false; }; @@ -401,38 +382,16 @@ static RawInstructionsData getMainOpcode(ArrayRef VL) { return {}; RawInstructionsData Res; unsigned Opcode = I0->getOpcode(); - unsigned AltOpcode = getAltOpcode(Opcode); - unsigned NewOpcodePos = 0; // Walk through the list of the vectorized instructions // in order to check its structure described by RawInstructionsData. for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) { auto *I = dyn_cast(VL[Cnt]); if (!I) return {}; - if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) { - if (Opcode != I->getOpcode()) { - Res.HasAltOpcodes = true; - if (Res.NeedAnalysis && isOdd(NewOpcodePos)) - std::swap(Opcode, AltOpcode); - } - continue; - } - if (unsigned NewOpcode = tryToRepresentAsInstArg(Opcode, I)) { - if (!Instruction::isBinaryOp(Opcode) || - !Instruction::isCommutative(Opcode)) { - NewOpcodePos = Cnt; - Opcode = NewOpcode; - AltOpcode = getAltOpcode(Opcode); - Res.NeedAnalysis = true; - } - } else if (tryToRepresentAsInstArg(I->getOpcode(), - cast(VL[NewOpcodePos]))) - Res.NeedAnalysis = true; - else - return {}; + if (Opcode != I->getOpcode()) + Res.HasAltOpcodes = true; } Res.Opcode = Opcode; - Res.OpcodePos = NewOpcodePos; return Res; } @@ -462,20 +421,16 @@ struct InstructionsState { static InstructionsState getSameOpcode(ArrayRef VL) { auto Res = getMainOpcode(VL); unsigned Opcode = Res.Opcode; - if (!Res.NeedAnalysis && !Res.HasAltOpcodes) - return InstructionsState(VL[Res.OpcodePos], Opcode, false); - auto *OpInst = cast(VL[Res.OpcodePos]); + if (!Res.HasAltOpcodes) + return InstructionsState(VL[0], Opcode, false); + auto *OpInst = cast(VL[0]); unsigned AltOpcode = getAltOpcode(Opcode); // Examine each element in the list instructions VL to determine // if some operations there could be considered as an alternative - // (for example as subtraction relates to addition operation) or - // operation could be an operand of a (possibly) binary operation. + // (for example as subtraction relates to addition operation). for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { auto *I = cast(VL[Cnt]); unsigned InstOpcode = I->getOpcode(); - if (Res.NeedAnalysis && !sameOpcodeOrAlt(Opcode, AltOpcode, InstOpcode)) - if (tryToRepresentAsInstArg(InstOpcode, OpInst)) - InstOpcode = (Res.HasAltOpcodes && isOdd(Cnt)) ? AltOpcode : Opcode; if ((Res.HasAltOpcodes && InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) || (!Res.HasAltOpcodes && InstOpcode != Opcode)) { @@ -628,7 +583,6 @@ public: void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntry.clear(); - ExtraScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); NumLoadsWantToKeepOrder = 0; @@ -768,40 +722,22 @@ private: /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; - - /// Info about instruction in this tree entry. - InstructionsState State; }; /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized, - int &UserTreeIdx, const InstructionsState &S) { - assert((!Vectorized || S.Opcode != 0) && - "Vectorized TreeEntry without opcode"); + int &UserTreeIdx) { VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; if (Vectorized) { - Last->State = S; - unsigned AltOpcode = getAltOpcode(S.Opcode); for (int i = 0, e = VL.size(); i != e; ++i) { - unsigned RealOpcode = - (S.IsAltShuffle && isOdd(i)) ? AltOpcode : S.Opcode; - Value *Key = (cast(VL[i])->getOpcode() == RealOpcode) - ? VL[i] - : S.OpValue; - assert(!getTreeEntry(VL[i], Key) && "Scalar already in tree!"); - if (VL[i] == Key) - ScalarToTreeEntry[Key] = idx; - else - ExtraScalarToTreeEntry[VL[i]][Key] = idx; + assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); + ScalarToTreeEntry[VL[i]] = idx; } } else { - Last->State.Opcode = 0; - Last->State.OpValue = VL[0]; - Last->State.IsAltShuffle = false; MustGather.insert(VL.begin(), VL.end()); } @@ -829,24 +765,8 @@ private: return nullptr; } - TreeEntry *getTreeEntry(Value *V, Value *OpValue) { - if (V == OpValue) - return getTreeEntry(V); - auto I = ExtraScalarToTreeEntry.find(V); - if (I != ExtraScalarToTreeEntry.end()) { - auto &STT = I->second; - auto STTI = STT.find(OpValue); - if (STTI != STT.end()) - return &VectorizableTree[STTI->second]; - } - return nullptr; - } - /// Maps a specific scalar to its tree entry. - SmallDenseMap ScalarToTreeEntry; - - /// Maps a specific scalar to its tree entry(s) with leading scalar. - SmallDenseMap> ExtraScalarToTreeEntry; + SmallDenseMap ScalarToTreeEntry; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -1169,11 +1089,9 @@ private: Action(SD); auto I = ExtraScheduleDataMap.find(V); if (I != ExtraScheduleDataMap.end()) - for (auto &P : I->second) { - ScheduleData *SD = P.second; - if (SD && SD->SchedulingRegionID == SchedulingRegionID) - Action(SD); - } + for (auto &P : I->second) + if (P.second->SchedulingRegionID == SchedulingRegionID) + Action(P.second); } /// Put all instructions into the ReadyList which are ready for scheduling. @@ -1420,15 +1338,9 @@ void BoUpSLP::buildTree(ArrayRef Roots, continue; // For each lane: - const unsigned Opcode = Entry->State.Opcode; - const unsigned AltOpcode = getAltOpcode(Opcode); for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - if (!sameOpcodeOrAlt(Opcode, AltOpcode, - cast(Scalar)->getOpcode())) - continue; - // Check if the scalar is externally used as an extra arg. auto ExtI = ExternallyUsedValues.find(Scalar); if (ExtI != ExternallyUsedValues.end()) { @@ -1471,37 +1383,6 @@ void BoUpSLP::buildTree(ArrayRef Roots, } } -static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) { - switch(Opcode) { - case Instruction::Add: - case Instruction::Sub: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - return ConstantInt::getNullValue(Ty); - case Instruction::Mul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::URem: - case Instruction::SRem: - return ConstantInt::get(Ty, /*V=*/1); - case Instruction::FAdd: - case Instruction::FSub: - return ConstantFP::get(Ty, /*V=*/0.0); - case Instruction::FMul: - case Instruction::FDiv: - case Instruction::FRem: - return ConstantFP::get(Ty, /*V=*/1.0); - case Instruction::And: - return ConstantInt::getAllOnesValue(Ty); - default: - break; - } - llvm_unreachable("unknown binop for default constant value"); -} - void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, int UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -1509,48 +1390,31 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } - // Avoid any vectors that are wider than two elements and - // with real operations less than or equal to half of vector - // to others members are operands to that operations. - unsigned AltOpcode = getAltOpcode(S.Opcode); - unsigned SameOrAlt = 0; - if (VL.size() > 2) { - for (Value *V : VL) { - auto *Instr = cast(V); - if (sameOpcodeOrAlt(S.Opcode, AltOpcode, Instr->getOpcode())) - SameOrAlt++; - } - if (SameOrAlt <= (VL.size() / 2)) { - newTreeEntry(VL, false, UserTreeIdx, S); - return; - } - } - // We now know that this is a vector of instructions of the same type from // the same block. @@ -1559,7 +1423,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (EphValues.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1570,7 +1434,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); if (E->Scalars[i] != VL[i]) { DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1589,7 +1453,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (getTreeEntry(I)) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1599,7 +1463,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1613,7 +1477,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1622,7 +1486,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (unsigned j = i + 1; j < e; ++j) if (VL[i] == VL[j]) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1637,7 +1501,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1656,12 +1520,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1683,7 +1547,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } else { BS.cancelScheduling(VL, VL0); } - newTreeEntry(VL, Reuse, UserTreeIdx, S); + newTreeEntry(VL, Reuse, UserTreeIdx); return; } case Instruction::Load: { @@ -1698,7 +1562,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1709,7 +1573,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, LoadInst *L = cast(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1731,7 +1595,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Consecutive) { ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1746,7 +1610,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); if (ReverseConsecutive) { ++NumLoadsWantToChangeOrder; @@ -1773,12 +1637,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1801,13 +1665,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1839,25 +1703,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, case Instruction::And: case Instruction::Or: case Instruction::Xor: - - // Check that all load operations dominate any real vector operations - for (Value *V : VL) - if (LoadInst *L = dyn_cast(V)) { - bool Dominate = false; - for (Value *V1 : VL) { - Instruction *I = dyn_cast(V1); - if (I != nullptr && - sameOpcodeOrAlt(S.Opcode, AltOpcode, I->getOpcode()) && - DT->dominates(&*L, &*I)) - Dominate = true; - } - if (!Dominate) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); - return; - } - } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1873,21 +1719,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *VecOp : VL) { - auto *I = cast(VecOp); - if (I->getOpcode() == S.Opcode) { - Operands.push_back(I->getOperand(i)); - continue; - } - assert(Instruction::isBinaryOp(S.Opcode) && - "Expected a binary operation."); - Value *Operand = isOdd(i) - ? getDefaultConstantForOpcode(S.Opcode, I->getType()) - : VecOp; - Operands.push_back(Operand); - } - if (allSameType(Operands)) - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + for (Value *j : VL) + Operands.push_back(cast(j)->getOperand(i)); + + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; @@ -1897,7 +1732,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (cast(VL[j])->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1910,7 +1745,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1922,12 +1757,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1944,12 +1779,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1967,7 +1802,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1981,7 +1816,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1992,7 +1827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -2005,14 +1840,14 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -2029,11 +1864,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // then do not vectorize this instruction. if (!S.IsAltShuffle) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -2048,19 +1883,8 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *VecOp : VL) { - auto *I = cast(VecOp); - if (sameOpcodeOrAlt(S.Opcode, AltOpcode, I->getOpcode())) { - Operands.push_back(I->getOperand(i)); - continue; - } - assert(Instruction::isBinaryOp(S.Opcode) && - "Expected a binary operation."); - Value *Operand = isOdd(i) - ? getDefaultConstantForOpcode(S.Opcode, I->getType()) - : VecOp; - Operands.push_back(Operand); - } + for (Value *j : VL) + Operands.push_back(cast(j)->getOperand(i)); buildTree_rec(Operands, Depth + 1, UserTreeIdx); } @@ -2068,7 +1892,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -2189,17 +2013,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return getGatherCost(E->Scalars); } - assert(E->State.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - auto *VL0 = cast(E->State.OpValue); - unsigned ShuffleOrOp = E->State.IsAltShuffle ? - (unsigned) Instruction::ShuffleVector : E->State.Opcode; + InstructionsState S = getSameOpcode(VL); + assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + Instruction *VL0 = cast(S.OpValue); + unsigned ShuffleOrOp = S.IsAltShuffle ? + (unsigned) Instruction::ShuffleVector : S.Opcode; switch (ShuffleOrOp) { case Instruction::PHI: return 0; case Instruction::ExtractValue: case Instruction::ExtractElement: - if (canReuseExtract(VL, E->State.OpValue)) { + if (canReuseExtract(VL, S.OpValue)) { int DeadCost = 0; for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *E = cast(VL[i]); @@ -2243,8 +2068,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Calculate the cost of this instruction. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(ShuffleOrOp, ScalarTy, Builder.getInt1Ty(), VL0); - int VecCost = TTI->getCmpSelInstrCost(ShuffleOrOp, VecTy, MaskTy, VL0); + TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0); + int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0); return VecCost - ScalarCost; } case Instruction::Add: @@ -2270,7 +2095,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OK_UniformConstantValue; TargetTransformInfo::OperandValueProperties Op1VP = TargetTransformInfo::OP_None; TargetTransformInfo::OperandValueProperties Op2VP = @@ -2281,33 +2106,34 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - if (auto *CInt = dyn_cast(VL0->getOperand(1))) { - Op2VK = TargetTransformInfo::OK_UniformConstantValue; - const unsigned Opcode = E->State.Opcode; - for (auto *V : VL) { - auto *I = cast(V); - if (I == VL0 || Opcode != I->getOpcode()) - continue; - if (!isa(I->getOperand(1))) { - Op2VK = TargetTransformInfo::OK_AnyValue; - break; - } - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && - CInt != cast(I->getOperand(1))) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + ConstantInt *CInt = nullptr; + for (unsigned i = 0; i < VL.size(); ++i) { + const Instruction *I = cast(VL[i]); + if (!isa(I->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + break; + } + if (i == 0) { + CInt = cast(I->getOperand(1)); + continue; } - // FIXME: Currently cost of model modification for division by power of - // 2 is handled for X86 and AArch64. Add support for other targets. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && - CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_PowerOf2; + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } + // FIXME: Currently cost of model modification for division by power of + // 2 is handled for X86 and AArch64. Add support for other targets. + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && + CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; - int ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy, - Op1VK, Op2VK, Op1VP, Op2VP); - int VecCost = TTI->getArithmeticInstrCost(E->State.Opcode, VecTy, Op1VK, - Op2VK, Op1VP, Op2VP); + SmallVector Operands(VL0->operand_values()); + int ScalarCost = + VecTy->getNumElements() * + TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, + Op2VP, Operands); + int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK, + Op1VP, Op2VP, Operands); return VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -2373,18 +2199,23 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; - unsigned AltOpcode = getAltOpcode(E->State.Opcode); - int ScalarCost = - TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy, Op1VK, Op2VK) * - VL.size() / 2; - ScalarCost += - TTI->getArithmeticInstrCost(AltOpcode, ScalarTy, Op1VK, Op2VK) * - VL.size() / 2; + int ScalarCost = 0; + int VecCost = 0; + for (Value *i : VL) { + Instruction *I = cast(i); + if (!I) + break; + ScalarCost += + TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); + } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. - int VecCost = - TTI->getArithmeticInstrCost(E->State.Opcode, VecTy, Op1VK, Op2VK); - VecCost += TTI->getArithmeticInstrCost(AltOpcode, VecTy, Op1VK, Op2VK); + Instruction *I0 = cast(VL[0]); + VecCost = + TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); + Instruction *I1 = cast(VL[1]); + VecCost += + TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); return VecCost - ScalarCost; @@ -2450,7 +2281,7 @@ int BoUpSLP::getSpillCost() { Instruction *PrevInst = nullptr; for (const auto &N : VectorizableTree) { - Instruction *Inst = dyn_cast(N.State.OpValue); + Instruction *Inst = dyn_cast(N.Scalars[0]); if (!Inst) continue; @@ -2510,7 +2341,7 @@ int BoUpSLP::getTreeCost() { for (TreeEntry &TE : VectorizableTree) { int C = getEntryCost(&TE); DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " - << *TE.State.OpValue << ".\n"); + << *TE.Scalars[0] << ".\n"); Cost += C; } @@ -2531,7 +2362,7 @@ int BoUpSLP::getTreeCost() { // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].State.OpValue; + auto *ScalarRoot = VectorizableTree[0].Scalars[0]; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto Extend = @@ -2594,15 +2425,13 @@ void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Right) { // Push left and right operands of binary operation into Left and Right unsigned AltOpcode = getAltOpcode(Opcode); + (void)AltOpcode; for (Value *V : VL) { auto *I = cast(V); - if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) { - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); - } else { - Left.push_back(I); - Right.push_back(getDefaultConstantForOpcode(Opcode, I->getType())); - } + assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) && + "Incorrect instruction in vector"); + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); } // Reorder if we have a commutative operation and consecutive access @@ -2651,13 +2480,8 @@ static bool shouldReorderOperands( int i, unsigned Opcode, Instruction &I, ArrayRef Left, ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { - if (I.getOpcode() == Opcode) { - VLeft = I.getOperand(0); - VRight = I.getOperand(1); - } else { - VLeft = &I; - VRight = getDefaultConstantForOpcode(Opcode, I.getType()); - } + VLeft = I.getOperand(0); + VRight = I.getOperand(1); // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { if (VRight == Right[i - 1]) @@ -2721,15 +2545,8 @@ void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. auto *I = cast(VL[0]); - Value *VLeft; - Value *VRight; - if (I->getOpcode() == Opcode) { - VLeft = I->getOperand(0); - VRight = I->getOperand(1); - } else { - VLeft = I; - VRight = getDefaultConstantForOpcode(Opcode, I->getType()); - } + Value *VLeft = I->getOperand(0); + Value *VRight = I->getOperand(1); if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2934,11 +2751,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilder<>::InsertPointGuard Guard(Builder); if (E->VectorizedValue) { - DEBUG(dbgs() << "SLP: Diamond merged for " << *E->State.OpValue << ".\n"); + DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); return E->VectorizedValue; } - Instruction *VL0 = cast(E->State.OpValue); + InstructionsState S = getSameOpcode(E->Scalars); + Instruction *VL0 = cast(E->Scalars[0]); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); @@ -2951,8 +2769,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } - unsigned ShuffleOrOp = E->State.IsAltShuffle ? - (unsigned) Instruction::ShuffleVector : E->State.Opcode; + unsigned ShuffleOrOp = S.IsAltShuffle ? + (unsigned) Instruction::ShuffleVector : S.Opcode; switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -3062,7 +2880,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V; - if (E->State.Opcode == Instruction::FCmp) + if (S.Opcode == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); @@ -3114,19 +2932,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(E->State.Opcode, E->Scalars, LHSVL, + reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL, RHSVL); else for (Value *V : E->Scalars) { auto *I = cast(V); - if (I->getOpcode() == E->State.Opcode) { - LHSVL.push_back(I->getOperand(0)); - RHSVL.push_back(I->getOperand(1)); - } else { - LHSVL.push_back(V); - RHSVL.push_back( - getDefaultConstantForOpcode(E->State.Opcode, I->getType())); - } + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); } setInsertPointAfterBundle(E->Scalars, VL0); @@ -3138,7 +2950,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; Value *V = Builder.CreateBinOp( - static_cast(E->State.Opcode), LHS, RHS); + static_cast(S.Opcode), LHS, RHS); E->VectorizedValue = V; propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; @@ -3288,9 +3100,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - assert(Instruction::isBinaryOp(E->State.Opcode) && + assert(Instruction::isBinaryOp(S.Opcode) && "Invalid Shuffle Vector Operand"); - reorderAltShuffleOperands(E->State.Opcode, E->Scalars, LHSVL, RHSVL); + reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars, VL0); Value *LHS = vectorizeTree(LHSVL); @@ -3301,9 +3113,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // Create a vector of LHS op1 RHS Value *V0 = Builder.CreateBinOp( - static_cast(E->State.Opcode), LHS, RHS); + static_cast(S.Opcode), LHS, RHS); - unsigned AltOpcode = getAltOpcode(E->State.Opcode); + unsigned AltOpcode = getAltOpcode(S.Opcode); // Create a vector of LHS op2 RHS Value *V1 = Builder.CreateBinOp( static_cast(AltOpcode), LHS, RHS); @@ -3325,13 +3137,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *ShuffleMask = ConstantVector::get(Mask); - InstructionsState S = getSameOpcode(EvenScalars); - assert(!S.IsAltShuffle && "Unexpected alternate opcode"); - propagateIRFlags(V0, EvenScalars, S.OpValue); - - S = getSameOpcode(OddScalars); - assert(!S.IsAltShuffle && "Unexpected alternate opcode"); - propagateIRFlags(V1, OddScalars, S.OpValue); + propagateIRFlags(V0, EvenScalars); + propagateIRFlags(V1, OddScalars); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); E->VectorizedValue = V; @@ -3365,7 +3172,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0].State.OpValue; + auto *ScalarRoot = VectorizableTree[0].Scalars[0]; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); @@ -3476,15 +3283,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { assert(Entry->VectorizedValue && "Can't find vectorizable value"); // For each lane: - const unsigned Opcode = Entry->State.Opcode; - const unsigned AltOpcode = getAltOpcode(Opcode); for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - if (!sameOpcodeOrAlt(Opcode, AltOpcode, - cast(Scalar)->getOpcode())) - continue; - Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { #ifndef NDEBUG @@ -3602,7 +3403,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, } for (Value *V : VL) { - ScheduleData *BundleMember = getScheduleData(V, isOneOf(OpValue, V)); + ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); if (BundleMember->IsScheduled) { @@ -3675,7 +3476,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL, if (isa(OpValue)) return; - ScheduleData *Bundle = getScheduleData(OpValue)->FirstInBundle; + ScheduleData *Bundle = getScheduleData(OpValue); DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); @@ -3980,7 +3781,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst, SD->OpValue) != nullptr) && + (getTreeEntry(SD->Inst) != nullptr) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { @@ -4003,13 +3804,12 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { ScheduleData *BundleMember = picked; while (BundleMember) { Instruction *pickedInst = BundleMember->Inst; - if (pickedInst == BundleMember->OpValue) { - if (LastScheduledInst->getNextNode() != pickedInst) { - BS->BB->getInstList().remove(pickedInst); - BS->BB->getInstList().insert(LastScheduledInst->getIterator(), pickedInst); - } - LastScheduledInst = pickedInst; + if (LastScheduledInst->getNextNode() != pickedInst) { + BS->BB->getInstList().remove(pickedInst); + BS->BB->getInstList().insert(LastScheduledInst->getIterator(), + pickedInst); } + LastScheduledInst = pickedInst; BundleMember = BundleMember->NextInBundle; } diff --git a/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll deleted file mode 100644 index 3855834d8a1..00000000000 --- a/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll +++ /dev/null @@ -1,52 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -slp-vectorizer -S < %s | FileCheck %s - -@bar = external global [4 x [4 x i32]], align 4 -@dct_luma = external global [4 x [4 x i32]], align 4 - -define void @foo() local_unnamed_addr { -; CHECK-LABEL: @foo( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ADD277:%.*]] = add nsw i32 undef, undef -; CHECK-NEXT: store i32 [[ADD277]], i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4 -; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0 -; CHECK-NEXT: [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2), align 4 -; CHECK-NEXT: [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 3), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[ADD277]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> undef, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = ashr <4 x i32> [[TMP7]], -; CHECK-NEXT: [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4 -; CHECK-NEXT: unreachable -; -entry: - %add277 = add nsw i32 undef, undef - store i32 %add277, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4 - %0 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4 - %sub355 = add nsw i32 undef, %0 - %shr.i = ashr i32 %sub355, 6 - %arrayidx372 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0 - store i32 %shr.i, i32* %arrayidx372, align 4 - %sub355.1 = add nsw i32 undef, %add277 - %shr.i.1 = ashr i32 %sub355.1, 6 - %arrayidx372.1 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1 - store i32 %shr.i.1, i32* %arrayidx372.1, align 4 - %1 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2), align 4 - %sub355.2 = add nsw i32 undef, %1 - %shr.i.2 = ashr i32 %sub355.2, 6 - %arrayidx372.2 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2 - store i32 %shr.i.2, i32* %arrayidx372.2, align 4 - %2 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 3), align 4 - %sub355.3 = add nsw i32 undef, %2 - %shr.i.3 = ashr i32 %sub355.3, 6 - %arrayidx372.3 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3 - store i32 %shr.i.3, i32* %arrayidx372.3, align 4 - unreachable -} diff --git a/test/Transforms/SLPVectorizer/X86/load-dominate.ll b/test/Transforms/SLPVectorizer/X86/load-dominate.ll deleted file mode 100644 index 7149375117d..00000000000 --- a/test/Transforms/SLPVectorizer/X86/load-dominate.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s - -%class.1 = type { %class.2 } -%class.2 = type { %"class.3" } -%"class.3" = type { %"struct.1", i64 } -%"struct.1" = type { [8 x i64] } - -$_ZN1C10SwitchModeEv = comdat any - -; Function Attrs: uwtable -define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 { -for.body.lr.ph.i: - %or.1 = or i64 undef, 1 - store i64 %or.1, i64* undef, align 8 - %foo.1 = getelementptr inbounds %class.1, %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 - %foo.3 = load i64, i64* %foo.1, align 8 - %foo.2 = getelementptr inbounds %class.1, %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1 - %foo.4 = load i64, i64* %foo.2, align 8 - %bar5 = load i64, i64* undef, align 8 - %and.2 = and i64 %or.1, %foo.3 - %and.1 = and i64 %bar5, %foo.4 - %bar3 = getelementptr inbounds %class.2, %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 - store i64 %and.2, i64* %bar3, align 8 - %bar4 = getelementptr inbounds %class.2, %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 - store i64 %and.1, i64* %bar4, align 8 - ret void -} diff --git a/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll index 172aebe9c5d..2b593b78652 100644 --- a/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -43,16 +43,22 @@ define void @add1(i32* noalias %dst, i32* noalias %src) { ; CHECK-LABEL: @add1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 +; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 +; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -80,16 +86,22 @@ define void @sub0(i32* noalias %dst, i32* noalias %src) { ; CHECK-LABEL: @sub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 +; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3 +; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -193,18 +205,22 @@ define void @addsub0(i32* noalias %dst, i32* noalias %src) { ; CHECK-LABEL: @addsub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 +; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 +; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -232,18 +248,22 @@ define void @addsub1(i32* noalias %dst, i32* noalias %src) { ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 +; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 +; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -271,16 +291,22 @@ define void @mul(i32* noalias %dst, i32* noalias %src) { ; CHECK-LABEL: @mul( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 +; CHECK-NEXT: store i32 [[MUL]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9 +; CHECK-NEXT: store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -308,16 +334,22 @@ define void @shl0(i32* noalias %dst, i32* noalias %src) { ; CHECK-LABEL: @shl0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 +; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 +; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -421,16 +453,22 @@ define void @add1f(float* noalias %dst, float* noalias %src) { ; CHECK-LABEL: @add1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 +; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 +; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 +; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -458,16 +496,22 @@ define void @sub0f(float* noalias %dst, float* noalias %src) { ; CHECK-LABEL: @sub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 +; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 +; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00 +; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -571,18 +615,22 @@ define void @addsub0f(float* noalias %dst, float* noalias %src) { ; CHECK-LABEL: @addsub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 +; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 +; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 +; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 +; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -610,18 +658,22 @@ define void @addsub1f(float* noalias %dst, float* noalias %src) { ; CHECK-LABEL: @addsub1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 +; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 +; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 +; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 +; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -649,16 +701,22 @@ define void @mulf(float* noalias %dst, float* noalias %src) { ; CHECK-LABEL: @mulf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 +; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 +; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 +; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -767,16 +825,22 @@ define void @sub0fn(float* noalias %dst, float* noalias %src) { ; CHECK-LABEL: @sub0fn( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 +; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 +; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00 +; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; entry: -- 2.11.0