From 5cc6baa2303a01c6128efe94a863e32fc6d4a635 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Mon, 1 Jul 2019 21:10:43 +0000 Subject: [PATCH] Revert [SLP] Look-ahead operand reordering heuristic. This reverts r364478 (git commit 574cb0eb3a7ac95e62d223a60bef891171dfe321) The patch is causing compilation timeouts. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@364846 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 282 ++++--------------------- test/Transforms/SLPVectorizer/X86/lookahead.ll | 134 +++++------- 2 files changed, 93 insertions(+), 323 deletions(-) diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index b6065c8d5eb..c47f6980a36 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -147,12 +147,6 @@ static cl::opt MinTreeSize( "slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable")); -// The maximum depth that the look-ahead score heuristic will explore. -// The higher this value, the higher the compilation time overhead. -static cl::opt LookAheadMaxDepth( - "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, - cl::desc("The maximum look-ahead depth for operand reordering scores")); - static cl::opt ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -714,7 +708,6 @@ public: const DataLayout &DL; ScalarEvolution &SE; - const BoUpSLP &R; /// \returns the operand data at \p OpIdx and \p Lane. OperandData &getData(unsigned OpIdx, unsigned Lane) { @@ -740,211 +733,6 @@ public: std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); } - // The hard-coded scores listed here are not very important. When computing - // the scores of matching one sub-tree with another, we are basically - // counting the number of values that are matching. So even if all scores - // are set to 1, we would still get a decent matching result. - // However, sometimes we have to break ties. For example we may have to - // choose between matching loads vs matching opcodes. This is what these - // scores are helping us with: they provide the order of preference. - - /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). - static const int ScoreConsecutiveLoads = 3; - /// Constants. - static const int ScoreConstants = 2; - /// Instructions with the same opcode. - static const int ScoreSameOpcode = 2; - /// Instructions with alt opcodes (e.g, add + sub). - static const int ScoreAltOpcodes = 1; - /// Identical instructions (a.k.a. splat or broadcast). - static const int ScoreSplat = 1; - /// Matching with an undef is preferable to failing. - static const int ScoreUndef = 1; - /// Score for failing to find a decent match. - static const int ScoreFail = 0; - /// User exteranl to the vectorized code. - static const int ExternalUseCost = 1; - /// The user is internal but in a different lane. - static const int UserInDiffLaneCost = ExternalUseCost; - - /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. - static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, - ScalarEvolution &SE) { - auto *LI1 = dyn_cast(V1); - auto *LI2 = dyn_cast(V2); - if (LI1 && LI2) - return isConsecutiveAccess(LI1, LI2, DL, SE) - ? VLOperands::ScoreConsecutiveLoads - : VLOperands::ScoreFail; - - auto *C1 = dyn_cast(V1); - auto *C2 = dyn_cast(V2); - if (C1 && C2) - return VLOperands::ScoreConstants; - - auto *I1 = dyn_cast(V1); - auto *I2 = dyn_cast(V2); - if (I1 && I2) { - if (I1 == I2) - return VLOperands::ScoreSplat; - InstructionsState S = getSameOpcode({I1, I2}); - // Note: Only consider instructions with <= 2 operands to avoid - // complexity explosion. - if (S.getOpcode() && S.MainOp->getNumOperands() <= 2) - return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes - : VLOperands::ScoreSameOpcode; - } - - if (isa(V2)) - return VLOperands::ScoreUndef; - - return VLOperands::ScoreFail; - } - - /// Holds the values and their lane that are taking part in the look-ahead - /// score calculation. This is used in the external uses cost calculation. - SmallDenseMap InLookAheadValues; - - /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are - /// either external to the vectorized code, or require shuffling. - int getExternalUsesCost(const std::pair &LHS, - const std::pair &RHS) { - int Cost = 0; - SmallVector, 2> Values = {LHS, RHS}; - for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { - Value *V = Values[Idx].first; - // Calculate the absolute lane, using the minimum relative lane of LHS - // and RHS as base and Idx as the offset. - int Ln = std::min(LHS.second, RHS.second) + Idx; - assert(Ln >= 0 && "Bad lane calculation"); - for (User *U : V->users()) { - if (const TreeEntry *UserTE = R.getTreeEntry(U)) { - // The user is in the VectorizableTree. Check if we need to insert. - auto It = llvm::find(UserTE->Scalars, U); - assert(It != UserTE->Scalars.end() && "U is in UserTE"); - int UserLn = std::distance(UserTE->Scalars.begin(), It); - assert(UserLn >= 0 && "Bad lane"); - if (UserLn != Ln) - Cost += UserInDiffLaneCost; - } else { - // Check if the user is in the look-ahead code. - auto It2 = InLookAheadValues.find(U); - if (It2 != InLookAheadValues.end()) { - // The user is in the look-ahead code. Check the lane. - if (It2->second != Ln) - Cost += UserInDiffLaneCost; - } else { - // The user is neither in SLP tree nor in the look-ahead code. - Cost += ExternalUseCost; - } - } - } - } - return Cost; - } - - /// Go through the operands of \p LHS and \p RHS recursively until \p - /// MaxLevel, and return the cummulative score. For example: - /// \verbatim - /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] - /// \ / \ / \ / \ / - /// + + + + - /// G1 G2 G3 G4 - /// \endverbatim - /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at - /// each level recursively, accumulating the score. It starts from matching - /// the additions at level 0, then moves on to the loads (level 1). The - /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and - /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while - /// {A[0],C[0]} has a score of VLOperands::ScoreFail. - /// Please note that the order of the operands does not matter, as we - /// evaluate the score of all profitable combinations of operands. In - /// other words the score of G1 and G4 is the same as G1 and G2. This - /// heuristic is based on ideas described in: - /// Look-ahead SLP: Auto-vectorization in the presence of commutative - /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, - /// Luís F. W. Góes - int getScoreAtLevelRec(const std::pair &LHS, - const std::pair &RHS, int CurrLevel, - int MaxLevel) { - - Value *V1 = LHS.first; - Value *V2 = RHS.first; - // Get the shallow score of V1 and V2. - int ShallowScoreAtThisLevel = - std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - - getExternalUsesCost(LHS, RHS)); - int Lane1 = LHS.second; - int Lane2 = RHS.second; - - // If reached MaxLevel, - // or if V1 and V2 are not instructions, - // or if they are SPLAT, - // or if they are not consecutive, early return the current cost. - auto *I1 = dyn_cast(V1); - auto *I2 = dyn_cast(V2); - if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || - ShallowScoreAtThisLevel == VLOperands::ScoreFail || - (isa(I1) && isa(I2) && ShallowScoreAtThisLevel)) - return ShallowScoreAtThisLevel; - assert(I1 && I2 && "Should have early exited."); - - // Keep track of in-tree values for determining the external-use cost. - InLookAheadValues[V1] = Lane1; - InLookAheadValues[V2] = Lane2; - - // Contains the I2 operand indexes that got matched with I1 operands. - SmallSet Op2Used; - - // Recursion towards the operands of I1 and I2. We are trying all possbile - // operand pairs, and keeping track of the best score. - for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); - OpIdx1 != NumOperands1; ++OpIdx1) { - // Try to pair op1I with the best operand of I2. - int MaxTmpScore = 0; - unsigned MaxOpIdx2 = 0; - bool FoundBest = false; - // If I2 is commutative try all combinations. - unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; - unsigned ToIdx = isCommutative(I2) - ? I2->getNumOperands() - : std::min(I2->getNumOperands(), OpIdx1 + 1); - assert(FromIdx <= ToIdx && "Bad index"); - for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { - // Skip operands already paired with OpIdx1. - if (Op2Used.count(OpIdx2)) - continue; - // Recursively calculate the cost at each level - int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1}, - {I2->getOperand(OpIdx2), Lane2}, - CurrLevel + 1, MaxLevel); - // Look for the best score. - if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { - MaxTmpScore = TmpScore; - MaxOpIdx2 = OpIdx2; - FoundBest = true; - } - } - if (FoundBest) { - // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. - Op2Used.insert(MaxOpIdx2); - ShallowScoreAtThisLevel += MaxTmpScore; - } - } - return ShallowScoreAtThisLevel; - } - - /// \Returns the look-ahead score, which tells us how much the sub-trees - /// rooted at \p LHS and \p RHS match, the more they match the higher the - /// score. This helps break ties in an informed way when we cannot decide on - /// the order of the operands by just considering the immediate - /// predecessors. - int getLookAheadScore(const std::pair &LHS, - const std::pair &RHS) { - InLookAheadValues.clear(); - return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth); - } - // Search all operands in Ops[*][Lane] for the one that matches best // Ops[OpIdx][LastLane] and return its opreand index. // If no good match can be found, return None. @@ -962,6 +750,9 @@ public: // The linearized opcode of the operand at OpIdx, Lane. bool OpIdxAPO = getData(OpIdx, Lane).APO; + const unsigned BestScore = 2; + const unsigned GoodScore = 1; + // The best operand index and its score. // Sometimes we have more than one option (e.g., Opcode and Undefs), so we // are using the score to differentiate between the two. @@ -990,19 +781,41 @@ public: // Look for an operand that matches the current mode. switch (RMode) { case ReorderingMode::Load: + if (isa(Op)) { + // Figure out which is left and right, so that we can check for + // consecutive loads + bool LeftToRight = Lane > LastLane; + Value *OpLeft = (LeftToRight) ? OpLastLane : Op; + Value *OpRight = (LeftToRight) ? Op : OpLastLane; + if (isConsecutiveAccess(cast(OpLeft), + cast(OpRight), DL, SE)) + BestOp.Idx = Idx; + } + break; + case ReorderingMode::Opcode: + // We accept both Instructions and Undefs, but with different scores. + if ((isa(Op) && isa(OpLastLane) && + cast(Op)->getOpcode() == + cast(OpLastLane)->getOpcode()) || + (isa(OpLastLane) && isa(Op)) || + isa(Op)) { + // An instruction has a higher score than an undef. + unsigned Score = (isa(Op)) ? GoodScore : BestScore; + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; + } + } + break; case ReorderingMode::Constant: - case ReorderingMode::Opcode: { - bool LeftToRight = Lane > LastLane; - Value *OpLeft = (LeftToRight) ? OpLastLane : Op; - Value *OpRight = (LeftToRight) ? Op : OpLastLane; - unsigned Score = - getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane}); - if (Score > BestOp.Score) { - BestOp.Idx = Idx; - BestOp.Score = Score; + if (isa(Op)) { + unsigned Score = (isa(Op)) ? GoodScore : BestScore; + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; + } } break; - } case ReorderingMode::Splat: if (Op == OpLastLane) BestOp.Idx = Idx; @@ -1133,8 +946,8 @@ public: public: /// Initialize with all the operands of the instruction vector \p RootVL. VLOperands(ArrayRef RootVL, const DataLayout &DL, - ScalarEvolution &SE, const BoUpSLP &R) - : DL(DL), SE(SE), R(R) { + ScalarEvolution &SE) + : DL(DL), SE(SE) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } @@ -1356,8 +1169,7 @@ private: SmallVectorImpl &Left, SmallVectorImpl &Right, const DataLayout &DL, - ScalarEvolution &SE, - const BoUpSLP &R); + ScalarEvolution &SE); struct TreeEntry { using VecTreeTy = SmallVector, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -2559,7 +2371,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == SwapP0 && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { @@ -2604,7 +2416,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; @@ -2773,7 +2585,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; @@ -3490,15 +3302,13 @@ int BoUpSLP::getGatherCost(ArrayRef VL) const { // Perform operand reordering on the instructions in VL and return the reordered // operands in Left and Right. -void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, - SmallVectorImpl &Left, - SmallVectorImpl &Right, - const DataLayout &DL, - ScalarEvolution &SE, - const BoUpSLP &R) { +void BoUpSLP::reorderInputsAccordingToOpcode( + ArrayRef VL, SmallVectorImpl &Left, + SmallVectorImpl &Right, const DataLayout &DL, + ScalarEvolution &SE) { if (VL.empty()) return; - VLOperands Ops(VL, DL, SE, R); + VLOperands Ops(VL, DL, SE); // Reorder the operands in place. Ops.reorder(); Left = Ops.getVL(0); diff --git a/test/Transforms/SLPVectorizer/X86/lookahead.ll b/test/Transforms/SLPVectorizer/X86/lookahead.ll index 52f19b5b3cf..f89cae88a5f 100644 --- a/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -27,19 +27,22 @@ define void @lookahead_basic(double* %array) { ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 +; CHECK-NEXT: [[A_0:%.*]] = load double, double* [[IDX0]], align 8 +; CHECK-NEXT: [[A_1:%.*]] = load double, double* [[IDX1]], align 8 +; CHECK-NEXT: [[B_0:%.*]] = load double, double* [[IDX2]], align 8 +; CHECK-NEXT: [[B_1:%.*]] = load double, double* [[IDX3]], align 8 +; CHECK-NEXT: [[C_0:%.*]] = load double, double* [[IDX4]], align 8 +; CHECK-NEXT: [[C_1:%.*]] = load double, double* [[IDX5]], align 8 +; CHECK-NEXT: [[D_0:%.*]] = load double, double* [[IDX6]], align 8 +; CHECK-NEXT: [[D_1:%.*]] = load double, double* [[IDX7]], align 8 +; CHECK-NEXT: [[SUBAB_0:%.*]] = fsub fast double [[A_0]], [[B_0]] +; CHECK-NEXT: [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]] +; CHECK-NEXT: [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]] +; CHECK-NEXT: [[SUBCD_1:%.*]] = fsub fast double [[C_1]], [[D_1]] +; CHECK-NEXT: [[ADDABCD_0:%.*]] = fadd fast double [[SUBAB_0]], [[SUBCD_0]] +; CHECK-NEXT: [[ADDCDAB_1:%.*]] = fadd fast double [[SUBCD_1]], [[SUBAB_1]] +; CHECK-NEXT: store double [[ADDABCD_0]], double* [[IDX0]], align 8 +; CHECK-NEXT: store double [[ADDCDAB_1]], double* [[IDX1]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -161,23 +164,22 @@ define void @lookahead_alt2(double* %array) { ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5 ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6 ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP10]] -; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8 +; CHECK-NEXT: [[A_0:%.*]] = load double, double* [[IDX0]], align 8 +; CHECK-NEXT: [[A_1:%.*]] = load double, double* [[IDX1]], align 8 +; CHECK-NEXT: [[B_0:%.*]] = load double, double* [[IDX2]], align 8 +; CHECK-NEXT: [[B_1:%.*]] = load double, double* [[IDX3]], align 8 +; CHECK-NEXT: [[C_0:%.*]] = load double, double* [[IDX4]], align 8 +; CHECK-NEXT: [[C_1:%.*]] = load double, double* [[IDX5]], align 8 +; CHECK-NEXT: [[D_0:%.*]] = load double, double* [[IDX6]], align 8 +; CHECK-NEXT: [[D_1:%.*]] = load double, double* [[IDX7]], align 8 +; CHECK-NEXT: [[ADDAB_0:%.*]] = fadd fast double [[A_0]], [[B_0]] +; CHECK-NEXT: [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]] +; CHECK-NEXT: [[ADDCD_1:%.*]] = fadd fast double [[C_1]], [[D_1]] +; CHECK-NEXT: [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]] +; CHECK-NEXT: [[ADDABCD_0:%.*]] = fadd fast double [[ADDAB_0]], [[SUBCD_0]] +; CHECK-NEXT: [[ADDCDAB_1:%.*]] = fadd fast double [[ADDCD_1]], [[SUBAB_1]] +; CHECK-NEXT: store double [[ADDABCD_0]], double* [[IDX0]], align 8 +; CHECK-NEXT: store double [[ADDCDAB_1]], double* [[IDX1]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -237,28 +239,29 @@ define void @lookahead_external_uses(double* %A, double *%B, double *%C, double ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 -; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 +; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 -; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 -; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -301,46 +304,3 @@ entry: store double %A1, double *%Ext1, align 8 ret void } - - -; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls). - -%Class = type { i8 } -declare double @_ZN1i2ayEv(%Class*) -declare double @_ZN1i2axEv() - -define void @lookahead_crash(double* %A, double *%S, %Class *%Arg0) { -; CHECK-LABEL: @lookahead_crash( -; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 -; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 -; CHECK-NEXT: [[C0:%.*]] = call double @_ZN1i2ayEv(%Class* [[ARG0:%.*]]) -; CHECK-NEXT: [[C1:%.*]] = call double @_ZN1i2axEv() -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[C1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 -; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: ret void -; - %IdxA0 = getelementptr inbounds double, double* %A, i64 0 - %IdxA1 = getelementptr inbounds double, double* %A, i64 1 - - %A0 = load double, double *%IdxA0, align 8 - %A1 = load double, double *%IdxA1, align 8 - - %C0 = call double @_ZN1i2ayEv(%Class *%Arg0) - %C1 = call double @_ZN1i2axEv() - - %add0 = fadd fast double %A0, %C0 - %add1 = fadd fast double %A1, %C1 - - %IdxS0 = getelementptr inbounds double, double* %S, i64 0 - %IdxS1 = getelementptr inbounds double, double* %S, i64 1 - store double %add0, double *%IdxS0, align 8 - store double %add1, double *%IdxS1, align 8 - ret void -} -- 2.11.0