enum ShuffleKind {
SK_Broadcast, ///< Broadcast element 0 to all other elements.
SK_Reverse, ///< Reverse the order of the vector.
+ SK_Alternate, ///< Choose alternate elements from vector.
SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
SK_ExtractSubvector ///< ExtractSubvector Index indicates start offset.
};
/// are set if the result needs to be inserted and/or extracted from vectors.
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+ /// Estimate the cost overhead of SK_Alternate shuffle.
+ unsigned getAltShuffleOverhead(Type *Ty) const;
+
const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); }
public:
return OpCost;
}
+unsigned BasicTTI::getAltShuffleOverhead(Type *Ty) const {
+ assert(Ty->isVectorTy() && "Can only shuffle vectors");
+ unsigned Cost = 0;
+ // Shuffle cost is equal to the cost of extracting element from its argument
+ // plus the cost of inserting them onto the result vector.
+
+ // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from index
+ // 0 of first vector, index 1 of second vector,index 2 of first vector and
+ // finally index 3 of second vector and insert them at index <0,1,2,3> of
+ // result vector.
+ for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+ Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+ }
+ return Cost;
+}
+
unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) const {
+ if (Kind == SK_Alternate) {
+ return getAltShuffleOverhead(Tp);
+ }
return 1;
}
unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) const {
- // We only handle costs of reverse shuffles for now.
- if (Kind != SK_Reverse)
+ // We only handle costs of reverse and alternate shuffles for now.
+ if (Kind != SK_Reverse && Kind != SK_Alternate)
return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
- static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
- // Reverse shuffle cost one instruction if we are shuffling within a double
- // word (vrev) or two if we shuffle a quad word (vrev, vext).
- { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 },
- { ISD::VECTOR_SHUFFLE, MVT::v2f32, 1 },
- { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 },
- { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 },
-
- { ISD::VECTOR_SHUFFLE, MVT::v4i32, 2 },
- { ISD::VECTOR_SHUFFLE, MVT::v4f32, 2 },
- { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 },
- { ISD::VECTOR_SHUFFLE, MVT::v16i8, 2 }
- };
+ if (Kind == SK_Reverse) {
+ static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
+ // Reverse shuffle cost one instruction if we are shuffling within a
+ // double word (vrev) or two if we shuffle a quad word (vrev, vext).
+ {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
- int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
- if (Idx == -1)
- return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+ int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+ if (Idx == -1)
+ return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
- return LT.first * NEONShuffleTbl[Idx].Cost;
+ return LT.first * NEONShuffleTbl[Idx].Cost;
+ }
+ if (Kind == SK_Alternate) {
+ static const CostTblEntry<MVT::SimpleValueType> NEONAltShuffleTbl[] = {
+ // Alt shuffle cost table for ARM. Cost is the number of instructions
+ // required to create the shuffled vector.
+
+ {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
+
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+ int Idx =
+ CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+ if (Idx == -1)
+ return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+ return LT.first * NEONAltShuffleTbl[Idx].Cost;
+ }
+ return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
}
unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) const {
- // We only estimate the cost of reverse shuffles.
- if (Kind != SK_Reverse)
+ // We only estimate the cost of reverse and alternate shuffles.
+ if (Kind != SK_Reverse && Kind != SK_Alternate)
return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
- unsigned Cost = 1;
- if (LT.second.getSizeInBits() > 128)
- Cost = 3; // Extract + insert + copy.
+ if (Kind == SK_Reverse) {
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+ unsigned Cost = 1;
+ if (LT.second.getSizeInBits() > 128)
+ Cost = 3; // Extract + insert + copy.
- // Multiple by the number of parts.
- return Cost * LT.first;
+ // Multiple by the number of parts.
+ return Cost * LT.first;
+ }
+
+ if (Kind == SK_Alternate) {
+ static const CostTblEntry<MVT::SimpleValueType> X86AltShuffleTbl[] = {
+ // Alt shuffle cost table for X86. Cost is the number of instructions
+ // required to create the shuffled vector.
+
+ {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v2i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v4i16, 8},
+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8},
+
+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 49}};
+
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+ int Idx = CostTableLookup(X86AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+ if (Idx == -1)
+ return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+ return LT.first * X86AltShuffleTbl[Idx].Cost;
+ }
+
+ return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
}
unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
return true;
}
+///\returns Opcode that can be clubbed with \p Op to create an alternate
+/// sequence which can later be merged as a ShuffleVector instruction.
+static unsigned getAltOpcode(unsigned Op) {
+ switch (Op) {
+ case Instruction::FAdd:
+ return Instruction::FSub;
+ case Instruction::FSub:
+ return Instruction::FAdd;
+ case Instruction::Add:
+ return Instruction::Sub;
+ case Instruction::Sub:
+ return Instruction::Add;
+ default:
+ return 0;
+ }
+}
+
+///\returns bool representing if Opcode \p Op can be part
+/// of an alternate sequence which can later be merged as
+/// a ShuffleVector instruction.
+static bool canCombineAsAltInst(unsigned Op) {
+ if (Op == Instruction::FAdd || Op == Instruction::FSub ||
+ Op == Instruction::Sub || Op == Instruction::Add)
+ return true;
+ return false;
+}
+
+/// \returns ShuffleVector instruction if intructions in \p VL have
+/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
+/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
+static unsigned isAltInst(ArrayRef<Value *> VL) {
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ unsigned Opcode = I0->getOpcode();
+ unsigned AltOpcode = getAltOpcode(Opcode);
+ for (int i = 1, e = VL.size(); i < e; i++) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
+ return 0;
+ }
+ return Instruction::ShuffleVector;
+}
+
/// \returns The opcode if all of the Instructions in \p VL have the same
/// opcode, or zero.
static unsigned getSameOpcode(ArrayRef<Value *> VL) {
unsigned Opcode = I0->getOpcode();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
- if (!I || Opcode != I->getOpcode())
+ if (!I || Opcode != I->getOpcode()) {
+ if (canCombineAsAltInst(Opcode) && i == 1)
+ return isAltInst(VL);
return 0;
+ }
}
return Opcode;
}
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
+
private:
struct TreeEntry;
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
bool SameTy = getSameType(VL); (void)SameTy;
+ bool isAltShuffle = false;
assert(SameTy && "Invalid types!");
if (Depth == RecursionMaxDepth) {
newTreeEntry(VL, false);
return;
}
+ unsigned Opcode = getSameOpcode(VL);
+
+ // Check that this shuffle vector refers to the alternate
+ // sequence of opcodes.
+ if (Opcode == Instruction::ShuffleVector) {
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ unsigned Op = I0->getOpcode();
+ if (Op != Instruction::ShuffleVector)
+ isAltShuffle = true;
+ }
// If all of the operands are identical or constant we have a simple solution.
- if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) ||
- !getSameOpcode(VL)) {
+ if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false);
return;
DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
- unsigned Opcode = getSameOpcode(VL);
-
// Check if it is safe to sink the loads or the stores.
if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
Instruction *Last = getLastInstruction(VL);
}
return;
}
+ case Instruction::ShuffleVector: {
+ // If this is not an alternate sequence of opcode like add-sub
+ // then do not vectorize this instruction.
+ if (!isAltShuffle) {
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+ return;
+ }
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1);
+ }
+ return;
+ }
default:
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
}
return getGatherCost(E->Scalars);
}
-
- assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) &&
- "Invalid VL");
+ unsigned Opcode = getSameOpcode(VL);
+ assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
Instruction *VL0 = cast<Instruction>(VL[0]);
- unsigned Opcode = VL0->getOpcode();
switch (Opcode) {
case Instruction::PHI: {
return 0;
return VecCallCost - ScalarCallCost;
}
+ case Instruction::ShuffleVector: {
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_AnyValue;
+ int ScalarCost = 0;
+ int VecCost = 0;
+ for (unsigned i = 0; i < VL.size(); ++i) {
+ Instruction *I = cast<Instruction>(VL[i]);
+ if (!I)
+ break;
+ ScalarCost +=
+ TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
+ }
+ // VecCost is equal to sum of the cost of creating 2 vectors
+ // and the cost of creating shuffle.
+ Instruction *I0 = cast<Instruction>(VL[0]);
+ VecCost =
+ TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
+ Instruction *I1 = cast<Instruction>(VL[1]);
+ VecCost +=
+ TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
+ VecCost +=
+ TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
+ return VecCost - ScalarCost;
+ }
default:
llvm_unreachable("Unknown instruction");
}
setInsertPointAfterBundle(E->Scalars);
return Gather(E->Scalars, VecTy);
}
-
- unsigned Opcode = VL0->getOpcode();
- assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
+ unsigned Opcode = getSameOpcode(E->Scalars);
switch (Opcode) {
case Instruction::PHI: {
E->VectorizedValue = V;
return V;
}
+ case Instruction::ShuffleVector: {
+ ValueList LHSVL, RHSVL;
+ for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+ LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+ RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ }
+ setInsertPointAfterBundle(E->Scalars);
+
+ Value *LHS = vectorizeTree(LHSVL);
+ Value *RHS = vectorizeTree(RHSVL);
+
+ if (Value *V = alreadyVectorized(E->Scalars))
+ return V;
+
+ // Create a vector of LHS op1 RHS
+ BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
+ Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
+
+ // Create a vector of LHS op2 RHS
+ Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
+ BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
+ Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
+
+ // Create appropriate shuffle to take alternative operations from
+ // the vector.
+ std::vector<Constant *> Mask(E->Scalars.size());
+ unsigned e = E->Scalars.size();
+ for (unsigned i = 0; i < e; ++i) {
+ if (i & 1)
+ Mask[i] = Builder.getInt32(e + i);
+ else
+ Mask[i] = Builder.getInt32(i);
+ }
+
+ Value *ShuffleMask = ConstantVector::get(Mask);
+
+ Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+ E->VectorizedValue = V;
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ return propagateMetadata(I, E->Scalars);
+
+ return V;
+ }
default:
llvm_unreachable("unknown inst");
}
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
-
// No need to handle users of gathered values.
if (Entry->NeedToGather)
continue;
for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
e = po_end(&F.getEntryBlock()); it != e; ++it) {
BasicBlock *BB = *it;
-
// Vectorize trees that end at stores.
if (unsigned count = collectStores(BB, R)) {
(void)count;
--- /dev/null
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global [4 x i32] zeroinitializer, align 16
+@c = common global [4 x i32] zeroinitializer, align 16
+@d = common global [4 x i32] zeroinitializer, align 16
+@e = common global [4 x i32] zeroinitializer, align 16
+@a = common global [4 x i32] zeroinitializer, align 16
+@fb = common global [4 x float] zeroinitializer, align 16
+@fc = common global [4 x float] zeroinitializer, align 16
+@fa = common global [4 x float] zeroinitializer, align 16
+
+; CHECK-LABEL: @addsub
+; CHECK: %5 = add <4 x i32> %3, %4
+; CHECK: %6 = add <4 x i32> %2, %5
+; CHECK: %7 = sub <4 x i32> %2, %5
+; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+
+; Function Attrs: nounwind uwtable
+define void @addsub() #0 {
+entry:
+ %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 0), align 4
+ %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 0), align 4
+ %add = add nsw i32 %0, %1
+ %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 0), align 4
+ %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 0), align 4
+ %add1 = add nsw i32 %2, %3
+ %add2 = add nsw i32 %add, %add1
+ store i32 %add2, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 0), align 4
+ %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 1), align 4
+ %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 1), align 4
+ %add3 = add nsw i32 %4, %5
+ %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 1), align 4
+ %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 1), align 4
+ %add4 = add nsw i32 %6, %7
+ %sub = sub nsw i32 %add3, %add4
+ store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 1), align 4
+ %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 2), align 4
+ %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 2), align 4
+ %add5 = add nsw i32 %8, %9
+ %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 2), align 4
+ %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 2), align 4
+ %add6 = add nsw i32 %10, %11
+ %add7 = add nsw i32 %add5, %add6
+ store i32 %add7, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 2), align 4
+ %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 3), align 4
+ %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 3), align 4
+ %add8 = add nsw i32 %12, %13
+ %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 3), align 4
+ %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 3), align 4
+ %add9 = add nsw i32 %14, %15
+ %sub10 = sub nsw i32 %add8, %add9
+ store i32 %sub10, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 3), align 4
+ ret void
+}
+
+; CHECK-LABEL: @subadd
+; CHECK: %5 = add <4 x i32> %3, %4
+; CHECK: %6 = sub <4 x i32> %2, %5
+; CHECK: %7 = add <4 x i32> %2, %5
+; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+
+; Function Attrs: nounwind uwtable
+define void @subadd() #0 {
+entry:
+ %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 0), align 4
+ %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 0), align 4
+ %add = add nsw i32 %0, %1
+ %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 0), align 4
+ %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 0), align 4
+ %add1 = add nsw i32 %2, %3
+ %sub = sub nsw i32 %add, %add1
+ store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 0), align 4
+ %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 1), align 4
+ %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 1), align 4
+ %add2 = add nsw i32 %4, %5
+ %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 1), align 4
+ %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 1), align 4
+ %add3 = add nsw i32 %6, %7
+ %add4 = add nsw i32 %add2, %add3
+ store i32 %add4, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 1), align 4
+ %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 2), align 4
+ %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 2), align 4
+ %add5 = add nsw i32 %8, %9
+ %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 2), align 4
+ %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 2), align 4
+ %add6 = add nsw i32 %10, %11
+ %sub7 = sub nsw i32 %add5, %add6
+ store i32 %sub7, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 2), align 4
+ %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 3), align 4
+ %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 3), align 4
+ %add8 = add nsw i32 %12, %13
+ %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 3), align 4
+ %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 3), align 4
+ %add9 = add nsw i32 %14, %15
+ %add10 = add nsw i32 %add8, %add9
+ store i32 %add10, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 3), align 4
+ ret void
+}
+
+; CHECK-LABEL: @faddfsub
+; CHECK: %2 = fadd <4 x float> %0, %1
+; CHECK: %3 = fsub <4 x float> %0, %1
+; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; Function Attrs: nounwind uwtable
+define void @faddfsub() #0 {
+entry:
+ %0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+ %1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+ %add = fadd float %0, %1
+ store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+ %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+ %3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+ %sub = fsub float %2, %3
+ store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+ %4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+ %5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+ %add1 = fadd float %4, %5
+ store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+ %6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+ %7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+ %sub2 = fsub float %6, %7
+ store float %sub2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+ ret void
+}
+
+; CHECK-LABEL: @fsubfadd
+; CHECK: %2 = fsub <4 x float> %0, %1
+; CHECK: %3 = fadd <4 x float> %0, %1
+; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; Function Attrs: nounwind uwtable
+define void @fsubfadd() #0 {
+entry:
+ %0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+ %1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+ %sub = fsub float %0, %1
+ store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+ %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+ %3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+ %add = fadd float %2, %3
+ store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+ %4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+ %5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+ %sub1 = fsub float %4, %5
+ store float %sub1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+ %6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+ %7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+ %add2 = fadd float %6, %7
+ store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+ ret void
+}
+
+; CHECK-LABEL: @No_faddfsub
+; CHECK-NOT: fadd <4 x float>
+; CHECK-NOT: fsub <4 x float>
+; CHECK-NOT: shufflevector
+; Function Attrs: nounwind uwtable
+define void @No_faddfsub() #0 {
+entry:
+ %0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+ %1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+ %add = fadd float %0, %1
+ store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+ %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+ %3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+ %add1 = fadd float %2, %3
+ store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+ %4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+ %5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+ %add2 = fadd float %4, %5
+ store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+ %6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+ %7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+ %sub = fsub float %6, %7
+ store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+ ret void
+}
+
+attributes #0 = { nounwind }
+