bool isLegalMaskedScatter(Type *DataType) const;
bool isLegalMaskedGather(Type *DataType) const;
+ /// Return true if target doesn't mind addresses in vectors.
+ bool prefersVectorizedAddressing() const;
+
/// \brief Return the cost of the scaling factor used in the addressing
/// mode represented by AM for this target, for a load/store
/// of the specified type.
virtual bool isLegalMaskedLoad(Type *DataType) = 0;
virtual bool isLegalMaskedScatter(Type *DataType) = 0;
virtual bool isLegalMaskedGather(Type *DataType) = 0;
+ virtual bool prefersVectorizedAddressing() = 0;
virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
int64_t BaseOffset, bool HasBaseReg,
int64_t Scale, unsigned AddrSpace) = 0;
bool isLegalMaskedGather(Type *DataType) override {
return Impl.isLegalMaskedGather(DataType);
}
+ bool prefersVectorizedAddressing() override {
+ return Impl.prefersVectorizedAddressing();
+ }
int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
bool HasBaseReg, int64_t Scale,
unsigned AddrSpace) override {
bool isLegalMaskedGather(Type *DataType) { return false; }
+ bool prefersVectorizedAddressing() { return true; }
+
int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
// Guess that all legal addressing mode are free.
return TTIImpl->isLegalMaskedGather(DataType);
}
+bool TargetTransformInfo::prefersVectorizedAddressing() const {
+ return TTIImpl->prefersVectorizedAddressing();
+}
+
int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
int64_t BaseOffset,
bool HasBaseReg,
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);
+ bool prefersVectorizedAddressing() { return false; }
bool supportsEfficientVectorElementLoadStore() { return true; }
bool enableInterleavedAccessVectorization() { return true; }
/// The data is collected per VF.
DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
+ /// Holds the instructions (address computations) that are forced to be
+ /// scalarized.
+ DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+
/// Returns the expected difference in cost from scalarizing the expression
/// feeding a predicated instruction \p PredInst. The instructions to
/// scalarize and their scalar costs are collected in \p ScalarCosts. A
DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
}
+ // Insert the forced scalars.
+ // FIXME: Currently widenPHIInstruction() often creates a dead vector
+ // induction variable when the PHI user is scalarized.
+ if (ForcedScalars.count(VF))
+ for (auto *I : ForcedScalars.find(VF)->second)
+ Worklist.insert(I);
+
// Expand the worklist by looking through any bitcasts and getelementptr
// instructions we've already identified as scalar. This is similar to the
// expansion step in collectLoopUniforms(); however, here we're only
if (VF > 1 && isProfitableToScalarize(I, VF))
return VectorizationCostTy(InstsToScalarize[VF][I], false);
+ // Forced scalars do not have any scalarization overhead.
+ if (VF > 1 && ForcedScalars.count(VF) &&
+ ForcedScalars.find(VF)->second.count(I))
+ return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
+
Type *VectorTy;
unsigned C = getInstructionCost(I, VF, VectorTy);
+ // Note: Even if all instructions are scalarized, return true if any memory
+ // accesses appear in the loop to get benefits from address folding etc.
bool TypeNotScalarized =
VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF;
return VectorizationCostTy(C, TypeNotScalarized);
setWideningDecision(&I, VF, Decision, Cost);
}
}
+
+ // Make sure that any load of address and any other address computation
+ // remains scalar unless there is gather/scatter support. This avoids
+ // inevitable extracts into address registers, and also has the benefit of
+ // activating LSR more, since that pass can't optimize vectorized
+ // addresses.
+ if (TTI.prefersVectorizedAddressing())
+ return;
+
+ // Start with all scalar pointer uses.
+ SmallPtrSet<Instruction *, 8> AddrDefs;
+ for (BasicBlock *BB : TheLoop->blocks())
+ for (Instruction &I : *BB) {
+ Instruction *PtrDef =
+ dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+ if (PtrDef && TheLoop->contains(PtrDef) &&
+ getWideningDecision(&I, VF) != CM_GatherScatter)
+ AddrDefs.insert(PtrDef);
+ }
+
+ // Add all instructions used to generate the addresses.
+ SmallVector<Instruction *, 4> Worklist;
+ for (auto *I : AddrDefs)
+ Worklist.push_back(I);
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+ for (auto &Op : I->operands())
+ if (auto *InstOp = dyn_cast<Instruction>(Op))
+ if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
+ AddrDefs.insert(InstOp).second == true)
+ Worklist.push_back(InstOp);
+ }
+
+ for (auto *I : AddrDefs) {
+ if (isa<LoadInst>(I)) {
+ // Setting the desired widening decision should ideally be handled in
+ // by cost functions, but since this involves the task of finding out
+ // if the loaded register is involved in an address computation, it is
+ // instead changed here when we know this is the case.
+ if (getWideningDecision(I, VF) == CM_Widen)
+ // Scalarize a widened load of address.
+ setWideningDecision(I, VF, CM_Scalarize,
+ (VF * getMemoryInstructionCost(I, 1)));
+ else if (auto Group = Legal->getInterleavedAccessGroup(I)) {
+ // Scalarize an interleave group of address loads.
+ for (unsigned I = 0; I < Group->getFactor(); ++I) {
+ if (Instruction *Member = Group->getMember(I))
+ setWideningDecision(Member, VF, CM_Scalarize,
+ (VF * getMemoryInstructionCost(Member, 1)));
+ }
+ }
+ } else
+ // Make sure I gets scalarized and a cost estimate without
+ // scalarization overhead.
+ ForcedScalars[VF].insert(I);
+ }
}
unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
--- /dev/null
+; RUN: opt -S -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -dce \
+; RUN: -instcombine -force-vector-width=2 < %s | FileCheck %s
+;
+; Test that loop vectorizer does not generate vector addresses that must then
+; always be extracted.
+
+; Check that the addresses for a scalarized memory access is not extracted
+; from a vector register.
+define i32 @foo(i32* nocapture %A) {
+;CHECK-LABEL: @foo(
+;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK: %0 = shl nsw i64 %index, 2
+;CHECK: %1 = shl i64 %index, 2
+;CHECK: %2 = or i64 %1, 4
+;CHECK: %3 = getelementptr inbounds i32, i32* %A, i64 %0
+;CHECK: %4 = getelementptr inbounds i32, i32* %A, i64 %2
+;CHECK: store i32 4, i32* %3, align 4
+;CHECK: store i32 4, i32* %4, align 4
+
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %0 = shl nsw i64 %indvars.iv, 2
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 %0
+ store i32 4, i32* %arrayidx, align 4
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, 10000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret i32 undef
+}
+
+
+; Check that a load of address is scalarized.
+define i32 @foo1(i32* nocapture noalias %A, i32** nocapture %PtrPtr) {
+;CHECK-LABEL: @foo1(
+;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK: %0 = or i64 %index, 1
+;CHECK: %1 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %index
+;CHECK: %2 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %0
+;CHECK: %3 = load i32*, i32** %1, align 8
+;CHECK: %4 = load i32*, i32** %2, align 8
+;CHECK: %5 = load i32, i32* %3, align 4
+;CHECK: %6 = load i32, i32* %4, align 4
+;CHECK: %7 = insertelement <2 x i32> undef, i32 %5, i32 0
+;CHECK: %8 = insertelement <2 x i32> %7, i32 %6, i32 1
+;CHECK: %9 = getelementptr inbounds i32, i32* %A, i64 %index
+;CHECK: %10 = bitcast i32* %9 to <2 x i32>*
+;CHECK: store <2 x i32> %8, <2 x i32>* %10, align 4
+
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %ptr = getelementptr inbounds i32*, i32** %PtrPtr, i64 %indvars.iv
+ %el = load i32*, i32** %ptr
+ %v = load i32, i32* %el
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+ store i32 %v, i32* %arrayidx, align 4
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, 10000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret i32 undef
+}