SmallPtrSetImpl<Value *> &AllowedExit) {
// Reductions, Inductions and non-header phis are allowed to have exit users. All
// other instructions must not have external users.
- // TODO: Non-phi instructions can also be taught to have exit users, now that
- // we know how to extract the last scalar element from the loop.
if (!AllowedExit.count(Inst))
// Check that all of the users of the loop are inside the BB.
for (User *U : Inst->users()) {
continue;
}
+ // TODO: Instead of recording the AllowedExit, it would be good to record the
+ // complementary set: NotAllowedExit. These include (but may not be
+ // limited to):
+ // 1. Reduction phis as they represent the one-before-last value, which
+ // is not available when vectorized
+ // 2. Induction phis and increment when SCEV predicates cannot be used
+ // outside the loop - see addInductionPhi
+ // 3. Non-Phis with outside uses when SCEV predicates cannot be used
+ // outside the loop - see call to hasOutsideLoopUser in the non-phi
+ // handling below
+ // 4. FirstOrderRecurrence phis that can possibly be handled by
+ // extraction.
+ // By recording these, we can then reason about ways to vectorize each
+ // of these NotAllowedExit.
InductionDescriptor ID;
if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
addInductionPhi(Phi, ID, AllowedExit);
// Reduction instructions are allowed to have exit users.
// All other instructions must not have external users.
if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
+ // We can safely vectorize loops where instructions within the loop are
+ // used outside the loop only if the SCEV predicates within the loop is
+ // same as outside the loop. Allowing the exit means reusing the SCEV
+ // outside the loop.
+ if (PSE.getUnionPredicate().isAlwaysTrue()) {
+ AllowedExit.insert(&I);
+ continue;
+ }
ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
<< "value cannot be used outside the loop");
return false;
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
if (LCSSAPhi.getNumIncomingValues() == 1) {
auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
+ // Non-instruction incoming values will have only one value.
+ unsigned LastLane = 0;
+ if (isa<Instruction>(IncomingValue))
+ LastLane = Cost->isUniformAfterVectorization(
+ cast<Instruction>(IncomingValue), VF)
+ ? 0
+ : VF - 1;
// Can be a loop invariant incoming value or the last scalar value to be
// extracted from the vectorized loop.
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
Value *lastIncomingValue =
- getOrCreateScalarValue(IncomingValue, {UF - 1, VF - 1});
+ getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
}
}
}
// Expand Worklist in topological order: whenever a new instruction
- // is added , its users should be either already inside Worklist, or
- // out of scope. It ensures a uniform instruction will only be used
- // by uniform instructions or out of scope instructions.
+ // is added , its users should be already inside Worklist. It ensures
+ // a uniform instruction will only be used by uniform instructions.
unsigned idx = 0;
while (idx != Worklist.size()) {
Instruction *I = Worklist[idx++];
for (auto OV : I->operand_values()) {
+ // isOutOfScope operands cannot be uniform instructions.
if (isOutOfScope(OV))
continue;
+ // If all the users of the operand are uniform, then add the
+ // operand into the uniform worklist.
auto *OI = cast<Instruction>(OV);
if (llvm::all_of(OI->users(), [&](User *U) -> bool {
auto *J = cast<Instruction>(U);
- return !TheLoop->contains(J) || Worklist.count(J) ||
+ return Worklist.count(J) ||
(OI == getLoadStorePointerOperand(J) &&
isUniformDecision(J, VF));
})) {
%x.0.lcssa = phi i32 [ 0, %entry ], [ %tmp17 , %latch ]
ret i32 %x.0.lcssa
}
+
+
+; CHECK-LABEL: @outside_user_non_phi(
+; CHECK: %vec.ind = phi <2 x i32>
+; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp sgt <2 x i32> %vec.ind, <i32 10, i32 10>
+; CHECK: %predphi = select <2 x i1> [[CMP]], <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer
+; CHECK: [[TRUNC:%[a-zA-Z0-9.]+]] = trunc <2 x i32> %predphi to <2 x i8>
+
+; CHECK-LABEL: middle.block:
+; CHECK: [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i8> [[TRUNC]], i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK: %.lcssa = phi i8 [ %tmp17.trunc, %bb16 ], [ [[E1]], %middle.block ]
+define i8 @outside_user_non_phi() {
+bb:
+ %b.promoted = load i32, i32* @b, align 4
+ br label %.lr.ph.i
+
+.lr.ph.i:
+ %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+ %tmp2 = icmp sgt i32 %tmp8, 10
+ br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+ br label %bb16
+
+bb16:
+ %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ]
+ %tmp17.trunc = trunc i32 %tmp17 to i8
+ %tmp18 = add nsw i32 %tmp8, 1
+ %tmp19 = icmp slt i32 %tmp18, 4
+ br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+ %.lcssa = phi i8 [ %tmp17.trunc, %bb16 ]
+ ret i8 %.lcssa
+}
+
+; CHECK-LABEL: no_vectorize_reduction_with_outside_use(
+; CHECK-NOT: <2 x i32>
+define i32 @no_vectorize_reduction_with_outside_use(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+ %cmp7 = icmp sgt i32 %n, 0
+ br i1 %cmp7, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+ %1 = load i32, i32* %arrayidx2, align 4
+ %add = add nsw i32 %1, %0
+ %or = or i32 %add, %result.08
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ %result.0.lcssa = phi i32 [ 0, %entry ], [ %1, %for.body ]
+ ret i32 %result.0.lcssa
+}
+
+
+; vectorize c[i] = a[i] + b[i] loop where result of c[i] is used outside the
+; loop
+; CHECK-LABEL: sum_arrays_outside_use(
+; CHECK-LABEL: vector.memcheck:
+; CHECK: br i1 %memcheck.conflict, label %scalar.ph, label %vector.ph
+
+; CHECK-LABEL: vector.body:
+; CHECK: %wide.load = load <2 x i32>, <2 x i32>*
+; CHECK: %wide.load16 = load <2 x i32>, <2 x i32>*
+; CHECK: [[ADD:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> %wide.load, %wide.load16
+; CHECK: store <2 x i32>
+
+; CHECK-LABEL: middle.block:
+; CHECK: [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK: %.lcssa = phi i32 [ %sum, %.lr.ph.i ], [ [[E1]], %middle.block ]
+define i32 @sum_arrays_outside_use(i32* %B, i32* %A, i32* %C, i32 %N) {
+bb:
+ %b.promoted = load i32, i32* @b, align 4
+ br label %.lr.ph.i
+
+.lr.ph.i:
+ %iv = phi i32 [ %ivnext, %.lr.ph.i ], [ %b.promoted, %bb ]
+ %indvars.iv = sext i32 %iv to i64
+ %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+ %Bload = load i32, i32* %arrayidx2, align 4
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+ %Aload = load i32, i32* %arrayidx, align 4
+ %sum = add nsw i32 %Bload, %Aload
+ %arrayidx3 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+ store i32 %sum, i32* %arrayidx3, align 4
+ %ivnext = add nsw i32 %iv, 1
+ %tmp19 = icmp slt i32 %ivnext, %N
+ br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+ %.lcssa = phi i32 [ %sum, %.lr.ph.i ]
+ ret i32 %.lcssa
+}
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+; CHECK-LABEL: non_uniform_live_out()
+; CHECK-LABEL: vector.body:
+; CHECK: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK: [[ADD:%[a-zA-Z0-9.]+]] = add <2 x i32> %vec.ind, <i32 7, i32 7>
+; CHECK: [[EE:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 0
+; CHECK: [[GEP:%[a-zA-Z0-9.]+]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[EE]]
+; CHECK-NEXT: [[GEP2:%[a-zA-Z0-9.]+]] = getelementptr inbounds i8, i8* [[GEP]], i32 0
+; CHECK-NEXT: [[BC:%[a-zA-Z0-9.]+]] = bitcast i8* [[GEP2]] to <2 x i8>*
+; CHECK-NEXT: %wide.load = load <2 x i8>, <2 x i8>* [[BC]]
+; CHECK-NEXT: [[ADD2:%[a-zA-Z0-9.]+]] = add <2 x i8> %wide.load, <i8 1, i8 1>
+; CHECK: store <2 x i8> [[ADD2]], <2 x i8>*
+
+; CHECK-LABEL: middle.block:
+; CHECK: [[ADDEE:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 1
+
+; CHECK-LABEL: for.end:
+; CHECK: %lcssa = phi i32 [ %i.09, %for.body ], [ [[ADDEE]], %middle.block ]
+; CHECK: %arrayidx.out = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %lcssa
+define i32 @non_uniform_live_out() {
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %i.09 = add i32 %i.08, 7
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.09
+ %0 = load i8, i8* %arrayidx, align 1
+ %bump = add i8 %0, 1
+ store i8 %bump, i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 1
+ %exitcond = icmp eq i32 %i.08, 20000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ %lcssa = phi i32 [%i.09, %for.body]
+ %arrayidx.out = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %lcssa
+ store i8 42, i8* %arrayidx.out, align 1
+ ret i32 0
+}