#include "llvm/ProfileData/SampleProf.h"
#include <list>
#include <map>
+#include <vector>
using namespace llvm;
using namespace sampleprof;
CallSiteLoc(CallLoc){};
ContextTrieNode *getChildContext(const LineLocation &CallSite,
StringRef CalleeName);
- ContextTrieNode *getChildContext(const LineLocation &CallSite);
+ ContextTrieNode *getHottestChildContext(const LineLocation &CallSite);
ContextTrieNode *getOrCreateChildContext(const LineLocation &CallSite,
StringRef CalleeName,
bool AllowCreate = true);
// call-site. The full context is identified by location of call instruction.
FunctionSamples *getCalleeContextSamplesFor(const CallBase &Inst,
StringRef CalleeName);
+ // Get samples for indirect call targets for call site at given location.
+ std::vector<const FunctionSamples *>
+ getIndirectCalleeContextSamplesFor(const DILocation *DIL);
// Query context profile for a given location. The full context
// is identified by input DILocation.
FunctionSamples *getContextSamplesFor(const DILocation *DIL);
ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite,
StringRef CalleeName) {
if (CalleeName.empty())
- return getChildContext(CallSite);
+ return getHottestChildContext(CallSite);
uint32_t Hash = nodeHash(CalleeName, CallSite);
auto It = AllChildContext.find(Hash);
}
ContextTrieNode *
-ContextTrieNode::getChildContext(const LineLocation &CallSite) {
+ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) {
// CSFDO-TODO: This could be slow, change AllChildContext so we can
// do point look up for child node by call site alone.
- // CSFDO-TODO: Return the child with max count for indirect call
+ // Retrieve the child node with max count for indirect call
ContextTrieNode *ChildNodeRet = nullptr;
+ uint64_t MaxCalleeSamples = 0;
for (auto &It : AllChildContext) {
ContextTrieNode &ChildNode = It.second;
- if (ChildNode.CallSiteLoc == CallSite) {
- if (ChildNodeRet)
- return nullptr;
- else
- ChildNodeRet = &ChildNode;
+ if (ChildNode.CallSiteLoc != CallSite)
+ continue;
+ FunctionSamples *Samples = ChildNode.getFunctionSamples();
+ if (!Samples)
+ continue;
+ if (Samples->getTotalSamples() > MaxCalleeSamples) {
+ ChildNodeRet = &ChildNode;
+ MaxCalleeSamples = Samples->getTotalSamples();
}
}
SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst,
StringRef CalleeName) {
LLVM_DEBUG(dbgs() << "Getting callee context for instr: " << Inst << "\n");
- // CSFDO-TODO: We use CalleeName to differentiate indirect call
- // We need to get sample for indirect callee too.
DILocation *DIL = Inst.getDebugLoc();
if (!DIL)
return nullptr;
+ // For indirect call, CalleeName will be empty, in which case the context
+ // profile for callee with largest total samples will be returned.
ContextTrieNode *CalleeContext = getCalleeContextFor(DIL, CalleeName);
if (CalleeContext) {
FunctionSamples *FSamples = CalleeContext->getFunctionSamples();
return nullptr;
}
+std::vector<const FunctionSamples *>
+SampleContextTracker::getIndirectCalleeContextSamplesFor(
+ const DILocation *DIL) {
+ std::vector<const FunctionSamples *> R;
+ if (!DIL)
+ return R;
+
+ ContextTrieNode *CallerNode = getContextFor(DIL);
+ LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
+ for (auto &It : CallerNode->getAllChildContext()) {
+ ContextTrieNode &ChildNode = It.second;
+ if (ChildNode.getCallSiteLoc() != CallSite)
+ continue;
+ if (FunctionSamples *CalleeSamples = ChildNode.getFunctionSamples())
+ R.push_back(CalleeSamples);
+ }
+
+ return R;
+}
+
FunctionSamples *
SampleContextTracker::getContextSamplesFor(const DILocation *DIL) {
assert(DIL && "Expect non-null location");
const Instruction &Inst, StringRef CalleeName) {
LLVM_DEBUG(dbgs() << "Promoting and merging context tree for instr: \n"
<< Inst << "\n");
- // CSFDO-TODO: We also need to promote context profile from indirect
- // calls. We won't have callee names from those from call instr.
- if (CalleeName.empty())
- return;
-
// Get the caller context for the call instruction, we don't use callee
// name from call because there can be context from indirect calls too.
DILocation *DIL = Inst.getDebugLoc();
// Get the context that needs to be promoted
LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
+ // For indirect call, CalleeName will be empty, in which case we need to
+ // promote all non-inlined child context profiles.
+ if (CalleeName.empty()) {
+ for (auto &It : CallerNode->getAllChildContext()) {
+ ContextTrieNode *NodeToPromo = &It.second;
+ if (CallSite != NodeToPromo->getCallSiteLoc())
+ continue;
+ FunctionSamples *FromSamples = NodeToPromo->getFunctionSamples();
+ if (FromSamples && FromSamples->getContext().hasState(InlinedContext))
+ continue;
+ promoteMergeContextSamplesTree(*NodeToPromo);
+ }
+ return;
+ }
+
+ // Get the context for the given callee that needs to be promoted
ContextTrieNode *NodeToPromo =
CallerNode->getChildContext(CallSite, CalleeName);
if (!NodeToPromo)
LLVM_DEBUG(dbgs() << " Found context tree root to promote: "
<< FromSamples->getContext() << "\n");
+ assert(!FromSamples->getContext().hasState(InlinedContext) &&
+ "Shouldn't promote inlined context profile");
StringRef ContextStrToRemove = FromSamples->getContext().getCallingContext();
return promoteMergeContextSamplesTree(NodeToPromo, RootContext,
ContextStrToRemove);
StringRef CalleeName) {
assert(DIL && "Expect non-null location");
- // CSSPGO-TODO: need to support indirect callee
- if (CalleeName.empty())
- return nullptr;
-
ContextTrieNode *CallContext = getContextFor(DIL);
if (!CallContext)
return nullptr;
+ // When CalleeName is empty, the child context profile with max
+ // total samples will be returned.
return CallContext->getChildContext(
FunctionSamples::getCallSiteIdentifier(DIL), CalleeName);
}
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/None.h"
+#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/SCCIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
"Number of functions with CFG mismatched profile");
STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
+STATISTIC(NumCSInlinedHitMinLimit,
+ "Number of functions with FDO inline stopped due to min size limit");
+STATISTIC(NumCSInlinedHitMaxLimit,
+ "Number of functions with FDO inline stopped due to max size limit");
+STATISTIC(
+ NumCSInlinedHitGrowthLimit,
+ "Number of functions with FDO inline stopped due to growth size limit");
+
// Command line option to specify the file to read samples from. This is
// mainly used for debugging.
static cl::opt<std::string> SampleProfileFile(
cl::desc("Inline cold call sites in profile loader if it's beneficial "
"for code size."));
+static cl::opt<int> ProfileInlineGrowthLimit(
+ "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
+ cl::desc("The size growth ratio limit for proirity-based sample profile "
+ "loader inlining."));
+
+static cl::opt<int> ProfileInlineLimitMin(
+ "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
+ cl::desc("The lower bound of size growth limit for "
+ "proirity-based sample profile loader inlining."));
+
+static cl::opt<int> ProfileInlineLimitMax(
+ "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
+ cl::desc("The upper bound of size growth limit for "
+ "proirity-based sample profile loader inlining."));
+
+static cl::opt<int> ProfileICPThreshold(
+ "sample-profile-icp-threshold", cl::Hidden, cl::init(5),
+ cl::desc(
+ "Relative hotness threshold for indirect "
+ "call promotion in proirity-based sample profile loader inlining."));
+
+static cl::opt<int> SampleHotCallSiteThreshold(
+ "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
+ cl::desc("Hot callsite threshold for proirity-based sample profile loader "
+ "inlining."));
+
+static cl::opt<bool> CallsitePrioritizedInline(
+ "sample-profile-prioritized-inline", cl::Hidden, cl::ZeroOrMore,
+ cl::init(false),
+ cl::desc("Use call site prioritized inlining for sample profile loader."
+ "Currently only CSSPGO is supported."));
+
static cl::opt<int> SampleColdCallSiteThreshold(
"sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
cl::desc("Threshold for inlining cold callsites"));
DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
};
+// Inline candidate used by iterative callsite prioritized inliner
+struct InlineCandidate {
+ CallBase *CallInstr;
+ const FunctionSamples *CalleeSamples;
+ uint64_t CallsiteCount;
+};
+
+// Inline candidate comparer using call site weight
+struct CandidateComparer {
+ bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
+ if (LHS.CallsiteCount != RHS.CallsiteCount)
+ return LHS.CallsiteCount < RHS.CallsiteCount;
+
+ // Tie breaker using GUID so we have stable/deterministic inlining order
+ assert(LHS.CalleeSamples && RHS.CalleeSamples &&
+ "Expect non-null FunctionSamples");
+ return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) <
+ RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName());
+ }
+};
+
+using CandidateQueue =
+ PriorityQueue<InlineCandidate, std::vector<InlineCandidate>,
+ CandidateComparer>;
+
/// Sample profile pass.
///
/// This pass reads profile data from the file specified by
findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap;
const FunctionSamples *findFunctionSamples(const Instruction &I) const;
- bool inlineCallInstruction(CallBase &CB);
+ CallBase *tryPromoteIndirectCall(Function &F, StringRef CalleeName,
+ uint64_t &Sum, uint64_t Count, CallBase *I,
+ const char *&Reason);
+ bool inlineCallInstruction(CallBase &CB,
+ const FunctionSamples *CalleeSamples);
bool inlineHotFunctions(Function &F,
DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+ // Helper functions call-site prioritized BFS inliner
+ // Will change the main FDO inliner to be work list based directly in
+ // upstream, then merge this change with that and remove the duplication.
+ InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
+ bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
+ bool tryInlineCandidate(InlineCandidate &Candidate,
+ SmallVector<CallBase *, 8> &InlinedCallSites);
+ bool
+ inlineHotFunctionsWithPriority(Function &F,
+ DenseSet<GlobalValue::GUID> &InlinedGUIDs);
// Inline cold/small functions in addition to hot ones
bool shouldInlineColdCallee(CallBase &CallInst);
void emitOptimizationRemarksForInlineCandidates(
return R;
}
+ auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
+ assert(L && R && "Expect non-null FunctionSamples");
+ if (L->getEntrySamples() != R->getEntrySamples())
+ return L->getEntrySamples() > R->getEntrySamples();
+ return FunctionSamples::getGUID(L->getName()) <
+ FunctionSamples::getGUID(R->getName());
+ };
+
+ if (ProfileIsCS) {
+ auto CalleeSamples =
+ ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
+ if (CalleeSamples.empty())
+ return R;
+
+ // For CSSPGO, we only use target context profile's entry count
+ // as that already includes both inlined callee and non-inlined ones..
+ Sum = 0;
+ for (const auto *const FS : CalleeSamples) {
+ Sum += FS->getEntrySamples();
+ R.push_back(FS);
+ }
+ llvm::sort(R, FSCompare);
+ return R;
+ }
+
const FunctionSamples *FS = findFunctionSamples(Inst);
if (FS == nullptr)
return R;
Sum += NameFS.second.getEntrySamples();
R.push_back(&NameFS.second);
}
- llvm::sort(R, [](const FunctionSamples *L, const FunctionSamples *R) {
- if (L->getEntrySamples() != R->getEntrySamples())
- return L->getEntrySamples() > R->getEntrySamples();
- return FunctionSamples::getGUID(L->getName()) <
- FunctionSamples::getGUID(R->getName());
- });
+ llvm::sort(R, FSCompare);
}
return R;
}
return it.first->second;
}
-bool SampleProfileLoader::inlineCallInstruction(CallBase &CB) {
+CallBase *
+SampleProfileLoader::tryPromoteIndirectCall(Function &F, StringRef CalleeName,
+ uint64_t &Sum, uint64_t Count,
+ CallBase *I, const char *&Reason) {
+ Reason = "Callee function not available";
+ // R->getValue() != &F is to prevent promoting a recursive call.
+ // If it is a recursive call, we do not inline it as it could bloat
+ // the code exponentially. There is way to better handle this, e.g.
+ // clone the caller first, and inline the cloned caller if it is
+ // recursive. As llvm does not inline recursive calls, we will
+ // simply ignore it instead of handling it explicitly.
+ auto R = SymbolMap.find(CalleeName);
+ if (R != SymbolMap.end() && R->getValue() &&
+ !R->getValue()->isDeclaration() && R->getValue()->getSubprogram() &&
+ R->getValue()->hasFnAttribute("use-sample-profile") &&
+ R->getValue() != &F && isLegalToPromote(*I, R->getValue(), &Reason)) {
+ auto *DI =
+ &pgo::promoteIndirectCall(*I, R->getValue(), Count, Sum, false, ORE);
+ Sum -= Count;
+ return DI;
+ }
+ return nullptr;
+}
+
+bool SampleProfileLoader::inlineCallInstruction(
+ CallBase &CB, const FunctionSamples *CalleeSamples) {
if (ExternalInlineAdvisor) {
auto Advice = ExternalInlineAdvisor->getAdvice(CB);
if (!Advice->isInliningRecommended()) {
// The call to InlineFunction erases I, so we can't pass it here.
emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
true, CSINLINE_DEBUG);
+ if (ProfileIsCS)
+ ContextTracker->markContextSamplesInlined(CalleeSamples);
+ ++NumCSInlined;
return true;
}
return false;
if (!callsiteIsHot(FS, PSI))
continue;
- const char *Reason = "Callee function not available";
- // R->getValue() != &F is to prevent promoting a recursive call.
- // If it is a recursive call, we do not inline it as it could bloat
- // the code exponentially. There is way to better handle this, e.g.
- // clone the caller first, and inline the cloned caller if it is
- // recursive. As llvm does not inline recursive calls, we will
- // simply ignore it instead of handling it explicitly.
+ const char *Reason = nullptr;
auto CalleeFunctionName = FS->getFuncName();
- auto R = SymbolMap.find(CalleeFunctionName);
- if (R != SymbolMap.end() && R->getValue() &&
- !R->getValue()->isDeclaration() &&
- R->getValue()->getSubprogram() &&
- R->getValue()->hasFnAttribute("use-sample-profile") &&
- R->getValue() != &F &&
- isLegalToPromote(*I, R->getValue(), &Reason)) {
- uint64_t C = FS->getEntrySamples();
- auto &DI =
- pgo::promoteIndirectCall(*I, R->getValue(), C, Sum, false, ORE);
- Sum -= C;
+ if (CallBase *DI =
+ tryPromoteIndirectCall(F, CalleeFunctionName, Sum,
+ FS->getEntrySamples(), I, Reason)) {
PromotedInsns.insert(I);
// If profile mismatches, we should not attempt to inline DI.
if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
- inlineCallInstruction(cast<CallBase>(DI))) {
- if (ProfileIsCS)
- ContextTracker->markContextSamplesInlined(FS);
+ inlineCallInstruction(cast<CallBase>(*DI), FS)) {
localNotInlinedCallSites.erase(I);
LocalChanged = true;
- ++NumCSInlined;
}
} else {
LLVM_DEBUG(dbgs()
}
} else if (CalledFunction && CalledFunction->getSubprogram() &&
!CalledFunction->isDeclaration()) {
- if (inlineCallInstruction(*I)) {
- if (ProfileIsCS)
- ContextTracker->markContextSamplesInlined(
- localNotInlinedCallSites[I]);
+ if (inlineCallInstruction(*I, localNotInlinedCallSites.count(I)
+ ? localNotInlinedCallSites[I]
+ : nullptr)) {
localNotInlinedCallSites.erase(I);
LocalChanged = true;
- ++NumCSInlined;
}
} else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
findCalleeFunctionSamples(*I)->findInlinedFunctions(
}
}
+ // For CS profile, profile for not inlined context will be merged when
+ // base profile is being trieved
+ if (ProfileIsCS)
+ return Changed;
+
// Accumulate not inlined callsite information into notInlinedSamples
for (const auto &Pair : localNotInlinedCallSites) {
CallBase *I = Pair.getFirst();
return Changed;
}
+bool SampleProfileLoader::tryInlineCandidate(
+ InlineCandidate &Candidate, SmallVector<CallBase *, 8> &InlinedCallSites) {
+
+ CallBase &CB = *Candidate.CallInstr;
+ Function *CalledFunction = CB.getCalledFunction();
+ assert(CalledFunction && "Expect a callee with definition");
+ DebugLoc DLoc = CB.getDebugLoc();
+ BasicBlock *BB = CB.getParent();
+
+ InlineCost Cost = shouldInlineCandidate(Candidate);
+ if (Cost.isNever()) {
+ ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
+ << "incompatible inlining");
+ return false;
+ }
+
+ if (!Cost)
+ return false;
+
+ InlineFunctionInfo IFI(nullptr, GetAC);
+ if (InlineFunction(CB, IFI).isSuccess()) {
+ // The call to InlineFunction erases I, so we can't pass it here.
+ emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
+ true, CSINLINE_DEBUG);
+
+ // Now populate the list of newly exposed call sites.
+ InlinedCallSites.clear();
+ for (auto &I : IFI.InlinedCallSites)
+ InlinedCallSites.push_back(I);
+
+ if (ProfileIsCS)
+ ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
+ ++NumCSInlined;
+ return true;
+ }
+ return false;
+}
+
+bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
+ CallBase *CB) {
+ assert(CB && "Expect non-null call instruction");
+
+ if (isa<IntrinsicInst>(CB))
+ return false;
+
+ // Find the callee's profile. For indirect call, find hottest target profile.
+ const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
+ if (!CalleeSamples)
+ return false;
+
+ uint64_t CallsiteCount = 0;
+ ErrorOr<uint64_t> Weight = getBlockWeight(CB->getParent());
+ if (Weight)
+ CallsiteCount = Weight.get();
+ if (CalleeSamples)
+ CallsiteCount = std::max(CallsiteCount, CalleeSamples->getEntrySamples());
+
+ *NewCandidate = {CB, CalleeSamples, CallsiteCount};
+ return true;
+}
+
+InlineCost
+SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
+ assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now");
+
+ std::unique_ptr<InlineAdvice> Advice = nullptr;
+ if (ExternalInlineAdvisor) {
+ Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr);
+ if (!Advice->isInliningRecommended()) {
+ Advice->recordUnattemptedInlining();
+ return InlineCost::getNever("not previously inlined");
+ }
+ Advice->recordInlining();
+ return InlineCost::getAlways("previously inlined");
+ }
+
+ // Adjust threshold based on call site hotness, only do this for callsite
+ // prioritized inliner because otherwise cost-benefit check is done earlier.
+ int SampleThreshold = SampleColdCallSiteThreshold;
+ if (CallsitePrioritizedInline) {
+ if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
+ SampleThreshold = SampleHotCallSiteThreshold;
+ else if (!ProfileSizeInline)
+ return InlineCost::getNever("cold callsite");
+ }
+
+ Function *Callee = Candidate.CallInstr->getCalledFunction();
+ assert(Callee && "Expect a definition for inline candidate of direct call");
+
+ InlineParams Params = getInlineParams();
+ Params.ComputeFullInlineCost = true;
+ // Checks if there is anything in the reachable portion of the callee at
+ // this callsite that makes this inlining potentially illegal. Need to
+ // set ComputeFullInlineCost, otherwise getInlineCost may return early
+ // when cost exceeds threshold without checking all IRs in the callee.
+ // The acutal cost does not matter because we only checks isNever() to
+ // see if it is legal to inline the callsite.
+ InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
+ GetTTI(*Callee), GetAC, GetTLI);
+
+ // For old FDO inliner, we inline the call site as long as cost is not
+ // "Never". The cost-benefit check is done earlier.
+ if (!CallsitePrioritizedInline) {
+ if (Cost.isNever())
+ return Cost;
+ return InlineCost::getAlways("hot callsite previously inlined");
+ }
+
+ // Honor always inline and never inline from call analyzer
+ if (Cost.isNever() || Cost.isAlways())
+ return Cost;
+
+ // Otherwise only use the cost from call analyzer, but overwite threshold with
+ // Sample PGO threshold.
+ return InlineCost::get(Cost.getCost(), SampleThreshold);
+}
+
+bool SampleProfileLoader::inlineHotFunctionsWithPriority(
+ Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
+ DenseSet<Instruction *> PromotedInsns;
+ assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now");
+
+ // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
+ // Profile symbol list is ignored when profile-sample-accurate is on.
+ assert((!ProfAccForSymsInList ||
+ (!ProfileSampleAccurate &&
+ !F.hasFnAttribute("profile-sample-accurate"))) &&
+ "ProfAccForSymsInList should be false when profile-sample-accurate "
+ "is enabled");
+
+ // Populating worklist with initial call sites from root inliner, along
+ // with call site weights.
+ CandidateQueue CQueue;
+ InlineCandidate NewCandidate;
+ for (auto &BB : F) {
+ for (auto &I : BB.getInstList()) {
+ auto *CB = dyn_cast<CallBase>(&I);
+ if (!CB)
+ continue;
+ if (getInlineCandidate(&NewCandidate, CB))
+ CQueue.push(NewCandidate);
+ }
+ }
+
+ // Cap the size growth from profile guided inlining. This is needed even
+ // though cost of each inline candidate already accounts for callee size,
+ // because with top-down inlining, we can grow inliner size significantly
+ // with large number of smaller inlinees each pass the cost check.
+ assert(ProfileInlineLimitMax >= ProfileInlineLimitMin &&
+ "Max inline size limit should not be smaller than min inline size "
+ "limit.");
+ unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
+ SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
+ SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
+ if (ExternalInlineAdvisor)
+ SizeLimit = std::numeric_limits<unsigned>::max();
+
+ // Perform iterative BFS call site prioritized inlining
+ bool Changed = false;
+ while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
+ InlineCandidate Candidate = CQueue.top();
+ CQueue.pop();
+ CallBase *I = Candidate.CallInstr;
+ Function *CalledFunction = I->getCalledFunction();
+
+ if (CalledFunction == &F)
+ continue;
+ if (I->isIndirectCall()) {
+ if (PromotedInsns.count(I))
+ continue;
+ uint64_t Sum;
+ auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
+ uint64_t SumOrigin = Sum;
+ for (const auto *FS : CalleeSamples) {
+ // TODO: Consider disable pre-lTO ICP for MonoLTO as well
+ if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
+ FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
+ PSI->getOrCompHotCountThreshold());
+ continue;
+ }
+ uint64_t EntryCountDistributed = FS->getEntrySamples();
+ // In addition to regular inline cost check, we also need to make sure
+ // ICP isn't introducing excessive speculative checks even if individual
+ // target looks beneficial to promote and inline. That means we should
+ // only do ICP when there's a small number dominant targets.
+ if (EntryCountDistributed < SumOrigin / ProfileICPThreshold)
+ break;
+ // TODO: Fix CallAnalyzer to handle all indirect calls.
+ // For indirect call, we don't run CallAnalyzer to get InlineCost
+ // before actual inlining. This is because we could see two different
+ // types from the same definition, which makes CallAnalyzer choke as
+ // it's expecting matching parameter type on both caller and callee
+ // side. See example from PR18962 for the triggering cases (the bug was
+ // fixed, but we generate different types).
+ if (!PSI->isHotCount(EntryCountDistributed))
+ break;
+ const char *Reason = nullptr;
+ auto CalleeFunctionName = FS->getFuncName();
+ if (CallBase *DI = tryPromoteIndirectCall(
+ F, CalleeFunctionName, Sum, EntryCountDistributed, I, Reason)) {
+ // Attach function profile for promoted indirect callee, and update
+ // call site count for the promoted inline candidate too.
+ Candidate = {DI, FS, EntryCountDistributed};
+ PromotedInsns.insert(I);
+ SmallVector<CallBase *, 8> InlinedCallSites;
+ // If profile mismatches, we should not attempt to inline DI.
+ if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
+ tryInlineCandidate(Candidate, InlinedCallSites)) {
+ for (auto *CB : InlinedCallSites) {
+ if (getInlineCandidate(&NewCandidate, CB))
+ CQueue.emplace(NewCandidate);
+ }
+ Changed = true;
+ }
+ } else {
+ LLVM_DEBUG(dbgs()
+ << "\nFailed to promote indirect call to "
+ << CalleeFunctionName << " because " << Reason << "\n");
+ }
+ }
+ } else if (CalledFunction && CalledFunction->getSubprogram() &&
+ !CalledFunction->isDeclaration()) {
+ SmallVector<CallBase *, 8> InlinedCallSites;
+ if (tryInlineCandidate(Candidate, InlinedCallSites)) {
+ for (auto *CB : InlinedCallSites) {
+ if (getInlineCandidate(&NewCandidate, CB))
+ CQueue.emplace(NewCandidate);
+ }
+ Changed = true;
+ }
+ } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
+ findCalleeFunctionSamples(*I)->findInlinedFunctions(
+ InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
+ }
+ }
+
+ if (!CQueue.empty()) {
+ if (SizeLimit == (unsigned)ProfileInlineLimitMax)
+ ++NumCSInlinedHitMaxLimit;
+ else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
+ ++NumCSInlinedHitMinLimit;
+ else
+ ++NumCSInlinedHitGrowthLimit;
+ }
+
+ return Changed;
+}
+
/// Find equivalence classes for the given block.
///
/// This finds all the blocks that are guaranteed to execute the same
}
DenseSet<GlobalValue::GUID> InlinedGUIDs;
- Changed |= inlineHotFunctions(F, InlinedGUIDs);
+ if (ProfileIsCS && CallsitePrioritizedInline)
+ Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
+ else
+ Changed |= inlineHotFunctions(F, InlinedGUIDs);
// Compute basic block weights.
Changed |= computeBlockWeights(F);
ProfileIsCS = true;
FunctionSamples::ProfileIsCS = true;
+ // Enable priority-base inliner and size inline by default for CSSPGO.
+ if (!ProfileSizeInline.getNumOccurrences())
+ ProfileSizeInline = true;
+ if (!CallsitePrioritizedInline.getNumOccurrences())
+ CallsitePrioritizedInline = true;
+
// Tracker for profiles under different context
ContextTracker =
std::make_unique<SampleContextTracker>(Reader->getProfiles());
--- /dev/null
+[test]:63067:0
+ 1: 3345 _Z3barv:1398 _Z3foov:2059
+ 2: 100 _Z3bazv:102
+ 3: 100 _Z3zoov:102
+[test:1 @ _Z3barv]:200:100
+ 1: 100
+[test:1 @ _Z3foov]:4220:1200
+ 14: 4220
+[test:2 @ _Z3bazv]:200:100
+ 5: 100
\ No newline at end of file
--- /dev/null
+; REQUIRES: asserts
+; Test that the new FDO inliner using prioty queue will not visit same call site again and again.
+; Use debug prints as repeated call site evaluation is not visible from final inline decision.
+
+; Note that we need new pass manager to enable top-down processing for sample profile loader
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=OLD-INLINE
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=1 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=NEW-INLINE
+
+; Old inliner will evaluate the same call site three times
+; OLD-INLINE: Getting callee context for instr: %call = tail call i32 @_Z5funcBi
+; OLD-INLINE-NEXT: Callee context found: main:3.1 @ _Z5funcBi
+; OLD-INLINE: Getting callee context for instr: %call = tail call i32 @_Z5funcBi
+; OLD-INLINE-NEXT: Callee context found: main:3.1 @ _Z5funcBi
+; OLD-INLINE: Getting callee context for instr: %call = tail call i32 @_Z5funcBi
+; OLD-INLINE-NEXT: Callee context found: main:3.1 @ _Z5funcBi
+
+; New inliner only evaluate the same call site once
+; NEW-INLINE: Getting callee context for instr: %call = tail call i32 @_Z5funcBi
+; NEW-INLINE-NEXT: Callee context found: main:3.1 @ _Z5funcBi
+; NEW-INLINE-NOT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi
+; NEW-INLINE-NOT: Callee context found: main:3.1 @ _Z5funcBi
+
+@factor = dso_local global i32 3, align 4, !dbg !0
+
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
+entry:
+ br label %for.body, !dbg !25
+
+for.cond.cleanup: ; preds = %for.body
+ ret i32 %add3, !dbg !27
+
+for.body: ; preds = %for.body, %entry
+ %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ]
+ %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+ %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32
+ %add = add nuw nsw i32 %x.011, 1, !dbg !31
+ %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
+ %add2 = add i32 %call, %r.010, !dbg !34
+ %add3 = add i32 %add2, %call1, !dbg !35
+ %dec = add nsw i32 %x.011, -1, !dbg !36
+ %cmp = icmp eq i32 %x.011, 0, !dbg !38
+ br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25
+}
+
+define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 {
+entry:
+ %add = add nsw i32 %x, 100000, !dbg !44
+ %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45
+ ret i32 %call, !dbg !46
+}
+
+define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
+entry:
+ %cmp = icmp sgt i32 %x, 0, !dbg !57
+ br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
+
+while.cond2.preheader: ; preds = %entry
+ %cmp313 = icmp slt i32 %x, 0, !dbg !60
+ br i1 %cmp313, label %while.body4, label %if.end, !dbg !63
+
+while.body: ; preds = %while.body, %entry
+ %x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ]
+ %tmp = load volatile i32, i32* @factor, align 4, !dbg !64
+ %call = tail call i32 @_Z3fibi(i32 %tmp), !dbg !67
+ %sub = sub nsw i32 %x.addr.016, %call, !dbg !68
+ %cmp1 = icmp sgt i32 %sub, 0, !dbg !69
+ br i1 %cmp1, label %while.body, label %if.end, !dbg !71
+
+while.body4: ; preds = %while.body4, %while.cond2.preheader
+ %x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ]
+ %tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72
+ %call5 = tail call i32 @_Z3fibi(i32 %tmp1), !dbg !74
+ %add = add nsw i32 %call5, %x.addr.114, !dbg !75
+ %cmp3 = icmp slt i32 %add, 0, !dbg !60
+ br i1 %cmp3, label %while.body4, label %if.end, !dbg !63
+
+if.end: ; preds = %while.body4, %while.body, %while.cond2.preheader
+ %x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ]
+ ret i32 %x.addr.2, !dbg !76
+}
+
+define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
+entry:
+ %sub = add nsw i32 %x, -100000, !dbg !51
+ %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
+ ret i32 %call, !dbg !53
+}
+
+declare i32 @_Z3fibi(i32)
+
+attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo")
+!4 = !{}
+!5 = !{!6, !10, !11}
+!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!12 = !{!0}
+!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9)
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{!"clang version 11.0.0"}
+!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!9}
+!21 = !{!22, !23}
+!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9)
+!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9)
+!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3)
+!25 = !DILocation(line: 13, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2)
+!27 = !DILocation(line: 17, column: 3, scope: !18)
+!28 = !DILocation(line: 14, column: 10, scope: !29)
+!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37)
+!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3)
+!31 = !DILocation(line: 14, column: 29, scope: !29)
+!32 = !DILocation(line: 14, column: 21, scope: !33)
+!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
+!34 = !DILocation(line: 14, column: 19, scope: !29)
+!35 = !DILocation(line: 14, column: 7, scope: !29)
+!36 = !DILocation(line: 13, column: 33, scope: !37)
+!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6)
+!38 = !DILocation(line: 13, column: 26, scope: !39)
+!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2)
+!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!44 = !DILocation(line: 27, column: 22, scope: !40)
+!45 = !DILocation(line: 27, column: 11, scope: !40)
+!46 = !DILocation(line: 29, column: 3, scope: !40)
+!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!51 = !DILocation(line: 33, column: 22, scope: !47)
+!52 = !DILocation(line: 33, column: 11, scope: !47)
+!53 = !DILocation(line: 35, column: 3, scope: !47)
+!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!57 = !DILocation(line: 49, column: 9, scope: !58)
+!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7)
+!59 = !DILocation(line: 49, column: 7, scope: !54)
+!60 = !DILocation(line: 58, column: 14, scope: !61)
+!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2)
+!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8)
+!63 = !DILocation(line: 58, column: 5, scope: !61)
+!64 = !DILocation(line: 52, column: 16, scope: !65)
+!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19)
+!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14)
+!67 = !DILocation(line: 52, column: 12, scope: !65)
+!68 = !DILocation(line: 52, column: 9, scope: !65)
+!69 = !DILocation(line: 51, column: 14, scope: !70)
+!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2)
+!71 = !DILocation(line: 51, column: 5, scope: !70)
+!72 = !DILocation(line: 59, column: 16, scope: !73)
+!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19)
+!74 = !DILocation(line: 59, column: 12, scope: !73)
+!75 = !DILocation(line: 59, column: 9, scope: !73)
+!76 = !DILocation(line: 63, column: 3, scope: !54)
--- /dev/null
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-ALL %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-ALL %s
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -sample-profile-inline-size=0 -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-HOT %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -sample-profile-inline-size=0 -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-HOT %s
+
+define void @test(void ()*) #0 !dbg !3 {
+;; Add two direct call to force top-down order for sample profile loader
+ call void @_Z3foov(), !dbg !7
+ call void @_Z3barv(), !dbg !7
+ call void @_Z3bazv(), !dbg !7
+ %2 = alloca void ()*
+ store void ()* %0, void ()** %2
+ %3 = load void ()*, void ()** %2
+ call void %3(), !dbg !4
+ %4 = alloca void ()*
+ store void ()* %0, void ()** %4
+ %5 = load void ()*, void ()** %4
+ call void %5(), !dbg !5
+ ret void
+}
+
+define void @_Z3foov() #0 !dbg !8 {
+ ret void
+}
+
+define void @_Z3barv() #0 !dbg !9 {
+ ret void
+}
+
+define void @_Z3bazv() #0 !dbg !10 {
+ ret void
+}
+
+define void @_Z3zoov() #0 !dbg !11 {
+ ret void
+}
+
+attributes #0 = {"use-sample-profile"}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1)
+!1 = !DIFile(filename: "test.cc", directory: "/")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 3, unit: !0)
+!4 = !DILocation(line: 4, scope: !3)
+!5 = !DILocation(line: 5, scope: !3)
+!6 = !DILocation(line: 6, scope: !3)
+!7 = !DILocation(line: 7, scope: !3)
+!8 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 29, unit: !0)
+!9 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 32, unit: !0)
+!10 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 24, unit: !0)
+!11 = distinct !DISubprogram(name: "zoo", linkageName: "_Z3zoov", scope: !1, file: !1, line: 24, unit: !0)
+
+
+; ICP-ALL: remark: test.cc:5:0: _Z3bazv inlined into test
+; ICP-ALL-NEXT: remark: test.cc:4:0: _Z3foov inlined into test
+; ICP-ALL-NEXT: remark: test.cc:4:0: _Z3barv inlined into test
+; ICP-ALL-NOT: remark
+
+; ICP-HOT: remark: test.cc:4:0: _Z3foov inlined into test
+; ICP-HOT-NOT: remark
--- /dev/null
+; Test for CSSPGO's new early inliner using priority queue
+
+; Note that we need new pass manager to enable top-down processing for sample profile loader
+; Test we inlined the following in top-down order with old inliner
+; main:3 @ _Z5funcAi
+; main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
+; _Z5funcBi:1 @ _Z8funcLeafi
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE
+;
+; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, so we get less inlining for given profile
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW
+;
+; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, tuning hot cutoff can get us the same inlining
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE
+;
+; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, tuning cold sample profile inline threshold can get us the same inlining
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE
+;
+; With new FDO early inliner and tuned cutoff, we can control inlining through size growth tuning knob.
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -sample-profile-inline-limit-min=0 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --allow-empty --check-prefix=INLINE-NEW-LIMIT1
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -sample-profile-inline-limit-min=10 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW-LIMIT2
+
+
+; INLINE-BASE: remark: merged.cpp:14:10: _Z5funcAi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite main:3:10
+; INLINE-BASE: remark: merged.cpp:27:11: _Z8funcLeafi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1:11 @ main:3:10
+; INLINE-BASE: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1:11
+
+; INLINE-NEW: remark: merged.cpp:14:10: _Z5funcAi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite main:3:10
+; INLINE-NEW-NOT: remark
+
+; INLINE-NEW-LIMIT1-NOT: remark
+
+; INLINE-NEW-LIMIT2: remark: merged.cpp:27:11: _Z8funcLeafi inlined into _Z5funcAi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1:11
+; INLINE-NEW-LIMIT2: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1:11
+; INLINE-NEW-LIMIT2-NOT: remark
+
+@factor = dso_local global i32 3, align 4, !dbg !0
+
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
+entry:
+ br label %for.body, !dbg !25
+
+for.cond.cleanup: ; preds = %for.body
+ ret i32 %add3, !dbg !27
+
+for.body: ; preds = %for.body, %entry
+ %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ]
+ %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+ %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32
+ %add = add nuw nsw i32 %x.011, 1, !dbg !31
+ %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
+ %add2 = add i32 %call, %r.010, !dbg !34
+ %add3 = add i32 %add2, %call1, !dbg !35
+ %dec = add nsw i32 %x.011, -1, !dbg !36
+ %cmp = icmp eq i32 %x.011, 0, !dbg !38
+ br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25
+}
+
+define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 {
+entry:
+ %add = add nsw i32 %x, 100000, !dbg !44
+ %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45
+ ret i32 %call, !dbg !46
+}
+
+define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
+entry:
+ %cmp = icmp sgt i32 %x, 0, !dbg !57
+ br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
+
+while.cond2.preheader: ; preds = %entry
+ %cmp313 = icmp slt i32 %x, 0, !dbg !60
+ br i1 %cmp313, label %while.body4, label %if.end, !dbg !63
+
+while.body: ; preds = %while.body, %entry
+ %x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ]
+ %tmp = load volatile i32, i32* @factor, align 4, !dbg !64
+ %call = tail call i32 @_Z3fibi(i32 %tmp), !dbg !67
+ %sub = sub nsw i32 %x.addr.016, %call, !dbg !68
+ %cmp1 = icmp sgt i32 %sub, 0, !dbg !69
+ br i1 %cmp1, label %while.body, label %if.end, !dbg !71
+
+while.body4: ; preds = %while.body4, %while.cond2.preheader
+ %x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ]
+ %tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72
+ %call5 = tail call i32 @_Z3fibi(i32 %tmp1), !dbg !74
+ %add = add nsw i32 %call5, %x.addr.114, !dbg !75
+ %cmp3 = icmp slt i32 %add, 0, !dbg !60
+ br i1 %cmp3, label %while.body4, label %if.end, !dbg !63
+
+if.end: ; preds = %while.body4, %while.body, %while.cond2.preheader
+ %x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ]
+ ret i32 %x.addr.2, !dbg !76
+}
+
+define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
+entry:
+ %sub = add nsw i32 %x, -100000, !dbg !51
+ %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
+ ret i32 %call, !dbg !53
+}
+
+declare i32 @_Z3fibi(i32)
+
+attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!14, !15, !16}
+!llvm.ident = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo")
+!4 = !{}
+!5 = !{!6, !10, !11}
+!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
+!12 = !{!0}
+!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9)
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{!"clang version 11.0.0"}
+!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!9}
+!21 = !{!22, !23}
+!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9)
+!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9)
+!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3)
+!25 = !DILocation(line: 13, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2)
+!27 = !DILocation(line: 17, column: 3, scope: !18)
+!28 = !DILocation(line: 14, column: 10, scope: !29)
+!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37)
+!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3)
+!31 = !DILocation(line: 14, column: 29, scope: !29)
+!32 = !DILocation(line: 14, column: 21, scope: !33)
+!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
+!34 = !DILocation(line: 14, column: 19, scope: !29)
+!35 = !DILocation(line: 14, column: 7, scope: !29)
+!36 = !DILocation(line: 13, column: 33, scope: !37)
+!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6)
+!38 = !DILocation(line: 13, column: 26, scope: !39)
+!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2)
+!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!44 = !DILocation(line: 27, column: 22, scope: !40)
+!45 = !DILocation(line: 27, column: 11, scope: !40)
+!46 = !DILocation(line: 29, column: 3, scope: !40)
+!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!51 = !DILocation(line: 33, column: 22, scope: !47)
+!52 = !DILocation(line: 33, column: 11, scope: !47)
+!53 = !DILocation(line: 35, column: 3, scope: !47)
+!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!57 = !DILocation(line: 49, column: 9, scope: !58)
+!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7)
+!59 = !DILocation(line: 49, column: 7, scope: !54)
+!60 = !DILocation(line: 58, column: 14, scope: !61)
+!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2)
+!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8)
+!63 = !DILocation(line: 58, column: 5, scope: !61)
+!64 = !DILocation(line: 52, column: 16, scope: !65)
+!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19)
+!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14)
+!67 = !DILocation(line: 52, column: 12, scope: !65)
+!68 = !DILocation(line: 52, column: 9, scope: !65)
+!69 = !DILocation(line: 51, column: 14, scope: !70)
+!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2)
+!71 = !DILocation(line: 51, column: 5, scope: !70)
+!72 = !DILocation(line: 59, column: 16, scope: !73)
+!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19)
+!74 = !DILocation(line: 59, column: 12, scope: !73)
+!75 = !DILocation(line: 59, column: 9, scope: !73)
+!76 = !DILocation(line: 63, column: 3, scope: !54)
; based on inline decision, so post inline counts are accurate.
; Note that we need new pass manager to enable top-down processing for sample profile loader
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-ALL
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-HOT
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-prioritized-inline=0 -sample-profile-inline-size=0 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-HOT
-; Testwe we inlined the following in top-down order and promot rest not inlined context profile into base profile
+; Test we inlined the following in top-down order and promot rest not inlined context profile into base profile
; main:3 @ _Z5funcAi
; main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
; _Z5funcBi:1 @ _Z8funcLeafi
; INLINE-ALL-NEXT: Getting callee context for instr: %call1 = tail call i32 @_Z5funcAi
; INLINE-ALL-NEXT: Callee context found: main:3 @ _Z5funcAi
; INLINE-ALL-NEXT: Marking context profile as inlined: main:3 @ _Z5funcAi
-; INLINE-ALL-NEXT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi(
-; INLINE-ALL-NEXT: Callee context found: main:3.1 @ _Z5funcBi
; INLINE-ALL-NEXT: Getting callee context for instr: %call.i = tail call i32 @_Z8funcLeafi
; INLINE-ALL-NEXT: Callee context found: main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
; INLINE-ALL-NEXT: Marking context profile as inlined: main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
-; INLINE-ALL-NEXT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi
-; INLINE-ALL-NEXT: Callee context found: main:3.1 @ _Z5funcBi
; INLINE-ALL-NEXT: Getting callee context for instr: %call.i1 = tail call i32 @_Z3fibi
; INLINE-ALL-NEXT: Getting callee context for instr: %call5.i = tail call i32 @_Z3fibi
; INLINE-ALL-NEXT: Getting base profile for function: _Z5funcAi
; INLINE-ALL-NEXT: Getting base profile for function: _Z8funcLeafi
; INLINE-ALL-NEXT: Merging context profile into base profile: _Z8funcLeafi
-; Testwe we inlined the following in top-down order and promot rest not inlined context profile into base profile
-; main:3 @ _Z5funcAi
+; Test we inlined the following in top-down order and promot rest not inlined context profile into base profile
; _Z5funcAi:1 @ _Z8funcLeafi
; _Z5funcBi:1 @ _Z8funcLeafi
; INLINE-HOT: Getting base profile for function: main
; INLINE-HOT-NEXT: Merging context profile into base profile: main
; INLINE-HOT-NEXT: Found context tree root to promote: external:12 @ main
; INLINE-HOT-NEXT: Context promoted and merged to: main
-; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !58
+; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi
; INLINE-HOT-NEXT: Callee context found: main:3.1 @ _Z5funcBi
-; INLINE-HOT-NEXT: Getting callee context for instr: %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !63
+; INLINE-HOT-NEXT: Getting callee context for instr: %call1 = tail call i32 @_Z5funcAi
; INLINE-HOT-NEXT: Callee context found: main:3 @ _Z5funcAi
; INLINE-HOT-NEXT: Getting base profile for function: _Z5funcAi
; INLINE-HOT-NEXT: Merging context profile into base profile: _Z5funcAi
; INLINE-HOT-NEXT: Found context tree root to promote: main:3 @ _Z5funcAi
; INLINE-HOT-NEXT: Context promoted to: _Z5funcAi
; INLINE-HOT-NEXT: Context promoted to: _Z5funcAi:1 @ _Z8funcLeafi
-; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !50
+; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !50
; INLINE-HOT-NEXT: Callee context found: _Z5funcAi:1 @ _Z8funcLeafi
; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcAi:1 @ _Z8funcLeafi
; INLINE-HOT-NEXT: Getting callee context for instr: %call.i = tail call i32 @_Z3fibi(i32 %tmp.i) #2, !dbg !62
; INLINE-HOT-NEXT: Context promoted to: _Z5funcBi:1 @ _Z8funcLeafi
; INLINE-HOT-NEXT: Found context tree root to promote: externalA:17 @ _Z5funcBi
; INLINE-HOT-NEXT: Context promoted and merged to: _Z5funcBi
-; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !50
+; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z8funcLeafi
; INLINE-HOT-NEXT: Callee context found: _Z5funcBi:1 @ _Z8funcLeafi
; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcBi:1 @ _Z8funcLeafi
-; INLINE-HOT-NEXT: Getting callee context for instr: %call.i = tail call i32 @_Z3fibi(i32 %tmp.i) #2, !dbg !62
-; INLINE-HOT-NEXT: Getting callee context for instr: %call5.i = tail call i32 @_Z3fibi(i32 %tmp1.i) #2, !dbg !69
+; INLINE-HOT-NEXT: Getting callee context for instr: %call.i = tail call i32 @_Z3fibi
+; INLINE-HOT-NEXT: Getting callee context for instr: %call5.i = tail call i32 @_Z3fibi
; INLINE-HOT-NEXT: Getting base profile for function: _Z8funcLeafi
; INLINE-HOT-NEXT: Merging context profile into base profile: _Z8funcLeafi
; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/profile-context-tracker.prof -o %t
; Note that we need new pass manager to enable top-down processing for sample profile loader
-; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
+; Test we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
; main:3 @ _Z5funcAi
; main:3 @ _Z5funcAi:1 @ _Z8funcLeafi
; _Z5funcBi:1 @ _Z8funcLeafi
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
-
-; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
-; main:3 @ _Z5funcAi
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL
+;
+; Test we inlined the following in top-down order and entry counts accurate reflects post-inline base profile
; _Z5funcAi:1 @ _Z8funcLeafi
; _Z5funcBi:1 @ _Z8funcLeafi
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT
@factor = dso_local global i32 3, align 4, !dbg !0
-; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-inline.prof -S -pass-remarks=sample-profile -pass-remarks-output=%t.opt.yaml 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-inline.prof -S -pass-remarks=sample-profile -sample-profile-prioritized-inline=0 -pass-remarks-output=%t.opt.yaml 2>&1 | FileCheck %s
; RUN: FileCheck %s -check-prefix=YAML < %t.opt.yaml
; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/pseudo-probe-inline.prof -o %t2
-; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%t2 -S -pass-remarks=sample-profile -pass-remarks-output=%t2.opt.yaml 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%t2 -S -pass-remarks=sample-profile -sample-profile-prioritized-inline=0 -pass-remarks-output=%t2.opt.yaml 2>&1 | FileCheck %s
; RUN: FileCheck %s -check-prefix=YAML < %t2.opt.yaml
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"