From 234f3b16918b5721bbc3449d11e4f317142271e7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Stefan=20Gr=C3=A4nitz?= Date: Sat, 11 Jan 2020 01:09:42 +0100 Subject: [PATCH] Add ThinLtoJIT example Summary: Prototype of a JIT compiler that utilizes ThinLTO summaries to compile modules ahead of time. This is an implementation of the concept I presented in my "ThinLTO Summaries in JIT Compilation" talk at the 2018 Developers' Meeting: http://llvm.org/devmtg/2018-10/talk-abstracts.html#lt8 Upfront the JIT first populates the *combined ThinLTO module index*, which provides fast access to the global call-graph and module paths by function. Next, it loads the main function's module and compiles it. All functions in the module will be emitted with prolog instructions that *fire a discovery flag* once execution reaches them. In parallel, the *discovery thread* is busy-watching the existing flags. Once it detects one has fired, it uses the module index to find all functions that are reachable from it within a given number of calls and submits their defining modules to the compilation pipeline. While execution continues, more flags are fired and further modules added. Ideally the JIT can be tuned in a way, so that in the majority of cases the code on the execution path can be compiled ahead of time. In cases where it doesn't work, the JIT has a *definition generator* in place that loads modules if missing functions are reached. Reviewers: lhames, dblaikie, jfb, tejohnson, pree-jackie, AlexDenisov, kavon Subscribers: mgorny, mehdi_amini, inglorion, hiraditya, steven_wu, dexonsmith, arphaman, jfb, merge_guards_bot, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D72486 --- llvm/examples/CMakeLists.txt | 1 + llvm/examples/ThinLtoJIT/CMakeLists.txt | 18 ++ .../examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp | 65 ++++ llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h | 57 ++++ .../ThinLtoJIT/ThinLtoInstrumentationLayer.cpp | 225 ++++++++++++++ .../ThinLtoJIT/ThinLtoInstrumentationLayer.h | 77 +++++ llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp | 333 +++++++++++++++++++++ llvm/examples/ThinLtoJIT/ThinLtoJIT.h | 111 +++++++ llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp | 268 +++++++++++++++++ llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h | 94 ++++++ llvm/examples/ThinLtoJIT/bench | 100 +++++++ llvm/examples/ThinLtoJIT/main.cpp | 83 +++++ 12 files changed, 1432 insertions(+) create mode 100644 llvm/examples/ThinLtoJIT/CMakeLists.txt create mode 100644 llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp create mode 100644 llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h create mode 100644 llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp create mode 100644 llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h create mode 100644 llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp create mode 100644 llvm/examples/ThinLtoJIT/ThinLtoJIT.h create mode 100644 llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp create mode 100644 llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h create mode 100755 llvm/examples/ThinLtoJIT/bench create mode 100644 llvm/examples/ThinLtoJIT/main.cpp diff --git a/llvm/examples/CMakeLists.txt b/llvm/examples/CMakeLists.txt index 863c12afbda..49d5f52ee07 100644 --- a/llvm/examples/CMakeLists.txt +++ b/llvm/examples/CMakeLists.txt @@ -8,6 +8,7 @@ add_subdirectory(Kaleidoscope) add_subdirectory(ModuleMaker) add_subdirectory(SpeculativeJIT) add_subdirectory(Bye) +add_subdirectory(ThinLtoJIT) if(LLVM_ENABLE_EH AND (NOT WIN32) AND (NOT "${LLVM_NATIVE_ARCH}" STREQUAL "ARM")) add_subdirectory(ExceptionDemo) diff --git a/llvm/examples/ThinLtoJIT/CMakeLists.txt b/llvm/examples/ThinLtoJIT/CMakeLists.txt new file mode 100644 index 00000000000..0de712dec61 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/CMakeLists.txt @@ -0,0 +1,18 @@ +set(LLVM_LINK_COMPONENTS + Core + IRReader + OrcJIT + ExecutionEngine + Support + nativecodegen + Analysis + Passes + ) + +add_llvm_example(ThinLtoJIT + main.cpp + ThinLtoJIT.cpp + ThinLtoModuleIndex.cpp + ThinLtoInstrumentationLayer.cpp + ThinLtoDiscoveryThread.cpp + ) diff --git a/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp new file mode 100644 index 00000000000..203532436ab --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.cpp @@ -0,0 +1,65 @@ +#include "ThinLtoDiscoveryThread.h" + +#include "llvm/IR/GlobalValue.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" + +#include "ThinLtoInstrumentationLayer.h" +#include "ThinLtoModuleIndex.h" + +#include + +#define DEBUG_TYPE "thinltojit" + +namespace llvm { +namespace orc { + +void ThinLtoDiscoveryThread::operator()() { + while (KeepRunning.load()) { + std::vector Indexes = Layer.takeFlagsThatFired(); + + if (!Indexes.empty()) { + LLVM_DEBUG(dbgs() << Indexes.size() << " new flags raised\n"); + auto ReachedFunctions = Layer.takeFlagOwners(std::move(Indexes)); + + for (GlobalValue::GUID F : ReachedFunctions) { + if (GlobalValueSummary *S = GlobalIndex.getSummary(F)) { + assert(isa(S) && "Reached symbols are functions"); + GlobalIndex.discoverCalleeModulePaths(cast(S), + LookaheadLevels); + } else { + LLVM_DEBUG(dbgs() << "No summary for GUID: " << F << "\n"); + } + } + + if (GlobalIndex.getNumDiscoveredModules() > 0) + spawnLookupForHighRankModules(); + } + } +} + +void ThinLtoDiscoveryThread::spawnLookupForHighRankModules() { + std::vector Paths = GlobalIndex.selectNextPaths(); + GlobalIndex.scheduleModuleParsing(Paths); + + // In order to add modules we need exclusive access to the execution session. + std::thread([this, Paths = std::move(Paths)]() { + ES.runSessionLocked([this, Paths = std::move(Paths)]() mutable { + for (const std::string &Path : Paths) { + ThreadSafeModule TSM = GlobalIndex.takeModule(Path); + if (!TSM) + // In the meantime the module was added synchronously. + continue; + + if (Error LoadErr = AddModule(std::move(TSM))) + // Failed to add the module to the session. + ES.reportError(std::move(LoadErr)); + + ++NumModulesSubmitted; + } + }); + }).detach(); +} + +} // namespace orc +} // namespace llvm diff --git a/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h new file mode 100644 index 00000000000..4ca3c95dee0 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoDiscoveryThread.h @@ -0,0 +1,57 @@ +#ifndef LLVM_EXAMPLES_THINLTOJIT_DISCOVERYTHREAD_H +#define LLVM_EXAMPLES_THINLTOJIT_DISCOVERYTHREAD_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/ModuleSummaryIndex.h" + +#include "ThinLtoJIT.h" + +#include +#include + +namespace llvm { +namespace orc { + +class ExecutionSession; +class ThinLtoModuleIndex; +class ThinLtoInstrumentationLayer; + +class ThinLtoDiscoveryThread { +public: + ThinLtoDiscoveryThread(std::atomic &RunningFlag, ExecutionSession &ES, + JITDylib *MainJD, ThinLtoInstrumentationLayer &L, + ThinLtoModuleIndex &GlobalIndex, + ThinLtoJIT::AddModuleFunction AddModule, + unsigned LookaheadLevels, bool PrintStats) + : KeepRunning(RunningFlag), ES(ES), Layer(L), GlobalIndex(GlobalIndex), + AddModule(std::move(AddModule)), LookaheadLevels(LookaheadLevels), + PrintStats(PrintStats) {} + + ~ThinLtoDiscoveryThread() { + if (PrintStats) + dump(errs()); + } + + void operator()(); + + void dump(raw_ostream &OS) { + OS << format("Modules submitted asynchronously: %d\n", NumModulesSubmitted); + } + +private: + std::atomic &KeepRunning; + ExecutionSession &ES; + ThinLtoInstrumentationLayer &Layer; + ThinLtoModuleIndex &GlobalIndex; + ThinLtoJIT::AddModuleFunction AddModule; + unsigned LookaheadLevels; + bool PrintStats; + unsigned NumModulesSubmitted{0}; + + void spawnLookupForHighRankModules(); +}; + +} // namespace orc +} // namespace llvm + +#endif diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp new file mode 100644 index 00000000000..c52fc9d158c --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp @@ -0,0 +1,225 @@ +#include "ThinLtoInstrumentationLayer.h" + +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Process.h" + +#include + +#define DEBUG_TYPE "thinltojit" + +namespace llvm { +namespace orc { + +// TODO: Fixed set of flags may not always be enough. Make this expandable. +void ThinLtoInstrumentationLayer::allocateDiscoveryFlags(unsigned MinFlags) { + // Round up to full memory pages. + unsigned PageSize = sys::Process::getPageSizeEstimate(); + unsigned NumPagesEach = (MinFlags + (PageSize - 1)) / PageSize; + unsigned NumPagesTotal = 2 * NumPagesEach; + assert(isPowerOf2_64(PageSize) && "Adjust aligned memory alloc below"); + + // Allocate one more page to make up for size loss due to alignment. + void *Storage = std::calloc(NumPagesTotal + 1, PageSize); + uint64_t StorageAddr = reinterpret_cast(Storage); + uint64_t PageSizeDecr = PageSize - 1; + uint64_t AlignedAddr = ((StorageAddr + PageSizeDecr) & ~PageSizeDecr); + uint64_t Diff = AlignedAddr - StorageAddr; + + // For each flag we allocate one byte in each location: Incoming and Handled. + // TODO: 'Handled' could be a bitset, but size must be dynamic + NumFlagsUsed.store(0); + NumFlagsAllocated = NumPagesEach * PageSize; + FlagsStorage = static_cast(Storage); + FlagsIncoming = reinterpret_cast(FlagsStorage + Diff); + FlagsHandled = FlagsIncoming + NumFlagsAllocated; + + static_assert(sizeof(FlagsIncoming[0]) == sizeof(uint8_t), "Flags are bytes"); + assert(reinterpret_cast(FlagsIncoming) % PageSize == 0); + assert(reinterpret_cast(FlagsHandled) % PageSize == 0); + assert(NumFlagsAllocated >= MinFlags); +} + +// Reserve a new set of discovery flags and return the index of the first one. +unsigned ThinLtoInstrumentationLayer::reserveDiscoveryFlags(unsigned Count) { +#ifndef NDEBUG + for (unsigned i = NumFlagsUsed.load(), e = i + Count; i < e; i++) { + assert(FlagsIncoming[i] == Clear); + } +#endif + + assert(Count > 0); + return NumFlagsUsed.fetch_add(Count); +} + +void ThinLtoInstrumentationLayer::registerDiscoveryFlagOwners( + std::vector Guids, unsigned FirstIdx) { + unsigned Count = Guids.size(); + + std::lock_guard Lock(DiscoveryFlagsInfoLock); + for (unsigned i = 0; i < Count; i++) { + assert(!FlagOwnersMap.count(FirstIdx + i) && + "Flag should not have an owner at this point"); + FlagOwnersMap[FirstIdx + i] = Guids[i]; + } +} + +std::vector ThinLtoInstrumentationLayer::takeFlagsThatFired() { + // This is only effective with the respective Release. + FlagsSync.load(std::memory_order_acquire); + + std::vector Indexes; + unsigned NumIndexesUsed = NumFlagsUsed.load(); + for (unsigned i = 0; i < NumIndexesUsed; i++) { + if (FlagsIncoming[i] == Fired && FlagsHandled[i] == Clear) { + FlagsHandled[i] = Fired; + Indexes.push_back(i); + } + } + + return Indexes; +} + +std::vector +ThinLtoInstrumentationLayer::takeFlagOwners(std::vector Indexes) { + std::vector ReachedFunctions; + std::lock_guard Lock(DiscoveryFlagsInfoLock); + + for (unsigned i : Indexes) { + auto KV = FlagOwnersMap.find(i); + assert(KV != FlagOwnersMap.end()); + ReachedFunctions.push_back(KV->second); + FlagOwnersMap.erase(KV); + } + + return ReachedFunctions; +} + +void ThinLtoInstrumentationLayer::nudgeIntoDiscovery( + std::vector Functions) { + unsigned Count = Functions.size(); + + // Registering synthetic flags in advance. We expect them to get processed + // before the respective functions get emitted. If not, the emit() function + unsigned FirstFlagIdx = reserveDiscoveryFlags(Functions.size()); + registerDiscoveryFlagOwners(std::move(Functions), FirstFlagIdx); + + // Initialize the flags as fired and force a cache sync, so discovery will + // pick them up as soon as possible. + for (unsigned i = FirstFlagIdx; i < FirstFlagIdx + Count; i++) { + FlagsIncoming[i] = Fired; + } + if (MemFence & ThinLtoJIT::FenceStaticCode) { + FlagsSync.store(0, std::memory_order_release); + } + + LLVM_DEBUG(dbgs() << "Nudged " << Count << " new functions into discovery\n"); +} + +void ThinLtoInstrumentationLayer::emit(MaterializationResponsibility R, + ThreadSafeModule TSM) { + TSM.withModuleDo([this](Module &M) { + std::vector FunctionsToInstrument; + + // We may have discovered ahead of some functions already, but we still + // instrument them all. Their notifications steer the future direction of + // discovery. + for (Function &F : M.getFunctionList()) + if (!F.isDeclaration()) + FunctionsToInstrument.push_back(&F); + + if (!FunctionsToInstrument.empty()) { + IRBuilder<> B(M.getContext()); + std::vector NewDiscoveryRoots; + + // Flags that fire must have owners registered. We will do it below and + // that's fine, because they can only be reached once the code is emitted. + unsigned FirstFlagIdx = + reserveDiscoveryFlags(FunctionsToInstrument.size()); + + unsigned NextFlagIdx = FirstFlagIdx; + for (Function *F : FunctionsToInstrument) { + // TODO: Emitting the write operation into an indirection stub would + // allow to skip it once we got the notification. + BasicBlock *E = &F->getEntryBlock(); + B.SetInsertPoint(BasicBlock::Create( + M.getContext(), "NotifyFunctionReachedProlog", F, E)); + compileFunctionReachedFlagSetter(B, FlagsIncoming + NextFlagIdx); + B.CreateBr(E); + + std::string GlobalName = GlobalValue::getGlobalIdentifier( + F->getName(), F->getLinkage(), M.getSourceFileName()); + NewDiscoveryRoots.push_back(GlobalValue::getGUID(GlobalName)); + ++NextFlagIdx; + } + + LLVM_DEBUG(dbgs() << "Instrumented " << NewDiscoveryRoots.size() + << " new functions in module " << M.getName() << "\n"); + + // Submit owner info, so the DiscoveryThread can evaluate the flags. + registerDiscoveryFlagOwners(std::move(NewDiscoveryRoots), FirstFlagIdx); + } + }); + + BaseLayer.emit(std::move(R), std::move(TSM)); +} + +void ThinLtoInstrumentationLayer::compileFunctionReachedFlagSetter( + IRBuilder<> &B, Flag *F) { + assert(*F == Clear); + Type *Int64Ty = Type::getInt64Ty(B.getContext()); + + // Write one immediate 8bit value to a fixed location in memory. + auto FlagAddr = pointerToJITTargetAddress(F); + Type *FlagTy = Type::getInt8Ty(B.getContext()); + B.CreateStore(ConstantInt::get(FlagTy, Fired), + B.CreateIntToPtr(ConstantInt::get(Int64Ty, FlagAddr), + FlagTy->getPointerTo())); + + if (MemFence & ThinLtoJIT::FenceJITedCode) { + // Overwrite the sync value with Release ordering. The discovery thread + // reads it with Acquire ordering. The actual value doesn't matter. + static constexpr bool IsVolatile = true; + static constexpr Instruction *NoInsertBefore = nullptr; + auto SyncFlagAddr = pointerToJITTargetAddress(&FlagsSync); + + B.Insert( + new StoreInst(ConstantInt::get(Int64Ty, 0), + B.CreateIntToPtr(ConstantInt::get(Int64Ty, SyncFlagAddr), + Int64Ty->getPointerTo()), + IsVolatile, MaybeAlign(64), AtomicOrdering::Release, + SyncScope::System, NoInsertBefore)); + } +} + +void ThinLtoInstrumentationLayer::dump(raw_ostream &OS) { + OS << "Discovery flags stats\n"; + + unsigned NumFlagsFired = 0; + for (unsigned i = 0; i < NumFlagsAllocated; i++) { + if (FlagsIncoming[i] == Fired) + ++NumFlagsFired; + } + OS << "Alloc: " << format("%6.d", NumFlagsAllocated) << "\n"; + OS << "Issued: " << format("%6.d", NumFlagsUsed.load()) << "\n"; + OS << "Fired: " << format("%6.d", NumFlagsFired) << "\n"; + + unsigned RemainingFlagOwners = 0; + for (const auto &_ : FlagOwnersMap) { + ++RemainingFlagOwners; + (void)_; + } + OS << "\nFlagOwnersMap has " << RemainingFlagOwners + << " remaining entries.\n"; +} + +ThinLtoInstrumentationLayer::~ThinLtoInstrumentationLayer() { + std::free(FlagsStorage); +} + +} // namespace orc +} // namespace llvm diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h new file mode 100644 index 00000000000..cd872078947 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h @@ -0,0 +1,77 @@ +#ifndef LLVM_EXAMPLES_THINLTOJIT_DISCOVERYLAYER_H +#define LLVM_EXAMPLES_THINLTOJIT_DISCOVERYLAYER_H + +#include "llvm/ExecutionEngine/JITSymbol.h" +#include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" +#include "llvm/ExecutionEngine/Orc/Layer.h" +#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/raw_ostream.h" + +#include "ThinLtoJIT.h" + +#include +#include +#include +#include +#include + +namespace llvm { +namespace orc { + +class ThinLtoInstrumentationLayer : public IRLayer { +public: + ThinLtoInstrumentationLayer(ExecutionSession &ES, IRCompileLayer &BaseLayer, + ThinLtoJIT::ExplicitMemoryBarrier MemFence, + unsigned FlagsPerBucket) + : IRLayer(ES, BaseLayer.getManglingOptions()), BaseLayer(BaseLayer), + MemFence(MemFence) { + // TODO: So far we only allocate one bucket. + allocateDiscoveryFlags(FlagsPerBucket); + } + + ~ThinLtoInstrumentationLayer() override; + + void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override; + + unsigned reserveDiscoveryFlags(unsigned Count); + void registerDiscoveryFlagOwners(std::vector Guids, + unsigned FirstIdx); + + void nudgeIntoDiscovery(std::vector Functions); + + std::vector takeFlagsThatFired(); + std::vector takeFlagOwners(std::vector Indexes); + + void dump(raw_ostream &OS); + +private: + IRCompileLayer &BaseLayer; + ThinLtoJIT::ExplicitMemoryBarrier MemFence; + + enum Flag : uint8_t { Clear = 0, Fired = 1 }; + + // Lock-free read access. + uint8_t *FlagsStorage; + Flag *FlagsIncoming; // lock-free write by design + Flag *FlagsHandled; + unsigned NumFlagsAllocated; + std::atomic NumFlagsUsed; // spin-lock + + // Acquire/release sync between writers and reader + std::atomic FlagsSync; + + // STL container requires locking for both, read and write access. + mutable std::mutex DiscoveryFlagsInfoLock; + std::map FlagOwnersMap; + + void allocateDiscoveryFlags(unsigned MinFlags); + void compileFunctionReachedFlagSetter(IRBuilder<> &B, Flag *F); +}; + +} // namespace orc +} // namespace llvm + +#endif diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp new file mode 100644 index 00000000000..394c1308fd6 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoJIT.cpp @@ -0,0 +1,333 @@ +#include "ThinLtoJIT.h" + +#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h" +#include "llvm/ExecutionEngine/Orc/CompileUtils.h" +#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" +#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" +#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h" +#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h" +#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" +#include "llvm/ExecutionEngine/SectionMemoryManager.h" +#include "llvm/Support/Debug.h" + +#include "ThinLtoDiscoveryThread.h" +#include "ThinLtoInstrumentationLayer.h" +#include "ThinLtoModuleIndex.h" + +#include +#include +#include + +#ifndef NDEBUG +#include +#endif + +#define DEBUG_TYPE "thinltojit" + +namespace llvm { +namespace orc { + +class ThinLtoDefinitionGenerator : public JITDylib::DefinitionGenerator { +public: + ThinLtoDefinitionGenerator(ThinLtoModuleIndex &GlobalIndex, + ThinLtoInstrumentationLayer &InstrumentationLayer, + ThinLtoJIT::AddModuleFunction AddModule, + char Prefix, bool AllowNudge, bool PrintStats) + : GlobalIndex(GlobalIndex), InstrumentationLayer(InstrumentationLayer), + AddModule(std::move(AddModule)), ManglePrefix(Prefix), + AllowNudgeIntoDiscovery(AllowNudge), PrintStats(PrintStats) {} + + ~ThinLtoDefinitionGenerator() { + if (PrintStats) + dump(errs()); + } + + Error tryToGenerate(LookupKind K, JITDylib &JD, + JITDylibLookupFlags JDLookupFlags, + const SymbolLookupSet &Symbols) override; + + void dump(raw_ostream &OS) { + OS << format("Modules submitted synchronously: %d\n", NumModulesMissed); + } + +private: + ThinLtoModuleIndex &GlobalIndex; + ThinLtoInstrumentationLayer &InstrumentationLayer; + ThinLtoJIT::AddModuleFunction AddModule; + char ManglePrefix; + bool AllowNudgeIntoDiscovery; + bool PrintStats; + unsigned NumModulesMissed{0}; + + // ThinLTO summaries encode unprefixed names. + StringRef stripGlobalManglePrefix(StringRef Symbol) const { + bool Strip = (ManglePrefix != '\0' && Symbol[0] == ManglePrefix); + return Strip ? StringRef(Symbol.data() + 1, Symbol.size() - 1) : Symbol; + } +}; + +Error ThinLtoDefinitionGenerator::tryToGenerate( + LookupKind K, JITDylib &JD, JITDylibLookupFlags JDLookupFlags, + const SymbolLookupSet &Symbols) { + std::set ModulePaths; + std::vector NewDiscoveryRoots; + + for (const auto &KV : Symbols) { + StringRef UnmangledName = stripGlobalManglePrefix(*KV.first); + auto Guid = GlobalValue::getGUID(UnmangledName); + if (GlobalValueSummary *S = GlobalIndex.getSummary(Guid)) { + // We could have discovered it ahead of time. + LLVM_DEBUG(dbgs() << format("Failed to discover symbol: %s\n", + UnmangledName.str().c_str())); + ModulePaths.insert(S->modulePath()); + if (AllowNudgeIntoDiscovery && isa(S)) { + NewDiscoveryRoots.push_back(Guid); + } + } + } + + NumModulesMissed += ModulePaths.size(); + + // Parse the requested modules if it hasn't happened yet. + GlobalIndex.scheduleModuleParsing(ModulePaths); + + for (StringRef Path : ModulePaths) { + ThreadSafeModule TSM = GlobalIndex.takeModule(Path); + assert(TSM && "We own the session lock, no asynchronous access possible"); + + if (Error LoadErr = AddModule(std::move(TSM))) + // Failed to add the module to the session. + return LoadErr; + + LLVM_DEBUG(dbgs() << "Generator: added " << Path << " synchronously\n"); + } + + // Requested functions that we failed to discover ahead of time, are likely + // close to the execution front. We can anticipate to run into them as soon + // as execution continues and trigger their discovery flags already now. This + // behavior is enabled with the 'allow-nudge' option and implemented below. + // On the one hand, it may give us a head start in a moment where discovery + // was lacking behind. On the other hand, we may bet on the wrong horse and + // waste extra time speculating in the wrong direction. + if (!NewDiscoveryRoots.empty()) { + assert(AllowNudgeIntoDiscovery); + InstrumentationLayer.nudgeIntoDiscovery(std::move(NewDiscoveryRoots)); + } + + return Error::success(); +} + +ThinLtoJIT::ThinLtoJIT(ArrayRef InputFiles, + StringRef MainFunctionName, unsigned LookaheadLevels, + unsigned NumCompileThreads, unsigned NumLoadThreads, + unsigned DiscoveryFlagsPerBucket, + ExplicitMemoryBarrier MemFence, + bool AllowNudgeIntoDiscovery, bool PrintStats, + Error &Err) { + ErrorAsOutParameter ErrAsOutParam(&Err); + + // Populate the module index, so we know which modules exist and we can find + // the one that defines the main function. + GlobalIndex = std::make_unique(ES, NumLoadThreads); + for (StringRef F : InputFiles) { + if (auto Err = GlobalIndex->add(F)) + ES.reportError(std::move(Err)); + } + + // Load the module that defines the main function. + auto TSM = setupMainModule(MainFunctionName); + if (!TSM) { + Err = TSM.takeError(); + return; + } + + // Infer target-specific utils from the main module. + ThreadSafeModule MainModule = std::move(*TSM); + auto JTMB = setupTargetUtils(MainModule.getModuleUnlocked()); + if (!JTMB) { + Err = JTMB.takeError(); + return; + } + + // Set up the JIT compile pipeline. + setupLayers(std::move(*JTMB), NumCompileThreads, DiscoveryFlagsPerBucket, + MemFence); + + // We can use the mangler now. Remember the mangled name of the main function. + MainFunctionMangled = (*Mangle)(MainFunctionName); + + // We are restricted to a single dylib currently. Add runtime overrides and + // symbol generators. + MainJD = &ES.createJITDylib("main"); + Err = setupJITDylib(MainJD, AllowNudgeIntoDiscovery, PrintStats); + if (Err) + return; + + // Spawn discovery thread and let it add newly discovered modules to the JIT. + setupDiscovery(MainJD, LookaheadLevels, PrintStats); + + Err = AddModule(std::move(MainModule)); + if (Err) + return; + + if (AllowNudgeIntoDiscovery) { + auto MainFunctionGuid = GlobalValue::getGUID(MainFunctionName); + InstrumentationLayer->nudgeIntoDiscovery({MainFunctionGuid}); + } +} + +Expected ThinLtoJIT::setupMainModule(StringRef MainFunction) { + Optional M = GlobalIndex->getModulePathForSymbol(MainFunction); + if (!M) { + std::string Buffer; + raw_string_ostream OS(Buffer); + OS << "No ValueInfo for symbol '" << MainFunction; + OS << "' in provided modules: "; + for (StringRef P : GlobalIndex->getAllModulePaths()) + OS << P << " "; + OS << "\n"; + return createStringError(inconvertibleErrorCode(), OS.str()); + } + + if (auto TSM = GlobalIndex->parseModuleFromFile(*M)) + return TSM; + + return createStringError(inconvertibleErrorCode(), + "Failed to parse main module"); +} + +Expected ThinLtoJIT::setupTargetUtils(Module *M) { + std::string T = M->getTargetTriple(); + JITTargetMachineBuilder JTMB(Triple(T.empty() ? sys::getProcessTriple() : T)); + + // CallThroughManager is ABI-specific + auto LCTM = createLocalLazyCallThroughManager( + JTMB.getTargetTriple(), ES, + pointerToJITTargetAddress(exitOnLazyCallThroughFailure)); + if (!LCTM) + return LCTM.takeError(); + CallThroughManager = std::move(*LCTM); + + // Use DataLayout or the given module or fall back to the host's default. + DL = DataLayout(M); + if (DL.getStringRepresentation().empty()) { + auto HostDL = JTMB.getDefaultDataLayoutForTarget(); + if (!HostDL) + return HostDL.takeError(); + DL = std::move(*HostDL); + if (Error Err = applyDataLayout(M)) + return std::move(Err); + } + + // Now that we know the target data layout we can setup the mangler. + Mangle = std::make_unique(ES, DL); + return JTMB; +} + +Error ThinLtoJIT::applyDataLayout(Module *M) { + if (M->getDataLayout().isDefault()) + M->setDataLayout(DL); + + if (M->getDataLayout() != DL) + return make_error( + "Added modules have incompatible data layouts", + inconvertibleErrorCode()); + + return Error::success(); +} + +static bool IsTrivialModule(MaterializationUnit *MU) { + StringRef ModuleName = MU->getName(); + return ModuleName == "" || ModuleName == "" || + ModuleName == ""; +} + +void ThinLtoJIT::setupLayers(JITTargetMachineBuilder JTMB, + unsigned NumCompileThreads, + unsigned DiscoveryFlagsPerBucket, + ExplicitMemoryBarrier MemFence) { + ObjLinkingLayer = std::make_unique( + ES, []() { return std::make_unique(); }); + + CompileLayer = std::make_unique( + ES, *ObjLinkingLayer, std::make_unique(JTMB)); + + InstrumentationLayer = std::make_unique( + ES, *CompileLayer, MemFence, DiscoveryFlagsPerBucket); + + OnDemandLayer = std::make_unique( + ES, *InstrumentationLayer, *CallThroughManager, + createLocalIndirectStubsManagerBuilder(JTMB.getTargetTriple())); + // Don't break up modules. Insert stubs on module boundaries. + OnDemandLayer->setPartitionFunction(CompileOnDemandLayer::compileWholeModule); + + // Delegate compilation to the thread pool. + CompileThreads = std::make_unique(NumCompileThreads); + ES.setDispatchMaterialization( + [this](JITDylib &JD, std::unique_ptr MU) { + if (IsTrivialModule(MU.get())) { + // This should be quick and we may save a few session locks. + MU->doMaterialize(JD); + } else { + // FIXME: Drop the std::shared_ptr workaround once ThreadPool::async() + // accepts llvm::unique_function to define jobs. + auto SharedMU = std::shared_ptr(std::move(MU)); + CompileThreads->async( + [MU = std::move(SharedMU), &JD]() { MU->doMaterialize(JD); }); + } + }); + + AddModule = [this](ThreadSafeModule TSM) -> Error { + assert(MainJD && "Setup MainJD JITDylib before calling"); + Module *M = TSM.getModuleUnlocked(); + if (Error Err = applyDataLayout(M)) + return Err; + VModuleKey Id = GlobalIndex->getModuleId(M->getName()); + return OnDemandLayer->add(*MainJD, std::move(TSM), Id); + }; +} + +void ThinLtoJIT::setupDiscovery(JITDylib *MainJD, unsigned LookaheadLevels, + bool PrintStats) { + JitRunning.store(true); + DiscoveryThreadWorker = std::make_unique( + JitRunning, ES, MainJD, *InstrumentationLayer, *GlobalIndex, AddModule, + LookaheadLevels, PrintStats); + + DiscoveryThread = std::thread(std::ref(*DiscoveryThreadWorker)); +} + +Error ThinLtoJIT::setupJITDylib(JITDylib *JD, bool AllowNudge, + bool PrintStats) { + // Register symbols for C++ static destructors. + LocalCXXRuntimeOverrides CXXRuntimeoverrides; + Error Err = CXXRuntimeoverrides.enable(*JD, *Mangle); + if (Err) + return Err; + + // Lookup symbol names in the global ThinLTO module index first + char Prefix = DL.getGlobalPrefix(); + JD->addGenerator(std::make_unique( + *GlobalIndex, *InstrumentationLayer, AddModule, Prefix, AllowNudge, + PrintStats)); + + // Then try lookup in the host process. + auto HostLookup = DynamicLibrarySearchGenerator::GetForCurrentProcess(Prefix); + if (!HostLookup) + return HostLookup.takeError(); + JD->addGenerator(std::move(*HostLookup)); + + return Error::success(); +} + +ThinLtoJIT::~ThinLtoJIT() { + // Signal the DiscoveryThread to shut down. + JitRunning.store(false); + DiscoveryThread.join(); + + // Wait for potential compile actions to finish. + CompileThreads->wait(); +} + +} // namespace orc +} // namespace llvm diff --git a/llvm/examples/ThinLtoJIT/ThinLtoJIT.h b/llvm/examples/ThinLtoJIT/ThinLtoJIT.h new file mode 100644 index 00000000000..4c2fddfd577 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoJIT.h @@ -0,0 +1,111 @@ +#ifndef LLVM_EXAMPLES_THINLTOJIT_THINLTOJIT_H +#define LLVM_EXAMPLES_THINLTOJIT_THINLTOJIT_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" +#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h" +#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ThreadPool.h" + +#include +#include +#include +#include +#include + +namespace llvm { +namespace orc { + +class ThinLtoDiscoveryThread; +class ThinLtoInstrumentationLayer; +class ThinLtoModuleIndex; + +class CompileOnDemandLayer; +class IRCompileLayer; +class RTDyldObjectLinkingLayer; + +class JITDylib; +class JITTargetMachineBuilder; +class LazyCallThroughManager; +class MangleAndInterner; + +class ThinLtoJIT { +public: + using AddModuleFunction = std::function; + + enum ExplicitMemoryBarrier { + NeverFence = 0, + FenceStaticCode = 1, + FenceJITedCode = 2, + AlwaysFence = 3 + }; + + ThinLtoJIT(ArrayRef InputFiles, StringRef MainFunctionName, + unsigned LookaheadLevels, unsigned NumCompileThreads, + unsigned NumLoadThreads, unsigned DiscoveryFlagsPerBucket, + ExplicitMemoryBarrier MemFence, bool AllowNudgeIntoDiscovery, + bool PrintStats, Error &Err); + ~ThinLtoJIT(); + + ThinLtoJIT(const ThinLtoJIT &) = delete; + ThinLtoJIT &operator=(const ThinLtoJIT &) = delete; + ThinLtoJIT(ThinLtoJIT &&) = delete; + ThinLtoJIT &operator=(ThinLtoJIT &&) = delete; + + Expected main(ArrayRef Args) { + auto MainSym = ES.lookup({MainJD}, MainFunctionMangled); + if (!MainSym) + return MainSym.takeError(); + + using MainFn = int(int, char *[]); + auto Main = jitTargetAddressToFunction(MainSym->getAddress()); + + return runAsMain(Main, Args, StringRef("ThinLtoJIT")); + } + +private: + ExecutionSession ES; + DataLayout DL{""}; + + JITDylib *MainJD; + SymbolStringPtr MainFunctionMangled; + std::unique_ptr CompileThreads; + std::unique_ptr GlobalIndex; + + AddModuleFunction AddModule; + std::unique_ptr ObjLinkingLayer; + std::unique_ptr CompileLayer; + std::unique_ptr InstrumentationLayer; + std::unique_ptr OnDemandLayer; + + std::atomic JitRunning; + std::thread DiscoveryThread; + std::unique_ptr DiscoveryThreadWorker; + + std::unique_ptr Mangle; + std::unique_ptr CallThroughManager; + + void setupLayers(JITTargetMachineBuilder JTMB, unsigned NumCompileThreads, + unsigned DiscoveryFlagsPerBucket, + ExplicitMemoryBarrier MemFence); + Error setupJITDylib(JITDylib *JD, bool AllowNudge, bool PrintStats); + void setupDiscovery(JITDylib *MainJD, unsigned LookaheadLevels, + bool PrintStats); + Expected setupMainModule(StringRef MainFunction); + Expected setupTargetUtils(Module *M); + Error applyDataLayout(Module *M); + + static void exitOnLazyCallThroughFailure() { + errs() << "Compilation failed. Aborting.\n"; + exit(1); + } +}; + +} // namespace orc +} // namespace llvm + +#endif diff --git a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp new file mode 100644 index 00000000000..596c9356826 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp @@ -0,0 +1,268 @@ +#include "ThinLtoModuleIndex.h" + +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include + +#define DEBUG_TYPE "thinltojit" + +namespace llvm { +namespace orc { + +Error ThinLtoModuleIndex::add(StringRef InputPath) { + auto Buffer = errorOrToExpected(MemoryBuffer::getFile(InputPath)); + if (!Buffer) + return Buffer.takeError(); + + Error ParseErr = readModuleSummaryIndex((*Buffer)->getMemBufferRef(), + CombinedSummaryIndex, NextModuleId); + if (ParseErr) + return ParseErr; + +#ifndef NDEBUG + auto Paths = getAllModulePaths(); + unsigned TotalPaths = Paths.size(); + std::sort(Paths.begin(), Paths.end()); + Paths.erase(std::unique(Paths.begin(), Paths.end()), Paths.end()); + assert(TotalPaths == Paths.size() && "Module paths must be unique"); +#endif + + ++NextModuleId; + return Error::success(); +} + +std::vector ThinLtoModuleIndex::getAllModulePaths() const { + auto ModuleTable = CombinedSummaryIndex.modulePaths(); + + std::vector Paths; + Paths.resize(ModuleTable.size()); + + for (const auto &KV : ModuleTable) { + assert(Paths[KV.second.first].empty() && "IDs are unique and continuous"); + Paths[KV.second.first] = KV.first(); + } + + return Paths; +} + +GlobalValueSummary * +ThinLtoModuleIndex::getSummary(GlobalValue::GUID Function) const { + ValueInfo VI = CombinedSummaryIndex.getValueInfo(Function); + if (!VI || VI.getSummaryList().empty()) + return nullptr; + + // There can be more than one symbol with the same GUID, in the case of same- + // named locals in different but same-named source files that were compiled in + // their respective directories (so the source file name and resulting GUID is + // the same). We avoid this by checking that module paths are unique upon + // add(). + // + // TODO: We can still get duplicates on symbols declared with + // attribute((weak)), a GNU extension supported by gcc and clang. + // We should support it by looking for a symbol in the current module + // or in the same module as the caller. + assert(VI.getSummaryList().size() == 1 && "Weak symbols not yet supported"); + + return VI.getSummaryList().front().get()->getBaseObject(); +} + +Optional +ThinLtoModuleIndex::getModulePathForSymbol(StringRef Name) const { + if (GlobalValueSummary *S = getSummary(GlobalValue::getGUID(Name))) + return S->modulePath(); + return None; // We don't know the symbol. +} + +void ThinLtoModuleIndex::scheduleModuleParsingPrelocked(StringRef Path) { + // Once the module was scheduled, we can call takeModule(). + auto ScheduledIt = ScheduledModules.find(Path); + if (ScheduledIt != ScheduledModules.end()) + return; + + auto Worker = [this](std::string Path) { + if (auto TSM = doParseModule(Path)) { + std::lock_guard Lock(ParsedModulesLock); + ParsedModules[Path] = std::move(*TSM); + + LLVM_DEBUG(dbgs() << "Finished parsing module: " << Path << "\n"); + } else { + ES.reportError(TSM.takeError()); + } + }; + + LLVM_DEBUG(dbgs() << "Schedule module for parsing: " << Path << "\n"); + ScheduledModules[Path] = ParseModuleWorkers.async(Worker, Path.str()); +} + +ThreadSafeModule ThinLtoModuleIndex::takeModule(StringRef Path) { + std::unique_lock ParseLock(ParsedModulesLock); + + auto ParsedIt = ParsedModules.find(Path); + if (ParsedIt == ParsedModules.end()) { + ParseLock.unlock(); + + // The module is not ready, wait for the future we stored. + std::unique_lock ScheduleLock(ScheduledModulesLock); + auto ScheduledIt = ScheduledModules.find(Path); + assert(ScheduledIt != ScheduledModules.end() && + "Don't call for unscheduled modules"); + std::shared_future Future = ScheduledIt->getValue(); + ScheduleLock.unlock(); + Future.get(); + + ParseLock.lock(); + ParsedIt = ParsedModules.find(Path); + assert(ParsedIt != ParsedModules.end() && "Must be ready now"); + } + + // We only add each module once. If it's not here anymore, we can skip it. + ThreadSafeModule TSM = std::move(ParsedIt->getValue()); + ParsedIt->getValue() = ThreadSafeModule(); + return TSM; +} + +ThreadSafeModule ThinLtoModuleIndex::parseModuleFromFile(StringRef Path) { + { + std::lock_guard ScheduleLock(ScheduledModulesLock); + scheduleModuleParsingPrelocked(Path); + } + return takeModule(Path); +} + +Expected ThinLtoModuleIndex::doParseModule(StringRef Path) { + // TODO: make a SMDiagnosticError class for this + SMDiagnostic Err; + auto Ctx = std::make_unique(); + auto M = parseIRFile(Path, Err, *Ctx); + if (!M) { + std::string ErrDescription; + { + raw_string_ostream S(ErrDescription); + Err.print("ThinLtoJIT", S); + } + return createStringError(inconvertibleErrorCode(), + "Failed to load module from file '%s' (%s)", + Path.data(), ErrDescription.c_str()); + } + + return ThreadSafeModule(std::move(M), std::move(Ctx)); +} + +// We don't filter visited functions. Discovery will often be retriggered +// from the middle of already visited functions and it aims to reach a little +// further each time. +void ThinLtoModuleIndex::discoverCalleeModulePaths(FunctionSummary *S, + unsigned LookaheadLevels) { + // Populate initial worklist + std::vector Worklist; + addToWorklist(Worklist, S->calls()); + unsigned Distance = 0; + + while (++Distance < LookaheadLevels) { + // Process current worklist and populate a new one. + std::vector NextWorklist; + for (FunctionSummary *F : Worklist) { + updatePathRank(F->modulePath(), Distance); + addToWorklist(NextWorklist, F->calls()); + } + Worklist = std::move(NextWorklist); + } + + // Process the last worklist without filling a new one + for (FunctionSummary *F : Worklist) { + updatePathRank(F->modulePath(), Distance); + } + + // Reset counts for known paths (includes both, scheduled and parsed modules). + std::lock_guard Lock(ScheduledModulesLock); + for (const auto &KV : ScheduledModules) { + PathRank[KV.first()].Count = 0; + } +} + +void ThinLtoModuleIndex::addToWorklist( + std::vector &List, + ArrayRef Calls) { + for (const auto &Edge : Calls) { + const auto &SummaryList = Edge.first.getSummaryList(); + if (!SummaryList.empty()) { + GlobalValueSummary *S = SummaryList.front().get()->getBaseObject(); + assert(isa(S) && "Callees must be functions"); + List.push_back(cast(S)); + } + } +} + +// PathRank is global and continuous. +void ThinLtoModuleIndex::updatePathRank(StringRef Path, unsigned Distance) { + auto &Entry = PathRank[Path]; + Entry.Count += 1; + Entry.MinDist = std::min(Entry.MinDist, Distance); + assert(Entry.MinDist > 0 && "We want it as a divisor"); +}; + +// TODO: The size of a ThreadPool's task queue is not accessible. It would +// be great to know in order to estimate how many modules we schedule. The +// more we schedule, the less precise is the ranking. The less we schedule, +// the higher the risk for downtime. +std::vector ThinLtoModuleIndex::selectNextPaths() { + struct ScorePath { + float Score; + unsigned MinDist; + StringRef Path; + }; + + std::vector Candidates; + Candidates.reserve(PathRank.size()); + for (const auto &KV : PathRank) { + float Score = static_cast(KV.second.Count) / KV.second.MinDist; + if (Score > .0f) { + Candidates.push_back({Score, KV.second.MinDist, KV.first()}); + } + } + + // Sort candidates by descending score. + std::sort(Candidates.begin(), Candidates.end(), + [](const ScorePath &LHS, const ScorePath &RHS) { + return LHS.Score > RHS.Score; + }); + + // Sort highest score candidates by ascending minimal distance. + size_t Selected = + std::min(std::max(NumParseModuleThreads, Candidates.size() / 2), + Candidates.size()); + std::sort(Candidates.begin(), Candidates.begin() + Selected, + [](const ScorePath &LHS, const ScorePath &RHS) { + return LHS.MinDist < RHS.MinDist; + }); + + std::vector Paths; + Paths.reserve(Selected); + for (unsigned i = 0; i < Selected; i++) { + Paths.push_back(Candidates[i].Path.str()); + } + + LLVM_DEBUG(dbgs() << "ModuleIndex: select " << Paths.size() << " out of " + << Candidates.size() << " discovered paths\n"); + + return Paths; +} + +unsigned ThinLtoModuleIndex::getNumDiscoveredModules() const { + // TODO: It would probably be more efficient to track the number of + // unscheduled modules. + unsigned NonNullItems = 0; + for (const auto &KV : PathRank) + if (KV.second.Count > 0) + ++NonNullItems; + return NonNullItems; +} + +} // namespace orc +} // namespace llvm diff --git a/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h new file mode 100644 index 00000000000..a6574be5c39 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.h @@ -0,0 +1,94 @@ +#ifndef LLVM_EXAMPLES_THINLTOJIT_THINLTOJITMODULEINDEX_H +#define LLVM_EXAMPLES_THINLTOJIT_THINLTOJITMODULEINDEX_H + +#include "llvm/ADT/Optional.h" +#include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/ModuleSummaryIndex.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ThreadPool.h" + +#include +#include +#include +#include +#include + +namespace llvm { +namespace orc { + +class SymbolStringPtr; + +class ThinLtoModuleIndex { + static constexpr bool HaveGVs = false; + +public: + ThinLtoModuleIndex(ExecutionSession &ES, unsigned ParseModuleThreads) + : ES(ES), CombinedSummaryIndex(HaveGVs), + ParseModuleWorkers(ParseModuleThreads), + NumParseModuleThreads(ParseModuleThreads) {} + + Error add(StringRef InputPath); + GlobalValueSummary *getSummary(GlobalValue::GUID Function) const; + std::vector getAllModulePaths() const; + Optional getModulePathForSymbol(StringRef Name) const; + + template void scheduleModuleParsing(const RangeT &Paths); + ThreadSafeModule takeModule(StringRef Path); + + // Blocking module parsing, returns a Null-module on error. + // Only used for the main module. + ThreadSafeModule parseModuleFromFile(StringRef Path); + + std::vector selectNextPaths(); + unsigned getNumDiscoveredModules() const; + void discoverCalleeModulePaths(FunctionSummary *S, unsigned LookaheadLevels); + + VModuleKey getModuleId(StringRef Path) const { + return CombinedSummaryIndex.getModuleId(Path); + } + +private: + ExecutionSession &ES; + ModuleSummaryIndex CombinedSummaryIndex; + uint64_t NextModuleId{0}; + + struct PathRankEntry { + uint32_t Count{0}; + uint32_t MinDist{100}; + }; + StringMap PathRank; + + ThreadPool ParseModuleWorkers; + unsigned NumParseModuleThreads; + + std::mutex ScheduledModulesLock; + StringMap> ScheduledModules; + + std::mutex ParsedModulesLock; + StringMap ParsedModules; + + void updatePathRank(StringRef Path, unsigned Distance); + void addToWorklist(std::vector &List, + ArrayRef Calls); + + std::vector selectAllPaths(); + std::vector selectHotPaths(unsigned Count); + + void scheduleModuleParsingPrelocked(StringRef Path); + Expected doParseModule(StringRef Path); +}; + +template +inline void ThinLtoModuleIndex::scheduleModuleParsing(const RangeT &Paths) { + std::lock_guard Lock(ScheduledModulesLock); + for (const auto &Path : Paths) { + scheduleModuleParsingPrelocked(Path); + } +} + +} // namespace orc +} // namespace llvm + +#endif diff --git a/llvm/examples/ThinLtoJIT/bench b/llvm/examples/ThinLtoJIT/bench new file mode 100755 index 00000000000..796697eb594 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/bench @@ -0,0 +1,100 @@ +#!/bin/bash +#set -x + +if [ $# -gt 2 ]; then + TOOLS_DIR="$1" + SOURCE_DIR="$2" + MAIN_SOURCE_FILE="$3" +else + echo "Usage: bench
[]" + exit 1 +fi + +if [ $# -gt 3 ]; then + SYS_ROOT="$4" +else + SYS_ROOT="/" +fi + +function check_tool () +{ + if [ -e "${TOOLS_DIR}/$1" ]; then + echo "Found: $1" + else + echo "!!! Cannot find required tool, please provide it in the LLVM binaries folder: $1" + fi +} + +check_tool lli +check_tool SpeculativeJIT +check_tool ThinLtoJIT + +SKIP_BITCODE_GEN=0 +if [[ -e bc-default || -e bc-thinlto || -e ll-default || -e ll-thinlto ]]; then + echo "Skipping bitcode generation: output directories existing" + echo "Please clean up manually: rm -R bc-default bc-thinlto ll-default ll-thinlto" + SKIP_BITCODE_GEN=1 +else + check_tool clang + check_tool llvm-dis + check_tool llvm-lto + mkdir bc-default + mkdir bc-thinlto + mkdir ll-default + mkdir ll-thinlto +fi + +ROOT_DIR=$(pwd) +ALL_BITCODE_FILES="" + +MAIN_FILE_BASENAME=$(basename "${MAIN_SOURCE_FILE%.c*}") +LLI_EXTRA_MODULES="" + +for f in ${SOURCE_DIR}/*.c* ; do + BASE_NAME=$(basename "${f%.c*}") + + if [ ${SKIP_BITCODE_GEN} -eq 0 ]; then + echo "Compile: $f -> ${BASE_NAME}.bc" + + ${TOOLS_DIR}/clang -c -I ${SOURCE_DIR} ${CFLAGS} -isysroot ${SYS_ROOT} -emit-llvm \ + -o "bc-default/${BASE_NAME}.bc" "$f" + ${TOOLS_DIR}/clang -c -I ${SOURCE_DIR} ${CFLAGS} -isysroot ${SYS_ROOT} -flto=thin \ + -o "bc-thinlto/${BASE_NAME}.bc" "$f" + + echo "Disassemble ${BASE_NAME}.bc -> ${BASE_NAME}.ll" + ${TOOLS_DIR}/llvm-dis bc-default/${BASE_NAME}.bc -o ll-default/${BASE_NAME}.ll + ${TOOLS_DIR}/llvm-dis bc-thinlto/${BASE_NAME}.bc -o ll-thinlto/${BASE_NAME}.ll + fi + + ALL_BITCODE_FILES="${ALL_BITCODE_FILES} ${BASE_NAME}.bc" + if [ "${BASE_NAME}" != "${MAIN_FILE_BASENAME}" ]; then + LLI_EXTRA_MODULES="${LLI_EXTRA_MODULES} -extra-module=${BASE_NAME}.bc" + fi +done + +if [ ${SKIP_BITCODE_GEN} -eq 0 ]; then + echo "Link global index file: index.thinlto.bc" + cd ${ROOT_DIR}/bc-thinlto + ${TOOLS_DIR}/llvm-lto --thinlto -o ${ROOT_DIR}/bc-thinlto/index ${ALL_BITCODE_FILES} + + echo "Disassemble global index file: index.thinlto.ll" + cd ${ROOT_DIR}/ll-thinlto + ${TOOLS_DIR}/llvm-dis -o index.thinlto.ll ${ROOT_DIR}/bc-thinlto/index.thinlto.bc +fi + +set -x +cd ${ROOT_DIR}/bc-default +time (${TOOLS_DIR}/clang -o ${MAIN_FILE_BASENAME} -O0 ${LDFLAGS} ${ALL_BITCODE_FILES} && ./${MAIN_FILE_BASENAME} ${EXEC_ARGS} 1>/dev/null) +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=mcjit "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-mcjit "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -compile-threads=8 "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=8 "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=8 -O1 "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null +time ${TOOLS_DIR}/lli ${LLI_EXTRA_MODULES} -jit-kind=orc-lazy -per-module-lazy -compile-threads=8 -O0 "${MAIN_FILE_BASENAME}.bc" ${EXEC_ARGS} 1>/dev/null +time ${TOOLS_DIR}/SpeculativeJIT -num-threads=8 ${ALL_BITCODE_FILES} --args ${EXEC_ARGS} 1>/dev/null + +cd ${ROOT_DIR}/bc-thinlto +#time (${TOOLS_DIR}/clang -flto=thin -o test ${ALL_BITCODE_FILES} && ./test ${EXEC_ARGS} 1>/dev/null) +time ${TOOLS_DIR}/ThinLtoJIT index.thinlto.bc --args ${EXEC_ARGS} 1>/dev/null diff --git a/llvm/examples/ThinLtoJIT/main.cpp b/llvm/examples/ThinLtoJIT/main.cpp new file mode 100644 index 00000000000..5a338e94744 --- /dev/null +++ b/llvm/examples/ThinLtoJIT/main.cpp @@ -0,0 +1,83 @@ +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/TargetSelect.h" + +#include "ThinLtoJIT.h" + +#include +#include + +using namespace llvm; + +static cl::list + InputFiles(cl::Positional, cl::OneOrMore, + cl::desc("")); + +static cl::list InputArgs("args", cl::Positional, + cl::desc("..."), + cl::ZeroOrMore, cl::PositionalEatsArgs); + +static cl::opt CompileThreads("compile-threads", cl::Optional, + cl::desc("Number of compile threads"), + cl::init(4)); + +static cl::opt LoadThreads("load-threads", cl::Optional, + cl::desc("Number of module load threads"), + cl::init(8)); + +static cl::opt + LookaheadLevels("lookahead", cl::Optional, + cl::desc("Calls to look ahead of execution"), cl::init(4)); + +static cl::opt DiscoveryFlagsBucketSize( + "discovery-flag-bucket-size", cl::Optional, + cl::desc("Flags per bucket (rounds up to memory pages)"), cl::init(4096)); + +static cl::opt + MemFence("mem-fence", + cl::desc("Control memory fences for cache synchronization"), + cl::init(orc::ThinLtoJIT::NeverFence), + cl::values(clEnumValN(orc::ThinLtoJIT::NeverFence, "never", + "No use of memory fences"), + clEnumValN(orc::ThinLtoJIT::FenceStaticCode, "static", + "Use of memory fences in static code only"), + clEnumValN(orc::ThinLtoJIT::FenceJITedCode, "jited", + "Install memory fences in JITed code only"), + clEnumValN(orc::ThinLtoJIT::AlwaysFence, "always", + "Always use of memory fences"))); + +static cl::opt + AllowNudge("allow-nudge", + cl::desc("Allow the symbol generator to nudge symbols into " + "discovery even though they haven't been reached"), + cl::init(false)); + +static cl::opt PrintStats("print-stats", + cl::desc("Print module stats on shutdown"), + cl::init(false)); + +int main(int argc, char *argv[]) { + InitLLVM X(argc, argv); + InitializeNativeTarget(); + InitializeNativeTargetAsmPrinter(); + cl::ParseCommandLineOptions(argc, argv, "ThinLtoJIT"); + + Error Err = Error::success(); + auto atLeastOne = [](unsigned N) { return std::max(1u, N); }; + + orc::ThinLtoJIT Jit(InputFiles, "main", atLeastOne(LookaheadLevels), + atLeastOne(CompileThreads), atLeastOne(LoadThreads), + DiscoveryFlagsBucketSize, MemFence, AllowNudge, + PrintStats, Err); + if (Err) { + logAllUnhandledErrors(std::move(Err), errs(), "[ThinLtoJIT] "); + exit(1); + } + + ExitOnError ExitOnErr; + ExitOnErr.setBanner("[ThinLtoJIT] "); + + return ExitOnErr(Jit.main(InputArgs)); +} -- 2.11.0