From a57d5d0ef66c1b060c94cb1297e2558d653c240c Mon Sep 17 00:00:00 2001 From: Sam Schweigel Date: Wed, 12 Nov 2025 16:13:14 -0800 Subject: [PATCH 01/10] Add JLJITLinkMemoryManager (ports memory manager to JITLink) (#60105) Ports our RTDyLD memory manager to JITLink in order to avoid memory use regressions after switching to JITLink everywhere (#60031). This is a direct port: finalization must happen all at once, because it invalidates all allocation `wr_ptr`s. I decided it wasn't worth it to associate `OnFinalizedFunction` callbacks with each block, since they are large enough to make it extremely likely that all in-flight allocations land in the same block; everything must be relocated before finalization can happen. (cherry picked from commit 6fa0e756d1464e4eae9e27acb78815766e90251d) --- src/cgmemmgr.cpp | 233 +++++++++++++++++++++++++++++++++++++--------- src/jitlayers.cpp | 7 +- 2 files changed, 188 insertions(+), 52 deletions(-) diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp index c257d2a2e3331..7cf358ddf1e95 100644 --- a/src/cgmemmgr.cpp +++ b/src/cgmemmgr.cpp @@ -3,7 +3,11 @@ #include "llvm-version.h" #include "platform.h" +#include +#include +#include #include + #include "julia.h" #include "julia_internal.h" @@ -458,18 +462,27 @@ struct Block { } }; +struct Allocation { + // Address to write to (the one returned by the allocation function) + void *wr_addr; + // Runtime address + void *rt_addr; + size_t sz; + bool relocated; +}; + class RWAllocator { static constexpr int nblocks = 8; Block blocks[nblocks]{}; public: RWAllocator() JL_NOTSAFEPOINT = default; - void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT + Allocation alloc(size_t size, size_t align) JL_NOTSAFEPOINT { size_t min_size = (size_t)-1; int min_id = 0; for (int i = 0;i < nblocks && blocks[i].ptr;i++) { if (void *ptr = blocks[i].alloc(size, align)) - return ptr; + return {ptr, ptr, size, false}; if (blocks[i].avail < min_size) { min_size = blocks[i].avail; min_id = i; @@ -477,7 +490,8 @@ class RWAllocator { } size_t block_size = get_block_size(size); blocks[min_id].reset(map_anon_page(block_size), block_size); - return blocks[min_id].alloc(size, align); + void *ptr = blocks[min_id].alloc(size, align); + return {ptr, ptr, size, false}; } }; @@ -517,16 +531,6 @@ struct SplitPtrBlock : public Block { } }; -struct Allocation { - // Address to write to (the one returned by the allocation function) - void *wr_addr; - // Runtime address - void *rt_addr; - size_t sz; - bool relocated; -}; - -template class ROAllocator { protected: static constexpr int nblocks = 8; @@ -554,7 +558,7 @@ class ROAllocator { } // Allocations that have not been finalized yet. SmallVector allocations; - void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT + Allocation alloc(size_t size, size_t align) JL_NOTSAFEPOINT { size_t min_size = (size_t)-1; int min_id = 0; @@ -570,8 +574,9 @@ class ROAllocator { wr_ptr = get_wr_ptr(block, ptr, size, align); } block.state |= SplitPtrBlock::Alloc; - allocations.push_back(Allocation{wr_ptr, ptr, size, false}); - return wr_ptr; + Allocation a{wr_ptr, ptr, size, false}; + allocations.push_back(a); + return a; } if (block.avail < min_size) { min_size = block.avail; @@ -592,18 +597,21 @@ class ROAllocator { #ifdef _OS_WINDOWS_ block.state = SplitPtrBlock::Alloc; void *wr_ptr = get_wr_ptr(block, ptr, size, align); - allocations.push_back(Allocation{wr_ptr, ptr, size, false}); + Allocation a{wr_ptr, ptr, size, false}; + allocations.push_back(a); ptr = wr_ptr; #else block.state = SplitPtrBlock::Alloc | SplitPtrBlock::InitAlloc; - allocations.push_back(Allocation{ptr, ptr, size, false}); + Allocation a{ptr, ptr, size, false}; + allocations.push_back(a); #endif - return ptr; + return a; } }; -template -class DualMapAllocator : public ROAllocator { +class DualMapAllocator : public ROAllocator { + bool exec; + protected: void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, size_t, size_t) override JL_NOTSAFEPOINT { @@ -664,7 +672,7 @@ class DualMapAllocator : public ROAllocator { } } public: - DualMapAllocator() JL_NOTSAFEPOINT + DualMapAllocator(bool exec) JL_NOTSAFEPOINT : exec(exec) { assert(anon_hdl != -1); } @@ -677,13 +685,13 @@ class DualMapAllocator : public ROAllocator { finalize_block(block, true); block.reset(nullptr, 0); } - ROAllocator::finalize(); + ROAllocator::finalize(); } }; #ifdef _OS_LINUX_ -template -class SelfMemAllocator : public ROAllocator { +class SelfMemAllocator : public ROAllocator { + bool exec; SmallVector temp_buff; protected: void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, @@ -720,9 +728,7 @@ class SelfMemAllocator : public ROAllocator { } } public: - SelfMemAllocator() JL_NOTSAFEPOINT - : ROAllocator(), - temp_buff() + SelfMemAllocator(bool exec) JL_NOTSAFEPOINT : exec(exec), temp_buff() { assert(get_self_mem_fd() != -1); } @@ -756,11 +762,25 @@ class SelfMemAllocator : public ROAllocator { } if (cached) temp_buff.resize(1); - ROAllocator::finalize(); + ROAllocator::finalize(); } }; #endif // _OS_LINUX_ +std::pair, std::unique_ptr> +get_preferred_allocators() JL_NOTSAFEPOINT +{ +#ifdef _OS_LINUX_ + if (get_self_mem_fd() != -1) + return {std::make_unique(false), + std::make_unique(true)}; +#endif + if (init_shared_map() != -1) + return {std::make_unique(false), + std::make_unique(true)}; + return {}; +} + class RTDyldMemoryManagerJL : public SectionMemoryManager { struct EHFrame { uint8_t *addr; @@ -770,8 +790,8 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager { void operator=(const RTDyldMemoryManagerJL&) = delete; SmallVector pending_eh; RWAllocator rw_alloc; - std::unique_ptr> ro_alloc; - std::unique_ptr> exe_alloc; + std::unique_ptr ro_alloc; + std::unique_ptr exe_alloc; size_t total_allocated; public: @@ -779,20 +799,9 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager { : SectionMemoryManager(), pending_eh(), rw_alloc(), - ro_alloc(), - exe_alloc(), total_allocated(0) { -#ifdef _OS_LINUX_ - if (!ro_alloc && get_self_mem_fd() != -1) { - ro_alloc.reset(new SelfMemAllocator()); - exe_alloc.reset(new SelfMemAllocator()); - } -#endif - if (!ro_alloc && init_shared_map() != -1) { - ro_alloc.reset(new DualMapAllocator()); - exe_alloc.reset(new DualMapAllocator()); - } + std::tie(ro_alloc, exe_alloc) = get_preferred_allocators(); } ~RTDyldMemoryManagerJL() override JL_NOTSAFEPOINT { @@ -845,7 +854,7 @@ uint8_t *RTDyldMemoryManagerJL::allocateCodeSection(uintptr_t Size, jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size); jl_timing_counter_inc(JL_TIMING_COUNTER_JITCodeSize, Size); if (exe_alloc) - return (uint8_t*)exe_alloc->alloc(Size, Alignment); + return (uint8_t*)exe_alloc->alloc(Size, Alignment).wr_addr; return SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID, SectionName); } @@ -860,9 +869,9 @@ uint8_t *RTDyldMemoryManagerJL::allocateDataSection(uintptr_t Size, jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size); jl_timing_counter_inc(JL_TIMING_COUNTER_JITDataSize, Size); if (!isReadOnly) - return (uint8_t*)rw_alloc.alloc(Size, Alignment); + return (uint8_t*)rw_alloc.alloc(Size, Alignment).wr_addr; if (ro_alloc) - return (uint8_t*)ro_alloc->alloc(Size, Alignment); + return (uint8_t*)ro_alloc->alloc(Size, Alignment).wr_addr; return SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, SectionName, isReadOnly); } @@ -917,6 +926,133 @@ void RTDyldMemoryManagerJL::deregisterEHFrames(uint8_t *Addr, } #endif +class JLJITLinkMemoryManager : public jitlink::JITLinkMemoryManager { + using OnFinalizedFunction = + jitlink::JITLinkMemoryManager::InFlightAlloc::OnFinalizedFunction; + + std::mutex Mutex; + RWAllocator RWAlloc; + std::unique_ptr ROAlloc; + std::unique_ptr ExeAlloc; + SmallVector FinalizedCallbacks; + uint32_t InFlight{0}; + +public: + class InFlightAlloc; + + static std::unique_ptr Create() + { + auto [ROAlloc, ExeAlloc] = get_preferred_allocators(); + if (ROAlloc && ExeAlloc) + return std::unique_ptr( + new JLJITLinkMemoryManager(std::move(ROAlloc), std::move(ExeAlloc))); + + return cantFail( + orc::MapperJITLinkMemoryManager::CreateWithMapper( + /*Reservation Granularity*/ 16 * 1024 * 1024)); + } + + void allocate(const jitlink::JITLinkDylib *JD, jitlink::LinkGraph &G, + OnAllocatedFunction OnAllocated) override; + + void deallocate(std::vector Allocs, + OnDeallocatedFunction OnDeallocated) override + { + jl_unreachable(); + } + +protected: + JLJITLinkMemoryManager(std::unique_ptr ROAlloc, + std::unique_ptr ExeAlloc) + : ROAlloc(std::move(ROAlloc)), ExeAlloc(std::move(ExeAlloc)) + { + } + + void finalize(OnFinalizedFunction OnFinalized) + { + SmallVector Callbacks; + { + std::unique_lock Lock{Mutex}; + FinalizedCallbacks.push_back(std::move(OnFinalized)); + + if (--InFlight > 0) + return; + + ROAlloc->finalize(); + ExeAlloc->finalize(); + Callbacks = std::move(FinalizedCallbacks); + } + + for (auto &CB : Callbacks) + std::move(CB)(FinalizedAlloc{}); + } +}; + +class JLJITLinkMemoryManager::InFlightAlloc + : public jitlink::JITLinkMemoryManager::InFlightAlloc { + JLJITLinkMemoryManager &MM; + jitlink::LinkGraph &G; + +public: + InFlightAlloc(JLJITLinkMemoryManager &MM, jitlink::LinkGraph &G) : MM(MM), G(G) {} + + void abandon(OnAbandonedFunction OnAbandoned) override { jl_unreachable(); } + + void finalize(OnFinalizedFunction OnFinalized) override + { + auto *GP = &G; + MM.finalize([GP, OnFinalized = + std::move(OnFinalized)](Expected FA) mutable { + if (!FA) + return OnFinalized(FA.takeError()); + // Need to handle dealloc actions when we GC code + auto E = orc::shared::runFinalizeActions(GP->allocActions()); + if (!E) + return OnFinalized(E.takeError()); + OnFinalized(std::move(FA)); + }); + } +}; + +using orc::MemProt; + +void JLJITLinkMemoryManager::allocate(const jitlink::JITLinkDylib *JD, + jitlink::LinkGraph &G, + OnAllocatedFunction OnAllocated) +{ + jitlink::BasicLayout BL{G}; + + { + std::unique_lock Lock{Mutex}; + for (auto &[AG, Seg] : BL.segments()) { + if (AG.getMemLifetime() == orc::MemLifetime::NoAlloc) + continue; + assert(AG.getMemLifetime() == orc::MemLifetime::Standard); + + auto Prot = AG.getMemProt(); + uint64_t Alignment = Seg.Alignment.value(); + uint64_t Size = Seg.ContentSize + Seg.ZeroFillSize; + Allocation Alloc; + if (Prot == (MemProt::Read | MemProt::Write)) + Alloc = RWAlloc.alloc(Size, Alignment); + else if (Prot == MemProt::Read) + Alloc = ROAlloc->alloc(Size, Alignment); + else if (Prot == (MemProt::Read | MemProt::Exec)) + Alloc = ExeAlloc->alloc(Size, Alignment); + else + abort(); + + Seg.Addr = orc::ExecutorAddr::fromPtr(Alloc.rt_addr); + Seg.WorkingMem = (char *)Alloc.wr_addr; + } + } + + if (auto Err = BL.apply()) + return OnAllocated(std::move(Err)); + + ++InFlight; + OnAllocated(std::make_unique(*this, G)); +} } RTDyldMemoryManager* createRTDyldMemoryManager() JL_NOTSAFEPOINT @@ -928,3 +1064,8 @@ size_t getRTDyldMemoryManagerTotalBytes(RTDyldMemoryManager *mm) JL_NOTSAFEPOINT { return ((RTDyldMemoryManagerJL*)mm)->getTotalBytes(); } + +std::unique_ptr createJITLinkMemoryManager() +{ + return JLJITLinkMemoryManager::Create(); +} diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 3ea95ea42f596..299e5d39d2c9f 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -1156,12 +1156,6 @@ class JLMemoryUsagePlugin : public ObjectLinkingLayer::Plugin { #pragma clang diagnostic ignored "-Wunused-function" #endif -// TODO: Port our memory management optimisations to JITLink instead of using the -// default InProcessMemoryManager. -std::unique_ptr createJITLinkMemoryManager() JL_NOTSAFEPOINT { - return cantFail(orc::MapperJITLinkMemoryManager::CreateWithMapper(/*Reservation Granularity*/ 16 * 1024 * 1024)); -} - #ifdef _COMPILER_CLANG_ #pragma clang diagnostic pop #endif @@ -1185,6 +1179,7 @@ class JLEHFrameRegistrar final : public jitlink::EHFrameRegistrar { }; RTDyldMemoryManager *createRTDyldMemoryManager(void) JL_NOTSAFEPOINT; +std::unique_ptr createJITLinkMemoryManager() JL_NOTSAFEPOINT; // A simple forwarding class, since OrcJIT v2 needs a unique_ptr, while we have a shared_ptr class ForwardingMemoryManager : public RuntimeDyld::MemoryManager { From 5c0221d20ed3250de0bbf7cc70add4b1c473b84b Mon Sep 17 00:00:00 2001 From: Sam Schweigel Date: Wed, 20 Aug 2025 10:58:21 -0700 Subject: [PATCH 02/10] Use temporary files for aotcompile outputs instead of memory (cherry picked from commit d844566a08b644ef206d81af2cfab92b82142bf9) --- src/aotcompile.cpp | 429 ++++++++++++++++++++++++++++++--------------- 1 file changed, 286 insertions(+), 143 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 6009bd435534c..7853856000934 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -1410,15 +1410,118 @@ struct ShardTimers { } }; +class AOTOutput { +public: + AOTOutput(const Twine &prefix, const char *suffix) + : name((prefix + "." + suffix).str()), state(OPEN) + { + std::error_code err = sys::fs::createTemporaryFile(prefix, suffix, fd, path); + if (err) + jl_errorf("failed to create temporary file: %s", err.message().c_str()); + } + ~AOTOutput() { remove(); } + AOTOutput(const AOTOutput &) = delete; + AOTOutput &operator=(const AOTOutput &) = delete; + AOTOutput(AOTOutput &&other) noexcept + : name(std::move(other.name)), + state(other.state), + fd(other.fd), + path(std::move(other.path)) + { + other.state = EMPTY; + } + AOTOutput &operator=(AOTOutput &&other) noexcept + { + remove(); + name = std::move(other.name); + std::swap(state, other.state); + fd = other.fd; + path = std::move(other.path); + return *this; + } + + std::unique_ptr ostream() + { + open(); + return std::make_unique(fd, false); + } + + ErrorOr> memorybuf() + { + open(); + auto f = sys::fs::convertFDToNativeFile(fd); + sys::fs::file_status status; + if (auto err = sys::fs::status(fd, status)) + return err; + return MemoryBuffer::getOpenFile(f, name, status.getSize(), false); + } + + StringRef get_name() { return name; } + + void open() + { + using namespace sys::fs; + assert(state == EXISTS || state == OPEN); + if (state == OPEN) + return; + auto err = openFileForReadWrite(path, fd, CD_OpenExisting, OF_None); + if (err) + jl_errorf("failed to open temporary file %s\n", path.c_str()); + state = OPEN; + } + + void close() + { + if (state == OPEN) { + auto f = sys::fs::convertFDToNativeFile(fd); + (void)sys::fs::closeFile(f); + state = EXISTS; + } + } + + void remove() + { + close(); + if (state == EXISTS) { + assert(!path.empty()); + (void)sys::fs::remove(path); + state = EMPTY; + } + } + +private: + std::string name; + enum { EMPTY, EXISTS, OPEN } state; + int fd; + SmallString<128> path; +}; + struct AOTOutputs { - SmallVector unopt, opt, obj, asm_; + AOTOutputs(const char *bc_fname, const char *unopt_bc_fname, const char *obj_fname, + const char *asm_fname) + : bc_fname(bc_fname), + unopt_bc_fname(unopt_bc_fname), + obj_fname(obj_fname), + asm_fname(asm_fname) + { + if (bc_fname) + opt.emplace(); + if (unopt_bc_fname) + unopt.emplace(); + if (obj_fname) + obj.emplace(); + if (asm_fname) + asm_.emplace(); + } + + std::mutex lock; + const char *bc_fname, *unopt_bc_fname, *obj_fname, *asm_fname; + // If one of the vectors is present, this output is being requested. + std::optional> unopt, opt, obj, asm_; }; // Perform the actual optimization and emission of the output files -static AOTOutputs add_output_impl(Module &M, TargetMachine &SourceTM, ShardTimers &timers, - bool unopt, bool opt, bool obj, bool asm_) { - assert((unopt || opt || obj || asm_) && "no output requested"); - AOTOutputs out; +static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &SourceTM, ShardTimers *timer = nullptr) { auto TM = std::unique_ptr( SourceTM.getTarget().createTargetMachine( SourceTM.getTargetTriple().str(), @@ -1429,23 +1532,35 @@ static AOTOutputs add_output_impl(Module &M, TargetMachine &SourceTM, ShardTimer SourceTM.getCodeModel(), SourceTM.getOptLevel())); fixupTM(*TM); - if (unopt) { - timers.unopt.startTimer(); - raw_svector_ostream OS(out.unopt); - PassBuilder PB; - AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; - ModulePassManager MPM; - MPM.addPass(BitcodeWriterPass(OS)); - MPM.run(M, AM.MAM); - timers.unopt.stopTimer(); - } - if (!opt && !obj && !asm_) { - return out; + if (outputs.unopt) { + if (timer) + timer->unopt.startTimer(); + AOTOutput out{M.getModuleIdentifier(), "unopt.bc"}; + auto OS = out.ostream(); + { + PassBuilder PB; + AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; + ModulePassManager MPM; + MPM.addPass(BitcodeWriterPass(*OS)); + MPM.run(M, AM.MAM); + } + if (timer) + timer->unopt.stopTimer(); + OS->flush(); + out.close(); + { + std::lock_guard guard{outputs.lock}; + outputs.unopt->push_back(std::move(out)); + } + } + if (!outputs.opt && !outputs.obj && !outputs.asm_) { + return; } assert(!verifyLLVMIR(M)); { - timers.optimize.startTimer(); + if (timer) + timer->optimize.startTimer(); auto PMTM = std::unique_ptr( SourceTM.getTarget().createTargetMachine( @@ -1507,51 +1622,85 @@ static AOTOutputs add_output_impl(Module &M, TargetMachine &SourceTM, ShardTimer injectCRTAlias(M, "__truncsdbf2", "julia__truncdfbf2", FunctionType::get(Type::getBFloatTy(M.getContext()), { Type::getDoubleTy(M.getContext()) }, false)); } - timers.optimize.stopTimer(); + if (timer) + timer->optimize.stopTimer(); } - if (opt) { - timers.opt.startTimer(); - raw_svector_ostream OS(out.opt); - PassBuilder PB; - AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; - ModulePassManager MPM; - MPM.addPass(BitcodeWriterPass(OS)); - MPM.run(M, AM.MAM); - timers.opt.stopTimer(); + if (outputs.opt) { + if (timer) + timer->opt.startTimer(); + AOTOutput out{M.getModuleIdentifier(), "bc"}; + auto OS = out.ostream(); + { + PassBuilder PB; + AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; + ModulePassManager MPM; + MPM.addPass(BitcodeWriterPass(*OS)); + MPM.run(M, AM.MAM); + } + OS->flush(); + out.close(); + if (timer) + timer->opt.stopTimer(); + { + std::lock_guard guard{outputs.lock}; + outputs.opt->push_back(std::move(out)); + } } - if (obj) { - timers.obj.startTimer(); - raw_svector_ostream OS(out.obj); - legacy::PassManager emitter; - addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); + if (outputs.obj) { + if (timer) + timer->obj.startTimer(); + AOTOutput out{M.getModuleIdentifier(), "o"}; + auto OS = out.ostream(); + { + legacy::PassManager emitter; + addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); #if JL_LLVM_VERSION >= 180000 - if (TM->addPassesToEmitFile(emitter, OS, nullptr, CodeGenFileType::ObjectFile, false)) + if (TM->addPassesToEmitFile(emitter, *OS, nullptr, CodeGenFileType::ObjectFile, false)) #else - if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_ObjectFile, false)) + if (TM->addPassesToEmitFile(emitter, *OS, nullptr, CGFT_ObjectFile, false)) #endif - jl_safe_printf("ERROR: target does not support generation of object files\n"); - emitter.run(M); - timers.obj.stopTimer(); + jl_safe_printf("ERROR: target does not support generation of object files\n"); + emitter.run(M); + } + OS->flush(); + out.close(); + if (timer) + timer->obj.stopTimer(); + { + std::lock_guard guard{outputs.lock}; + outputs.obj->push_back(std::move(out)); + } } - if (asm_) { - timers.asm_.startTimer(); - raw_svector_ostream OS(out.asm_); - legacy::PassManager emitter; - addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); + if (outputs.asm_) { + if (timer) + timer->asm_.startTimer(); + AOTOutput out{M.getModuleIdentifier(), "s"}; + auto OS = out.ostream(); + { + legacy::PassManager emitter; + addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); #if JL_LLVM_VERSION >= 180000 - if (TM->addPassesToEmitFile(emitter, OS, nullptr, CodeGenFileType::AssemblyFile, false)) + if (TM->addPassesToEmitFile(emitter, *OS, nullptr, + CodeGenFileType::AssemblyFile, false)) #else - if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_AssemblyFile, false)) + if (TM->addPassesToEmitFile(emitter, *OS, nullptr, CGFT_AssemblyFile, false)) #endif - jl_safe_printf("ERROR: target does not support generation of assembly files\n"); - emitter.run(M); - timers.asm_.stopTimer(); + jl_safe_printf( + "ERROR: target does not support generation of assembly files\n"); + emitter.run(M); + } + OS->flush(); + out.close(); + if (timer) + timer->asm_.stopTimer(); + { + std::lock_guard guard{outputs.lock}; + outputs.asm_->push_back(std::move(out)); + } } - - return out; } // serialize module to bitcode @@ -1724,14 +1873,53 @@ extern "C" void lambda_trampoline(void* arg) { delete func; } +template +static void +add_output_no_partition(AOTOutputs &outputs, Module &M, TargetMachine &TM, StringRef name, ModuleReleasedFunc module_released) +{ + { + JL_TIMING(NATIVE_AOT, NATIVE_Opt); + // convert gvars to the expected offset table format for shard 0 + if (M.getGlobalVariable("jl_gvars")) { + auto gvars = consume_gv(M, "jl_gvars", false); + Type *T_size = M.getDataLayout().getIntPtrType(M.getContext()); + emit_offset_table(M, T_size, gvars, "jl_gvar", + "_0"); // module flag "julia.mv.suffix" + M.getGlobalVariable("jl_gvar_idxs")->setName("jl_gvar_idxs_0"); + } + add_output_impl(outputs, M, TM); + } + // Don't need M anymore + module_released(M); +} + +static bool should_report_image_timings() +{ + bool report_timings = false; + if (auto env = getenv("JULIA_IMAGE_TIMINGS")) { + char *endptr; + unsigned long val = strtoul(env, &endptr, 10); + if (endptr != env && !*endptr && val <= 1) { + report_timings = val; + } else { + if (StringRef("true").compare_insensitive(env) == 0) + report_timings = true; + else if (StringRef("false").compare_insensitive(env) == 0) + report_timings = false; + else + errs() << "WARNING: Invalid value for JULIA_IMAGE_TIMINGS: " << env << "\n"; + } + } + return report_timings; +} + // Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading, // as well as partitioning, serialization, and deserialization. template -static SmallVector add_output(Module &M, TargetMachine &TM, StringRef name, unsigned threads, - bool unopt_out, bool opt_out, bool obj_out, bool asm_out, ModuleReleasedFunc module_released) { - SmallVector outputs(threads); +static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, StringRef name, + unsigned threads, ModuleReleasedFunc module_released) +{ assert(threads); - assert(unopt_out || opt_out || obj_out || asm_out); // Timers for timing purposes TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str()); SmallVector timers(threads); @@ -1751,49 +1939,7 @@ static SmallVector add_output(Module &M, TargetMachine &TM, Stri Timer partition_timer("partition", "Partition module", timer_group); Timer serialize_timer("serialize", "Serialize module", timer_group); Timer output_timer("output", "Add outputs", timer_group); - bool report_timings = false; - if (auto env = getenv("JULIA_IMAGE_TIMINGS")) { - char *endptr; - unsigned long val = strtoul(env, &endptr, 10); - if (endptr != env && !*endptr && val <= 1) { - report_timings = val; - } else { - if (StringRef("true").compare_insensitive(env) == 0) - report_timings = true; - else if (StringRef("false").compare_insensitive(env) == 0) - report_timings = false; - else - errs() << "WARNING: Invalid value for JULIA_IMAGE_TIMINGS: " << env << "\n"; - } - } - // Single-threaded case - if (threads == 1) { - output_timer.startTimer(); - { - JL_TIMING(NATIVE_AOT, NATIVE_Opt); - // convert gvars to the expected offset table format for shard 0 - if (M.getGlobalVariable("jl_gvars")) { - auto gvars = consume_gv(M, "jl_gvars", false); - Type *T_size = M.getDataLayout().getIntPtrType(M.getContext()); - emit_offset_table(M, T_size, gvars, "jl_gvar", "_0"); // module flag "julia.mv.suffix" - M.getGlobalVariable("jl_gvar_idxs")->setName("jl_gvar_idxs_0"); - } - outputs[0] = add_output_impl(M, TM, timers[0], unopt_out, opt_out, obj_out, asm_out); - } - output_timer.stopTimer(); - // Don't need M anymore - module_released(M); - - if (!report_timings) { - timer_group.clear(); - } else { - timer_group.print(dbgs(), true); - for (auto &t : timers) { - t.print(dbgs(), true); - } - } - return outputs; - } + bool report_timings = should_report_image_timings(); partition_timer.startTimer(); uint64_t counter = 0; @@ -1804,6 +1950,7 @@ static SmallVector add_output(Module &M, TargetMachine &TM, Stri G.setName("jl_ext_" + Twine(counter++)); } } + auto partitions = partitionModule(M, threads); partition_timer.stopTimer(); @@ -1826,7 +1973,7 @@ static SmallVector add_output(Module &M, TargetMachine &TM, Stri ctx.setDiscardValueNames(true); // Lazily deserialize the entire module timers[i].deserialize.startTimer(); - auto EM = getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx); + auto EM = getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), name), ctx); // Make sure this also fails with only julia, but not LLVM assertions enabled, // otherwise, the first error we hit is the LLVM module verification failure, // which will look very confusing, because the module was partially deserialized. @@ -1842,6 +1989,7 @@ static SmallVector add_output(Module &M, TargetMachine &TM, Stri timers[i].construct.startTimer(); std::string suffix = "_" + std::to_string(i); construct_vars(*M, partitions[i], suffix); + M->setModuleIdentifier((Twine(M->getModuleIdentifier()) + "#" + Twine(i)).str()); M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), suffix)); // The DICompileUnit file is not used for anything, but ld64 requires it be a unique string per object file // or it may skip emitting debug info for that file. Here set it to ./julia#N @@ -1850,7 +1998,7 @@ static SmallVector add_output(Module &M, TargetMachine &TM, Stri CU->replaceOperandWith(0, topfile); timers[i].construct.stopTimer(); - outputs[i] = add_output_impl(*M, TM, timers[i], unopt_out, opt_out, obj_out, asm_out); + add_output_impl(outputs, *M, TM, &timers[i]); }; auto arg = new std::function(func); uv_thread_create(&workers[i], lambda_trampoline, arg); // Use libuv thread to avoid issues with stack sizes @@ -1881,7 +2029,6 @@ static SmallVector add_output(Module &M, TargetMachine &TM, Stri } dbgs() << "]\n"; } - return outputs; } extern int jl_is_timing_passes; @@ -2016,13 +2163,8 @@ void jl_dump_native_impl(void *native_code, OverrideStackAlignment = M.getOverrideStackAlignment(); }); - auto compile = [&](Module &M, StringRef name, unsigned threads, auto module_released) { - return add_output(M, *SourceTM, name, threads, !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname, module_released); - }; + AOTOutputs outputs{bc_fname, unopt_bc_fname, obj_fname, asm_fname}; - SmallVector sysimg_outputs; - SmallVector data_outputs; - SmallVector metadata_outputs; if (z) { JL_TIMING(NATIVE_AOT, NATIVE_Sysimg); LLVMContext Context; @@ -2053,10 +2195,10 @@ void jl_dump_native_impl(void *native_code, // Results in serious memory savings ios_close(z); free(z); - // Note that we don't set z to null, this allows the check in WRITE_ARCHIVE + // Note that we don't set z to null, this allows the check in write_archive // to function as expected // no need to free the module/context, destructor handles that - sysimg_outputs = compile(sysimgM, "sysimg", 1, [](Module &) {}); + add_output_no_partition(outputs, sysimgM, *SourceTM, "sysimg", [](Module &){}); } const bool imaging_mode = true; @@ -2161,14 +2303,15 @@ void jl_dump_native_impl(void *native_code, auto lock = TSCtx.getLock(); auto dataM = data->M.getModuleUnlocked(); - data_outputs = compile(*dataM, "text", threads, [data, &lock, &TSCtx](Module &) { - // Delete data when add_output thinks it's done with it - // Saves memory for use when multithreading - auto lock2 = std::move(lock); - delete data; - // Drop last reference to shared LLVM::Context - auto TSCtx2 = std::move(TSCtx); - }); + add_output(outputs, *dataM, *SourceTM, "text", threads, + [data, &lock, &TSCtx](Module &) { + // Delete data when add_output thinks it's done with it + // Saves memory for use when multithreading + auto lock2 = std::move(lock); + delete data; + // Drop last reference to shared LLVM::Context + auto TSCtx2 = std::move(TSCtx); + }); } if (params->emit_metadata) { @@ -2258,7 +2401,7 @@ void jl_dump_native_impl(void *native_code, } // no need to free module/context, destructor handles that - metadata_outputs = compile(metadataM, "data", 1, [](Module &) {}); + add_output_no_partition(outputs, metadataM, *SourceTM, "data", [](Module &) {}); } { @@ -2270,32 +2413,32 @@ void jl_dump_native_impl(void *native_code, #else #define WritingMode true #endif -#define WRITE_ARCHIVE(fname, field, prefix, suffix) \ - if (fname) {\ - SmallVector archive; \ - SmallVector filenames; \ - SmallVector buffers; \ - for (size_t i = 0; i < threads; i++) { \ - filenames.push_back((StringRef("text") + prefix + "#" + Twine(i) + suffix).str()); \ - buffers.push_back(StringRef(data_outputs[i].field.data(), data_outputs[i].field.size())); \ - } \ - filenames.push_back("metadata" prefix suffix); \ - buffers.push_back(StringRef(metadata_outputs[0].field.data(), metadata_outputs[0].field.size())); \ - if (z) { \ - filenames.push_back("sysimg" prefix suffix); \ - buffers.push_back(StringRef(sysimg_outputs[0].field.data(), sysimg_outputs[0].field.size())); \ - } \ - for (size_t i = 0; i < filenames.size(); i++) { \ - archive.push_back(NewArchiveMember(MemoryBufferRef(buffers[i], filenames[i]))); \ - } \ - handleAllErrors(writeArchive(fname, archive, WritingMode, Kind, true, false), reportWriterError); \ - } - - WRITE_ARCHIVE(unopt_bc_fname, unopt, "_unopt", ".bc"); - WRITE_ARCHIVE(bc_fname, opt, "_opt", ".bc"); - WRITE_ARCHIVE(obj_fname, obj, "", ".o"); - WRITE_ARCHIVE(asm_fname, asm_, "", ".s"); -#undef WRITE_ARCHIVE + auto write_archive = [&](const char *fname, SmallVector &outputs) { + if (!fname) + return; + SmallVector archive; + // Must be SmallString<0> so StringRefs in NewArchiveMembers aren't invalidated + SmallVector, 0> buffers; + for (auto &out : outputs) { + auto buf = out.memorybuf(); + if (buf.getError()) + jl_errorf("failed to read temporary object file: %s", + buf.getError().message().c_str()); + buffers.push_back(std::move(*buf)); + archive.push_back(NewArchiveMember{*buffers.back()}); + } + handleAllErrors(writeArchive(fname, archive, WritingMode, Kind, true, false), + reportWriterError); + }; + + if (outputs.unopt) + write_archive(unopt_bc_fname, *outputs.unopt); + if (outputs.opt) + write_archive(bc_fname, *outputs.opt); + if (outputs.obj) + write_archive(obj_fname, *outputs.obj); + if (outputs.asm_) + write_archive(asm_fname, *outputs.asm_); } } From 76e8a80bd23af4a96ad4b290c32b297d0225aca3 Mon Sep 17 00:00:00 2001 From: Sam Schweigel Date: Wed, 20 Aug 2025 11:22:25 -0700 Subject: [PATCH 03/10] Use more partitions than threads in aotcompile (cherry picked from commit 1f159908e15a1b99762f9419de9a5b0b736f375b) --- src/aotcompile.cpp | 119 ++++++++++++++++++++++++++------------------- 1 file changed, 70 insertions(+), 49 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 7853856000934..304f63d69faa8 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -1000,11 +1000,11 @@ static GlobalVariable *emit_ptls_table(Module &M, Type *T_size, Type *T_ptr) { } // See src/processor.h for documentation about this table. Corresponds to jl_image_header_t. -static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned nfvars, unsigned ngvars) { +static GlobalVariable *emit_image_header(Module &M, unsigned shards, unsigned nfvars, unsigned ngvars) { constexpr uint32_t version = 1; std::array header{ version, - threads, + shards, nfvars, ngvars, }; @@ -1913,17 +1913,9 @@ static bool should_report_image_timings() return report_timings; } -// Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading, -// as well as partitioning, serialization, and deserialization. -template -static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, StringRef name, - unsigned threads, ModuleReleasedFunc module_released) +static void initialize_shard_timers(StringRef name, SmallVector &timers) { - assert(threads); - // Timers for timing purposes - TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str()); - SmallVector timers(threads); - for (unsigned i = 0; i < threads; ++i) { + for (unsigned i = 0; i < timers.size(); ++i) { auto idx = std::to_string(i); timers[i].name = "shard_" + idx; timers[i].desc = ("Timings for " + name + " module shard " + idx).str(); @@ -1936,6 +1928,18 @@ static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, String timers[i].obj.init("obj_" + idx, "Emit object file"); timers[i].asm_.init("asm_" + idx, "Emit assembly file"); } +} + +// Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading, +// as well as partitioning, serialization, and deserialization. +template +static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, StringRef name, + unsigned threads, unsigned shards, + ModuleReleasedFunc module_released) +{ + assert(threads); + // Timers for timing purposes + TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str()); Timer partition_timer("partition", "Partition module", timer_group); Timer serialize_timer("serialize", "Serialize module", timer_group); Timer output_timer("output", "Add outputs", timer_group); @@ -1951,7 +1955,7 @@ static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, String } } - auto partitions = partitionModule(M, threads); + auto partitions = partitionModule(M, shards); partition_timer.stopTimer(); serialize_timer.startTimer(); @@ -1961,47 +1965,59 @@ static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, String // Don't need M anymore, since we'll only read from serialized from now on module_released(M); + SmallVector timers(shards); + initialize_shard_timers(name, timers); + + std::atomic next_part = 0; + output_timer.startTimer(); // Start all of the worker threads { JL_TIMING(NATIVE_AOT, NATIVE_Opt); std::vector workers(threads); - for (unsigned i = 0; i < threads; i++) { - std::function func = [&, i]() { - LLVMContext ctx; - ctx.setDiscardValueNames(true); - // Lazily deserialize the entire module - timers[i].deserialize.startTimer(); - auto EM = getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), name), ctx); - // Make sure this also fails with only julia, but not LLVM assertions enabled, - // otherwise, the first error we hit is the LLVM module verification failure, - // which will look very confusing, because the module was partially deserialized. - bool deser_succeeded = (bool)EM; - auto M = cantFail(std::move(EM), "Error loading module"); - assert(deser_succeeded); (void)deser_succeeded; - timers[i].deserialize.stopTimer(); - - timers[i].materialize.startTimer(); - materializePreserved(*M, partitions[i]); - timers[i].materialize.stopTimer(); - - timers[i].construct.startTimer(); - std::string suffix = "_" + std::to_string(i); - construct_vars(*M, partitions[i], suffix); - M->setModuleIdentifier((Twine(M->getModuleIdentifier()) + "#" + Twine(i)).str()); - M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), suffix)); - // The DICompileUnit file is not used for anything, but ld64 requires it be a unique string per object file - // or it may skip emitting debug info for that file. Here set it to ./julia#N - DIFile *topfile = DIFile::get(M->getContext(), "julia#" + std::to_string(i), "."); - for (DICompileUnit *CU : M->debug_compile_units()) - CU->replaceOperandWith(0, topfile); - timers[i].construct.stopTimer(); - - add_output_impl(outputs, *M, TM, &timers[i]); + for (unsigned tid = 0; tid < threads; tid++) { + std::function func = [&]() { + while (1) { + unsigned i = std::atomic_fetch_add(&next_part, 1); + if (i >= shards) + return; + + Partition &partition = partitions[i]; + LLVMContext ctx; + ctx.setDiscardValueNames(true); + // Lazily deserialize the entire module + timers[i].deserialize.startTimer(); + auto EM = getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), name), ctx); + // Make sure this also fails with only julia, but not LLVM assertions enabled, + // otherwise, the first error we hit is the LLVM module verification failure, + // which will look very confusing, because the module was partially deserialized. + bool deser_succeeded = (bool)EM; + auto M = cantFail(std::move(EM), "Error loading module"); + assert(deser_succeeded); (void)deser_succeeded; + timers[i].deserialize.stopTimer(); + + timers[i].materialize.startTimer(); + materializePreserved(*M, partition); + timers[i].materialize.stopTimer(); + + timers[i].construct.startTimer(); + std::string suffix = "_" + std::to_string(i); + construct_vars(*M, partition, suffix); + M->setModuleIdentifier((Twine(M->getModuleIdentifier()) + "#" + Twine(i)).str()); + M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), suffix)); + // The DICompileUnit file is not used for anything, but ld64 requires it be a unique string per object file + // or it may skip emitting debug info for that file. Here set it to ./julia#N + DIFile *topfile = DIFile::get(M->getContext(), "julia#" + std::to_string(i), "."); + for (DICompileUnit *CU : M->debug_compile_units()) + CU->replaceOperandWith(0, topfile); + timers[i].construct.stopTimer(); + + add_output_impl(outputs, *M, TM, &timers[i]); + } }; auto arg = new std::function(func); - uv_thread_create(&workers[i], lambda_trampoline, arg); // Use libuv thread to avoid issues with stack sizes + uv_thread_create(&workers[tid], lambda_trampoline, arg); // Use libuv thread to avoid issues with stack sizes } // Wait for all of the worker threads to finish @@ -2297,13 +2313,18 @@ void jl_dump_native_impl(void *native_code, has_veccall = !!dataM.getModuleFlag("julia.mv.veccall"); }); + size_t nshards; { // Don't use withModuleDo here since we delete the TSM midway through auto TSCtx = data->M.getContext(); auto lock = TSCtx.getLock(); auto dataM = data->M.getModuleUnlocked(); - add_output(outputs, *dataM, *SourceTM, "text", threads, + auto info = compute_module_info(*dataM); + constexpr size_t weight_per_partition = 500000; + nshards = std::max(1, info.weight / weight_per_partition); + + add_output(outputs, *dataM, *SourceTM, "text", threads, nshards, [data, &lock, &TSCtx](Module &) { // Delete data when add_output thinks it's done with it // Saves memory for use when multithreading @@ -2372,9 +2393,9 @@ void jl_dump_native_impl(void *native_code, auto target_ids = new GlobalVariable(metadataM, value->getType(), true, GlobalVariable::InternalLinkage, value, "jl_dispatch_target_ids"); - auto shards = emit_shard_table(metadataM, T_size, T_psize, threads); + auto shards = emit_shard_table(metadataM, T_size, T_psize, nshards); auto ptls = emit_ptls_table(metadataM, T_size, T_ptr); - auto header = emit_image_header(metadataM, threads, nfvars, ngvars); + auto header = emit_image_header(metadataM, nshards, nfvars, ngvars); auto AT = ArrayType::get(T_size, sizeof(jl_small_typeof) / sizeof(void*)); auto jl_small_typeof_copy = new GlobalVariable(metadataM, AT, false, GlobalVariable::ExternalLinkage, From 213eb10fda9a71a21c950d9a8bca6aa6b352ce23 Mon Sep 17 00:00:00 2001 From: Sam Schweigel Date: Thu, 21 Aug 2025 10:44:48 -0700 Subject: [PATCH 04/10] Fix partitioning modules with zero fvars or gvars (cherry picked from commit 2c517642a968538e5d02ddf9678dc2ce3d9fde3e) --- src/aotcompile.cpp | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 304f63d69faa8..1645fb5fd8d62 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -1024,22 +1024,24 @@ static void get_fvars_gvars(Module &M, DenseMap &fvars, assert(gvars_gv); assert(fvars_idxs); assert(gvars_idxs); - auto fvars_init = cast(fvars_gv->getInitializer()); - auto gvars_init = cast(gvars_gv->getInitializer()); - for (unsigned i = 0; i < fvars_init->getNumOperands(); ++i) { - auto gv = cast(fvars_init->getOperand(i)->stripPointerCasts()); - assert(gv && gv->hasName() && "fvar must be a named global"); - assert(!fvars.count(gv) && "Duplicate fvar"); - fvars[gv] = i; - } - assert(fvars.size() == fvars_init->getNumOperands()); - for (unsigned i = 0; i < gvars_init->getNumOperands(); ++i) { - auto gv = cast(gvars_init->getOperand(i)->stripPointerCasts()); - assert(gv && gv->hasName() && "gvar must be a named global"); - assert(!gvars.count(gv) && "Duplicate gvar"); - gvars[gv] = i; - } - assert(gvars.size() == gvars_init->getNumOperands()); + if (auto fvars_init = dyn_cast(fvars_gv->getInitializer())) { + for (unsigned i = 0; i < fvars_init->getNumOperands(); ++i) { + auto gv = cast(fvars_init->getOperand(i)->stripPointerCasts()); + assert(gv && gv->hasName() && "fvar must be a named global"); + assert(!fvars.count(gv) && "Duplicate fvar"); + fvars[gv] = i; + } + assert(fvars.size() == fvars_init->getNumOperands()); + } + if (auto gvars_init = dyn_cast(gvars_gv->getInitializer())) { + for (unsigned i = 0; i < gvars_init->getNumOperands(); ++i) { + auto gv = cast(gvars_init->getOperand(i)->stripPointerCasts()); + assert(gv && gv->hasName() && "gvar must be a named global"); + assert(!gvars.count(gv) && "Duplicate gvar"); + gvars[gv] = i; + } + assert(gvars.size() == gvars_init->getNumOperands()); + } fvars_gv->eraseFromParent(); gvars_gv->eraseFromParent(); fvars_idxs->eraseFromParent(); From 58aa0eeda2ea98da9bfbc557ec2300217b32f4e9 Mon Sep 17 00:00:00 2001 From: Sam Schweigel Date: Thu, 21 Aug 2025 12:24:21 -0700 Subject: [PATCH 05/10] Reduce partition weight to 100000, add JULIA_IMAGE_PARTITION_WEIGHT Also reuse already-computed ModuleInfo (cherry picked from commit be73aabf9e5130dce3cbca1a7e7080b2df76fd3e) --- src/aotcompile.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 1645fb5fd8d62..fa891b84d9710 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -2221,6 +2221,7 @@ void jl_dump_native_impl(void *native_code, const bool imaging_mode = true; unsigned threads = 1; + unsigned nshards; unsigned nfvars = 0; unsigned ngvars = 0; @@ -2278,6 +2279,17 @@ void jl_dump_native_impl(void *native_code, ); threads = compute_image_thread_count(module_info); LLVM_DEBUG(dbgs() << "Using " << threads << " to emit aot image\n"); + + char *weight_s = getenv("JULIA_IMAGE_PARTITION_WEIGHT"); + size_t weight = 100000; + char *end; + if (weight_s) { + size_t x = strtol(weight_s, &end, 10); + if (weight_s != end) + weight = x; + } + nshards = std::max(1, module_info.weight / weight); + nfvars = data->jl_sysimg_fvars.size(); ngvars = data->jl_sysimg_gvars.size(); emit_table(dataM, data->jl_sysimg_gvars, "jl_gvars", T_psize); @@ -2315,17 +2327,11 @@ void jl_dump_native_impl(void *native_code, has_veccall = !!dataM.getModuleFlag("julia.mv.veccall"); }); - size_t nshards; { // Don't use withModuleDo here since we delete the TSM midway through auto TSCtx = data->M.getContext(); auto lock = TSCtx.getLock(); auto dataM = data->M.getModuleUnlocked(); - - auto info = compute_module_info(*dataM); - constexpr size_t weight_per_partition = 500000; - nshards = std::max(1, info.weight / weight_per_partition); - add_output(outputs, *dataM, *SourceTM, "text", threads, nshards, [data, &lock, &TSCtx](Module &) { // Delete data when add_output thinks it's done with it From c0dfbff2059b188990352558536048adabacf28f Mon Sep 17 00:00:00 2001 From: Sam Schweigel Date: Thu, 21 Aug 2025 15:59:38 -0700 Subject: [PATCH 06/10] Use temporary buffers with madvise on Unix (cherry picked from commit e64d4bd0a277134a892277485ff701f6396a6922) --- src/aotcompile.cpp | 80 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 62 insertions(+), 18 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index fa891b84d9710..f6bbede0c25ee 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -42,6 +42,9 @@ #include #include +#ifndef _OS_WINDOWS_ +#include +#endif using namespace llvm; @@ -1412,14 +1415,22 @@ struct ShardTimers { } }; +#ifdef _OS_WINDOWS_ +#define JL_USE_TEMP_FILES +#endif + class AOTOutput { public: - AOTOutput(const Twine &prefix, const char *suffix) - : name((prefix + "." + suffix).str()), state(OPEN) + AOTOutput(const Twine &prefix, const char *suffix) : name((prefix + "." + suffix).str()) { +#ifdef JL_USE_TEMP_FILES std::error_code err = sys::fs::createTemporaryFile(prefix, suffix, fd, path); if (err) jl_errorf("failed to create temporary file: %s", err.message().c_str()); + state = OPEN; +#else + state = MEMORY; +#endif } ~AOTOutput() { remove(); } AOTOutput(const AOTOutput &) = delete; @@ -1428,43 +1439,53 @@ class AOTOutput { : name(std::move(other.name)), state(other.state), fd(other.fd), - path(std::move(other.path)) + path(std::move(other.path)), + buf(std::move(other.buf)) { other.state = EMPTY; } AOTOutput &operator=(AOTOutput &&other) noexcept { remove(); - name = std::move(other.name); + std::swap(name, other.name); std::swap(state, other.state); - fd = other.fd; - path = std::move(other.path); + std::swap(fd, other.fd); + std::swap(path, other.path); + std::swap(buf, other.buf); return *this; } std::unique_ptr ostream() { open(); - return std::make_unique(fd, false); + if (state == OPEN) + return std::make_unique(fd, false); + else + return std::make_unique(buf); } ErrorOr> memorybuf() { open(); - auto f = sys::fs::convertFDToNativeFile(fd); - sys::fs::file_status status; - if (auto err = sys::fs::status(fd, status)) - return err; - return MemoryBuffer::getOpenFile(f, name, status.getSize(), false); + if (state == OPEN) { + auto f = sys::fs::convertFDToNativeFile(fd); + sys::fs::file_status status; + if (auto err = sys::fs::status(fd, status)) + return err; + return MemoryBuffer::getOpenFile(f, name, status.getSize(), false); + } + else if (state == MEMORY) { + return MemoryBuffer::getMemBuffer(StringRef{buf.data(), buf.size()}, name, + false); + } + jl_unreachable(); } - StringRef get_name() { return name; } - void open() { using namespace sys::fs; - assert(state == EXISTS || state == OPEN); - if (state == OPEN) + assert(state == EXISTS || state == OPEN || state == MEMORY); + if (state == OPEN || state == MEMORY) return; auto err = openFileForReadWrite(path, fd, CD_OpenExisting, OF_None); if (err) @@ -1479,6 +1500,20 @@ class AOTOutput { (void)sys::fs::closeFile(f); state = EXISTS; } + else if (state == MEMORY) { + void *p = (void *)((uintptr_t)buf.data() & ~(jl_page_size - 1)); + size_t s = LLT_ALIGN(buf.size(), jl_page_size); +#if defined(_OS_DARWIN_) || defined(_OS_FREEBSD_) || defined(_OS_OPENBSD_) + if (s > 0) + madvise(p, s, MADV_DONTNEED); +#elif defined(_OS_LINUX_) && defined(MADV_COLD) + if (s > 0) + madvise(p, s, MADV_COLD); +#else + (void)p; + (void)s; +#endif + } } void remove() @@ -1489,13 +1524,22 @@ class AOTOutput { (void)sys::fs::remove(path); state = EMPTY; } + else if (state == MEMORY) { + buf.clear(); + } } private: std::string name; - enum { EMPTY, EXISTS, OPEN } state; + enum { + EMPTY, // Temporary file removed/buffer freed + EXISTS, // Temporary file exists but is not open (save FDs) + OPEN, // Temporary file exists and is open + MEMORY, // Contents are stored in memory + } state; int fd; SmallString<128> path; + SmallVector buf; }; struct AOTOutputs { @@ -2221,7 +2265,7 @@ void jl_dump_native_impl(void *native_code, const bool imaging_mode = true; unsigned threads = 1; - unsigned nshards; + unsigned nshards = 1; unsigned nfvars = 0; unsigned ngvars = 0; From f1b783d6d006996ca6afdc2daea54d9a6c44edd3 Mon Sep 17 00:00:00 2001 From: Sam Schweigel Date: Fri, 22 Aug 2025 14:45:30 -0700 Subject: [PATCH 07/10] Use raw_fd_ostream to support old LLVM (cherry picked from commit 040d85f193a6c511397bc026f15ab390a8fb45d5) --- src/aotcompile.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index f6bbede0c25ee..90b429e9f68d3 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -1459,7 +1459,7 @@ class AOTOutput { { open(); if (state == OPEN) - return std::make_unique(fd, false); + return std::make_unique(fd, false); else return std::make_unique(buf); } From c67de81c8350d85562ae4cfb07c7b795dea4a5d2 Mon Sep 17 00:00:00 2001 From: Sam Schweigel Date: Fri, 22 Aug 2025 15:17:47 -0700 Subject: [PATCH 08/10] Use memory buffers exclusively on Unix, temp file on Windows (cherry picked from commit 78093988a261310af1f21cdcba4de8bf96407e9f) --- src/aotcompile.cpp | 123 ++++++++++++++++++++++++--------------------- 1 file changed, 66 insertions(+), 57 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 90b429e9f68d3..2af7ac243d6cb 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -42,7 +42,9 @@ #include #include -#ifndef _OS_WINDOWS_ +#ifdef _OS_WINDOWS_ +#include +#else #include #endif @@ -1415,19 +1417,33 @@ struct ShardTimers { } }; -#ifdef _OS_WINDOWS_ -#define JL_USE_TEMP_FILES -#endif +// If an AOTOutput is greater than this many bytes, we should write it to a +// temporary file on Windows, or MADV_DONTNEED/MADV_COLD it on Unix to alleviate +// memory pressure. Except in rare cases, this should be triggered only by the +// output containing the heap image. +constexpr size_t jl_large_aotoutput = 64 * 1024 * 1024; // 64 MiB class AOTOutput { public: AOTOutput(const Twine &prefix, const char *suffix) : name((prefix + "." + suffix).str()) { -#ifdef JL_USE_TEMP_FILES - std::error_code err = sys::fs::createTemporaryFile(prefix, suffix, fd, path); - if (err) - jl_errorf("failed to create temporary file: %s", err.message().c_str()); - state = OPEN; +#ifdef _OS_WINDOWS_ + SmallString<128> path; + SmallVector path_utf16; + auto model = prefix + "-%%%%%%." + suffix; + sys::fs::createUniquePath(model, path, true); + auto fail = [&]() { + jl_errorf("failed to create temporary file: %s", path.c_str()); + }; + if (sys::windows::widenPath(path, path_utf16)) + fail(); + file = CreateFileW(path_utf16.begin(), GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ, nullptr, CREATE_ALWAYS, + FILE_ATTRIBUTE_TEMPORARY | FILE_FLAG_DELETE_ON_CLOSE, nullptr); + if (file == INVALID_HANDLE_VALUE) + fail(); + fd = _open_osfhandle((intptr_t)file, 0); + state = TMP_OPEN; #else state = MEMORY; #endif @@ -1438,9 +1454,12 @@ class AOTOutput { AOTOutput(AOTOutput &&other) noexcept : name(std::move(other.name)), state(other.state), +#ifdef _OS_WINDOWS_ + file(other.file), fd(other.fd), - path(std::move(other.path)), +#endif buf(std::move(other.buf)) + { other.state = EMPTY; } @@ -1449,58 +1468,45 @@ class AOTOutput { remove(); std::swap(name, other.name); std::swap(state, other.state); +#ifdef _OS_WINDOWS_ + std::swap(file, other.file); std::swap(fd, other.fd); - std::swap(path, other.path); +#endif std::swap(buf, other.buf); return *this; } std::unique_ptr ostream() { - open(); - if (state == OPEN) +#ifdef _OS_WINDOWS_ + if (state == TMP_OPEN) { return std::make_unique(fd, false); - else - return std::make_unique(buf); + } +#endif + assert(state == MEMORY); + return std::make_unique(buf); } ErrorOr> memorybuf() { - open(); - if (state == OPEN) { - auto f = sys::fs::convertFDToNativeFile(fd); +#ifdef _OS_WINDOWS_ + if (state == TMP_OPEN) { sys::fs::file_status status; if (auto err = sys::fs::status(fd, status)) return err; - return MemoryBuffer::getOpenFile(f, name, status.getSize(), false); - } - else if (state == MEMORY) { - return MemoryBuffer::getMemBuffer(StringRef{buf.data(), buf.size()}, name, - false); + return MemoryBuffer::getOpenFile(file, name, status.getSize(), false); } - jl_unreachable(); - } - - void open() - { - using namespace sys::fs; - assert(state == EXISTS || state == OPEN || state == MEMORY); - if (state == OPEN || state == MEMORY) - return; - auto err = openFileForReadWrite(path, fd, CD_OpenExisting, OF_None); - if (err) - jl_errorf("failed to open temporary file %s\n", path.c_str()); - state = OPEN; +#endif + assert(state == MEMORY); + return MemoryBuffer::getMemBuffer(StringRef{buf.data(), buf.size()}, name, false); } - void close() + // Signal that we are done with writing to this output for the time being; + // inform the operating system it should page the memory out if we're + // running low. + void done() { - if (state == OPEN) { - auto f = sys::fs::convertFDToNativeFile(fd); - (void)sys::fs::closeFile(f); - state = EXISTS; - } - else if (state == MEMORY) { + if (state == MEMORY && buf.size() >= jl_large_aotoutput) { void *p = (void *)((uintptr_t)buf.data() & ~(jl_page_size - 1)); size_t s = LLT_ALIGN(buf.size(), jl_page_size); #if defined(_OS_DARWIN_) || defined(_OS_FREEBSD_) || defined(_OS_OPENBSD_) @@ -1518,27 +1524,30 @@ class AOTOutput { void remove() { - close(); - if (state == EXISTS) { - assert(!path.empty()); - (void)sys::fs::remove(path); +#ifdef _OS_WINDOWS_ + if (state == TMP_OPEN) { + close(fd); state = EMPTY; + return; } - else if (state == MEMORY) { +#endif + if (state == MEMORY) { buf.clear(); + state = EMPTY; } } private: std::string name; enum { - EMPTY, // Temporary file removed/buffer freed - EXISTS, // Temporary file exists but is not open (save FDs) - OPEN, // Temporary file exists and is open - MEMORY, // Contents are stored in memory + EMPTY, // Temporary file removed/buffer freed + TMP_OPEN, // Temporary file exists and is open, but will be deleted on close (Windows). + MEMORY, // Contents are stored in memory } state; +#ifdef _OS_WINDOWS_ + HANDLE file; int fd; - SmallString<128> path; +#endif SmallVector buf; }; @@ -1593,7 +1602,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc if (timer) timer->unopt.stopTimer(); OS->flush(); - out.close(); + out.done(); { std::lock_guard guard{outputs.lock}; outputs.unopt->push_back(std::move(out)); @@ -1685,7 +1694,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc MPM.run(M, AM.MAM); } OS->flush(); - out.close(); + out.done(); if (timer) timer->opt.stopTimer(); { @@ -1711,7 +1720,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc emitter.run(M); } OS->flush(); - out.close(); + out.done(); if (timer) timer->obj.stopTimer(); { @@ -1739,7 +1748,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc emitter.run(M); } OS->flush(); - out.close(); + out.done(); if (timer) timer->asm_.stopTimer(); { From f89bc0a7051c8ef32fece0b408fdb272d28db6d5 Mon Sep 17 00:00:00 2001 From: Sam Schweigel Date: Mon, 25 Aug 2025 09:57:59 -0700 Subject: [PATCH 09/10] Do not serialize module if only one shard is necessary (cherry picked from commit b66bc0152bd75b401ce112336d7f6c5bd0d6ebac) --- src/aotcompile.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 2af7ac243d6cb..fdcfd69198185 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -1993,6 +1993,11 @@ static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, String ModuleReleasedFunc module_released) { assert(threads); + if (shards <= 1) { + add_output_no_partition(outputs, M, TM, name, module_released); + return; + } + // Timers for timing purposes TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str()); Timer partition_timer("partition", "Partition module", timer_group); @@ -2334,7 +2339,7 @@ void jl_dump_native_impl(void *native_code, LLVM_DEBUG(dbgs() << "Using " << threads << " to emit aot image\n"); char *weight_s = getenv("JULIA_IMAGE_PARTITION_WEIGHT"); - size_t weight = 100000; + size_t weight = 500000; char *end; if (weight_s) { size_t x = strtol(weight_s, &end, 10); From 6d04991a8c16be6602d5b56742c196465af9e95d Mon Sep 17 00:00:00 2001 From: Sam Schweigel Date: Mon, 25 Aug 2025 10:15:37 -0700 Subject: [PATCH 10/10] Use temporary file only for sysimgM (Windows) (cherry picked from commit 57f420a21e5a3005f30ebcdab6d8d97a40502708) --- src/aotcompile.cpp | 80 ++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index fdcfd69198185..4df5c9090e488 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -1417,36 +1417,41 @@ struct ShardTimers { } }; -// If an AOTOutput is greater than this many bytes, we should write it to a -// temporary file on Windows, or MADV_DONTNEED/MADV_COLD it on Unix to alleviate -// memory pressure. Except in rare cases, this should be triggered only by the -// output containing the heap image. +// If an AOTOutput is greater than this many bytes, madvise +// MADV_DONTNEED/MADV_COLD it on Unix to alleviate memory pressure. Except in +// rare cases, this should be triggered only by the output containing the heap +// image. constexpr size_t jl_large_aotoutput = 64 * 1024 * 1024; // 64 MiB class AOTOutput { public: - AOTOutput(const Twine &prefix, const char *suffix) : name((prefix + "." + suffix).str()) + // If large = true and we are on Windows, use a temporary file. + AOTOutput(const Twine &prefix, const char *suffix, bool large = false) + : name((prefix + "." + suffix).str()) { #ifdef _OS_WINDOWS_ - SmallString<128> path; - SmallVector path_utf16; - auto model = prefix + "-%%%%%%." + suffix; - sys::fs::createUniquePath(model, path, true); - auto fail = [&]() { - jl_errorf("failed to create temporary file: %s", path.c_str()); - }; - if (sys::windows::widenPath(path, path_utf16)) - fail(); - file = CreateFileW(path_utf16.begin(), GENERIC_READ | GENERIC_WRITE, - FILE_SHARE_READ, nullptr, CREATE_ALWAYS, - FILE_ATTRIBUTE_TEMPORARY | FILE_FLAG_DELETE_ON_CLOSE, nullptr); - if (file == INVALID_HANDLE_VALUE) - fail(); - fd = _open_osfhandle((intptr_t)file, 0); - state = TMP_OPEN; -#else - state = MEMORY; + if (large) { + SmallString<128> path; + SmallVector path_utf16; + auto model = prefix + "-%%%%%%." + suffix; + sys::fs::createUniquePath(model, path, true); + auto fail = [&]() { + jl_errorf("failed to create temporary file: %s", path.c_str()); + }; + if (sys::windows::widenPath(path, path_utf16)) + fail(); + file = + CreateFileW(path_utf16.begin(), GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ, nullptr, CREATE_ALWAYS, + FILE_ATTRIBUTE_TEMPORARY | FILE_FLAG_DELETE_ON_CLOSE, nullptr); + if (file == INVALID_HANDLE_VALUE) + fail(); + fd = _open_osfhandle((intptr_t)file, 0); + state = TMP_OPEN; + return; + } #endif + state = MEMORY; } ~AOTOutput() { remove(); } AOTOutput(const AOTOutput &) = delete; @@ -1576,7 +1581,9 @@ struct AOTOutputs { }; // Perform the actual optimization and emission of the output files -static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &SourceTM, ShardTimers *timer = nullptr) { +static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &SourceTM, + bool large = false, ShardTimers *timer = nullptr) +{ auto TM = std::unique_ptr( SourceTM.getTarget().createTargetMachine( SourceTM.getTargetTriple().str(), @@ -1590,7 +1597,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc if (outputs.unopt) { if (timer) timer->unopt.startTimer(); - AOTOutput out{M.getModuleIdentifier(), "unopt.bc"}; + AOTOutput out{M.getModuleIdentifier(), "unopt.bc", large}; auto OS = out.ostream(); { PassBuilder PB; @@ -1684,7 +1691,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc if (outputs.opt) { if (timer) timer->opt.startTimer(); - AOTOutput out{M.getModuleIdentifier(), "bc"}; + AOTOutput out{M.getModuleIdentifier(), "bc", large}; auto OS = out.ostream(); { PassBuilder PB; @@ -1706,7 +1713,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc if (outputs.obj) { if (timer) timer->obj.startTimer(); - AOTOutput out{M.getModuleIdentifier(), "o"}; + AOTOutput out{M.getModuleIdentifier(), "o", large}; auto OS = out.ostream(); { legacy::PassManager emitter; @@ -1732,7 +1739,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc if (outputs.asm_) { if (timer) timer->asm_.startTimer(); - AOTOutput out{M.getModuleIdentifier(), "s"}; + AOTOutput out{M.getModuleIdentifier(), "s", large}; auto OS = out.ostream(); { legacy::PassManager emitter; @@ -1929,8 +1936,9 @@ extern "C" void lambda_trampoline(void* arg) { } template -static void -add_output_no_partition(AOTOutputs &outputs, Module &M, TargetMachine &TM, StringRef name, ModuleReleasedFunc module_released) +static void add_output_no_partition(AOTOutputs &outputs, Module &M, TargetMachine &TM, + StringRef name, bool large, + ModuleReleasedFunc module_released) { { JL_TIMING(NATIVE_AOT, NATIVE_Opt); @@ -1942,7 +1950,7 @@ add_output_no_partition(AOTOutputs &outputs, Module &M, TargetMachine &TM, Strin "_0"); // module flag "julia.mv.suffix" M.getGlobalVariable("jl_gvar_idxs")->setName("jl_gvar_idxs_0"); } - add_output_impl(outputs, M, TM); + add_output_impl(outputs, M, TM, large); } // Don't need M anymore module_released(M); @@ -1994,7 +2002,7 @@ static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, String { assert(threads); if (shards <= 1) { - add_output_no_partition(outputs, M, TM, name, module_released); + add_output_no_partition(outputs, M, TM, name, false, module_released); return; } @@ -2073,7 +2081,7 @@ static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, String CU->replaceOperandWith(0, topfile); timers[i].construct.stopTimer(); - add_output_impl(outputs, *M, TM, &timers[i]); + add_output_impl(outputs, *M, TM, false, &timers[i]); } }; auto arg = new std::function(func); @@ -2274,7 +2282,8 @@ void jl_dump_native_impl(void *native_code, // Note that we don't set z to null, this allows the check in write_archive // to function as expected // no need to free the module/context, destructor handles that - add_output_no_partition(outputs, sysimgM, *SourceTM, "sysimg", [](Module &){}); + add_output_no_partition(outputs, sysimgM, *SourceTM, "sysimg", true, + [](Module &) {}); } const bool imaging_mode = true; @@ -2488,7 +2497,8 @@ void jl_dump_native_impl(void *native_code, } // no need to free module/context, destructor handles that - add_output_no_partition(outputs, metadataM, *SourceTM, "data", [](Module &) {}); + add_output_no_partition(outputs, metadataM, *SourceTM, "data", false, + [](Module &) {}); } {