From a57d5d0ef66c1b060c94cb1297e2558d653c240c Mon Sep 17 00:00:00 2001
From: Sam Schweigel <sam.schweigel@juliahub.com>
Date: Wed, 12 Nov 2025 16:13:14 -0800
Subject: [PATCH 01/10] Add JLJITLinkMemoryManager (ports memory manager to
 JITLink) (#60105)

Ports our RTDyLD memory manager to JITLink in order to avoid memory use
regressions after switching to JITLink everywhere (#60031). This is a
direct port: finalization must happen all at once, because it
invalidates all allocation `wr_ptr`s. I decided it wasn't worth it to
associate `OnFinalizedFunction` callbacks with each block, since they
are large enough to make it extremely likely that all in-flight
allocations land in the same block; everything must be relocated before
finalization can happen.

(cherry picked from commit 6fa0e756d1464e4eae9e27acb78815766e90251d)
---
 src/cgmemmgr.cpp  | 233 +++++++++++++++++++++++++++++++++++++---------
 src/jitlayers.cpp |   7 +-
 2 files changed, 188 insertions(+), 52 deletions(-)
diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp
index c257d2a2e3331..7cf358ddf1e95 100644
--- a/src/cgmemmgr.cpp
+++ b/src/cgmemmgr.cpp
@@ -3,7 +3,11 @@
 #include "llvm-version.h"
 #include "platform.h"
 
+#include <llvm/ExecutionEngine/JITLink/JITLink.h>
+#include <llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h>
+#include <llvm/ExecutionEngine/Orc/MapperJITLinkMemoryManager.h>
 #include <llvm/ExecutionEngine/SectionMemoryManager.h>
+
 #include "julia.h"
 #include "julia_internal.h"
 
@@ -458,18 +462,27 @@ struct Block {
     }
 };
 
+struct Allocation {
+    // Address to write to (the one returned by the allocation function)
+    void *wr_addr;
+    // Runtime address
+    void *rt_addr;
+    size_t sz;
+    bool relocated;
+};
+
 class RWAllocator {
     static constexpr int nblocks = 8;
     Block blocks[nblocks]{};
 public:
     RWAllocator() JL_NOTSAFEPOINT = default;
-    void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT
+    Allocation alloc(size_t size, size_t align) JL_NOTSAFEPOINT
     {
         size_t min_size = (size_t)-1;
         int min_id = 0;
         for (int i = 0;i < nblocks && blocks[i].ptr;i++) {
             if (void *ptr = blocks[i].alloc(size, align))
-                return ptr;
+                return {ptr, ptr, size, false};
             if (blocks[i].avail < min_size) {
                 min_size = blocks[i].avail;
                 min_id = i;
@@ -477,7 +490,8 @@ class RWAllocator {
         }
         size_t block_size = get_block_size(size);
         blocks[min_id].reset(map_anon_page(block_size), block_size);
-        return blocks[min_id].alloc(size, align);
+        void *ptr = blocks[min_id].alloc(size, align);
+        return {ptr, ptr, size, false};
     }
 };
 
@@ -517,16 +531,6 @@ struct SplitPtrBlock : public Block {
     }
 };
 
-struct Allocation {
-    // Address to write to (the one returned by the allocation function)
-    void *wr_addr;
-    // Runtime address
-    void *rt_addr;
-    size_t sz;
-    bool relocated;
-};
-
-template<bool exec>
 class ROAllocator {
 protected:
     static constexpr int nblocks = 8;
@@ -554,7 +558,7 @@ class ROAllocator {
     }
     // Allocations that have not been finalized yet.
     SmallVector<Allocation, 16> allocations;
-    void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT
+    Allocation alloc(size_t size, size_t align) JL_NOTSAFEPOINT
     {
         size_t min_size = (size_t)-1;
         int min_id = 0;
@@ -570,8 +574,9 @@ class ROAllocator {
                     wr_ptr = get_wr_ptr(block, ptr, size, align);
                 }
                 block.state |= SplitPtrBlock::Alloc;
-                allocations.push_back(Allocation{wr_ptr, ptr, size, false});
-                return wr_ptr;
+                Allocation a{wr_ptr, ptr, size, false};
+                allocations.push_back(a);
+                return a;
             }
             if (block.avail < min_size) {
                 min_size = block.avail;
@@ -592,18 +597,21 @@ class ROAllocator {
 #ifdef _OS_WINDOWS_
         block.state = SplitPtrBlock::Alloc;
         void *wr_ptr = get_wr_ptr(block, ptr, size, align);
-        allocations.push_back(Allocation{wr_ptr, ptr, size, false});
+        Allocation a{wr_ptr, ptr, size, false};
+        allocations.push_back(a);
         ptr = wr_ptr;
 #else
         block.state = SplitPtrBlock::Alloc | SplitPtrBlock::InitAlloc;
-        allocations.push_back(Allocation{ptr, ptr, size, false});
+        Allocation a{ptr, ptr, size, false};
+        allocations.push_back(a);
 #endif
-        return ptr;
+        return a;
     }
 };
 
-template<bool exec>
-class DualMapAllocator : public ROAllocator<exec> {
+class DualMapAllocator : public ROAllocator {
+    bool exec;
+
 protected:
     void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, size_t, size_t) override JL_NOTSAFEPOINT
     {
@@ -664,7 +672,7 @@ class DualMapAllocator : public ROAllocator<exec> {
         }
     }
 public:
-    DualMapAllocator() JL_NOTSAFEPOINT
+    DualMapAllocator(bool exec) JL_NOTSAFEPOINT : exec(exec)
     {
         assert(anon_hdl != -1);
     }
@@ -677,13 +685,13 @@ class DualMapAllocator : public ROAllocator<exec> {
             finalize_block(block, true);
             block.reset(nullptr, 0);
         }
-        ROAllocator<exec>::finalize();
+        ROAllocator::finalize();
     }
 };
 
 #ifdef _OS_LINUX_
-template<bool exec>
-class SelfMemAllocator : public ROAllocator<exec> {
+class SelfMemAllocator : public ROAllocator {
+    bool exec;
     SmallVector<Block, 16> temp_buff;
 protected:
     void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr,
@@ -720,9 +728,7 @@ class SelfMemAllocator : public ROAllocator<exec> {
         }
     }
 public:
-    SelfMemAllocator() JL_NOTSAFEPOINT
-        : ROAllocator<exec>(),
-          temp_buff()
+    SelfMemAllocator(bool exec) JL_NOTSAFEPOINT : exec(exec), temp_buff()
     {
         assert(get_self_mem_fd() != -1);
     }
@@ -756,11 +762,25 @@ class SelfMemAllocator : public ROAllocator<exec> {
         }
         if (cached)
             temp_buff.resize(1);
-        ROAllocator<exec>::finalize();
+        ROAllocator::finalize();
     }
 };
 #endif // _OS_LINUX_
 
+std::pair<std::unique_ptr<ROAllocator>, std::unique_ptr<ROAllocator>>
+get_preferred_allocators() JL_NOTSAFEPOINT
+{
+#ifdef _OS_LINUX_
+    if (get_self_mem_fd() != -1)
+        return {std::make_unique<SelfMemAllocator>(false),
+                std::make_unique<SelfMemAllocator>(true)};
+#endif
+    if (init_shared_map() != -1)
+        return {std::make_unique<DualMapAllocator>(false),
+                std::make_unique<DualMapAllocator>(true)};
+    return {};
+}
+
 class RTDyldMemoryManagerJL : public SectionMemoryManager {
     struct EHFrame {
         uint8_t *addr;
@@ -770,8 +790,8 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager {
     void operator=(const RTDyldMemoryManagerJL&) = delete;
     SmallVector<EHFrame, 16> pending_eh;
     RWAllocator rw_alloc;
-    std::unique_ptr<ROAllocator<false>> ro_alloc;
-    std::unique_ptr<ROAllocator<true>> exe_alloc;
+    std::unique_ptr<ROAllocator> ro_alloc;
+    std::unique_ptr<ROAllocator> exe_alloc;
     size_t total_allocated;
 
 public:
@@ -779,20 +799,9 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager {
         : SectionMemoryManager(),
           pending_eh(),
           rw_alloc(),
-          ro_alloc(),
-          exe_alloc(),
           total_allocated(0)
     {
-#ifdef _OS_LINUX_
-        if (!ro_alloc && get_self_mem_fd() != -1) {
-            ro_alloc.reset(new SelfMemAllocator<false>());
-            exe_alloc.reset(new SelfMemAllocator<true>());
-        }
-#endif
-        if (!ro_alloc && init_shared_map() != -1) {
-            ro_alloc.reset(new DualMapAllocator<false>());
-            exe_alloc.reset(new DualMapAllocator<true>());
-        }
+        std::tie(ro_alloc, exe_alloc) = get_preferred_allocators();
     }
     ~RTDyldMemoryManagerJL() override JL_NOTSAFEPOINT
     {
@@ -845,7 +854,7 @@ uint8_t *RTDyldMemoryManagerJL::allocateCodeSection(uintptr_t Size,
     jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size);
     jl_timing_counter_inc(JL_TIMING_COUNTER_JITCodeSize, Size);
     if (exe_alloc)
-        return (uint8_t*)exe_alloc->alloc(Size, Alignment);
+        return (uint8_t*)exe_alloc->alloc(Size, Alignment).wr_addr;
     return SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID,
                                                      SectionName);
 }
@@ -860,9 +869,9 @@ uint8_t *RTDyldMemoryManagerJL::allocateDataSection(uintptr_t Size,
     jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size);
     jl_timing_counter_inc(JL_TIMING_COUNTER_JITDataSize, Size);
     if (!isReadOnly)
-        return (uint8_t*)rw_alloc.alloc(Size, Alignment);
+        return (uint8_t*)rw_alloc.alloc(Size, Alignment).wr_addr;
     if (ro_alloc)
-        return (uint8_t*)ro_alloc->alloc(Size, Alignment);
+        return (uint8_t*)ro_alloc->alloc(Size, Alignment).wr_addr;
     return SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID,
                                                      SectionName, isReadOnly);
 }
@@ -917,6 +926,133 @@ void RTDyldMemoryManagerJL::deregisterEHFrames(uint8_t *Addr,
 }
 #endif
 
+class JLJITLinkMemoryManager : public jitlink::JITLinkMemoryManager {
+    using OnFinalizedFunction =
+        jitlink::JITLinkMemoryManager::InFlightAlloc::OnFinalizedFunction;
+
+    std::mutex Mutex;
+    RWAllocator RWAlloc;
+    std::unique_ptr<ROAllocator> ROAlloc;
+    std::unique_ptr<ROAllocator> ExeAlloc;
+    SmallVector<OnFinalizedFunction> FinalizedCallbacks;
+    uint32_t InFlight{0};
+
+public:
+    class InFlightAlloc;
+
+    static std::unique_ptr<JITLinkMemoryManager> Create()
+    {
+        auto [ROAlloc, ExeAlloc] = get_preferred_allocators();
+        if (ROAlloc && ExeAlloc)
+            return std::unique_ptr<JLJITLinkMemoryManager>(
+                new JLJITLinkMemoryManager(std::move(ROAlloc), std::move(ExeAlloc)));
+
+        return cantFail(
+            orc::MapperJITLinkMemoryManager::CreateWithMapper<orc::InProcessMemoryMapper>(
+                /*Reservation Granularity*/ 16 * 1024 * 1024));
+    }
+
+    void allocate(const jitlink::JITLinkDylib *JD, jitlink::LinkGraph &G,
+                  OnAllocatedFunction OnAllocated) override;
+
+    void deallocate(std::vector<FinalizedAlloc> Allocs,
+                    OnDeallocatedFunction OnDeallocated) override
+    {
+        jl_unreachable();
+    }
+
+protected:
+    JLJITLinkMemoryManager(std::unique_ptr<ROAllocator> ROAlloc,
+                           std::unique_ptr<ROAllocator> ExeAlloc)
+      : ROAlloc(std::move(ROAlloc)), ExeAlloc(std::move(ExeAlloc))
+    {
+    }
+
+    void finalize(OnFinalizedFunction OnFinalized)
+    {
+        SmallVector<OnFinalizedFunction> Callbacks;
+        {
+            std::unique_lock Lock{Mutex};
+            FinalizedCallbacks.push_back(std::move(OnFinalized));
+
+            if (--InFlight > 0)
+                return;
+
+            ROAlloc->finalize();
+            ExeAlloc->finalize();
+            Callbacks = std::move(FinalizedCallbacks);
+        }
+
+        for (auto &CB : Callbacks)
+            std::move(CB)(FinalizedAlloc{});
+    }
+};
+
+class JLJITLinkMemoryManager::InFlightAlloc
+  : public jitlink::JITLinkMemoryManager::InFlightAlloc {
+    JLJITLinkMemoryManager &MM;
+    jitlink::LinkGraph &G;
+
+public:
+    InFlightAlloc(JLJITLinkMemoryManager &MM, jitlink::LinkGraph &G) : MM(MM), G(G) {}
+
+    void abandon(OnAbandonedFunction OnAbandoned) override { jl_unreachable(); }
+
+    void finalize(OnFinalizedFunction OnFinalized) override
+    {
+        auto *GP = &G;
+        MM.finalize([GP, OnFinalized =
+                             std::move(OnFinalized)](Expected<FinalizedAlloc> FA) mutable {
+            if (!FA)
+                return OnFinalized(FA.takeError());
+            // Need to handle dealloc actions when we GC code
+            auto E = orc::shared::runFinalizeActions(GP->allocActions());
+            if (!E)
+                return OnFinalized(E.takeError());
+            OnFinalized(std::move(FA));
+        });
+    }
+};
+
+using orc::MemProt;
+
+void JLJITLinkMemoryManager::allocate(const jitlink::JITLinkDylib *JD,
+                                      jitlink::LinkGraph &G,
+                                      OnAllocatedFunction OnAllocated)
+{
+    jitlink::BasicLayout BL{G};
+
+    {
+        std::unique_lock Lock{Mutex};
+        for (auto &[AG, Seg] : BL.segments()) {
+            if (AG.getMemLifetime() == orc::MemLifetime::NoAlloc)
+                continue;
+            assert(AG.getMemLifetime() == orc::MemLifetime::Standard);
+
+            auto Prot = AG.getMemProt();
+            uint64_t Alignment = Seg.Alignment.value();
+            uint64_t Size = Seg.ContentSize + Seg.ZeroFillSize;
+            Allocation Alloc;
+            if (Prot == (MemProt::Read | MemProt::Write))
+                Alloc = RWAlloc.alloc(Size, Alignment);
+            else if (Prot == MemProt::Read)
+                Alloc = ROAlloc->alloc(Size, Alignment);
+            else if (Prot == (MemProt::Read | MemProt::Exec))
+                Alloc = ExeAlloc->alloc(Size, Alignment);
+            else
+                abort();
+
+            Seg.Addr = orc::ExecutorAddr::fromPtr(Alloc.rt_addr);
+            Seg.WorkingMem = (char *)Alloc.wr_addr;
+        }
+    }
+
+    if (auto Err = BL.apply())
+        return OnAllocated(std::move(Err));
+
+    ++InFlight;
+    OnAllocated(std::make_unique<InFlightAlloc>(*this, G));
+}
 }
 
 RTDyldMemoryManager* createRTDyldMemoryManager() JL_NOTSAFEPOINT
@@ -928,3 +1064,8 @@ size_t getRTDyldMemoryManagerTotalBytes(RTDyldMemoryManager *mm) JL_NOTSAFEPOINT
 {
     return ((RTDyldMemoryManagerJL*)mm)->getTotalBytes();
 }
+
+std::unique_ptr<jitlink::JITLinkMemoryManager> createJITLinkMemoryManager()
+{
+    return JLJITLinkMemoryManager::Create();
+}
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 3ea95ea42f596..299e5d39d2c9f 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -1156,12 +1156,6 @@ class JLMemoryUsagePlugin : public ObjectLinkingLayer::Plugin {
 #pragma clang diagnostic ignored "-Wunused-function"
 #endif
 
-// TODO: Port our memory management optimisations to JITLink instead of using the
-// default InProcessMemoryManager.
-std::unique_ptr<jitlink::JITLinkMemoryManager> createJITLinkMemoryManager() JL_NOTSAFEPOINT {
-    return cantFail(orc::MapperJITLinkMemoryManager::CreateWithMapper<orc::InProcessMemoryMapper>(/*Reservation Granularity*/ 16 * 1024 * 1024));
-}
-
 #ifdef _COMPILER_CLANG_
 #pragma clang diagnostic pop
 #endif
@@ -1185,6 +1179,7 @@ class JLEHFrameRegistrar final : public jitlink::EHFrameRegistrar {
 };
 
 RTDyldMemoryManager *createRTDyldMemoryManager(void) JL_NOTSAFEPOINT;
+std::unique_ptr<jitlink::JITLinkMemoryManager> createJITLinkMemoryManager() JL_NOTSAFEPOINT;
 
 // A simple forwarding class, since OrcJIT v2 needs a unique_ptr, while we have a shared_ptr
 class ForwardingMemoryManager : public RuntimeDyld::MemoryManager {

From 5c0221d20ed3250de0bbf7cc70add4b1c473b84b Mon Sep 17 00:00:00 2001
From: Sam Schweigel <sam.schweigel@juliahub.com>
Date: Wed, 20 Aug 2025 10:58:21 -0700
Subject: [PATCH 02/10] Use temporary files for aotcompile outputs instead of
 memory

(cherry picked from commit d844566a08b644ef206d81af2cfab92b82142bf9)
---
 src/aotcompile.cpp | 429 ++++++++++++++++++++++++++++++---------------
 1 file changed, 286 insertions(+), 143 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 6009bd435534c..7853856000934 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -1410,15 +1410,118 @@ struct ShardTimers {
     }
 };
 
+class AOTOutput {
+public:
+    AOTOutput(const Twine &prefix, const char *suffix)
+      : name((prefix + "." + suffix).str()), state(OPEN)
+    {
+        std::error_code err = sys::fs::createTemporaryFile(prefix, suffix, fd, path);
+        if (err)
+            jl_errorf("failed to create temporary file: %s", err.message().c_str());
+    }
+    ~AOTOutput() { remove(); }
+    AOTOutput(const AOTOutput &) = delete;
+    AOTOutput &operator=(const AOTOutput &) = delete;
+    AOTOutput(AOTOutput &&other) noexcept
+      : name(std::move(other.name)),
+        state(other.state),
+        fd(other.fd),
+        path(std::move(other.path))
+    {
+        other.state = EMPTY;
+    }
+    AOTOutput &operator=(AOTOutput &&other) noexcept
+    {
+        remove();
+        name = std::move(other.name);
+        std::swap(state, other.state);
+        fd = other.fd;
+        path = std::move(other.path);
+        return *this;
+    }
+
+    std::unique_ptr<raw_pwrite_stream> ostream()
+    {
+        open();
+        return std::make_unique<raw_fd_stream>(fd, false);
+    }
+
+    ErrorOr<std::unique_ptr<MemoryBuffer>> memorybuf()
+    {
+        open();
+        auto f = sys::fs::convertFDToNativeFile(fd);
+        sys::fs::file_status status;
+        if (auto err = sys::fs::status(fd, status))
+            return err;
+        return MemoryBuffer::getOpenFile(f, name, status.getSize(), false);
+    }
+
+    StringRef get_name() { return name; }
+
+    void open()
+    {
+        using namespace sys::fs;
+        assert(state == EXISTS || state == OPEN);
+        if (state == OPEN)
+            return;
+        auto err = openFileForReadWrite(path, fd, CD_OpenExisting, OF_None);
+        if (err)
+            jl_errorf("failed to open temporary file %s\n", path.c_str());
+        state = OPEN;
+    }
+
+    void close()
+    {
+        if (state == OPEN) {
+            auto f = sys::fs::convertFDToNativeFile(fd);
+            (void)sys::fs::closeFile(f);
+            state = EXISTS;
+        }
+    }
+
+    void remove()
+    {
+        close();
+        if (state == EXISTS) {
+            assert(!path.empty());
+            (void)sys::fs::remove(path);
+            state = EMPTY;
+        }
+    }
+
+private:
+    std::string name;
+    enum { EMPTY, EXISTS, OPEN } state;
+    int fd;
+    SmallString<128> path;
+};
+
 struct AOTOutputs {
-    SmallVector<char, 0> unopt, opt, obj, asm_;
+    AOTOutputs(const char *bc_fname, const char *unopt_bc_fname, const char *obj_fname,
+               const char *asm_fname)
+      : bc_fname(bc_fname),
+        unopt_bc_fname(unopt_bc_fname),
+        obj_fname(obj_fname),
+        asm_fname(asm_fname)
+    {
+        if (bc_fname)
+            opt.emplace();
+        if (unopt_bc_fname)
+            unopt.emplace();
+        if (obj_fname)
+            obj.emplace();
+        if (asm_fname)
+            asm_.emplace();
+    }
+
+    std::mutex lock;
+    const char *bc_fname, *unopt_bc_fname, *obj_fname, *asm_fname;
+    // If one of the vectors is present, this output is being requested.
+    std::optional<SmallVector<AOTOutput, 0>> unopt, opt, obj, asm_;
 };
 
 // Perform the actual optimization and emission of the output files
-static AOTOutputs add_output_impl(Module &M, TargetMachine &SourceTM, ShardTimers &timers,
-        bool unopt, bool opt, bool obj, bool asm_) {
-    assert((unopt || opt || obj || asm_) && "no output requested");
-    AOTOutputs out;
+static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &SourceTM, ShardTimers *timer = nullptr) {
     auto TM = std::unique_ptr<TargetMachine>(
         SourceTM.getTarget().createTargetMachine(
             SourceTM.getTargetTriple().str(),
@@ -1429,23 +1532,35 @@ static AOTOutputs add_output_impl(Module &M, TargetMachine &SourceTM, ShardTimer
             SourceTM.getCodeModel(),
             SourceTM.getOptLevel()));
     fixupTM(*TM);
-    if (unopt) {
-        timers.unopt.startTimer();
-        raw_svector_ostream OS(out.unopt);
-        PassBuilder PB;
-        AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
-        ModulePassManager MPM;
-        MPM.addPass(BitcodeWriterPass(OS));
-        MPM.run(M, AM.MAM);
-        timers.unopt.stopTimer();
-    }
-    if (!opt && !obj && !asm_) {
-        return out;
+    if (outputs.unopt) {
+        if (timer)
+            timer->unopt.startTimer();
+        AOTOutput out{M.getModuleIdentifier(), "unopt.bc"};
+        auto OS = out.ostream();
+        {
+            PassBuilder PB;
+            AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
+            ModulePassManager MPM;
+            MPM.addPass(BitcodeWriterPass(*OS));
+            MPM.run(M, AM.MAM);
+        }
+        if (timer)
+            timer->unopt.stopTimer();
+        OS->flush();
+        out.close();
+        {
+            std::lock_guard guard{outputs.lock};
+            outputs.unopt->push_back(std::move(out));
+        }
+    }
+    if (!outputs.opt && !outputs.obj && !outputs.asm_) {
+        return;
     }
     assert(!verifyLLVMIR(M));
 
     {
-        timers.optimize.startTimer();
+        if (timer)
+            timer->optimize.startTimer();
 
         auto PMTM = std::unique_ptr<TargetMachine>(
             SourceTM.getTarget().createTargetMachine(
@@ -1507,51 +1622,85 @@ static AOTOutputs add_output_impl(Module &M, TargetMachine &SourceTM, ShardTimer
             injectCRTAlias(M, "__truncsdbf2", "julia__truncdfbf2",
                     FunctionType::get(Type::getBFloatTy(M.getContext()), { Type::getDoubleTy(M.getContext()) }, false));
         }
-        timers.optimize.stopTimer();
+        if (timer)
+            timer->optimize.stopTimer();
     }
 
-    if (opt) {
-        timers.opt.startTimer();
-        raw_svector_ostream OS(out.opt);
-        PassBuilder PB;
-        AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
-        ModulePassManager MPM;
-        MPM.addPass(BitcodeWriterPass(OS));
-        MPM.run(M, AM.MAM);
-        timers.opt.stopTimer();
+    if (outputs.opt) {
+        if (timer)
+            timer->opt.startTimer();
+        AOTOutput out{M.getModuleIdentifier(), "bc"};
+        auto OS = out.ostream();
+        {
+            PassBuilder PB;
+            AnalysisManagers AM{*TM, PB, OptimizationLevel::O0};
+            ModulePassManager MPM;
+            MPM.addPass(BitcodeWriterPass(*OS));
+            MPM.run(M, AM.MAM);
+        }
+        OS->flush();
+        out.close();
+        if (timer)
+            timer->opt.stopTimer();
+        {
+            std::lock_guard guard{outputs.lock};
+            outputs.opt->push_back(std::move(out));
+        }
     }
 
-    if (obj) {
-        timers.obj.startTimer();
-        raw_svector_ostream OS(out.obj);
-        legacy::PassManager emitter;
-        addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis());
+    if (outputs.obj) {
+        if (timer)
+            timer->obj.startTimer();
+        AOTOutput out{M.getModuleIdentifier(), "o"};
+        auto OS = out.ostream();
+        {
+            legacy::PassManager emitter;
+            addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis());
 #if JL_LLVM_VERSION >= 180000
-        if (TM->addPassesToEmitFile(emitter, OS, nullptr, CodeGenFileType::ObjectFile, false))
+            if (TM->addPassesToEmitFile(emitter, *OS, nullptr, CodeGenFileType::ObjectFile, false))
 #else
-        if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_ObjectFile, false))
+            if (TM->addPassesToEmitFile(emitter, *OS, nullptr, CGFT_ObjectFile, false))
 #endif
-            jl_safe_printf("ERROR: target does not support generation of object files\n");
-        emitter.run(M);
-        timers.obj.stopTimer();
+                jl_safe_printf("ERROR: target does not support generation of object files\n");
+            emitter.run(M);
+        }
+        OS->flush();
+        out.close();
+        if (timer)
+            timer->obj.stopTimer();
+        {
+            std::lock_guard guard{outputs.lock};
+            outputs.obj->push_back(std::move(out));
+        }
     }
 
-    if (asm_) {
-        timers.asm_.startTimer();
-        raw_svector_ostream OS(out.asm_);
-        legacy::PassManager emitter;
-        addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis());
+    if (outputs.asm_) {
+        if (timer)
+            timer->asm_.startTimer();
+        AOTOutput out{M.getModuleIdentifier(), "s"};
+        auto OS = out.ostream();
+        {
+            legacy::PassManager emitter;
+            addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis());
 #if JL_LLVM_VERSION >= 180000
-        if (TM->addPassesToEmitFile(emitter, OS, nullptr, CodeGenFileType::AssemblyFile, false))
+            if (TM->addPassesToEmitFile(emitter, *OS, nullptr,
+                                        CodeGenFileType::AssemblyFile, false))
 #else
-        if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_AssemblyFile, false))
+            if (TM->addPassesToEmitFile(emitter, *OS, nullptr, CGFT_AssemblyFile, false))
 #endif
-            jl_safe_printf("ERROR: target does not support generation of assembly files\n");
-        emitter.run(M);
-        timers.asm_.stopTimer();
+                jl_safe_printf(
+                    "ERROR: target does not support generation of assembly files\n");
+            emitter.run(M);
+        }
+        OS->flush();
+        out.close();
+        if (timer)
+            timer->asm_.stopTimer();
+        {
+            std::lock_guard guard{outputs.lock};
+            outputs.asm_->push_back(std::move(out));
+        }
     }
-
-    return out;
 }
 
 // serialize module to bitcode
@@ -1724,14 +1873,53 @@ extern "C" void lambda_trampoline(void* arg) {
     delete func;
 }
 
+template<typename ModuleReleasedFunc>
+static void
+add_output_no_partition(AOTOutputs &outputs, Module &M, TargetMachine &TM, StringRef name, ModuleReleasedFunc module_released)
+{
+    {
+        JL_TIMING(NATIVE_AOT, NATIVE_Opt);
+        // convert gvars to the expected offset table format for shard 0
+        if (M.getGlobalVariable("jl_gvars")) {
+            auto gvars = consume_gv<Constant>(M, "jl_gvars", false);
+            Type *T_size = M.getDataLayout().getIntPtrType(M.getContext());
+            emit_offset_table(M, T_size, gvars, "jl_gvar",
+                              "_0"); // module flag "julia.mv.suffix"
+            M.getGlobalVariable("jl_gvar_idxs")->setName("jl_gvar_idxs_0");
+        }
+        add_output_impl(outputs, M, TM);
+    }
+    // Don't need M anymore
+    module_released(M);
+}
+
+static bool should_report_image_timings()
+{
+    bool report_timings = false;
+    if (auto env = getenv("JULIA_IMAGE_TIMINGS")) {
+        char *endptr;
+        unsigned long val = strtoul(env, &endptr, 10);
+        if (endptr != env && !*endptr && val <= 1) {
+            report_timings = val;
+        } else {
+            if (StringRef("true").compare_insensitive(env) == 0)
+                report_timings = true;
+            else if (StringRef("false").compare_insensitive(env) == 0)
+                report_timings = false;
+            else
+                errs() << "WARNING: Invalid value for JULIA_IMAGE_TIMINGS: " << env << "\n";
+        }
+    }
+    return report_timings;
+}
+
 // Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading,
 // as well as partitioning, serialization, and deserialization.
 template<typename ModuleReleasedFunc>
-static SmallVector<AOTOutputs, 16> add_output(Module &M, TargetMachine &TM, StringRef name, unsigned threads,
-                bool unopt_out, bool opt_out, bool obj_out, bool asm_out, ModuleReleasedFunc module_released) {
-    SmallVector<AOTOutputs, 16> outputs(threads);
+static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, StringRef name,
+                       unsigned threads, ModuleReleasedFunc module_released)
+{
     assert(threads);
-    assert(unopt_out || opt_out || obj_out || asm_out);
     // Timers for timing purposes
     TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str());
     SmallVector<ShardTimers, 1> timers(threads);
@@ -1751,49 +1939,7 @@ static SmallVector<AOTOutputs, 16> add_output(Module &M, TargetMachine &TM, Stri
     Timer partition_timer("partition", "Partition module", timer_group);
     Timer serialize_timer("serialize", "Serialize module", timer_group);
     Timer output_timer("output", "Add outputs", timer_group);
-    bool report_timings = false;
-    if (auto env = getenv("JULIA_IMAGE_TIMINGS")) {
-        char *endptr;
-        unsigned long val = strtoul(env, &endptr, 10);
-        if (endptr != env && !*endptr && val <= 1) {
-            report_timings = val;
-        } else {
-            if (StringRef("true").compare_insensitive(env) == 0)
-                report_timings = true;
-            else if (StringRef("false").compare_insensitive(env) == 0)
-                report_timings = false;
-            else
-                errs() << "WARNING: Invalid value for JULIA_IMAGE_TIMINGS: " << env << "\n";
-        }
-    }
-    // Single-threaded case
-    if (threads == 1) {
-        output_timer.startTimer();
-        {
-            JL_TIMING(NATIVE_AOT, NATIVE_Opt);
-            // convert gvars to the expected offset table format for shard 0
-            if (M.getGlobalVariable("jl_gvars")) {
-                auto gvars = consume_gv<Constant>(M, "jl_gvars", false);
-                Type *T_size = M.getDataLayout().getIntPtrType(M.getContext());
-                emit_offset_table(M, T_size, gvars, "jl_gvar", "_0"); // module flag "julia.mv.suffix"
-                M.getGlobalVariable("jl_gvar_idxs")->setName("jl_gvar_idxs_0");
-            }
-            outputs[0] = add_output_impl(M, TM, timers[0], unopt_out, opt_out, obj_out, asm_out);
-        }
-        output_timer.stopTimer();
-        // Don't need M anymore
-        module_released(M);
-
-        if (!report_timings) {
-            timer_group.clear();
-        } else {
-            timer_group.print(dbgs(), true);
-            for (auto &t : timers) {
-                t.print(dbgs(), true);
-            }
-        }
-        return outputs;
-    }
+    bool report_timings = should_report_image_timings();
 
     partition_timer.startTimer();
     uint64_t counter = 0;
@@ -1804,6 +1950,7 @@ static SmallVector<AOTOutputs, 16> add_output(Module &M, TargetMachine &TM, Stri
             G.setName("jl_ext_" + Twine(counter++));
         }
     }
+
     auto partitions = partitionModule(M, threads);
     partition_timer.stopTimer();
 
@@ -1826,7 +1973,7 @@ static SmallVector<AOTOutputs, 16> add_output(Module &M, TargetMachine &TM, Stri
                 ctx.setDiscardValueNames(true);
                 // Lazily deserialize the entire module
                 timers[i].deserialize.startTimer();
-                auto EM = getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx);
+                auto EM = getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), name), ctx);
                 // Make sure this also fails with only julia, but not LLVM assertions enabled,
                 // otherwise, the first error we hit is the LLVM module verification failure,
                 // which will look very confusing, because the module was partially deserialized.
@@ -1842,6 +1989,7 @@ static SmallVector<AOTOutputs, 16> add_output(Module &M, TargetMachine &TM, Stri
                 timers[i].construct.startTimer();
                 std::string suffix = "_" + std::to_string(i);
                 construct_vars(*M, partitions[i], suffix);
+                M->setModuleIdentifier((Twine(M->getModuleIdentifier()) + "#" + Twine(i)).str());
                 M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), suffix));
                 // The DICompileUnit file is not used for anything, but ld64 requires it be a unique string per object file
                 // or it may skip emitting debug info for that file. Here set it to ./julia#N
@@ -1850,7 +1998,7 @@ static SmallVector<AOTOutputs, 16> add_output(Module &M, TargetMachine &TM, Stri
                     CU->replaceOperandWith(0, topfile);
                 timers[i].construct.stopTimer();
 
-                outputs[i] = add_output_impl(*M, TM, timers[i], unopt_out, opt_out, obj_out, asm_out);
+                add_output_impl(outputs, *M, TM, &timers[i]);
             };
             auto arg = new std::function<void()>(func);
             uv_thread_create(&workers[i], lambda_trampoline, arg); // Use libuv thread to avoid issues with stack sizes
@@ -1881,7 +2029,6 @@ static SmallVector<AOTOutputs, 16> add_output(Module &M, TargetMachine &TM, Stri
         }
         dbgs() << "]\n";
     }
-    return outputs;
 }
 
 extern int jl_is_timing_passes;
@@ -2016,13 +2163,8 @@ void jl_dump_native_impl(void *native_code,
         OverrideStackAlignment = M.getOverrideStackAlignment();
     });
 
-    auto compile = [&](Module &M, StringRef name, unsigned threads, auto module_released) {
-        return add_output(M, *SourceTM, name, threads, !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname, module_released);
-    };
+    AOTOutputs outputs{bc_fname, unopt_bc_fname, obj_fname, asm_fname};
 
-    SmallVector<AOTOutputs, 16> sysimg_outputs;
-    SmallVector<AOTOutputs, 16> data_outputs;
-    SmallVector<AOTOutputs, 16> metadata_outputs;
     if (z) {
         JL_TIMING(NATIVE_AOT, NATIVE_Sysimg);
         LLVMContext Context;
@@ -2053,10 +2195,10 @@ void jl_dump_native_impl(void *native_code,
         // Results in serious memory savings
         ios_close(z);
         free(z);
-        // Note that we don't set z to null, this allows the check in WRITE_ARCHIVE
+        // Note that we don't set z to null, this allows the check in write_archive
         // to function as expected
         // no need to free the module/context, destructor handles that
-        sysimg_outputs = compile(sysimgM, "sysimg", 1, [](Module &) {});
+        add_output_no_partition(outputs, sysimgM, *SourceTM, "sysimg", [](Module &){});
     }
 
     const bool imaging_mode = true;
@@ -2161,14 +2303,15 @@ void jl_dump_native_impl(void *native_code,
         auto lock = TSCtx.getLock();
         auto dataM = data->M.getModuleUnlocked();
 
-        data_outputs = compile(*dataM, "text", threads, [data, &lock, &TSCtx](Module &) {
-            // Delete data when add_output thinks it's done with it
-            // Saves memory for use when multithreading
-            auto lock2 = std::move(lock);
-            delete data;
-            // Drop last reference to shared LLVM::Context
-            auto TSCtx2 = std::move(TSCtx);
-        });
+        add_output(outputs, *dataM, *SourceTM, "text", threads,
+                   [data, &lock, &TSCtx](Module &) {
+                       // Delete data when add_output thinks it's done with it
+                       // Saves memory for use when multithreading
+                       auto lock2 = std::move(lock);
+                       delete data;
+                       // Drop last reference to shared LLVM::Context
+                       auto TSCtx2 = std::move(TSCtx);
+                   });
     }
 
     if (params->emit_metadata) {
@@ -2258,7 +2401,7 @@ void jl_dump_native_impl(void *native_code,
         }
 
         // no need to free module/context, destructor handles that
-        metadata_outputs = compile(metadataM, "data", 1, [](Module &) {});
+        add_output_no_partition(outputs, metadataM, *SourceTM, "data", [](Module &) {});
     }
 
     {
@@ -2270,32 +2413,32 @@ void jl_dump_native_impl(void *native_code,
 #else
 #define WritingMode true
 #endif
-#define WRITE_ARCHIVE(fname, field, prefix, suffix) \
-    if (fname) {\
-        SmallVector<NewArchiveMember, 0> archive; \
-        SmallVector<std::string, 16> filenames; \
-        SmallVector<StringRef, 16> buffers; \
-        for (size_t i = 0; i < threads; i++) { \
-            filenames.push_back((StringRef("text") + prefix + "#" + Twine(i) + suffix).str()); \
-            buffers.push_back(StringRef(data_outputs[i].field.data(), data_outputs[i].field.size())); \
-        } \
-        filenames.push_back("metadata" prefix suffix); \
-        buffers.push_back(StringRef(metadata_outputs[0].field.data(), metadata_outputs[0].field.size())); \
-        if (z) { \
-            filenames.push_back("sysimg" prefix suffix); \
-            buffers.push_back(StringRef(sysimg_outputs[0].field.data(), sysimg_outputs[0].field.size())); \
-        } \
-        for (size_t i = 0; i < filenames.size(); i++) { \
-            archive.push_back(NewArchiveMember(MemoryBufferRef(buffers[i], filenames[i]))); \
-        } \
-        handleAllErrors(writeArchive(fname, archive, WritingMode, Kind, true, false), reportWriterError); \
-    }
-
-        WRITE_ARCHIVE(unopt_bc_fname, unopt, "_unopt", ".bc");
-        WRITE_ARCHIVE(bc_fname, opt, "_opt", ".bc");
-        WRITE_ARCHIVE(obj_fname, obj, "", ".o");
-        WRITE_ARCHIVE(asm_fname, asm_, "", ".s");
-#undef WRITE_ARCHIVE
+        auto write_archive = [&](const char *fname, SmallVector<AOTOutput, 0> &outputs) {
+            if (!fname)
+                return;
+            SmallVector<NewArchiveMember, 0> archive;
+            // Must be SmallString<0> so StringRefs in NewArchiveMembers aren't invalidated
+            SmallVector<std::unique_ptr<MemoryBuffer>, 0> buffers;
+            for (auto &out : outputs) {
+                auto buf = out.memorybuf();
+                if (buf.getError())
+                    jl_errorf("failed to read temporary object file: %s",
+                              buf.getError().message().c_str());
+                buffers.push_back(std::move(*buf));
+                archive.push_back(NewArchiveMember{*buffers.back()});
+            }
+            handleAllErrors(writeArchive(fname, archive, WritingMode, Kind, true, false),
+                            reportWriterError);
+        };
+
+        if (outputs.unopt)
+            write_archive(unopt_bc_fname, *outputs.unopt);
+        if (outputs.opt)
+            write_archive(bc_fname, *outputs.opt);
+        if (outputs.obj)
+            write_archive(obj_fname, *outputs.obj);
+        if (outputs.asm_)
+            write_archive(asm_fname, *outputs.asm_);
     }
 }
 

From 76e8a80bd23af4a96ad4b290c32b297d0225aca3 Mon Sep 17 00:00:00 2001
From: Sam Schweigel <sam.schweigel@juliahub.com>
Date: Wed, 20 Aug 2025 11:22:25 -0700
Subject: [PATCH 03/10] Use more partitions than threads in aotcompile

(cherry picked from commit 1f159908e15a1b99762f9419de9a5b0b736f375b)
---
 src/aotcompile.cpp | 119 ++++++++++++++++++++++++++-------------------
 1 file changed, 70 insertions(+), 49 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 7853856000934..304f63d69faa8 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -1000,11 +1000,11 @@ static GlobalVariable *emit_ptls_table(Module &M, Type *T_size, Type *T_ptr) {
 }
 
 // See src/processor.h for documentation about this table. Corresponds to jl_image_header_t.
-static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned nfvars, unsigned ngvars) {
+static GlobalVariable *emit_image_header(Module &M, unsigned shards, unsigned nfvars, unsigned ngvars) {
     constexpr uint32_t version = 1;
     std::array<uint32_t, 4> header{
         version,
-        threads,
+        shards,
         nfvars,
         ngvars,
     };
@@ -1913,17 +1913,9 @@ static bool should_report_image_timings()
     return report_timings;
 }
 
-// Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading,
-// as well as partitioning, serialization, and deserialization.
-template<typename ModuleReleasedFunc>
-static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, StringRef name,
-                       unsigned threads, ModuleReleasedFunc module_released)
+static void initialize_shard_timers(StringRef name, SmallVector<ShardTimers, 1> &timers)
 {
-    assert(threads);
-    // Timers for timing purposes
-    TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str());
-    SmallVector<ShardTimers, 1> timers(threads);
-    for (unsigned i = 0; i < threads; ++i) {
+    for (unsigned i = 0; i < timers.size(); ++i) {
         auto idx = std::to_string(i);
         timers[i].name = "shard_" + idx;
         timers[i].desc = ("Timings for " + name + " module shard " + idx).str();
@@ -1936,6 +1928,18 @@ static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, String
         timers[i].obj.init("obj_" + idx, "Emit object file");
         timers[i].asm_.init("asm_" + idx, "Emit assembly file");
     }
+}
+
+// Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading,
+// as well as partitioning, serialization, and deserialization.
+template<typename ModuleReleasedFunc>
+static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, StringRef name,
+                       unsigned threads, unsigned shards,
+                       ModuleReleasedFunc module_released)
+{
+    assert(threads);
+    // Timers for timing purposes
+    TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str());
     Timer partition_timer("partition", "Partition module", timer_group);
     Timer serialize_timer("serialize", "Serialize module", timer_group);
     Timer output_timer("output", "Add outputs", timer_group);
@@ -1951,7 +1955,7 @@ static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, String
         }
     }
 
-    auto partitions = partitionModule(M, threads);
+    auto partitions = partitionModule(M, shards);
     partition_timer.stopTimer();
 
     serialize_timer.startTimer();
@@ -1961,47 +1965,59 @@ static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, String
     // Don't need M anymore, since we'll only read from serialized from now on
     module_released(M);
 
+    SmallVector<ShardTimers, 1> timers(shards);
+    initialize_shard_timers(name, timers);
+
+    std::atomic<unsigned> next_part = 0;
+
     output_timer.startTimer();
 
     // Start all of the worker threads
     {
         JL_TIMING(NATIVE_AOT, NATIVE_Opt);
         std::vector<uv_thread_t> workers(threads);
-        for (unsigned i = 0; i < threads; i++) {
-            std::function<void()> func = [&, i]() {
-                LLVMContext ctx;
-                ctx.setDiscardValueNames(true);
-                // Lazily deserialize the entire module
-                timers[i].deserialize.startTimer();
-                auto EM = getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), name), ctx);
-                // Make sure this also fails with only julia, but not LLVM assertions enabled,
-                // otherwise, the first error we hit is the LLVM module verification failure,
-                // which will look very confusing, because the module was partially deserialized.
-                bool deser_succeeded = (bool)EM;
-                auto M = cantFail(std::move(EM), "Error loading module");
-                assert(deser_succeeded); (void)deser_succeeded;
-                timers[i].deserialize.stopTimer();
-
-                timers[i].materialize.startTimer();
-                materializePreserved(*M, partitions[i]);
-                timers[i].materialize.stopTimer();
-
-                timers[i].construct.startTimer();
-                std::string suffix = "_" + std::to_string(i);
-                construct_vars(*M, partitions[i], suffix);
-                M->setModuleIdentifier((Twine(M->getModuleIdentifier()) + "#" + Twine(i)).str());
-                M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), suffix));
-                // The DICompileUnit file is not used for anything, but ld64 requires it be a unique string per object file
-                // or it may skip emitting debug info for that file. Here set it to ./julia#N
-                DIFile *topfile = DIFile::get(M->getContext(), "julia#" + std::to_string(i), ".");
-                for (DICompileUnit *CU : M->debug_compile_units())
-                    CU->replaceOperandWith(0, topfile);
-                timers[i].construct.stopTimer();
-
-                add_output_impl(outputs, *M, TM, &timers[i]);
+        for (unsigned tid = 0; tid < threads; tid++) {
+            std::function<void()> func = [&]() {
+                while (1) {
+                    unsigned i = std::atomic_fetch_add(&next_part, 1);
+                    if (i >= shards)
+                        return;
+
+                    Partition &partition = partitions[i];
+                    LLVMContext ctx;
+                    ctx.setDiscardValueNames(true);
+                    // Lazily deserialize the entire module
+                    timers[i].deserialize.startTimer();
+                    auto EM = getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), name), ctx);
+                    // Make sure this also fails with only julia, but not LLVM assertions enabled,
+                    // otherwise, the first error we hit is the LLVM module verification failure,
+                    // which will look very confusing, because the module was partially deserialized.
+                    bool deser_succeeded = (bool)EM;
+                    auto M = cantFail(std::move(EM), "Error loading module");
+                    assert(deser_succeeded); (void)deser_succeeded;
+                    timers[i].deserialize.stopTimer();
+
+                    timers[i].materialize.startTimer();
+                    materializePreserved(*M, partition);
+                    timers[i].materialize.stopTimer();
+
+                    timers[i].construct.startTimer();
+                    std::string suffix = "_" + std::to_string(i);
+                    construct_vars(*M, partition, suffix);
+                    M->setModuleIdentifier((Twine(M->getModuleIdentifier()) + "#" + Twine(i)).str());
+                    M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), suffix));
+                    // The DICompileUnit file is not used for anything, but ld64 requires it be a unique string per object file
+                    // or it may skip emitting debug info for that file. Here set it to ./julia#N
+                    DIFile *topfile = DIFile::get(M->getContext(), "julia#" + std::to_string(i), ".");
+                    for (DICompileUnit *CU : M->debug_compile_units())
+                        CU->replaceOperandWith(0, topfile);
+                    timers[i].construct.stopTimer();
+
+                    add_output_impl(outputs, *M, TM, &timers[i]);
+                }
             };
             auto arg = new std::function<void()>(func);
-            uv_thread_create(&workers[i], lambda_trampoline, arg); // Use libuv thread to avoid issues with stack sizes
+            uv_thread_create(&workers[tid], lambda_trampoline, arg); // Use libuv thread to avoid issues with stack sizes
         }
 
         // Wait for all of the worker threads to finish
@@ -2297,13 +2313,18 @@ void jl_dump_native_impl(void *native_code,
         has_veccall = !!dataM.getModuleFlag("julia.mv.veccall");
     });
 
+    size_t nshards;
     {
         // Don't use withModuleDo here since we delete the TSM midway through
         auto TSCtx = data->M.getContext();
         auto lock = TSCtx.getLock();
         auto dataM = data->M.getModuleUnlocked();
 
-        add_output(outputs, *dataM, *SourceTM, "text", threads,
+        auto info = compute_module_info(*dataM);
+        constexpr size_t weight_per_partition = 500000;
+        nshards = std::max<size_t>(1, info.weight / weight_per_partition);
+
+        add_output(outputs, *dataM, *SourceTM, "text", threads, nshards,
                    [data, &lock, &TSCtx](Module &) {
                        // Delete data when add_output thinks it's done with it
                        // Saves memory for use when multithreading
@@ -2372,9 +2393,9 @@ void jl_dump_native_impl(void *native_code,
             auto target_ids = new GlobalVariable(metadataM, value->getType(), true,
                                         GlobalVariable::InternalLinkage,
                                         value, "jl_dispatch_target_ids");
-            auto shards = emit_shard_table(metadataM, T_size, T_psize, threads);
+            auto shards = emit_shard_table(metadataM, T_size, T_psize, nshards);
             auto ptls = emit_ptls_table(metadataM, T_size, T_ptr);
-            auto header = emit_image_header(metadataM, threads, nfvars, ngvars);
+            auto header = emit_image_header(metadataM, nshards, nfvars, ngvars);
             auto AT = ArrayType::get(T_size, sizeof(jl_small_typeof) / sizeof(void*));
             auto jl_small_typeof_copy = new GlobalVariable(metadataM, AT, false,
                                                         GlobalVariable::ExternalLinkage,

From 213eb10fda9a71a21c950d9a8bca6aa6b352ce23 Mon Sep 17 00:00:00 2001
From: Sam Schweigel <sam.schweigel@juliahub.com>
Date: Thu, 21 Aug 2025 10:44:48 -0700
Subject: [PATCH 04/10] Fix partitioning modules with zero fvars or gvars

(cherry picked from commit 2c517642a968538e5d02ddf9678dc2ce3d9fde3e)
---
 src/aotcompile.cpp | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 304f63d69faa8..1645fb5fd8d62 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -1024,22 +1024,24 @@ static void get_fvars_gvars(Module &M, DenseMap<GlobalValue *, unsigned> &fvars,
     assert(gvars_gv);
     assert(fvars_idxs);
     assert(gvars_idxs);
-    auto fvars_init = cast<ConstantArray>(fvars_gv->getInitializer());
-    auto gvars_init = cast<ConstantArray>(gvars_gv->getInitializer());
-    for (unsigned i = 0; i < fvars_init->getNumOperands(); ++i) {
-        auto gv = cast<GlobalValue>(fvars_init->getOperand(i)->stripPointerCasts());
-        assert(gv && gv->hasName() && "fvar must be a named global");
-        assert(!fvars.count(gv) && "Duplicate fvar");
-        fvars[gv] = i;
-    }
-    assert(fvars.size() == fvars_init->getNumOperands());
-    for (unsigned i = 0; i < gvars_init->getNumOperands(); ++i) {
-        auto gv = cast<GlobalValue>(gvars_init->getOperand(i)->stripPointerCasts());
-        assert(gv && gv->hasName() && "gvar must be a named global");
-        assert(!gvars.count(gv) && "Duplicate gvar");
-        gvars[gv] = i;
-    }
-    assert(gvars.size() == gvars_init->getNumOperands());
+    if (auto fvars_init = dyn_cast<ConstantArray>(fvars_gv->getInitializer())) {
+        for (unsigned i = 0; i < fvars_init->getNumOperands(); ++i) {
+            auto gv = cast<GlobalValue>(fvars_init->getOperand(i)->stripPointerCasts());
+            assert(gv && gv->hasName() && "fvar must be a named global");
+            assert(!fvars.count(gv) && "Duplicate fvar");
+            fvars[gv] = i;
+        }
+        assert(fvars.size() == fvars_init->getNumOperands());
+    }
+    if (auto gvars_init = dyn_cast<ConstantArray>(gvars_gv->getInitializer())) {
+        for (unsigned i = 0; i < gvars_init->getNumOperands(); ++i) {
+            auto gv = cast<GlobalValue>(gvars_init->getOperand(i)->stripPointerCasts());
+            assert(gv && gv->hasName() && "gvar must be a named global");
+            assert(!gvars.count(gv) && "Duplicate gvar");
+            gvars[gv] = i;
+        }
+        assert(gvars.size() == gvars_init->getNumOperands());
+    }
     fvars_gv->eraseFromParent();
     gvars_gv->eraseFromParent();
     fvars_idxs->eraseFromParent();

From 58aa0eeda2ea98da9bfbc557ec2300217b32f4e9 Mon Sep 17 00:00:00 2001
From: Sam Schweigel <sam.schweigel@juliahub.com>
Date: Thu, 21 Aug 2025 12:24:21 -0700
Subject: [PATCH 05/10] Reduce partition weight to 100000, add
 JULIA_IMAGE_PARTITION_WEIGHT

Also reuse already-computed ModuleInfo

(cherry picked from commit be73aabf9e5130dce3cbca1a7e7080b2df76fd3e)
---
 src/aotcompile.cpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 1645fb5fd8d62..fa891b84d9710 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -2221,6 +2221,7 @@ void jl_dump_native_impl(void *native_code,
 
     const bool imaging_mode = true;
     unsigned threads = 1;
+    unsigned nshards;
     unsigned nfvars = 0;
     unsigned ngvars = 0;
 
@@ -2278,6 +2279,17 @@ void jl_dump_native_impl(void *native_code,
             );
             threads = compute_image_thread_count(module_info);
             LLVM_DEBUG(dbgs() << "Using " << threads << " to emit aot image\n");
+
+            char *weight_s = getenv("JULIA_IMAGE_PARTITION_WEIGHT");
+            size_t weight = 100000;
+            char *end;
+            if (weight_s) {
+                size_t x = strtol(weight_s, &end, 10);
+                if (weight_s != end)
+                    weight = x;
+            }
+            nshards = std::max<size_t>(1, module_info.weight / weight);
+
             nfvars = data->jl_sysimg_fvars.size();
             ngvars = data->jl_sysimg_gvars.size();
             emit_table(dataM, data->jl_sysimg_gvars, "jl_gvars", T_psize);
@@ -2315,17 +2327,11 @@ void jl_dump_native_impl(void *native_code,
         has_veccall = !!dataM.getModuleFlag("julia.mv.veccall");
     });
 
-    size_t nshards;
     {
         // Don't use withModuleDo here since we delete the TSM midway through
         auto TSCtx = data->M.getContext();
         auto lock = TSCtx.getLock();
         auto dataM = data->M.getModuleUnlocked();
-
-        auto info = compute_module_info(*dataM);
-        constexpr size_t weight_per_partition = 500000;
-        nshards = std::max<size_t>(1, info.weight / weight_per_partition);
-
         add_output(outputs, *dataM, *SourceTM, "text", threads, nshards,
                    [data, &lock, &TSCtx](Module &) {
                        // Delete data when add_output thinks it's done with it

From c0dfbff2059b188990352558536048adabacf28f Mon Sep 17 00:00:00 2001
From: Sam Schweigel <sam.schweigel@juliahub.com>
Date: Thu, 21 Aug 2025 15:59:38 -0700
Subject: [PATCH 06/10] Use temporary buffers with madvise on Unix

(cherry picked from commit e64d4bd0a277134a892277485ff701f6396a6922)
---
 src/aotcompile.cpp | 80 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 62 insertions(+), 18 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index fa891b84d9710..f6bbede0c25ee 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -42,6 +42,9 @@
 #include <llvm/Support/FormatAdapters.h>
 #include <llvm/Linker/Linker.h>
 
+#ifndef _OS_WINDOWS_
+#include <sys/mman.h>
+#endif
 
 using namespace llvm;
 
@@ -1412,14 +1415,22 @@ struct ShardTimers {
     }
 };
 
+#ifdef _OS_WINDOWS_
+#define JL_USE_TEMP_FILES
+#endif
+
 class AOTOutput {
 public:
-    AOTOutput(const Twine &prefix, const char *suffix)
-      : name((prefix + "." + suffix).str()), state(OPEN)
+    AOTOutput(const Twine &prefix, const char *suffix) : name((prefix + "." + suffix).str())
     {
+#ifdef JL_USE_TEMP_FILES
         std::error_code err = sys::fs::createTemporaryFile(prefix, suffix, fd, path);
         if (err)
             jl_errorf("failed to create temporary file: %s", err.message().c_str());
+        state = OPEN;
+#else
+        state = MEMORY;
+#endif
     }
     ~AOTOutput() { remove(); }
     AOTOutput(const AOTOutput &) = delete;
@@ -1428,43 +1439,53 @@ class AOTOutput {
       : name(std::move(other.name)),
         state(other.state),
         fd(other.fd),
-        path(std::move(other.path))
+        path(std::move(other.path)),
+        buf(std::move(other.buf))
     {
         other.state = EMPTY;
     }
     AOTOutput &operator=(AOTOutput &&other) noexcept
     {
         remove();
-        name = std::move(other.name);
+        std::swap(name, other.name);
         std::swap(state, other.state);
-        fd = other.fd;
-        path = std::move(other.path);
+        std::swap(fd, other.fd);
+        std::swap(path, other.path);
+        std::swap(buf, other.buf);
         return *this;
     }
 
     std::unique_ptr<raw_pwrite_stream> ostream()
     {
         open();
-        return std::make_unique<raw_fd_stream>(fd, false);
+        if (state == OPEN)
+            return std::make_unique<raw_fd_stream>(fd, false);
+        else
+            return std::make_unique<raw_svector_ostream>(buf);
     }
 
     ErrorOr<std::unique_ptr<MemoryBuffer>> memorybuf()
     {
         open();
-        auto f = sys::fs::convertFDToNativeFile(fd);
-        sys::fs::file_status status;
-        if (auto err = sys::fs::status(fd, status))
-            return err;
-        return MemoryBuffer::getOpenFile(f, name, status.getSize(), false);
+        if (state == OPEN) {
+            auto f = sys::fs::convertFDToNativeFile(fd);
+            sys::fs::file_status status;
+            if (auto err = sys::fs::status(fd, status))
+                return err;
+            return MemoryBuffer::getOpenFile(f, name, status.getSize(), false);
+        }
+        else if (state == MEMORY) {
+            return MemoryBuffer::getMemBuffer(StringRef{buf.data(), buf.size()}, name,
+                                              false);
+        }
+        jl_unreachable();
     }
 
-    StringRef get_name() { return name; }
-
     void open()
     {
         using namespace sys::fs;
-        assert(state == EXISTS || state == OPEN);
-        if (state == OPEN)
+        assert(state == EXISTS || state == OPEN || state == MEMORY);
+        if (state == OPEN || state == MEMORY)
             return;
         auto err = openFileForReadWrite(path, fd, CD_OpenExisting, OF_None);
         if (err)
@@ -1479,6 +1500,20 @@ class AOTOutput {
             (void)sys::fs::closeFile(f);
             state = EXISTS;
         }
+        else if (state == MEMORY) {
+            void *p = (void *)((uintptr_t)buf.data() & ~(jl_page_size - 1));
+            size_t s = LLT_ALIGN(buf.size(), jl_page_size);
+#if defined(_OS_DARWIN_) || defined(_OS_FREEBSD_) || defined(_OS_OPENBSD_)
+            if (s > 0)
+                madvise(p, s, MADV_DONTNEED);
+#elif defined(_OS_LINUX_) && defined(MADV_COLD)
+            if (s > 0)
+                madvise(p, s, MADV_COLD);
+#else
+            (void)p;
+            (void)s;
+#endif
+        }
     }
 
     void remove()
@@ -1489,13 +1524,22 @@ class AOTOutput {
             (void)sys::fs::remove(path);
             state = EMPTY;
         }
+        else if (state == MEMORY) {
+            buf.clear();
+        }
     }
 
 private:
     std::string name;
-    enum { EMPTY, EXISTS, OPEN } state;
+    enum {
+        EMPTY,               // Temporary file removed/buffer freed
+        EXISTS,              // Temporary file exists but is not open (save FDs)
+        OPEN,                // Temporary file exists and is open
+        MEMORY,              // Contents are stored in memory
+    } state;
     int fd;
     SmallString<128> path;
+    SmallVector<char, 0> buf;
 };
 
 struct AOTOutputs {
@@ -2221,7 +2265,7 @@ void jl_dump_native_impl(void *native_code,
 
     const bool imaging_mode = true;
     unsigned threads = 1;
-    unsigned nshards;
+    unsigned nshards = 1;
     unsigned nfvars = 0;
     unsigned ngvars = 0;
 

From f1b783d6d006996ca6afdc2daea54d9a6c44edd3 Mon Sep 17 00:00:00 2001
From: Sam Schweigel <sam.schweigel@juliahub.com>
Date: Fri, 22 Aug 2025 14:45:30 -0700
Subject: [PATCH 07/10] Use raw_fd_ostream to support old LLVM

(cherry picked from commit 040d85f193a6c511397bc026f15ab390a8fb45d5)
---
 src/aotcompile.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index f6bbede0c25ee..90b429e9f68d3 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -1459,7 +1459,7 @@ class AOTOutput {
     {
         open();
         if (state == OPEN)
-            return std::make_unique<raw_fd_stream>(fd, false);
+            return std::make_unique<raw_fd_ostream>(fd, false);
         else
             return std::make_unique<raw_svector_ostream>(buf);
     }

From c67de81c8350d85562ae4cfb07c7b795dea4a5d2 Mon Sep 17 00:00:00 2001
From: Sam Schweigel <sam.schweigel@juliahub.com>
Date: Fri, 22 Aug 2025 15:17:47 -0700
Subject: [PATCH 08/10] Use memory buffers exclusively on Unix, temp file on
 Windows

(cherry picked from commit 78093988a261310af1f21cdcba4de8bf96407e9f)
---
 src/aotcompile.cpp | 123 ++++++++++++++++++++++++---------------------
 1 file changed, 66 insertions(+), 57 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 90b429e9f68d3..2af7ac243d6cb 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -42,7 +42,9 @@
 #include <llvm/Support/FormatAdapters.h>
 #include <llvm/Linker/Linker.h>
 
-#ifndef _OS_WINDOWS_
+#ifdef _OS_WINDOWS_
+#include <llvm/Support/Windows/WindowsSupport.h>
+#else
 #include <sys/mman.h>
 #endif
 
@@ -1415,19 +1417,33 @@ struct ShardTimers {
     }
 };
 
-#ifdef _OS_WINDOWS_
-#define JL_USE_TEMP_FILES
-#endif
+// If an AOTOutput is greater than this many bytes, we should write it to a
+// temporary file on Windows, or MADV_DONTNEED/MADV_COLD it on Unix to alleviate
+// memory pressure.  Except in rare cases, this should be triggered only by the
+// output containing the heap image.
+constexpr size_t jl_large_aotoutput = 64 * 1024 * 1024; // 64 MiB
 
 class AOTOutput {
 public:
     AOTOutput(const Twine &prefix, const char *suffix) : name((prefix + "." + suffix).str())
     {
-#ifdef JL_USE_TEMP_FILES
-        std::error_code err = sys::fs::createTemporaryFile(prefix, suffix, fd, path);
-        if (err)
-            jl_errorf("failed to create temporary file: %s", err.message().c_str());
-        state = OPEN;
+#ifdef _OS_WINDOWS_
+        SmallString<128> path;
+        SmallVector<wchar_t, 128> path_utf16;
+        auto model = prefix + "-%%%%%%." + suffix;
+        sys::fs::createUniquePath(model, path, true);
+        auto fail = [&]() {
+            jl_errorf("failed to create temporary file: %s", path.c_str());
+        };
+        if (sys::windows::widenPath(path, path_utf16))
+            fail();
+        file = CreateFileW(path_utf16.begin(), GENERIC_READ | GENERIC_WRITE,
+                           FILE_SHARE_READ, nullptr, CREATE_ALWAYS,
+                           FILE_ATTRIBUTE_TEMPORARY | FILE_FLAG_DELETE_ON_CLOSE, nullptr);
+        if (file == INVALID_HANDLE_VALUE)
+            fail();
+        fd = _open_osfhandle((intptr_t)file, 0);
+        state = TMP_OPEN;
 #else
         state = MEMORY;
 #endif
@@ -1438,9 +1454,12 @@ class AOTOutput {
     AOTOutput(AOTOutput &&other) noexcept
       : name(std::move(other.name)),
         state(other.state),
+#ifdef _OS_WINDOWS_
+        file(other.file),
         fd(other.fd),
-        path(std::move(other.path)),
+#endif
         buf(std::move(other.buf))
+
     {
         other.state = EMPTY;
     }
@@ -1449,58 +1468,45 @@ class AOTOutput {
         remove();
         std::swap(name, other.name);
         std::swap(state, other.state);
+#ifdef _OS_WINDOWS_
+        std::swap(file, other.file);
         std::swap(fd, other.fd);
-        std::swap(path, other.path);
+#endif
         std::swap(buf, other.buf);
         return *this;
     }
 
     std::unique_ptr<raw_pwrite_stream> ostream()
     {
-        open();
-        if (state == OPEN)
+#ifdef _OS_WINDOWS_
+        if (state == TMP_OPEN) {
             return std::make_unique<raw_fd_ostream>(fd, false);
-        else
-            return std::make_unique<raw_svector_ostream>(buf);
+        }
+#endif
+        assert(state == MEMORY);
+        return std::make_unique<raw_svector_ostream>(buf);
     }
 
     ErrorOr<std::unique_ptr<MemoryBuffer>> memorybuf()
     {
-        open();
-        if (state == OPEN) {
-            auto f = sys::fs::convertFDToNativeFile(fd);
+#ifdef _OS_WINDOWS_
+        if (state == TMP_OPEN) {
             sys::fs::file_status status;
             if (auto err = sys::fs::status(fd, status))
                 return err;
-            return MemoryBuffer::getOpenFile(f, name, status.getSize(), false);
-        }
-        else if (state == MEMORY) {
-            return MemoryBuffer::getMemBuffer(StringRef{buf.data(), buf.size()}, name,
-                                              false);
+            return MemoryBuffer::getOpenFile(file, name, status.getSize(), false);
         }
-        jl_unreachable();
-    }
-
-    void open()
-    {
-        using namespace sys::fs;
-        assert(state == EXISTS || state == OPEN || state == MEMORY);
-        if (state == OPEN || state == MEMORY)
-            return;
-        auto err = openFileForReadWrite(path, fd, CD_OpenExisting, OF_None);
-        if (err)
-            jl_errorf("failed to open temporary file %s\n", path.c_str());
-        state = OPEN;
+#endif
+        assert(state == MEMORY);
+        return MemoryBuffer::getMemBuffer(StringRef{buf.data(), buf.size()}, name, false);
     }
 
-    void close()
+    // Signal that we are done with writing to this output for the time being;
+    // inform the operating system it should page the memory out if we're
+    // running low.
+    void done()
     {
-        if (state == OPEN) {
-            auto f = sys::fs::convertFDToNativeFile(fd);
-            (void)sys::fs::closeFile(f);
-            state = EXISTS;
-        }
-        else if (state == MEMORY) {
+        if (state == MEMORY && buf.size() >= jl_large_aotoutput) {
             void *p = (void *)((uintptr_t)buf.data() & ~(jl_page_size - 1));
             size_t s = LLT_ALIGN(buf.size(), jl_page_size);
 #if defined(_OS_DARWIN_) || defined(_OS_FREEBSD_) || defined(_OS_OPENBSD_)
@@ -1518,27 +1524,30 @@ class AOTOutput {
 
     void remove()
     {
-        close();
-        if (state == EXISTS) {
-            assert(!path.empty());
-            (void)sys::fs::remove(path);
+#ifdef _OS_WINDOWS_
+        if (state == TMP_OPEN) {
+            close(fd);
             state = EMPTY;
+            return;
         }
-        else if (state == MEMORY) {
+#endif
+        if (state == MEMORY) {
             buf.clear();
+            state = EMPTY;
         }
     }
 
 private:
     std::string name;
     enum {
-        EMPTY,               // Temporary file removed/buffer freed
-        EXISTS,              // Temporary file exists but is not open (save FDs)
-        OPEN,                // Temporary file exists and is open
-        MEMORY,              // Contents are stored in memory
+        EMPTY,    // Temporary file removed/buffer freed
+        TMP_OPEN, // Temporary file exists and is open, but will be deleted on close (Windows).
+        MEMORY,   // Contents are stored in memory
     } state;
+#ifdef _OS_WINDOWS_
+    HANDLE file;
     int fd;
-    SmallString<128> path;
+#endif
     SmallVector<char, 0> buf;
 };
 
@@ -1593,7 +1602,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc
         if (timer)
             timer->unopt.stopTimer();
         OS->flush();
-        out.close();
+        out.done();
         {
             std::lock_guard guard{outputs.lock};
             outputs.unopt->push_back(std::move(out));
@@ -1685,7 +1694,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc
             MPM.run(M, AM.MAM);
         }
         OS->flush();
-        out.close();
+        out.done();
         if (timer)
             timer->opt.stopTimer();
         {
@@ -1711,7 +1720,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc
             emitter.run(M);
         }
         OS->flush();
-        out.close();
+        out.done();
         if (timer)
             timer->obj.stopTimer();
         {
@@ -1739,7 +1748,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc
             emitter.run(M);
         }
         OS->flush();
-        out.close();
+        out.done();
         if (timer)
             timer->asm_.stopTimer();
         {

From f89bc0a7051c8ef32fece0b408fdb272d28db6d5 Mon Sep 17 00:00:00 2001
From: Sam Schweigel <sam.schweigel@juliahub.com>
Date: Mon, 25 Aug 2025 09:57:59 -0700
Subject: [PATCH 09/10] Do not serialize module if only one shard is necessary

(cherry picked from commit b66bc0152bd75b401ce112336d7f6c5bd0d6ebac)
---
 src/aotcompile.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 2af7ac243d6cb..fdcfd69198185 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -1993,6 +1993,11 @@ static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, String
                        ModuleReleasedFunc module_released)
 {
     assert(threads);
+    if (shards <= 1) {
+        add_output_no_partition(outputs, M, TM, name, module_released);
+        return;
+    }
+
     // Timers for timing purposes
     TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str());
     Timer partition_timer("partition", "Partition module", timer_group);
@@ -2334,7 +2339,7 @@ void jl_dump_native_impl(void *native_code,
             LLVM_DEBUG(dbgs() << "Using " << threads << " to emit aot image\n");
 
             char *weight_s = getenv("JULIA_IMAGE_PARTITION_WEIGHT");
-            size_t weight = 100000;
+            size_t weight = 500000;
             char *end;
             if (weight_s) {
                 size_t x = strtol(weight_s, &end, 10);

From 6d04991a8c16be6602d5b56742c196465af9e95d Mon Sep 17 00:00:00 2001
From: Sam Schweigel <sam.schweigel@juliahub.com>
Date: Mon, 25 Aug 2025 10:15:37 -0700
Subject: [PATCH 10/10] Use temporary file only for sysimgM (Windows)

(cherry picked from commit 57f420a21e5a3005f30ebcdab6d8d97a40502708)
---
 src/aotcompile.cpp | 80 ++++++++++++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 35 deletions(-)

diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index fdcfd69198185..4df5c9090e488 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -1417,36 +1417,41 @@ struct ShardTimers {
     }
 };
 
-// If an AOTOutput is greater than this many bytes, we should write it to a
-// temporary file on Windows, or MADV_DONTNEED/MADV_COLD it on Unix to alleviate
-// memory pressure.  Except in rare cases, this should be triggered only by the
-// output containing the heap image.
+// If an AOTOutput is greater than this many bytes, madvise
+// MADV_DONTNEED/MADV_COLD it on Unix to alleviate memory pressure.  Except in
+// rare cases, this should be triggered only by the output containing the heap
+// image.
 constexpr size_t jl_large_aotoutput = 64 * 1024 * 1024; // 64 MiB
 
 class AOTOutput {
 public:
-    AOTOutput(const Twine &prefix, const char *suffix) : name((prefix + "." + suffix).str())
+    // If large = true and we are on Windows, use a temporary file.
+    AOTOutput(const Twine &prefix, const char *suffix, bool large = false)
+      : name((prefix + "." + suffix).str())
     {
 #ifdef _OS_WINDOWS_
-        SmallString<128> path;
-        SmallVector<wchar_t, 128> path_utf16;
-        auto model = prefix + "-%%%%%%." + suffix;
-        sys::fs::createUniquePath(model, path, true);
-        auto fail = [&]() {
-            jl_errorf("failed to create temporary file: %s", path.c_str());
-        };
-        if (sys::windows::widenPath(path, path_utf16))
-            fail();
-        file = CreateFileW(path_utf16.begin(), GENERIC_READ | GENERIC_WRITE,
-                           FILE_SHARE_READ, nullptr, CREATE_ALWAYS,
-                           FILE_ATTRIBUTE_TEMPORARY | FILE_FLAG_DELETE_ON_CLOSE, nullptr);
-        if (file == INVALID_HANDLE_VALUE)
-            fail();
-        fd = _open_osfhandle((intptr_t)file, 0);
-        state = TMP_OPEN;
-#else
-        state = MEMORY;
+        if (large) {
+            SmallString<128> path;
+            SmallVector<wchar_t, 128> path_utf16;
+            auto model = prefix + "-%%%%%%." + suffix;
+            sys::fs::createUniquePath(model, path, true);
+            auto fail = [&]() {
+                jl_errorf("failed to create temporary file: %s", path.c_str());
+            };
+            if (sys::windows::widenPath(path, path_utf16))
+                fail();
+            file =
+                CreateFileW(path_utf16.begin(), GENERIC_READ | GENERIC_WRITE,
+                            FILE_SHARE_READ, nullptr, CREATE_ALWAYS,
+                            FILE_ATTRIBUTE_TEMPORARY | FILE_FLAG_DELETE_ON_CLOSE, nullptr);
+            if (file == INVALID_HANDLE_VALUE)
+                fail();
+            fd = _open_osfhandle((intptr_t)file, 0);
+            state = TMP_OPEN;
+            return;
+        }
 #endif
+        state = MEMORY;
     }
     ~AOTOutput() { remove(); }
     AOTOutput(const AOTOutput &) = delete;
@@ -1576,7 +1581,9 @@ struct AOTOutputs {
 };
 
 // Perform the actual optimization and emission of the output files
-static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &SourceTM, ShardTimers *timer = nullptr) {
+static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &SourceTM,
+                            bool large = false, ShardTimers *timer = nullptr)
+{
     auto TM = std::unique_ptr<TargetMachine>(
         SourceTM.getTarget().createTargetMachine(
             SourceTM.getTargetTriple().str(),
@@ -1590,7 +1597,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc
     if (outputs.unopt) {
         if (timer)
             timer->unopt.startTimer();
-        AOTOutput out{M.getModuleIdentifier(), "unopt.bc"};
+        AOTOutput out{M.getModuleIdentifier(), "unopt.bc", large};
         auto OS = out.ostream();
         {
             PassBuilder PB;
@@ -1684,7 +1691,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc
     if (outputs.opt) {
         if (timer)
             timer->opt.startTimer();
-        AOTOutput out{M.getModuleIdentifier(), "bc"};
+        AOTOutput out{M.getModuleIdentifier(), "bc", large};
         auto OS = out.ostream();
         {
             PassBuilder PB;
@@ -1706,7 +1713,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc
     if (outputs.obj) {
         if (timer)
             timer->obj.startTimer();
-        AOTOutput out{M.getModuleIdentifier(), "o"};
+        AOTOutput out{M.getModuleIdentifier(), "o", large};
         auto OS = out.ostream();
         {
             legacy::PassManager emitter;
@@ -1732,7 +1739,7 @@ static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &Sourc
     if (outputs.asm_) {
         if (timer)
             timer->asm_.startTimer();
-        AOTOutput out{M.getModuleIdentifier(), "s"};
+        AOTOutput out{M.getModuleIdentifier(), "s", large};
         auto OS = out.ostream();
         {
             legacy::PassManager emitter;
@@ -1929,8 +1936,9 @@ extern "C" void lambda_trampoline(void* arg) {
 }
 
 template<typename ModuleReleasedFunc>
-static void
-add_output_no_partition(AOTOutputs &outputs, Module &M, TargetMachine &TM, StringRef name, ModuleReleasedFunc module_released)
+static void add_output_no_partition(AOTOutputs &outputs, Module &M, TargetMachine &TM,
+                                    StringRef name, bool large,
+                                    ModuleReleasedFunc module_released)
 {
     {
         JL_TIMING(NATIVE_AOT, NATIVE_Opt);
@@ -1942,7 +1950,7 @@ add_output_no_partition(AOTOutputs &outputs, Module &M, TargetMachine &TM, Strin
                               "_0"); // module flag "julia.mv.suffix"
             M.getGlobalVariable("jl_gvar_idxs")->setName("jl_gvar_idxs_0");
         }
-        add_output_impl(outputs, M, TM);
+        add_output_impl(outputs, M, TM, large);
     }
     // Don't need M anymore
     module_released(M);
@@ -1994,7 +2002,7 @@ static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, String
 {
     assert(threads);
     if (shards <= 1) {
-        add_output_no_partition(outputs, M, TM, name, module_released);
+        add_output_no_partition(outputs, M, TM, name, false, module_released);
         return;
     }
 
@@ -2073,7 +2081,7 @@ static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, String
                         CU->replaceOperandWith(0, topfile);
                     timers[i].construct.stopTimer();
 
-                    add_output_impl(outputs, *M, TM, &timers[i]);
+                    add_output_impl(outputs, *M, TM, false, &timers[i]);
                 }
             };
             auto arg = new std::function<void()>(func);
@@ -2274,7 +2282,8 @@ void jl_dump_native_impl(void *native_code,
         // Note that we don't set z to null, this allows the check in write_archive
         // to function as expected
         // no need to free the module/context, destructor handles that
-        add_output_no_partition(outputs, sysimgM, *SourceTM, "sysimg", [](Module &){});
+        add_output_no_partition(outputs, sysimgM, *SourceTM, "sysimg", true,
+                                [](Module &) {});
     }
 
     const bool imaging_mode = true;
@@ -2488,7 +2497,8 @@ void jl_dump_native_impl(void *native_code,
         }
 
         // no need to free module/context, destructor handles that
-        add_output_no_partition(outputs, metadataM, *SourceTM, "data", [](Module &) {});
+        add_output_no_partition(outputs, metadataM, *SourceTM, "data", false,
+                                [](Module &) {});
     }
 
     {