From 054645025e3dadb987394e28b54ae7c9ee0afbc7 Mon Sep 17 00:00:00 2001
From: Sam Schweigel <sam.schweigel@juliahub.com>
Date: Wed, 12 Nov 2025 16:13:14 -0800
Subject: [PATCH 1/2] Add JLJITLinkMemoryManager (ports memory manager to
 JITLink) (#60105)

Ports our RTDyLD memory manager to JITLink in order to avoid memory use
regressions after switching to JITLink everywhere (#60031). This is a
direct port: finalization must happen all at once, because it
invalidates all allocation `wr_ptr`s. I decided it wasn't worth it to
associate `OnFinalizedFunction` callbacks with each block, since they
are large enough to make it extremely likely that all in-flight
allocations land in the same block; everything must be relocated before
finalization can happen.
---
 src/cgmemmgr.cpp  | 233 +++++++++++++++++++++++++++++++++++++---------
 src/jitlayers.cpp |   7 +-
 2 files changed, 188 insertions(+), 52 deletions(-)
diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp
index 99f78b81bf0b2..e36f9f80cfccf 100644
--- a/src/cgmemmgr.cpp
+++ b/src/cgmemmgr.cpp
@@ -3,7 +3,11 @@
 #include "llvm-version.h"
 #include "platform.h"
 
+#include <llvm/ExecutionEngine/JITLink/JITLink.h>
+#include <llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h>
+#include <llvm/ExecutionEngine/Orc/MapperJITLinkMemoryManager.h>
 #include <llvm/ExecutionEngine/SectionMemoryManager.h>
+
 #include "julia.h"
 #include "julia_internal.h"
 
@@ -460,18 +464,27 @@ struct Block {
     }
 };
 
+struct Allocation {
+    // Address to write to (the one returned by the allocation function)
+    void *wr_addr;
+    // Runtime address
+    void *rt_addr;
+    size_t sz;
+    bool relocated;
+};
+
 class RWAllocator {
     static constexpr int nblocks = 8;
     Block blocks[nblocks]{};
 public:
     RWAllocator() JL_NOTSAFEPOINT = default;
-    void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT
+    Allocation alloc(size_t size, size_t align) JL_NOTSAFEPOINT
     {
         size_t min_size = (size_t)-1;
         int min_id = 0;
         for (int i = 0;i < nblocks && blocks[i].ptr;i++) {
             if (void *ptr = blocks[i].alloc(size, align))
-                return ptr;
+                return {ptr, ptr, size, false};
             if (blocks[i].avail < min_size) {
                 min_size = blocks[i].avail;
                 min_id = i;
@@ -479,7 +492,8 @@ class RWAllocator {
         }
         size_t block_size = get_block_size(size);
         blocks[min_id].reset(map_anon_page(block_size), block_size);
-        return blocks[min_id].alloc(size, align);
+        void *ptr = blocks[min_id].alloc(size, align);
+        return {ptr, ptr, size, false};
     }
 };
 
@@ -519,16 +533,6 @@ struct SplitPtrBlock : public Block {
     }
 };
 
-struct Allocation {
-    // Address to write to (the one returned by the allocation function)
-    void *wr_addr;
-    // Runtime address
-    void *rt_addr;
-    size_t sz;
-    bool relocated;
-};
-
-template<bool exec>
 class ROAllocator {
 protected:
     static constexpr int nblocks = 8;
@@ -556,7 +560,7 @@ class ROAllocator {
     }
     // Allocations that have not been finalized yet.
     SmallVector<Allocation, 16> allocations;
-    void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT
+    Allocation alloc(size_t size, size_t align) JL_NOTSAFEPOINT
     {
         size_t min_size = (size_t)-1;
         int min_id = 0;
@@ -572,8 +576,9 @@ class ROAllocator {
                     wr_ptr = get_wr_ptr(block, ptr, size, align);
                 }
                 block.state |= SplitPtrBlock::Alloc;
-                allocations.push_back(Allocation{wr_ptr, ptr, size, false});
-                return wr_ptr;
+                Allocation a{wr_ptr, ptr, size, false};
+                allocations.push_back(a);
+                return a;
             }
             if (block.avail < min_size) {
                 min_size = block.avail;
@@ -594,18 +599,21 @@ class ROAllocator {
 #ifdef _OS_WINDOWS_
         block.state = SplitPtrBlock::Alloc;
         void *wr_ptr = get_wr_ptr(block, ptr, size, align);
-        allocations.push_back(Allocation{wr_ptr, ptr, size, false});
+        Allocation a{wr_ptr, ptr, size, false};
+        allocations.push_back(a);
         ptr = wr_ptr;
 #else
         block.state = SplitPtrBlock::Alloc | SplitPtrBlock::InitAlloc;
-        allocations.push_back(Allocation{ptr, ptr, size, false});
+        Allocation a{ptr, ptr, size, false};
+        allocations.push_back(a);
 #endif
-        return ptr;
+        return a;
     }
 };
 
-template<bool exec>
-class DualMapAllocator : public ROAllocator<exec> {
+class DualMapAllocator : public ROAllocator {
+    bool exec;
+
 protected:
     void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, size_t, size_t) override JL_NOTSAFEPOINT
     {
@@ -666,7 +674,7 @@ class DualMapAllocator : public ROAllocator<exec> {
         }
     }
 public:
-    DualMapAllocator() JL_NOTSAFEPOINT
+    DualMapAllocator(bool exec) JL_NOTSAFEPOINT : exec(exec)
     {
         assert(anon_hdl != -1);
     }
@@ -679,13 +687,13 @@ class DualMapAllocator : public ROAllocator<exec> {
             finalize_block(block, true);
             block.reset(nullptr, 0);
         }
-        ROAllocator<exec>::finalize();
+        ROAllocator::finalize();
     }
 };
 
 #ifdef _OS_LINUX_
-template<bool exec>
-class SelfMemAllocator : public ROAllocator<exec> {
+class SelfMemAllocator : public ROAllocator {
+    bool exec;
     SmallVector<Block, 16> temp_buff;
 protected:
     void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr,
@@ -722,9 +730,7 @@ class SelfMemAllocator : public ROAllocator<exec> {
         }
     }
 public:
-    SelfMemAllocator() JL_NOTSAFEPOINT
-        : ROAllocator<exec>(),
-          temp_buff()
+    SelfMemAllocator(bool exec) JL_NOTSAFEPOINT : exec(exec), temp_buff()
     {
         assert(get_self_mem_fd() != -1);
     }
@@ -758,11 +764,25 @@ class SelfMemAllocator : public ROAllocator<exec> {
         }
         if (cached)
             temp_buff.resize(1);
-        ROAllocator<exec>::finalize();
+        ROAllocator::finalize();
     }
 };
 #endif // _OS_LINUX_
 
+std::pair<std::unique_ptr<ROAllocator>, std::unique_ptr<ROAllocator>>
+get_preferred_allocators() JL_NOTSAFEPOINT
+{
+#ifdef _OS_LINUX_
+    if (get_self_mem_fd() != -1)
+        return {std::make_unique<SelfMemAllocator>(false),
+                std::make_unique<SelfMemAllocator>(true)};
+#endif
+    if (init_shared_map() != -1)
+        return {std::make_unique<DualMapAllocator>(false),
+                std::make_unique<DualMapAllocator>(true)};
+    return {};
+}
+
 class RTDyldMemoryManagerJL : public SectionMemoryManager {
     struct EHFrame {
         uint8_t *addr;
@@ -772,8 +792,8 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager {
     void operator=(const RTDyldMemoryManagerJL&) = delete;
     SmallVector<EHFrame, 16> pending_eh;
     RWAllocator rw_alloc;
-    std::unique_ptr<ROAllocator<false>> ro_alloc;
-    std::unique_ptr<ROAllocator<true>> exe_alloc;
+    std::unique_ptr<ROAllocator> ro_alloc;
+    std::unique_ptr<ROAllocator> exe_alloc;
     size_t total_allocated;
 
 public:
@@ -781,20 +801,9 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager {
         : SectionMemoryManager(),
           pending_eh(),
           rw_alloc(),
-          ro_alloc(),
-          exe_alloc(),
           total_allocated(0)
     {
-#ifdef _OS_LINUX_
-        if (!ro_alloc && get_self_mem_fd() != -1) {
-            ro_alloc.reset(new SelfMemAllocator<false>());
-            exe_alloc.reset(new SelfMemAllocator<true>());
-        }
-#endif
-        if (!ro_alloc && init_shared_map() != -1) {
-            ro_alloc.reset(new DualMapAllocator<false>());
-            exe_alloc.reset(new DualMapAllocator<true>());
-        }
+        std::tie(ro_alloc, exe_alloc) = get_preferred_allocators();
     }
     ~RTDyldMemoryManagerJL() override JL_NOTSAFEPOINT
     {
@@ -847,7 +856,7 @@ uint8_t *RTDyldMemoryManagerJL::allocateCodeSection(uintptr_t Size,
     jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size);
     jl_timing_counter_inc(JL_TIMING_COUNTER_JITCodeSize, Size);
     if (exe_alloc)
-        return (uint8_t*)exe_alloc->alloc(Size, Alignment);
+        return (uint8_t*)exe_alloc->alloc(Size, Alignment).wr_addr;
     return SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID,
                                                      SectionName);
 }
@@ -862,9 +871,9 @@ uint8_t *RTDyldMemoryManagerJL::allocateDataSection(uintptr_t Size,
     jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size);
     jl_timing_counter_inc(JL_TIMING_COUNTER_JITDataSize, Size);
     if (!isReadOnly)
-        return (uint8_t*)rw_alloc.alloc(Size, Alignment);
+        return (uint8_t*)rw_alloc.alloc(Size, Alignment).wr_addr;
     if (ro_alloc)
-        return (uint8_t*)ro_alloc->alloc(Size, Alignment);
+        return (uint8_t*)ro_alloc->alloc(Size, Alignment).wr_addr;
     return SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID,
                                                      SectionName, isReadOnly);
 }
@@ -919,6 +928,133 @@ void RTDyldMemoryManagerJL::deregisterEHFrames(uint8_t *Addr,
 }
 #endif
 
+class JLJITLinkMemoryManager : public jitlink::JITLinkMemoryManager {
+    using OnFinalizedFunction =
+        jitlink::JITLinkMemoryManager::InFlightAlloc::OnFinalizedFunction;
+
+    std::mutex Mutex;
+    RWAllocator RWAlloc;
+    std::unique_ptr<ROAllocator> ROAlloc;
+    std::unique_ptr<ROAllocator> ExeAlloc;
+    SmallVector<OnFinalizedFunction> FinalizedCallbacks;
+    uint32_t InFlight{0};
+
+public:
+    class InFlightAlloc;
+
+    static std::unique_ptr<JITLinkMemoryManager> Create()
+    {
+        auto [ROAlloc, ExeAlloc] = get_preferred_allocators();
+        if (ROAlloc && ExeAlloc)
+            return std::unique_ptr<JLJITLinkMemoryManager>(
+                new JLJITLinkMemoryManager(std::move(ROAlloc), std::move(ExeAlloc)));
+
+        return cantFail(
+            orc::MapperJITLinkMemoryManager::CreateWithMapper<orc::InProcessMemoryMapper>(
+                /*Reservation Granularity*/ 16 * 1024 * 1024));
+    }
+
+    void allocate(const jitlink::JITLinkDylib *JD, jitlink::LinkGraph &G,
+                  OnAllocatedFunction OnAllocated) override;
+
+    void deallocate(std::vector<FinalizedAlloc> Allocs,
+                    OnDeallocatedFunction OnDeallocated) override
+    {
+        jl_unreachable();
+    }
+
+protected:
+    JLJITLinkMemoryManager(std::unique_ptr<ROAllocator> ROAlloc,
+                           std::unique_ptr<ROAllocator> ExeAlloc)
+      : ROAlloc(std::move(ROAlloc)), ExeAlloc(std::move(ExeAlloc))
+    {
+    }
+
+    void finalize(OnFinalizedFunction OnFinalized)
+    {
+        SmallVector<OnFinalizedFunction> Callbacks;
+        {
+            std::unique_lock Lock{Mutex};
+            FinalizedCallbacks.push_back(std::move(OnFinalized));
+
+            if (--InFlight > 0)
+                return;
+
+            ROAlloc->finalize();
+            ExeAlloc->finalize();
+            Callbacks = std::move(FinalizedCallbacks);
+        }
+
+        for (auto &CB : Callbacks)
+            std::move(CB)(FinalizedAlloc{});
+    }
+};
+
+class JLJITLinkMemoryManager::InFlightAlloc
+  : public jitlink::JITLinkMemoryManager::InFlightAlloc {
+    JLJITLinkMemoryManager &MM;
+    jitlink::LinkGraph &G;
+
+public:
+    InFlightAlloc(JLJITLinkMemoryManager &MM, jitlink::LinkGraph &G) : MM(MM), G(G) {}
+
+    void abandon(OnAbandonedFunction OnAbandoned) override { jl_unreachable(); }
+
+    void finalize(OnFinalizedFunction OnFinalized) override
+    {
+        auto *GP = &G;
+        MM.finalize([GP, OnFinalized =
+                             std::move(OnFinalized)](Expected<FinalizedAlloc> FA) mutable {
+            if (!FA)
+                return OnFinalized(FA.takeError());
+            // Need to handle dealloc actions when we GC code
+            auto E = orc::shared::runFinalizeActions(GP->allocActions());
+            if (!E)
+                return OnFinalized(E.takeError());
+            OnFinalized(std::move(FA));
+        });
+    }
+};
+
+using orc::MemProt;
+
+void JLJITLinkMemoryManager::allocate(const jitlink::JITLinkDylib *JD,
+                                      jitlink::LinkGraph &G,
+                                      OnAllocatedFunction OnAllocated)
+{
+    jitlink::BasicLayout BL{G};
+
+    {
+        std::unique_lock Lock{Mutex};
+        for (auto &[AG, Seg] : BL.segments()) {
+            if (AG.getMemLifetime() == orc::MemLifetime::NoAlloc)
+                continue;
+            assert(AG.getMemLifetime() == orc::MemLifetime::Standard);
+
+            auto Prot = AG.getMemProt();
+            uint64_t Alignment = Seg.Alignment.value();
+            uint64_t Size = Seg.ContentSize + Seg.ZeroFillSize;
+            Allocation Alloc;
+            if (Prot == (MemProt::Read | MemProt::Write))
+                Alloc = RWAlloc.alloc(Size, Alignment);
+            else if (Prot == MemProt::Read)
+                Alloc = ROAlloc->alloc(Size, Alignment);
+            else if (Prot == (MemProt::Read | MemProt::Exec))
+                Alloc = ExeAlloc->alloc(Size, Alignment);
+            else
+                abort();
+
+            Seg.Addr = orc::ExecutorAddr::fromPtr(Alloc.rt_addr);
+            Seg.WorkingMem = (char *)Alloc.wr_addr;
+        }
+    }
+
+    if (auto Err = BL.apply())
+        return OnAllocated(std::move(Err));
+
+    ++InFlight;
+    OnAllocated(std::make_unique<InFlightAlloc>(*this, G));
+}
 }
 
 RTDyldMemoryManager* createRTDyldMemoryManager() JL_NOTSAFEPOINT
@@ -930,3 +1066,8 @@ size_t getRTDyldMemoryManagerTotalBytes(RTDyldMemoryManager *mm) JL_NOTSAFEPOINT
 {
     return ((RTDyldMemoryManagerJL*)mm)->getTotalBytes();
 }
+
+std::unique_ptr<jitlink::JITLinkMemoryManager> createJITLinkMemoryManager()
+{
+    return JLJITLinkMemoryManager::Create();
+}
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 0773d1a6c16a1..90091cc1f38db 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -1208,12 +1208,6 @@ class JLMemoryUsagePlugin : public ObjectLinkingLayer::Plugin {
 #pragma clang diagnostic ignored "-Wunused-function"
 #endif
 
-// TODO: Port our memory management optimisations to JITLink instead of using the
-// default InProcessMemoryManager.
-std::unique_ptr<jitlink::JITLinkMemoryManager> createJITLinkMemoryManager() JL_NOTSAFEPOINT {
-    return cantFail(orc::MapperJITLinkMemoryManager::CreateWithMapper<orc::InProcessMemoryMapper>(/*Reservation Granularity*/ 16 * 1024 * 1024));
-}
-
 #ifdef _COMPILER_CLANG_
 #pragma clang diagnostic pop
 #endif
@@ -1237,6 +1231,7 @@ class JLEHFrameRegistrar final : public jitlink::EHFrameRegistrar {
 };
 
 RTDyldMemoryManager *createRTDyldMemoryManager(void) JL_NOTSAFEPOINT;
+std::unique_ptr<jitlink::JITLinkMemoryManager> createJITLinkMemoryManager() JL_NOTSAFEPOINT;
 
 // A simple forwarding class, since OrcJIT v2 needs a unique_ptr, while we have a shared_ptr
 class ForwardingMemoryManager : public RuntimeDyld::MemoryManager {

From 068186c191fc3d0ba7a1cf54ee36ae4ff55c1c08 Mon Sep 17 00:00:00 2001
From: Sam Schweigel <sam.schweigel@juliahub.com>
Date: Mon, 24 Nov 2025 14:08:06 -0800
Subject: [PATCH 2/2] Fix aarch64 macOS crash when SIP disabled
 (JLJITLinkMemoryManager)

Apple ARM CPUs treat the `ic ivau` as a memory read, which causes a confusing
crash in DualMapAllocator if we try using it on a wr_addr that has been
mprotected to `Prot::NO`, since we are still holding the allocator lock.

For Apple aarch64 systems with SIP disabled, this will result in some memory
savings, since DualMapAllocator will now work there.  Like before, other JITLink
platforms, namely Linux aarch64 and RISC-V, will benefit too.

This re-lands #60105, after it was reverted in #60196.  Thanks @giordano!
---
 src/cgmemmgr.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp
index e36f9f80cfccf..d3f0be0ec4cbf 100644
--- a/src/cgmemmgr.cpp
+++ b/src/cgmemmgr.cpp
@@ -548,13 +548,12 @@ class ROAllocator {
     virtual ~ROAllocator() JL_NOTSAFEPOINT {}
     virtual void finalize() JL_NOTSAFEPOINT
     {
-        for (auto &alloc: allocations) {
-            // ensure the mapped pages are consistent
-            sys::Memory::InvalidateInstructionCache(alloc.wr_addr,
-                                                    alloc.sz);
-            sys::Memory::InvalidateInstructionCache(alloc.rt_addr,
-                                                    alloc.sz);
-        }
+        // Note: on some aarch64 platforms, like Apple CPUs, we need read
+        // permission in order to invalidate instruction cache lines.  We are
+        // not guaranteed to have read permission on the wr_addr when using
+        // DualMapAllocator.
+        for (auto &alloc : allocations)
+            sys::Memory::InvalidateInstructionCache(alloc.rt_addr, alloc.sz);
         completed.clear();
         allocations.clear();
     }