From 054645025e3dadb987394e28b54ae7c9ee0afbc7 Mon Sep 17 00:00:00 2001 From: Sam Schweigel Date: Wed, 12 Nov 2025 16:13:14 -0800 Subject: [PATCH 1/2] Add JLJITLinkMemoryManager (ports memory manager to JITLink) (#60105) Ports our RTDyLD memory manager to JITLink in order to avoid memory use regressions after switching to JITLink everywhere (#60031). This is a direct port: finalization must happen all at once, because it invalidates all allocation `wr_ptr`s. I decided it wasn't worth it to associate `OnFinalizedFunction` callbacks with each block, since they are large enough to make it extremely likely that all in-flight allocations land in the same block; everything must be relocated before finalization can happen. --- src/cgmemmgr.cpp | 233 +++++++++++++++++++++++++++++++++++++--------- src/jitlayers.cpp | 7 +- 2 files changed, 188 insertions(+), 52 deletions(-) diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp index 99f78b81bf0b2..e36f9f80cfccf 100644 --- a/src/cgmemmgr.cpp +++ b/src/cgmemmgr.cpp @@ -3,7 +3,11 @@ #include "llvm-version.h" #include "platform.h" +#include +#include +#include #include + #include "julia.h" #include "julia_internal.h" @@ -460,18 +464,27 @@ struct Block { } }; +struct Allocation { + // Address to write to (the one returned by the allocation function) + void *wr_addr; + // Runtime address + void *rt_addr; + size_t sz; + bool relocated; +}; + class RWAllocator { static constexpr int nblocks = 8; Block blocks[nblocks]{}; public: RWAllocator() JL_NOTSAFEPOINT = default; - void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT + Allocation alloc(size_t size, size_t align) JL_NOTSAFEPOINT { size_t min_size = (size_t)-1; int min_id = 0; for (int i = 0;i < nblocks && blocks[i].ptr;i++) { if (void *ptr = blocks[i].alloc(size, align)) - return ptr; + return {ptr, ptr, size, false}; if (blocks[i].avail < min_size) { min_size = blocks[i].avail; min_id = i; @@ -479,7 +492,8 @@ class RWAllocator { } size_t block_size = get_block_size(size); blocks[min_id].reset(map_anon_page(block_size), block_size); - return blocks[min_id].alloc(size, align); + void *ptr = blocks[min_id].alloc(size, align); + return {ptr, ptr, size, false}; } }; @@ -519,16 +533,6 @@ struct SplitPtrBlock : public Block { } }; -struct Allocation { - // Address to write to (the one returned by the allocation function) - void *wr_addr; - // Runtime address - void *rt_addr; - size_t sz; - bool relocated; -}; - -template class ROAllocator { protected: static constexpr int nblocks = 8; @@ -556,7 +560,7 @@ class ROAllocator { } // Allocations that have not been finalized yet. SmallVector allocations; - void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT + Allocation alloc(size_t size, size_t align) JL_NOTSAFEPOINT { size_t min_size = (size_t)-1; int min_id = 0; @@ -572,8 +576,9 @@ class ROAllocator { wr_ptr = get_wr_ptr(block, ptr, size, align); } block.state |= SplitPtrBlock::Alloc; - allocations.push_back(Allocation{wr_ptr, ptr, size, false}); - return wr_ptr; + Allocation a{wr_ptr, ptr, size, false}; + allocations.push_back(a); + return a; } if (block.avail < min_size) { min_size = block.avail; @@ -594,18 +599,21 @@ class ROAllocator { #ifdef _OS_WINDOWS_ block.state = SplitPtrBlock::Alloc; void *wr_ptr = get_wr_ptr(block, ptr, size, align); - allocations.push_back(Allocation{wr_ptr, ptr, size, false}); + Allocation a{wr_ptr, ptr, size, false}; + allocations.push_back(a); ptr = wr_ptr; #else block.state = SplitPtrBlock::Alloc | SplitPtrBlock::InitAlloc; - allocations.push_back(Allocation{ptr, ptr, size, false}); + Allocation a{ptr, ptr, size, false}; + allocations.push_back(a); #endif - return ptr; + return a; } }; -template -class DualMapAllocator : public ROAllocator { +class DualMapAllocator : public ROAllocator { + bool exec; + protected: void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, size_t, size_t) override JL_NOTSAFEPOINT { @@ -666,7 +674,7 @@ class DualMapAllocator : public ROAllocator { } } public: - DualMapAllocator() JL_NOTSAFEPOINT + DualMapAllocator(bool exec) JL_NOTSAFEPOINT : exec(exec) { assert(anon_hdl != -1); } @@ -679,13 +687,13 @@ class DualMapAllocator : public ROAllocator { finalize_block(block, true); block.reset(nullptr, 0); } - ROAllocator::finalize(); + ROAllocator::finalize(); } }; #ifdef _OS_LINUX_ -template -class SelfMemAllocator : public ROAllocator { +class SelfMemAllocator : public ROAllocator { + bool exec; SmallVector temp_buff; protected: void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, @@ -722,9 +730,7 @@ class SelfMemAllocator : public ROAllocator { } } public: - SelfMemAllocator() JL_NOTSAFEPOINT - : ROAllocator(), - temp_buff() + SelfMemAllocator(bool exec) JL_NOTSAFEPOINT : exec(exec), temp_buff() { assert(get_self_mem_fd() != -1); } @@ -758,11 +764,25 @@ class SelfMemAllocator : public ROAllocator { } if (cached) temp_buff.resize(1); - ROAllocator::finalize(); + ROAllocator::finalize(); } }; #endif // _OS_LINUX_ +std::pair, std::unique_ptr> +get_preferred_allocators() JL_NOTSAFEPOINT +{ +#ifdef _OS_LINUX_ + if (get_self_mem_fd() != -1) + return {std::make_unique(false), + std::make_unique(true)}; +#endif + if (init_shared_map() != -1) + return {std::make_unique(false), + std::make_unique(true)}; + return {}; +} + class RTDyldMemoryManagerJL : public SectionMemoryManager { struct EHFrame { uint8_t *addr; @@ -772,8 +792,8 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager { void operator=(const RTDyldMemoryManagerJL&) = delete; SmallVector pending_eh; RWAllocator rw_alloc; - std::unique_ptr> ro_alloc; - std::unique_ptr> exe_alloc; + std::unique_ptr ro_alloc; + std::unique_ptr exe_alloc; size_t total_allocated; public: @@ -781,20 +801,9 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager { : SectionMemoryManager(), pending_eh(), rw_alloc(), - ro_alloc(), - exe_alloc(), total_allocated(0) { -#ifdef _OS_LINUX_ - if (!ro_alloc && get_self_mem_fd() != -1) { - ro_alloc.reset(new SelfMemAllocator()); - exe_alloc.reset(new SelfMemAllocator()); - } -#endif - if (!ro_alloc && init_shared_map() != -1) { - ro_alloc.reset(new DualMapAllocator()); - exe_alloc.reset(new DualMapAllocator()); - } + std::tie(ro_alloc, exe_alloc) = get_preferred_allocators(); } ~RTDyldMemoryManagerJL() override JL_NOTSAFEPOINT { @@ -847,7 +856,7 @@ uint8_t *RTDyldMemoryManagerJL::allocateCodeSection(uintptr_t Size, jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size); jl_timing_counter_inc(JL_TIMING_COUNTER_JITCodeSize, Size); if (exe_alloc) - return (uint8_t*)exe_alloc->alloc(Size, Alignment); + return (uint8_t*)exe_alloc->alloc(Size, Alignment).wr_addr; return SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID, SectionName); } @@ -862,9 +871,9 @@ uint8_t *RTDyldMemoryManagerJL::allocateDataSection(uintptr_t Size, jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size); jl_timing_counter_inc(JL_TIMING_COUNTER_JITDataSize, Size); if (!isReadOnly) - return (uint8_t*)rw_alloc.alloc(Size, Alignment); + return (uint8_t*)rw_alloc.alloc(Size, Alignment).wr_addr; if (ro_alloc) - return (uint8_t*)ro_alloc->alloc(Size, Alignment); + return (uint8_t*)ro_alloc->alloc(Size, Alignment).wr_addr; return SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, SectionName, isReadOnly); } @@ -919,6 +928,133 @@ void RTDyldMemoryManagerJL::deregisterEHFrames(uint8_t *Addr, } #endif +class JLJITLinkMemoryManager : public jitlink::JITLinkMemoryManager { + using OnFinalizedFunction = + jitlink::JITLinkMemoryManager::InFlightAlloc::OnFinalizedFunction; + + std::mutex Mutex; + RWAllocator RWAlloc; + std::unique_ptr ROAlloc; + std::unique_ptr ExeAlloc; + SmallVector FinalizedCallbacks; + uint32_t InFlight{0}; + +public: + class InFlightAlloc; + + static std::unique_ptr Create() + { + auto [ROAlloc, ExeAlloc] = get_preferred_allocators(); + if (ROAlloc && ExeAlloc) + return std::unique_ptr( + new JLJITLinkMemoryManager(std::move(ROAlloc), std::move(ExeAlloc))); + + return cantFail( + orc::MapperJITLinkMemoryManager::CreateWithMapper( + /*Reservation Granularity*/ 16 * 1024 * 1024)); + } + + void allocate(const jitlink::JITLinkDylib *JD, jitlink::LinkGraph &G, + OnAllocatedFunction OnAllocated) override; + + void deallocate(std::vector Allocs, + OnDeallocatedFunction OnDeallocated) override + { + jl_unreachable(); + } + +protected: + JLJITLinkMemoryManager(std::unique_ptr ROAlloc, + std::unique_ptr ExeAlloc) + : ROAlloc(std::move(ROAlloc)), ExeAlloc(std::move(ExeAlloc)) + { + } + + void finalize(OnFinalizedFunction OnFinalized) + { + SmallVector Callbacks; + { + std::unique_lock Lock{Mutex}; + FinalizedCallbacks.push_back(std::move(OnFinalized)); + + if (--InFlight > 0) + return; + + ROAlloc->finalize(); + ExeAlloc->finalize(); + Callbacks = std::move(FinalizedCallbacks); + } + + for (auto &CB : Callbacks) + std::move(CB)(FinalizedAlloc{}); + } +}; + +class JLJITLinkMemoryManager::InFlightAlloc + : public jitlink::JITLinkMemoryManager::InFlightAlloc { + JLJITLinkMemoryManager &MM; + jitlink::LinkGraph &G; + +public: + InFlightAlloc(JLJITLinkMemoryManager &MM, jitlink::LinkGraph &G) : MM(MM), G(G) {} + + void abandon(OnAbandonedFunction OnAbandoned) override { jl_unreachable(); } + + void finalize(OnFinalizedFunction OnFinalized) override + { + auto *GP = &G; + MM.finalize([GP, OnFinalized = + std::move(OnFinalized)](Expected FA) mutable { + if (!FA) + return OnFinalized(FA.takeError()); + // Need to handle dealloc actions when we GC code + auto E = orc::shared::runFinalizeActions(GP->allocActions()); + if (!E) + return OnFinalized(E.takeError()); + OnFinalized(std::move(FA)); + }); + } +}; + +using orc::MemProt; + +void JLJITLinkMemoryManager::allocate(const jitlink::JITLinkDylib *JD, + jitlink::LinkGraph &G, + OnAllocatedFunction OnAllocated) +{ + jitlink::BasicLayout BL{G}; + + { + std::unique_lock Lock{Mutex}; + for (auto &[AG, Seg] : BL.segments()) { + if (AG.getMemLifetime() == orc::MemLifetime::NoAlloc) + continue; + assert(AG.getMemLifetime() == orc::MemLifetime::Standard); + + auto Prot = AG.getMemProt(); + uint64_t Alignment = Seg.Alignment.value(); + uint64_t Size = Seg.ContentSize + Seg.ZeroFillSize; + Allocation Alloc; + if (Prot == (MemProt::Read | MemProt::Write)) + Alloc = RWAlloc.alloc(Size, Alignment); + else if (Prot == MemProt::Read) + Alloc = ROAlloc->alloc(Size, Alignment); + else if (Prot == (MemProt::Read | MemProt::Exec)) + Alloc = ExeAlloc->alloc(Size, Alignment); + else + abort(); + + Seg.Addr = orc::ExecutorAddr::fromPtr(Alloc.rt_addr); + Seg.WorkingMem = (char *)Alloc.wr_addr; + } + } + + if (auto Err = BL.apply()) + return OnAllocated(std::move(Err)); + + ++InFlight; + OnAllocated(std::make_unique(*this, G)); +} } RTDyldMemoryManager* createRTDyldMemoryManager() JL_NOTSAFEPOINT @@ -930,3 +1066,8 @@ size_t getRTDyldMemoryManagerTotalBytes(RTDyldMemoryManager *mm) JL_NOTSAFEPOINT { return ((RTDyldMemoryManagerJL*)mm)->getTotalBytes(); } + +std::unique_ptr createJITLinkMemoryManager() +{ + return JLJITLinkMemoryManager::Create(); +} diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 0773d1a6c16a1..90091cc1f38db 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -1208,12 +1208,6 @@ class JLMemoryUsagePlugin : public ObjectLinkingLayer::Plugin { #pragma clang diagnostic ignored "-Wunused-function" #endif -// TODO: Port our memory management optimisations to JITLink instead of using the -// default InProcessMemoryManager. -std::unique_ptr createJITLinkMemoryManager() JL_NOTSAFEPOINT { - return cantFail(orc::MapperJITLinkMemoryManager::CreateWithMapper(/*Reservation Granularity*/ 16 * 1024 * 1024)); -} - #ifdef _COMPILER_CLANG_ #pragma clang diagnostic pop #endif @@ -1237,6 +1231,7 @@ class JLEHFrameRegistrar final : public jitlink::EHFrameRegistrar { }; RTDyldMemoryManager *createRTDyldMemoryManager(void) JL_NOTSAFEPOINT; +std::unique_ptr createJITLinkMemoryManager() JL_NOTSAFEPOINT; // A simple forwarding class, since OrcJIT v2 needs a unique_ptr, while we have a shared_ptr class ForwardingMemoryManager : public RuntimeDyld::MemoryManager { From 068186c191fc3d0ba7a1cf54ee36ae4ff55c1c08 Mon Sep 17 00:00:00 2001 From: Sam Schweigel Date: Mon, 24 Nov 2025 14:08:06 -0800 Subject: [PATCH 2/2] Fix aarch64 macOS crash when SIP disabled (JLJITLinkMemoryManager) Apple ARM CPUs treat the `ic ivau` as a memory read, which causes a confusing crash in DualMapAllocator if we try using it on a wr_addr that has been mprotected to `Prot::NO`, since we are still holding the allocator lock. For Apple aarch64 systems with SIP disabled, this will result in some memory savings, since DualMapAllocator will now work there. Like before, other JITLink platforms, namely Linux aarch64 and RISC-V, will benefit too. This re-lands #60105, after it was reverted in #60196. Thanks @giordano! --- src/cgmemmgr.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp index e36f9f80cfccf..d3f0be0ec4cbf 100644 --- a/src/cgmemmgr.cpp +++ b/src/cgmemmgr.cpp @@ -548,13 +548,12 @@ class ROAllocator { virtual ~ROAllocator() JL_NOTSAFEPOINT {} virtual void finalize() JL_NOTSAFEPOINT { - for (auto &alloc: allocations) { - // ensure the mapped pages are consistent - sys::Memory::InvalidateInstructionCache(alloc.wr_addr, - alloc.sz); - sys::Memory::InvalidateInstructionCache(alloc.rt_addr, - alloc.sz); - } + // Note: on some aarch64 platforms, like Apple CPUs, we need read + // permission in order to invalidate instruction cache lines. We are + // not guaranteed to have read permission on the wr_addr when using + // DualMapAllocator. + for (auto &alloc : allocations) + sys::Memory::InvalidateInstructionCache(alloc.rt_addr, alloc.sz); completed.clear(); allocations.clear(); }