From e8154a529f9ecf34e1ade2376a74b7841d40019e Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 24 Mar 2024 11:49:47 +0100 Subject: [PATCH] JitArm64: Increase farcode & nearcode cache size This is a JitArm64 version of 219610d8a0e5a2c12d074314c6f2e62a4b43d7e4. Due to limitations on how far you can jump with a single AArch64 branch instruction, going above the former limit of 128 MiB of code (counting nearcode and farcode combined) requires a bit of restructuring. With the restructuring in place, the limit now is 256 MiB. See the new large comment in Jit.h for a description of the new memory layout. --- Source/Core/Common/CodeBlock.h | 6 +- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 152 ++++++++++++++++------ Source/Core/Core/PowerPC/JitArm64/Jit.h | 37 +++++- 3 files changed, 148 insertions(+), 47 deletions(-) diff --git a/Source/Core/Common/CodeBlock.h b/Source/Core/Common/CodeBlock.h index fbd4cdb00a..9efc82edeb 100644 --- a/Source/Core/Common/CodeBlock.h +++ b/Source/Core/Common/CodeBlock.h @@ -82,6 +82,10 @@ public: } bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); } + bool IsInSpaceOrChildSpace(const u8* ptr) const + { + return ptr >= region && ptr < (region + total_region_size); + } void WriteProtect(bool allow_execute) { Common::WriteProtectMemory(region, region_size, allow_execute); @@ -106,7 +110,7 @@ public: bool HasChildren() const { return region_size != total_region_size; } u8* AllocChildCodeSpace(size_t child_size) { - ASSERT_MSG(DYNA_REC, child_size < GetSpaceLeft(), "Insufficient space for child allocation."); + ASSERT_MSG(DYNA_REC, child_size <= GetSpaceLeft(), "Insufficient space for child allocation."); u8* child_region = region + region_size - child_size; region_size -= child_size; ResetCodePtr(); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 1064da5517..cf30f546c7 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -4,6 +4,7 @@ #include "Core/PowerPC/JitArm64/Jit.h" #include +#include #include "Common/Arm64Emitter.h" #include "Common/CommonTypes.h" @@ -29,13 +30,13 @@ using namespace Arm64Gen; -constexpr size_t CODE_SIZE = 1024 * 1024 * 32; +constexpr size_t NEAR_CODE_SIZE = 1024 * 1024 * 64; // We use a bigger farcode size for JitArm64 than Jit64, because JitArm64 always emits farcode // for the slow path of each loadstore instruction. Jit64 postpones emitting farcode until the // farcode actually is needed, saving it from having to emit farcode for most instructions. // TODO: Perhaps implement something similar to Jit64. But using more RAM isn't much of a problem. -constexpr size_t FARCODE_SIZE = 1024 * 1024 * 64; -constexpr size_t FARCODE_SIZE_MMU = 1024 * 1024 * 64; +constexpr size_t FAR_CODE_SIZE = 1024 * 1024 * 64; +constexpr size_t TOTAL_CODE_SIZE = NEAR_CODE_SIZE * 2 + FAR_CODE_SIZE * 2; JitArm64::JitArm64(Core::System& system) : JitBase(system), m_float_emit(this) { @@ -49,9 +50,18 @@ void JitArm64::Init() RefreshConfig(); - const size_t child_code_size = jo.memcheck ? FARCODE_SIZE_MMU : FARCODE_SIZE; - AllocCodeSpace(CODE_SIZE + child_code_size); - AddChildCodeSpace(&m_far_code, child_code_size); + // We want the regions to be laid out in this order in memory: + // m_far_code_0, m_near_code_0, m_near_code_1, m_far_code_1. + // AddChildCodeSpace grabs space from the end of the parent region, + // so we have to call AddChildCodeSpace in reverse order. + AllocCodeSpace(TOTAL_CODE_SIZE); + AddChildCodeSpace(&m_far_code_1, FAR_CODE_SIZE); + AddChildCodeSpace(&m_near_code_1, NEAR_CODE_SIZE); + AddChildCodeSpace(&m_near_code_0, NEAR_CODE_SIZE); + AddChildCodeSpace(&m_far_code_0, FAR_CODE_SIZE); + ASSERT(m_far_code_0.GetCodeEnd() == m_near_code_0.GetCodePtr()); + ASSERT(m_near_code_0.GetCodeEnd() == m_near_code_1.GetCodePtr()); + ASSERT(m_near_code_1.GetCodeEnd() == m_far_code_1.GetCodePtr()); jo.optimizeGatherPipe = true; SetBlockLinkingEnabled(true); @@ -66,9 +76,7 @@ void JitArm64::Init() InitBLROptimization(); - GenerateAsm(); - - ResetFreeMemoryRanges(); + GenerateAsmAndResetFreeMemoryRanges(); } void JitArm64::SetBlockLinkingEnabled(bool enabled) @@ -113,7 +121,7 @@ bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx) success = HandleStackFault(); // If the fault is in JIT code space, look for fastmem areas. - if (!success && IsInSpace(reinterpret_cast(ctx->CTX_PC))) + if (!success && IsInSpaceOrChildSpace(reinterpret_cast(ctx->CTX_PC))) { auto& memory = m_system.GetMemory(); if (memory.IsAddressInFastmemArea(reinterpret_cast(access_address))) @@ -153,22 +161,47 @@ void JitArm64::ClearCache() blocks.Clear(); blocks.ClearRangesToFree(); const Common::ScopedJITPageWriteAndNoExecute enable_jit_page_writes; - ClearCodeSpace(); - m_far_code.ClearCodeSpace(); + m_far_code_0.ClearCodeSpace(); + m_near_code_0.ClearCodeSpace(); + m_near_code_1.ClearCodeSpace(); + m_far_code_1.ClearCodeSpace(); RefreshConfig(); + GenerateAsmAndResetFreeMemoryRanges(); +} + +void JitArm64::GenerateAsmAndResetFreeMemoryRanges() +{ + SetCodePtr(m_near_code_1.GetWritableCodePtr(), m_near_code_1.GetWritableCodeEnd()); + m_far_code.SetCodePtr(m_far_code_1.GetWritableCodePtr(), m_far_code_1.GetWritableCodeEnd()); + + const u8* routines_near_start = GetCodePtr(); + const u8* routines_far_start = m_far_code.GetCodePtr(); + GenerateAsm(); - ResetFreeMemoryRanges(); + const u8* routines_near_end = GetCodePtr(); + const u8* routines_far_end = m_far_code.GetCodePtr(); + + ResetFreeMemoryRanges(routines_near_end - routines_near_start, + routines_far_end - routines_far_start); } -void JitArm64::ResetFreeMemoryRanges() +void JitArm64::ResetFreeMemoryRanges(size_t routines_near_size, size_t routines_far_size) { // Set the near and far code regions as unused. - m_free_ranges_near.clear(); - m_free_ranges_near.insert(GetWritableCodePtr(), GetWritableCodeEnd()); - m_free_ranges_far.clear(); - m_free_ranges_far.insert(m_far_code.GetWritableCodePtr(), m_far_code.GetWritableCodeEnd()); + m_free_ranges_far_0.clear(); + m_free_ranges_far_0.insert(m_far_code_0.GetWritableCodePtr() + routines_near_size, + m_far_code_0.GetWritableCodeEnd()); + m_free_ranges_near_0.clear(); + m_free_ranges_near_0.insert(m_near_code_0.GetWritableCodePtr(), + m_near_code_0.GetWritableCodeEnd()); + m_free_ranges_near_1.clear(); + m_free_ranges_near_1.insert(m_near_code_1.GetWritableCodePtr() + routines_near_size, + m_near_code_1.GetWritableCodeEnd()); + m_free_ranges_far_1.clear(); + m_free_ranges_far_1.insert(m_far_code_1.GetWritableCodePtr() + routines_far_size, + m_far_code_1.GetWritableCodeEnd()); } void JitArm64::Shutdown() @@ -889,11 +922,17 @@ void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure) ++last_fastmem_area; m_fault_to_handler.erase(first_fastmem_area, last_fastmem_area); - m_free_ranges_near.insert(range.first, range.second); + if (range.first < m_near_code_0.GetCodeEnd()) + m_free_ranges_near_0.insert(range.first, range.second); + else + m_free_ranges_near_1.insert(range.first, range.second); } for (auto range : blocks.GetRangesToFreeFar()) { - m_free_ranges_far.insert(range.first, range.second); + if (range.first < m_far_code_0.GetCodeEnd()) + m_free_ranges_far_0.insert(range.first, range.second); + else + m_free_ranges_far_1.insert(range.first, range.second); } blocks.ClearRangesToFree(); @@ -939,7 +978,7 @@ void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure) return; } - if (SetEmitterStateToFreeCodeRegion()) + if (std::optional code_region_index = SetEmitterStateToFreeCodeRegion()) { u8* near_start = GetWritableCodePtr(); u8* far_start = m_far_code.GetWritableCodePtr(); @@ -952,10 +991,16 @@ void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure) // Mark the memory regions that this code block uses as used in the local rangesets. u8* near_end = GetWritableCodePtr(); if (near_start != near_end) - m_free_ranges_near.erase(near_start, near_end); + { + (code_region_index == 0 ? m_free_ranges_near_0 : m_free_ranges_near_1) + .erase(near_start, near_end); + } u8* far_end = m_far_code.GetWritableCodePtr(); if (far_start != far_end) - m_free_ranges_far.erase(far_start, far_end); + { + (code_region_index == 0 ? m_free_ranges_far_0 : m_free_ranges_far_1) + .erase(far_start, far_end); + } // Store the used memory regions in the block so we know what to mark as unused when the // block gets invalidated. @@ -984,27 +1029,52 @@ void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure) exit(-1); } -bool JitArm64::SetEmitterStateToFreeCodeRegion() +std::optional JitArm64::SetEmitterStateToFreeCodeRegion() { - // Find the largest free memory blocks and set code emitters to point at them. - // If we can't find a free block return false instead, which will trigger a JIT cache clear. - auto free_near = m_free_ranges_near.by_size_begin(); - if (free_near == m_free_ranges_near.by_size_end()) - { - WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in near code region."); - return false; - } - SetCodePtr(free_near.from(), free_near.to()); + // Find some large free memory blocks and set code emitters to point at them. If we can't find + // free blocks, return std::nullopt instead, which will trigger a JIT cache clear. + const auto free_near_0 = m_free_ranges_near_0.by_size_begin(); + const auto free_near_1 = m_free_ranges_near_1.by_size_begin(); + const auto free_far_0 = m_free_ranges_far_0.by_size_begin(); + const auto free_far_1 = m_free_ranges_far_1.by_size_begin(); - auto free_far = m_free_ranges_far.by_size_begin(); - if (free_far == m_free_ranges_far.by_size_end()) - { - WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in far code region."); - return false; - } - m_far_code.SetCodePtr(free_far.from(), free_far.to()); + const size_t free_near_1_size = free_near_1.to() - free_near_1.from(); + const size_t free_far_1_size = free_far_1.to() - free_far_1.from(); + const size_t free_1_smallest_size = std::min(free_near_1_size, free_far_1_size); - return true; + if (free_1_smallest_size >= 1024 * 1024) + { + // Don't use region 0 unless region 1 is getting full. This improves cache friendliness. + SetCodePtr(free_near_1.from(), free_near_1.to()); + m_far_code.SetCodePtr(free_far_1.from(), free_far_1.to()); + return std::make_optional(1); + } + + const size_t free_near_0_size = free_near_0.to() - free_near_0.from(); + const size_t free_far_0_size = free_far_0.to() - free_far_0.from(); + const size_t free_0_smallest_size = std::min(free_near_0_size, free_far_0_size); + + if (free_0_smallest_size == 0 && free_1_smallest_size == 0) + { + if (free_near_0_size == 0 && free_near_1_size == 0) + WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in near code regions."); + else if (free_far_0_size == 0 && free_far_1_size == 0) + WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in far code regions."); + return std::nullopt; + } + + if (free_0_smallest_size > free_1_smallest_size) + { + SetCodePtr(free_near_0.from(), free_near_0.to()); + m_far_code.SetCodePtr(free_far_0.from(), free_far_0.to()); + return std::make_optional(0); + } + else + { + SetCodePtr(free_near_1.from(), free_near_1.to()); + m_far_code.SetCodePtr(free_far_1.from(), free_far_1.to()); + return std::make_optional(1); + } } bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index a572b2e38f..6ab99d3322 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -285,14 +286,16 @@ protected: void Trace(); // Finds a free memory region and sets the near and far code emitters to point at that region. - // Returns false if no free memory region can be found for either of the two. - bool SetEmitterStateToFreeCodeRegion(); + // On success, returns the index of the memory region (either 0 or 1). + // If either near code or far code is full, returns std::nullopt. + std::optional SetEmitterStateToFreeCodeRegion(); void DoDownCount(); void Cleanup(); void ResetStack(); - void ResetFreeMemoryRanges(); + void GenerateAsmAndResetFreeMemoryRanges(); + void ResetFreeMemoryRanges(size_t routines_near_size, size_t routines_far_size); void IntializeSpeculativeConstants(); @@ -372,6 +375,28 @@ protected: Arm64Gen::ARM64FloatEmitter m_float_emit; + // Because B instructions can't jump farther than +/- 128 MiB, code memory is allocated like this: + // + // m_far_code_0: x MiB of unused space, followed by 64 - x MiB of far code + // m_near_code_0: 64 MiB of near code + // m_near_code_1: x MiB of asm routines, followed by 64 - x MiB of near code + // m_far_code_1: 64 MiB of far code + // + // This ensures that: + // + // * Any code in m_near_code_0 can reach any code in m_far_code_0, and vice versa + // * Any code in m_near_code_1 can reach any code in m_far_code_1, and vice versa + // * Any near code can reach any near code + // * Any code can reach any asm routine + // + // m_far_code_0 and m_far_code_1 can't reach each other, but that isn't needed, because all blocks + // have their entry points in near code. + + Arm64Gen::ARM64CodeBlock m_near_code_0; + Arm64Gen::ARM64CodeBlock m_near_code_1; + Arm64Gen::ARM64CodeBlock m_far_code_0; + Arm64Gen::ARM64CodeBlock m_far_code_1; + Arm64Gen::ARM64CodeBlock m_far_code; bool m_in_far_code = false; @@ -380,6 +405,8 @@ protected: u8* m_near_code_end = nullptr; bool m_near_code_write_failed = false; - HyoutaUtilities::RangeSizeSet m_free_ranges_near; - HyoutaUtilities::RangeSizeSet m_free_ranges_far; + HyoutaUtilities::RangeSizeSet m_free_ranges_near_0; + HyoutaUtilities::RangeSizeSet m_free_ranges_near_1; + HyoutaUtilities::RangeSizeSet m_free_ranges_far_0; + HyoutaUtilities::RangeSizeSet m_free_ranges_far_1; };