Merge pull request #8765 from AdmiralCurtiss/jit-reuse-memory

Jit64 codegen space reuse.
This commit is contained in:
JMC47
2020-09-09 08:16:09 -04:00
committed by GitHub
21 changed files with 1235 additions and 30 deletions

View File

@ -310,7 +310,7 @@ void ARM64XEmitter::SetCodePtrUnsafe(u8* ptr)
m_code = ptr;
}
void ARM64XEmitter::SetCodePtr(u8* ptr)
void ARM64XEmitter::SetCodePtr(u8* ptr, u8* end, bool write_failed)
{
SetCodePtrUnsafe(ptr);
m_lastCacheFlushEnd = ptr;

View File

@ -540,7 +540,11 @@ public:
}
virtual ~ARM64XEmitter() {}
void SetCodePtr(u8* ptr);
// 'end' and 'write_failed' are unused in the ARM code emitter at the moment.
// They're just here for interface compatibility with the x64 code emitter.
void SetCodePtr(u8* ptr, u8* end, bool write_failed = false);
void SetCodePtrUnsafe(u8* ptr);
void ReserveCodeSpace(u32 bytes);
u8* AlignCode16();

View File

@ -55,7 +55,7 @@ public:
region_size = size;
total_region_size = size;
region = static_cast<u8*>(Common::AllocateExecutableMemory(total_region_size));
T::SetCodePtr(region);
T::SetCodePtr(region, region + size);
}
// Always clear code space with breakpoints, so that if someone accidentally executes
@ -86,7 +86,7 @@ public:
// Cannot currently be undone. Will write protect the entire code region.
// Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
void WriteProtect() { Common::WriteProtectMemory(region, region_size, true); }
void ResetCodePtr() { T::SetCodePtr(region); }
void ResetCodePtr() { T::SetCodePtr(region, region + region_size); }
size_t GetSpaceLeft() const
{
ASSERT(static_cast<size_t>(T::GetCodePtr() - region) < region_size);

View File

@ -101,9 +101,11 @@ enum class FloatOp
Invalid = -1,
};
void XEmitter::SetCodePtr(u8* ptr)
void XEmitter::SetCodePtr(u8* ptr, u8* end, bool write_failed)
{
code = ptr;
m_code_end = end;
m_write_failed = write_failed;
}
const u8* XEmitter::GetCodePtr() const
@ -116,31 +118,76 @@ u8* XEmitter::GetWritableCodePtr()
return code;
}
const u8* XEmitter::GetCodeEnd() const
{
return m_code_end;
}
u8* XEmitter::GetWritableCodeEnd()
{
return m_code_end;
}
void XEmitter::Write8(u8 value)
{
if (code >= m_code_end)
{
code = m_code_end;
m_write_failed = true;
return;
}
*code++ = value;
}
void XEmitter::Write16(u16 value)
{
if (code + sizeof(u16) > m_code_end)
{
code = m_code_end;
m_write_failed = true;
return;
}
std::memcpy(code, &value, sizeof(u16));
code += sizeof(u16);
}
void XEmitter::Write32(u32 value)
{
if (code + sizeof(u32) > m_code_end)
{
code = m_code_end;
m_write_failed = true;
return;
}
std::memcpy(code, &value, sizeof(u32));
code += sizeof(u32);
}
void XEmitter::Write64(u64 value)
{
if (code + sizeof(u64) > m_code_end)
{
code = m_code_end;
m_write_failed = true;
return;
}
std::memcpy(code, &value, sizeof(u64));
code += sizeof(u64);
}
void XEmitter::ReserveCodeSpace(int bytes)
{
if (code + bytes > m_code_end)
{
code = m_code_end;
m_write_failed = true;
return;
}
for (int i = 0; i < bytes; i++)
*code++ = 0xCC;
}
@ -454,6 +501,13 @@ FixupBranch XEmitter::CALL()
branch.ptr = code + 5;
Write8(0xE8);
Write32(0);
// If we couldn't write the full call instruction, indicate that in the returned FixupBranch by
// setting the branch's address to null. This will prevent a later SetJumpTarget() from writing to
// invalid memory.
if (HasWriteFailed())
branch.ptr = nullptr;
return branch;
}
@ -473,6 +527,13 @@ FixupBranch XEmitter::J(bool force5bytes)
Write8(0xE9);
Write32(0);
}
// If we couldn't write the full jump instruction, indicate that in the returned FixupBranch by
// setting the branch's address to null. This will prevent a later SetJumpTarget() from writing to
// invalid memory.
if (HasWriteFailed())
branch.ptr = nullptr;
return branch;
}
@ -493,6 +554,13 @@ FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes)
Write8(0x80 + conditionCode);
Write32(0);
}
// If we couldn't write the full jump instruction, indicate that in the returned FixupBranch by
// setting the branch's address to null. This will prevent a later SetJumpTarget() from writing to
// invalid memory.
if (HasWriteFailed())
branch.ptr = nullptr;
return branch;
}
@ -518,6 +586,9 @@ void XEmitter::J_CC(CCFlags conditionCode, const u8* addr)
void XEmitter::SetJumpTarget(const FixupBranch& branch)
{
if (!branch.ptr)
return;
if (branch.type == FixupBranch::Type::Branch8Bit)
{
s64 distance = (s64)(code - branch.ptr);

View File

@ -329,9 +329,19 @@ class XEmitter
{
friend struct OpArg; // for Write8 etc
private:
// Pointer to memory where code will be emitted to.
u8* code = nullptr;
// Pointer past the end of the memory region we're allowed to emit to.
// Writes that would reach this memory are refused and will set the m_write_failed flag instead.
u8* m_code_end = nullptr;
bool flags_locked = false;
// Set to true when a write request happens that would write past m_code_end.
// Must be cleared with SetCodePtr() afterwards.
bool m_write_failed = false;
void CheckFlags();
void Rex(int w, int r, int x, int b);
@ -378,9 +388,9 @@ protected:
public:
XEmitter() = default;
explicit XEmitter(u8* code_ptr) : code{code_ptr} {}
explicit XEmitter(u8* code_ptr, u8* code_end) : code(code_ptr), m_code_end(code_end) {}
virtual ~XEmitter() = default;
void SetCodePtr(u8* ptr);
void SetCodePtr(u8* ptr, u8* end, bool write_failed = false);
void ReserveCodeSpace(int bytes);
u8* AlignCodeTo(size_t alignment);
u8* AlignCode4();
@ -388,9 +398,16 @@ public:
u8* AlignCodePage();
const u8* GetCodePtr() const;
u8* GetWritableCodePtr();
const u8* GetCodeEnd() const;
u8* GetWritableCodeEnd();
void LockFlags() { flags_locked = true; }
void UnlockFlags() { flags_locked = false; }
// Should be checked after a block of code has been generated to see if the code has been
// successfully written to memory. Do not call the generated code when this returns true!
bool HasWriteFailed() const { return m_write_failed; }
// Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
// INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other
// string instr.,

View File

@ -555,6 +555,7 @@ PUBLIC
inputcommon
${MBEDTLS_LIBRARIES}
pugixml
RangeSet::RangeSet
sfml-network
sfml-system
videonull

View File

@ -281,7 +281,7 @@ bool Jit64::BackPatch(u32 emAddress, SContext* ctx)
u8* start = info.start;
// Patch the original memory operation.
XEmitter emitter(start);
XEmitter emitter(start, start + info.len);
emitter.JMP(trampoline, true);
// NOPs become dead code
const u8* end = info.start + info.len;
@ -351,6 +351,7 @@ void Jit64::Init()
AddChildCodeSpace(&trampolines, trampolines_size);
AddChildCodeSpace(&m_far_code, farcode_size);
m_const_pool.Init(AllocChildCodeSpace(constpool_size), constpool_size);
ResetCodePtr();
// BLR optimization has the same consequences as block linking, as well as
// depending on the fault handler to be safe in the event of excessive BL.
@ -375,17 +376,30 @@ void Jit64::Init()
code_block.m_gpa = &js.gpa;
code_block.m_fpa = &js.fpa;
EnableOptimization();
ResetFreeMemoryRanges();
}
void Jit64::ClearCache()
{
blocks.Clear();
blocks.ClearRangesToFree();
trampolines.ClearCodeSpace();
m_far_code.ClearCodeSpace();
m_const_pool.Clear();
ClearCodeSpace();
Clear();
UpdateMemoryOptions();
ResetFreeMemoryRanges();
}
void Jit64::ResetFreeMemoryRanges()
{
// Set the entire near and far code regions as unused.
m_free_ranges_near.clear();
m_free_ranges_near.insert(region, region + region_size);
m_free_ranges_far.clear();
m_free_ranges_far.insert(m_far_code.GetWritableCodePtr(), m_far_code.GetWritableCodeEnd());
}
void Jit64::Shutdown()
@ -720,6 +734,11 @@ void Jit64::Trace()
}
void Jit64::Jit(u32 em_address)
{
Jit(em_address, true);
}
void Jit64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
{
if (m_cleanup_after_stackfault)
{
@ -731,18 +750,23 @@ void Jit64::Jit(u32 em_address)
#endif
}
if (IsAlmostFull() || m_far_code.IsAlmostFull() || trampolines.IsAlmostFull() ||
SConfig::GetInstance().bJITNoBlockCache)
if (trampolines.IsAlmostFull() || SConfig::GetInstance().bJITNoBlockCache)
{
if (!SConfig::GetInstance().bJITNoBlockCache)
{
const auto reason =
IsAlmostFull() ? "main" : m_far_code.IsAlmostFull() ? "far" : "trampoline";
WARN_LOG(POWERPC, "flushing %s code cache, please report if this happens a lot", reason);
WARN_LOG(POWERPC, "flushing trampoline code cache, please report if this happens a lot");
}
ClearCache();
}
// Check if any code blocks have been freed in the block cache and transfer this information to
// the local rangesets to allow overwriting them with new code.
for (auto range : blocks.GetRangesToFreeNear())
m_free_ranges_near.insert(range.first, range.second);
for (auto range : blocks.GetRangesToFreeFar())
m_free_ranges_far.insert(range.first, range.second);
blocks.ClearRangesToFree();
std::size_t block_size = m_code_buffer.size();
if (SConfig::GetInstance().bEnableDebugging)
@ -785,12 +809,75 @@ void Jit64::Jit(u32 em_address)
return;
}
JitBlock* b = blocks.AllocateBlock(em_address);
DoJit(em_address, b, nextPC);
blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses);
if (SetEmitterStateToFreeCodeRegion())
{
u8* near_start = GetWritableCodePtr();
u8* far_start = m_far_code.GetWritableCodePtr();
JitBlock* b = blocks.AllocateBlock(em_address);
if (DoJit(em_address, b, nextPC))
{
// Code generation succeeded.
// Mark the memory regions that this code block uses as used in the local rangesets.
u8* near_end = GetWritableCodePtr();
if (near_start != near_end)
m_free_ranges_near.erase(near_start, near_end);
u8* far_end = m_far_code.GetWritableCodePtr();
if (far_start != far_end)
m_free_ranges_far.erase(far_start, far_end);
// Store the used memory regions in the block so we know what to mark as unused when the
// block gets invalidated.
b->near_begin = near_start;
b->near_end = near_end;
b->far_begin = far_start;
b->far_end = far_end;
blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses);
return;
}
}
if (clear_cache_and_retry_on_failure)
{
// Code generation failed due to not enough free space in either the near or far code regions.
// Clear the entire JIT cache and retry.
WARN_LOG(POWERPC, "flushing code caches, please report if this happens a lot");
ClearCache();
Jit(em_address, false);
return;
}
PanicAlertT("JIT failed to find code space after a cache clear. This should never happen. Please "
"report this incident on the bug tracker. Dolphin will now exit.");
exit(-1);
}
u8* Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
bool Jit64::SetEmitterStateToFreeCodeRegion()
{
// Find the largest free memory blocks and set code emitters to point at them.
// If we can't find a free block return false instead, which will trigger a JIT cache clear.
auto free_near = m_free_ranges_near.by_size_begin();
if (free_near == m_free_ranges_near.by_size_end())
{
WARN_LOG(POWERPC, "Failed to find free memory region in near code region.");
return false;
}
SetCodePtr(free_near.from(), free_near.to());
auto free_far = m_free_ranges_far.by_size_begin();
if (free_far == m_free_ranges_far.by_size_end())
{
WARN_LOG(POWERPC, "Failed to find free memory region in far code region.");
return false;
}
m_far_code.SetCodePtr(free_far.from(), free_far.to());
return true;
}
bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
{
js.firstFPInstructionFound = false;
js.isLastInstruction = false;
@ -1091,6 +1178,16 @@ u8* Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
WriteExit(nextPC);
}
if (HasWriteFailed() || m_far_code.HasWriteFailed())
{
if (HasWriteFailed())
WARN_LOG(POWERPC, "JIT ran out of space in near code region during code generation.");
if (m_far_code.HasWriteFailed())
WARN_LOG(POWERPC, "JIT ran out of space in far code region during code generation.");
return false;
}
b->codeSize = (u32)(GetCodePtr() - start);
b->originalSize = code_block.m_num_instructions;
@ -1098,7 +1195,7 @@ u8* Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
LogGeneratedX86(code_block.m_num_instructions, m_code_buffer, start, b);
#endif
return start;
return true;
}
BitSet8 Jit64::ComputeStaticGQRs(const PPCAnalyst::CodeBlock& cb) const

View File

@ -18,6 +18,8 @@
// ----------
#pragma once
#include <rangeset/rangesizeset.h>
#include "Common/CommonTypes.h"
#include "Common/x64ABI.h"
#include "Common/x64Emitter.h"
@ -56,7 +58,12 @@ public:
// Jit!
void Jit(u32 em_address) override;
u8* DoJit(u32 em_address, JitBlock* b, u32 nextPC);
void Jit(u32 em_address, bool clear_cache_and_retry_on_failure);
bool DoJit(u32 em_address, JitBlock* b, u32 nextPC);
// Finds a free memory region and sets the near and far code emitters to point at that region.
// Returns false if no free memory region can be found for either of the two.
bool SetEmitterStateToFreeCodeRegion();
BitSet32 CallerSavedRegistersInUse() const;
BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const;
@ -243,6 +250,8 @@ private:
void AllocStack();
void FreeStack();
void ResetFreeMemoryRanges();
JitBlockCache blocks{*this};
TrampolineCache trampolines{*this};
@ -254,6 +263,9 @@ private:
bool m_enable_blr_optimization;
bool m_cleanup_after_stackfault;
u8* m_stack;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far;
};
void LogGeneratedX86(size_t size, const PPCAnalyst::CodeBuffer& code_buffer, const u8* normalEntry,

View File

@ -21,9 +21,9 @@ void JitBlockCache::WriteLinkBlock(const JitBlock::LinkData& source, const JitBl
u8* location = source.exitPtrs;
const u8* address = dest ? dest->checkedEntry : dispatcher;
Gen::XEmitter emit(location);
if (source.call)
{
Gen::XEmitter emit(location, location + 5);
emit.CALL(address);
}
else
@ -31,19 +31,57 @@ void JitBlockCache::WriteLinkBlock(const JitBlock::LinkData& source, const JitBl
// If we're going to link with the next block, there is no need
// to emit JMP. So just NOP out the gap to the next block.
// Support up to 3 additional bytes because of alignment.
s64 offset = address - emit.GetCodePtr();
s64 offset = address - location;
if (offset > 0 && offset <= 5 + 3)
{
Gen::XEmitter emit(location, location + offset);
emit.NOP(offset);
}
else
{
Gen::XEmitter emit(location, location + 5);
emit.JMP(address, true);
}
}
}
void JitBlockCache::WriteDestroyBlock(const JitBlock& block)
{
// Only clear the entry points as we might still be within this block.
Gen::XEmitter emit(block.checkedEntry);
Gen::XEmitter emit(block.checkedEntry, block.checkedEntry + 1);
emit.INT3();
Gen::XEmitter emit2(block.normalEntry);
Gen::XEmitter emit2(block.normalEntry, block.normalEntry + 1);
emit2.INT3();
}
void JitBlockCache::Init()
{
JitBaseBlockCache::Init();
ClearRangesToFree();
}
void JitBlockCache::DestroyBlock(JitBlock& block)
{
JitBaseBlockCache::DestroyBlock(block);
if (block.near_begin != block.near_end)
m_ranges_to_free_on_next_codegen_near.emplace_back(block.near_begin, block.near_end);
if (block.far_begin != block.far_end)
m_ranges_to_free_on_next_codegen_far.emplace_back(block.far_begin, block.far_end);
}
const std::vector<std::pair<u8*, u8*>>& JitBlockCache::GetRangesToFreeNear() const
{
return m_ranges_to_free_on_next_codegen_near;
}
const std::vector<std::pair<u8*, u8*>>& JitBlockCache::GetRangesToFreeFar() const
{
return m_ranges_to_free_on_next_codegen_far;
}
void JitBlockCache::ClearRangesToFree()
{
m_ranges_to_free_on_next_codegen_near.clear();
m_ranges_to_free_on_next_codegen_far.clear();
}

View File

@ -4,6 +4,8 @@
#pragma once
#include <vector>
#include "Core/PowerPC/JitCommon/JitCache.h"
class JitBase;
@ -13,7 +15,19 @@ class JitBlockCache : public JitBaseBlockCache
public:
explicit JitBlockCache(JitBase& jit);
void Init() override;
void DestroyBlock(JitBlock& block) override;
const std::vector<std::pair<u8*, u8*>>& GetRangesToFreeNear() const;
const std::vector<std::pair<u8*, u8*>>& GetRangesToFreeFar() const;
void ClearRangesToFree();
private:
void WriteLinkBlock(const JitBlock::LinkData& source, const JitBlock* dest) override;
void WriteDestroyBlock(const JitBlock& block) override;
std::vector<std::pair<u8*, u8*>> m_ranges_to_free_on_next_codegen_near;
std::vector<std::pair<u8*, u8*>> m_ranges_to_free_on_next_codegen_far;
};

View File

@ -80,13 +80,16 @@ void EmuCodeBlock::MemoryExceptionCheck()
void EmuCodeBlock::SwitchToFarCode()
{
m_near_code = GetWritableCodePtr();
SetCodePtr(m_far_code.GetWritableCodePtr());
m_near_code_end = GetWritableCodeEnd();
m_near_code_write_failed = HasWriteFailed();
SetCodePtr(m_far_code.GetWritableCodePtr(), m_far_code.GetWritableCodeEnd(),
m_far_code.HasWriteFailed());
}
void EmuCodeBlock::SwitchToNearCode()
{
m_far_code.SetCodePtr(GetWritableCodePtr());
SetCodePtr(m_near_code);
m_far_code.SetCodePtr(GetWritableCodePtr(), GetWritableCodeEnd(), HasWriteFailed());
SetCodePtr(m_near_code, m_near_code_end, m_near_code_write_failed);
}
FixupBranch EmuCodeBlock::CheckIfSafeAddress(const OpArg& reg_value, X64Reg reg_addr,

View File

@ -131,7 +131,11 @@ protected:
Jit64& m_jit;
ConstantPool m_const_pool;
FarCodeCache m_far_code;
u8* m_near_code; // Backed up when we switch to far code.
// Backed up when we switch to far code.
u8* m_near_code;
u8* m_near_code_end;
bool m_near_code_write_failed;
std::unordered_map<u8*, TrampolineInfo> m_back_patch_info;
std::unordered_map<u8*, u8*> m_exception_handler_at_loc;

View File

@ -22,6 +22,12 @@ class JitBase;
// so this struct needs to have a standard layout.
struct JitBlockData
{
// Memory range this code block takes up in near and far code caches.
u8* near_begin;
u8* near_end;
u8* far_begin;
u8* far_end;
// A special entry point for block linking; usually used to check the
// downcount.
u8* checkedEntry;
@ -130,7 +136,7 @@ public:
explicit JitBaseBlockCache(JitBase& jit);
virtual ~JitBaseBlockCache();
void Init();
virtual void Init();
void Shutdown();
void Clear();
void Reset();
@ -159,6 +165,8 @@ public:
u32* GetBlockBitSet() const;
protected:
virtual void DestroyBlock(JitBlock& block);
JitBase& m_jit;
private:
@ -168,7 +176,6 @@ private:
void LinkBlockExits(JitBlock& block);
void LinkBlock(JitBlock& block);
void UnlinkBlock(const JitBlock& block);
void DestroyBlock(JitBlock& block);
JitBlock* MoveBlockIntoFastCache(u32 em_address, u32 msr);

View File

@ -93,6 +93,7 @@ protected:
emitter.reset(new X64CodeBlock());
emitter->AllocCodeSpace(4096);
code_buffer = emitter->GetWritableCodePtr();
code_buffer_end = emitter->GetWritableCodeEnd();
disasm.reset(new disassembler);
disasm->set_syntax_intel();
@ -158,12 +159,13 @@ protected:
EXPECT_EQ(expected_norm, disasmed_norm);
// Reset code buffer afterwards.
emitter->SetCodePtr(code_buffer);
emitter->SetCodePtr(code_buffer, code_buffer_end);
}
std::unique_ptr<X64CodeBlock> emitter;
std::unique_ptr<disassembler> disasm;
u8* code_buffer;
u8* code_buffer_end;
};
#define TEST_INSTR_NO_OPERANDS(Name, ExpectedDisasm) \

View File

@ -41,6 +41,7 @@
<AdditionalIncludeDirectories>$(ExternalsDir)OpenAL\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>$(ExternalsDir)picojson;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>$(ExternalsDir)pugixml;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>$(ExternalsDir)rangeset\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>$(ExternalsDir)SFML\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>$(ExternalsDir)soundtouch;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories>$(ExternalsDir)Vulkan\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>