From 789975e35092621988058e19e92f599781445e11 Mon Sep 17 00:00:00 2001 From: hthh Date: Fri, 17 Jun 2016 21:31:27 +1000 Subject: [PATCH] Jit: FIFO optimization improvements This introduces speculative constants, allowing FIFO writes to be optimized in more places. It also clarifies the guarantees of the FIFO optimization, changing the location of some of the checks and potentially avoiding redundant checks. --- .../Core/Core/PowerPC/CachedInterpreter.cpp | 2 +- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 51 +++++++++++++++++-- Source/Core/Core/PowerPC/Jit64/Jit.h | 2 + .../Core/Core/PowerPC/Jit64/JitRegCache.cpp | 13 +++-- Source/Core/Core/PowerPC/Jit64/JitRegCache.h | 2 +- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 2 +- Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp | 2 +- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 9 ++-- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 4 +- .../JitArm64/JitArm64_LoadStoreFloating.cpp | 2 +- Source/Core/Core/PowerPC/JitCommon/JitBase.h | 3 +- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 2 +- Source/Core/Core/PowerPC/JitInterface.cpp | 3 ++ Source/Core/Core/PowerPC/JitInterface.h | 3 +- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 6 ++- Source/Core/Core/PowerPC/PPCAnalyst.h | 3 ++ 16 files changed, 81 insertions(+), 28 deletions(-) diff --git a/Source/Core/Core/PowerPC/CachedInterpreter.cpp b/Source/Core/Core/PowerPC/CachedInterpreter.cpp index 5a560d9d24..ddf3eaa59d 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter.cpp +++ b/Source/Core/Core/PowerPC/CachedInterpreter.cpp @@ -136,7 +136,7 @@ void CachedInterpreter::Jit(u32 address) js.blockStart = PC; js.firstFPInstructionFound = false; - js.fifoBytesThisBlock = 0; + js.fifoBytesSinceCheck = 0; js.downcountAmount = 0; js.curBlock = b; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index ba5975ab24..dba9d22a05 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -349,7 +349,7 @@ bool Jit64::Cleanup() { bool did_something = false; - if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) + if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0) { ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunction((void*)&GPFifo::FastCheckGatherPipe); @@ -597,7 +597,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc js.firstFPInstructionFound = false; js.isLastInstruction = false; js.blockStart = em_address; - js.fifoBytesThisBlock = 0; + js.fifoBytesSinceCheck = 0; js.mustCheckFifo = false; js.curBlock = b; js.numLoadStoreInst = 0; @@ -690,6 +690,12 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc } } + if (js.noSpeculativeConstantsAddresses.find(js.blockStart) == + js.noSpeculativeConstantsAddresses.end()) + { + IntializeSpeculativeConstants(); + } + // Translate instructions for (u32 i = 0; i < code_block.m_num_instructions; i++) { @@ -724,10 +730,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc js.fifoWriteAddresses.find(ops[i].address) != js.fifoWriteAddresses.end(); // Gather pipe writes using an immediate address are explicitly tracked. - if (jo.optimizeGatherPipe && (js.fifoBytesThisBlock >= 32 || js.mustCheckFifo)) + if (jo.optimizeGatherPipe && (js.fifoBytesSinceCheck >= 32 || js.mustCheckFifo)) { - if (js.fifoBytesThisBlock >= 32) - js.fifoBytesThisBlock -= 32; + js.fifoBytesSinceCheck = 0; js.mustCheckFifo = false; BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); @@ -967,3 +972,39 @@ void Jit64::EnableOptimization() analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE); } + +void Jit64::IntializeSpeculativeConstants() +{ + // If the block depends on an input register which looks like a gather pipe or MMIO related + // constant, guess that it is actually a constant input, and specialize the block based on this + // assumption. This happens when there are branches in code writing to the gather pipe, but only + // the first block loads the constant. + // Insert a check at the start of the block to verify that the value is actually constant. + // This can save a lot of backpatching and optimize gather pipe writes in more places. + const u8* target = nullptr; + for (auto i : code_block.m_gpr_inputs) + { + u32 compileTimeValue = PowerPC::ppcState.gpr[i]; + if (PowerPC::IsOptimizableGatherPipeWrite(compileTimeValue) || + PowerPC::IsOptimizableGatherPipeWrite(compileTimeValue - 0x8000) || + compileTimeValue == 0xCC000000) + { + if (!target) + { + SwitchToFarCode(); + target = GetCodePtr(); + MOV(32, PPCSTATE(pc), Imm32(js.blockStart)); + ABI_PushRegistersAndAdjustStack({}, 0); + ABI_CallFunctionC( + reinterpret_cast(&JitInterface::CompileExceptionCheck), + static_cast(JitInterface::ExceptionType::EXCEPTIONS_SPECULATIVE_CONSTANTS)); + ABI_PopRegistersAndAdjustStack({}, 0); + JMP(asm_routines.dispatcher, true); + SwitchToNearCode(); + } + CMP(32, PPCSTATE(gpr[i]), Imm32(compileTimeValue)); + J_CC(CC_NZ, target); + gpr.SetImmediate32(i, compileTimeValue, false); + } + } +} diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 4fce2e91bf..9ff580d1a2 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -68,6 +68,8 @@ public: BitSet32 CallerSavedRegistersInUse() const; BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const; + void IntializeSpeculativeConstants(); + JitBlockCache* GetBlockCache() override { return &blocks; } void Trace(); diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp index bc5384d481..67baa2f485 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp @@ -227,10 +227,12 @@ void RegCache::DiscardRegContentsIfCached(size_t preg) } } -void GPRRegCache::SetImmediate32(size_t preg, u32 immValue) +void GPRRegCache::SetImmediate32(size_t preg, u32 immValue, bool dirty) { + // "dirty" can be false to avoid redundantly flushing an immediate when + // processing speculative constants. DiscardRegContentsIfCached(preg); - regs[preg].away = true; + regs[preg].away |= dirty; regs[preg].location = Imm32(immValue); } @@ -282,10 +284,7 @@ void RegCache::KillImmediate(size_t preg, bool doLoad, bool makeDirty) void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty) { - if (!regs[i].away && regs[i].location.IsImm()) - PanicAlert("Bad immediate"); - - if (!regs[i].away || (regs[i].away && regs[i].location.IsImm())) + if (!regs[i].away || regs[i].location.IsImm()) { X64Reg xr = GetFreeXReg(); if (xregs[xr].dirty) @@ -294,7 +293,7 @@ void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty) PanicAlert("GetFreeXReg returned locked register"); xregs[xr].free = false; xregs[xr].ppcReg = i; - xregs[xr].dirty = makeDirty || regs[i].location.IsImm(); + xregs[xr].dirty = makeDirty || regs[i].away; if (doLoad) LoadRegister(i, xr); for (size_t j = 0; j < regs.size(); j++) diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h index 14ed139a42..f47e57e377 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h @@ -161,7 +161,7 @@ public: void LoadRegister(size_t preg, Gen::X64Reg newLoc) override; Gen::OpArg GetDefaultLocation(size_t reg) const override; const Gen::X64Reg* GetAllocationOrder(size_t* count) override; - void SetImmediate32(size_t preg, u32 immValue); + void SetImmediate32(size_t preg, u32 immValue, bool dirty = true); BitSet32 GetRegUtilization() override; BitSet32 CountRegsIn(size_t preg, u32 lookahead) override; }; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index 2059587dd6..bab950f531 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -607,6 +607,6 @@ void Jit64::eieio(UGeckoInstruction inst) // optimizeGatherPipe generally postpones FIFO checks to the end of the JIT block, // which is generally safe. However postponing FIFO writes across eieio instructions // is incorrect (would crash NBA2K11 strap screen if we improve our FIFO detection). - if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) + if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0) js.mustCheckFifo = true; } diff --git a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp index 03d8253493..ba6c00e36c 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp @@ -515,7 +515,7 @@ const u8* JitIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc { js.isLastInstruction = false; js.blockStart = em_address; - js.fifoBytesThisBlock = 0; + js.fifoBytesSinceCheck = 0; js.curBlock = b; jit->js.numLoadStoreInst = 0; jit->js.numFloatingPointInst = 0; diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index e19343393a..a49b8486e0 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -150,7 +150,7 @@ void JitArm64::Break(UGeckoInstruction inst) void JitArm64::Cleanup() { - if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) + if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0) { gpr.Lock(W0); MOVI2R(X0, (u64)&GPFifo::FastCheckGatherPipe); @@ -404,7 +404,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB js.firstFPInstructionFound = false; js.assumeNoPairedQuantize = false; js.blockStart = em_address; - js.fifoBytesThisBlock = 0; + js.fifoBytesSinceCheck = 0; js.mustCheckFifo = false; js.downcountAmount = 0; js.skipInstructions = 0; @@ -492,10 +492,9 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB bool gatherPipeIntCheck = jit->js.fifoWriteAddresses.find(ops[i].address) != jit->js.fifoWriteAddresses.end(); - if (jo.optimizeGatherPipe && (js.fifoBytesThisBlock >= 32 || js.mustCheckFifo)) + if (jo.optimizeGatherPipe && (js.fifoBytesSinceCheck >= 32 || js.mustCheckFifo)) { - if (js.fifoBytesThisBlock >= 32) - js.fifoBytesThisBlock -= 32; + js.fifoBytesSinceCheck = 0; js.mustCheckFifo = false; gpr.Lock(W30); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 34a8638024..00833287db 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -333,7 +333,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s } ADD(W0, W0, accessSize >> 3); STR(INDEX_UNSIGNED, W0, X30, count_off); - js.fifoBytesThisBlock += accessSize >> 3; + js.fifoBytesSinceCheck += accessSize >> 3; if (accessSize != 8) gpr.Unlock(WA); @@ -862,6 +862,6 @@ void JitArm64::eieio(UGeckoInstruction inst) // optimizeGatherPipe generally postpones FIFO checks to the end of the JIT block, // which is generally safe. However postponing FIFO writes across eieio instructions // is incorrect (would crash NBA2K11 strap screen if we improve our FIFO detection). - if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) + if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0) js.mustCheckFifo = true; } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index 256b724c72..f1ecdaba36 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -442,7 +442,7 @@ void JitArm64::stfXX(UGeckoInstruction inst) ADD(W0, W0, accessSize >> 3); STR(INDEX_UNSIGNED, W0, X30, count_off); - js.fifoBytesThisBlock += accessSize >> 3; + js.fifoBytesSinceCheck += accessSize >> 3; if (update) { diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 489bb5ba38..611395e3b8 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -104,7 +104,7 @@ protected: u8* trampolineExceptionHandler; bool mustCheckFifo; - int fifoBytesThisBlock; + int fifoBytesSinceCheck; PPCAnalyst::BlockStats st; PPCAnalyst::BlockRegStats gpa; @@ -116,6 +116,7 @@ protected: std::unordered_set fifoWriteAddresses; std::unordered_set pairedQuantizeAddresses; + std::unordered_set noSpeculativeConstantsAddresses; }; PPCAnalyst::CodeBlock code_block; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index fb0998f284..f7806cf426 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -475,7 +475,7 @@ void EmuCodeBlock::UnsafeWriteGatherPipe(int accessSize) CALL(jit->GetAsmRoutines()->fifoDirectWrite64); break; } - jit->js.fifoBytesThisBlock += accessSize >> 3; + jit->js.fifoBytesSinceCheck += accessSize >> 3; } bool EmuCodeBlock::WriteToConstAddress(int accessSize, OpArg arg, u32 address, diff --git a/Source/Core/Core/PowerPC/JitInterface.cpp b/Source/Core/Core/PowerPC/JitInterface.cpp index 8ace00aa0e..8b81f07fb5 100644 --- a/Source/Core/Core/PowerPC/JitInterface.cpp +++ b/Source/Core/Core/PowerPC/JitInterface.cpp @@ -260,6 +260,9 @@ void CompileExceptionCheck(ExceptionType type) case ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE: exception_addresses = &jit->js.pairedQuantizeAddresses; break; + case ExceptionType::EXCEPTIONS_SPECULATIVE_CONSTANTS: + exception_addresses = &jit->js.noSpeculativeConstantsAddresses; + break; } if (PC != 0 && (exception_addresses->find(PC)) == (exception_addresses->end())) diff --git a/Source/Core/Core/PowerPC/JitInterface.h b/Source/Core/Core/PowerPC/JitInterface.h index 4608f306a1..01fd6bf911 100644 --- a/Source/Core/Core/PowerPC/JitInterface.h +++ b/Source/Core/Core/PowerPC/JitInterface.h @@ -15,7 +15,8 @@ namespace JitInterface enum class ExceptionType { EXCEPTIONS_FIFO_WRITE, - EXCEPTIONS_PAIRED_QUANTIZE + EXCEPTIONS_PAIRED_QUANTIZE, + EXCEPTIONS_SPECULATIVE_CONSTANTS }; void DoState(PointerWrap& p); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 43a5f041fe..ea1493315f 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -849,10 +849,13 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32 } // Forward scan, for flags that need the other direction for calculation. - BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe; + BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe, gprDefined, gprBlockInputs; BitSet8 gqrUsed, gqrModified; for (u32 i = 0; i < block->m_num_instructions; i++) { + gprBlockInputs |= code[i].regsIn & ~gprDefined; + gprDefined |= code[i].regsOut; + code[i].fprIsSingle = fprIsSingle; code[i].fprIsDuplicated = fprIsDuplicated; code[i].fprIsStoreSafe = fprIsStoreSafe; @@ -905,6 +908,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32 } block->m_gqr_used = gqrUsed; block->m_gqr_modified = gqrModified; + block->m_gpr_inputs = gprBlockInputs; return address; } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 2fddaa59f1..42625757b4 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -154,6 +154,9 @@ struct CodeBlock // Which GQRs this block modifies, if any. BitSet8 m_gqr_modified; + + // Which GPRs this block reads from before defining, if any. + BitSet32 m_gpr_inputs; }; class PPCAnalyzer