From 55db7c7a05692518919705359c4e506c12a42285 Mon Sep 17 00:00:00 2001 From: degasus Date: Sat, 28 Jul 2018 01:52:22 +0200 Subject: [PATCH 1/4] Jit64: Optimized idle skipping detection. --- .../Interpreter/Interpreter_Tables.cpp | 4 +- Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp | 35 +++---- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 15 ++- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 35 ------- .../Core/PowerPC/JitArm64/JitArm64_Branch.cpp | 35 +++---- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 31 ------ Source/Core/Core/PowerPC/PPCAnalyst.cpp | 96 +++++++++++++++++-- Source/Core/Core/PowerPC/PPCAnalyst.h | 6 +- 8 files changed, 147 insertions(+), 110 deletions(-) diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp index 1ff7a60c8b..c10de1092c 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp @@ -29,8 +29,8 @@ static std::array primarytable = {59, Interpreter::RunTable59, {"RunTable59", OpType::Subtable, 0, 0, 0, 0, 0}}, {63, Interpreter::RunTable63, {"RunTable63", OpType::Subtable, 0, 0, 0, 0, 0}}, - {16, Interpreter::bcx, {"bcx", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}}, - {18, Interpreter::bx, {"bx", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}}, + {16, Interpreter::bcx, {"bcx", OpType::Branch, FL_ENDBLOCK, 1, 0, 0, 0}}, + {18, Interpreter::bx, {"bx", OpType::Branch, FL_ENDBLOCK, 1, 0, 0, 0}}, {3, Interpreter::twi, {"twi", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}}, {17, Interpreter::sc, {"sc", OpType::System, FL_ENDBLOCK, 2, 0, 0, 0}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp index 7e0ded66c8..3e6ef5e255 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp @@ -87,25 +87,22 @@ void Jit64::bx(UGeckoInstruction inst) gpr.Flush(); fpr.Flush(); - u32 destination; - if (inst.AA) - destination = SignExt26(inst.LI << 2); - else - destination = js.compilerPC + SignExt26(inst.LI << 2); #ifdef ACID_TEST if (inst.LK) AND(32, PPCSTATE(cr), Imm32(~(0xFF000000))); #endif - if (destination == js.compilerPC) + if (js.op->branchIsIdleLoop) { ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunction(CoreTiming::Idle); ABI_PopRegistersAndAdjustStack({}, 0); - MOV(32, PPCSTATE(pc), Imm32(destination)); + MOV(32, PPCSTATE(pc), Imm32(js.op->branchTo)); WriteExceptionExit(); - return; } - WriteExit(destination, inst.LK, js.compilerPC + 4); + else + { + WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4); + } } // TODO - optimize to hell and beyond @@ -154,18 +151,24 @@ void Jit64::bcx(UGeckoInstruction inst) return; } - u32 destination; - if (inst.AA) - destination = SignExt16(inst.BD << 2); - else - destination = js.compilerPC + SignExt16(inst.BD << 2); - { RCForkGuard gpr_guard = gpr.Fork(); RCForkGuard fpr_guard = fpr.Fork(); gpr.Flush(); fpr.Flush(); - WriteExit(destination, inst.LK, js.compilerPC + 4); + + if (js.op->branchIsIdleLoop) + { + ABI_PushRegistersAndAdjustStack({}, 0); + ABI_CallFunction(CoreTiming::Idle); + ABI_PopRegistersAndAdjustStack({}, 0); + MOV(32, PPCSTATE(pc), Imm32(js.op->branchTo)); + WriteExceptionExit(); + } + else + { + WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4); + } } if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index a4ae501936..057d039b8d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -12,6 +12,7 @@ #include "Common/CommonTypes.h" #include "Common/MathUtil.h" #include "Common/x64Emitter.h" +#include "Core/CoreTiming.h" #include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/RegCache/JitRegCache.h" #include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h" @@ -361,7 +362,19 @@ void Jit64::DoMergedBranch() // Code that handles successful PPC branching. const UGeckoInstruction& next = js.op[1].inst; const u32 nextPC = js.op[1].address; - if (next.OPCD == 16) // bcx + + if (js.op[1].branchIsIdleLoop) + { + if (next.LK) + MOV(32, PPCSTATE(spr[SPR_LR]), Imm32(nextPC + 4)); + + ABI_PushRegistersAndAdjustStack({}, 0); + ABI_CallFunction(CoreTiming::Idle); + ABI_PopRegistersAndAdjustStack({}, 0); + MOV(32, PPCSTATE(pc), Imm32(js.op[1].branchTo)); + WriteExceptionExit(); + } + else if (next.OPCD == 16) // bcx { if (next.LK) MOV(32, PPCSTATE(spr[SPR_LR]), Imm32(nextPC + 4)); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index 3bab9436dd..a1cfe3976b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -119,41 +119,6 @@ void Jit64::lXXx(UGeckoInstruction inst) signExtend = true; } - if (!CPU::IsStepping() && inst.OPCD == 32 && CanMergeNextInstructions(2) && - (inst.hex & 0xFFFF0000) == 0x800D0000 && - (js.op[1].inst.hex == 0x28000000 || - (SConfig::GetInstance().bWii && js.op[1].inst.hex == 0x2C000000)) && - js.op[2].inst.hex == 0x4182fff8) - { - s32 offset = (s32)(s16)inst.SIMM_16; - RCX64Reg Ra = gpr.Bind(a, RCMode::Read); - RCX64Reg Rd = gpr.Bind(d, RCMode::Write); - RegCache::Realize(Ra, Rd); - - SafeLoadToReg(Rd, Ra, accessSize, offset, CallerSavedRegistersInUse(), signExtend); - - // if it's still 0, we can wait until the next event - TEST(32, Rd, Rd); - FixupBranch noIdle = J_CC(CC_NZ); - - BitSet32 registersInUse = CallerSavedRegistersInUse(); - ABI_PushRegistersAndAdjustStack(registersInUse, 0); - - ABI_CallFunction(CoreTiming::Idle); - - ABI_PopRegistersAndAdjustStack(registersInUse, 0); - - // ! we must continue executing of the loop after exception handling, maybe there is still 0 in - // r0 - // MOV(32, PPCSTATE(pc), Imm32(js.compilerPC)); - WriteExceptionExit(); - - SetJumpTarget(noIdle); - - // js.compilerPC += 8; - return; - } - // Determine whether this instruction updates inst.RA bool update; if (inst.OPCD == 31) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp index 9ac99abf67..08fc9c8aa4 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp @@ -76,12 +76,6 @@ void JitArm64::bx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITBranchOff); - u32 destination; - if (inst.AA) - destination = SignExt26(inst.LI << 2); - else - destination = js.compilerPC + SignExt26(inst.LI << 2); - if (inst.LK) { ARM64Reg WA = gpr.GetReg(); @@ -105,7 +99,7 @@ void JitArm64::bx(UGeckoInstruction inst) gpr.Flush(FlushMode::FLUSH_ALL); fpr.Flush(FlushMode::FLUSH_ALL); - if (destination == js.compilerPC) + if (js.op->branchIsIdleLoop) { // make idle loops go faster ARM64Reg WA = gpr.GetReg(); @@ -115,11 +109,11 @@ void JitArm64::bx(UGeckoInstruction inst) BLR(XA); gpr.Unlock(WA); - WriteExceptionExit(js.compilerPC); + WriteExceptionExit(js.op->branchTo); return; } - WriteExit(destination, inst.LK, js.compilerPC + 4); + WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4); } void JitArm64::bcx(UGeckoInstruction inst) @@ -160,16 +154,25 @@ void JitArm64::bcx(UGeckoInstruction inst) } gpr.Unlock(WA); - u32 destination; - if (inst.AA) - destination = SignExt16(inst.BD << 2); - else - destination = js.compilerPC + SignExt16(inst.BD << 2); - gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE); fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE); - WriteExit(destination, inst.LK, js.compilerPC + 4); + if (js.op->branchIsIdleLoop) + { + // make idle loops go faster + ARM64Reg WA = gpr.GetReg(); + ARM64Reg XA = EncodeRegTo64(WA); + + MOVP2R(XA, &CoreTiming::Idle); + BLR(XA); + gpr.Unlock(WA); + + WriteExceptionExit(js.op->branchTo); + } + else + { + WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4); + } SwitchToNearCode(); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index f0e5397f1c..7daad1ef91 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -346,37 +346,6 @@ void JitArm64::lXX(UGeckoInstruction inst) } SafeLoadToReg(d, update ? a : (a ? a : -1), offsetReg, flags, offset, update); - - // LWZ idle skipping - if (inst.OPCD == 32 && CanMergeNextInstructions(2) && - (inst.hex & 0xFFFF0000) == 0x800D0000 && // lwz r0, XXXX(r13) - (js.op[1].inst.hex == 0x28000000 || - (SConfig::GetInstance().bWii && js.op[1].inst.hex == 0x2C000000)) && // cmpXwi r0,0 - js.op[2].inst.hex == 0x4182fff8) // beq -8 - { - ARM64Reg WA = gpr.GetReg(); - ARM64Reg XA = EncodeRegTo64(WA); - - // if it's still 0, we can wait until the next event - FixupBranch noIdle = CBNZ(gpr.R(d)); - - FixupBranch far = B(); - SwitchToFarCode(); - SetJumpTarget(far); - - gpr.Flush(FLUSH_MAINTAIN_STATE); - fpr.Flush(FLUSH_MAINTAIN_STATE); - - MOVP2R(XA, &CoreTiming::Idle); - BLR(XA); - gpr.Unlock(WA); - - WriteExceptionExit(js.compilerPC); - - SwitchToNearCode(); - - SetJumpTarget(noIdle); - } } void JitArm64::stX(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 89c57b9bcb..3866b2e672 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -640,6 +640,90 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock* block, CodeOp* code, const Gekk code->outputCR0 = true; code->outputCR1 = true; } + + code->branchUsesCtr = false; + code->branchTo = UINT32_MAX; + + // For branch with immediate addresses (bx/bcx), compute the destination. + if (code->inst.OPCD == 18) // bx + { + if (code->inst.AA) // absolute + code->branchTo = SignExt26(code->inst.LI << 2); + else + code->branchTo = code->address + SignExt26(code->inst.LI << 2); + } + else if (code->inst.OPCD == 16) // bcx + { + if (code->inst.AA) // absolute + code->branchTo = SignExt16(code->inst.BD << 2); + else + code->branchTo = code->address + SignExt16(code->inst.BD << 2); + if (!(code->inst.BO & BO_DONT_DECREMENT_FLAG)) + code->branchUsesCtr = true; + } + else if (code->inst.OPCD == 19 && code->inst.SUBOP10 == 16) // bclrx + { + if (!(code->inst.BO & BO_DONT_DECREMENT_FLAG)) + code->branchUsesCtr = true; + } + else if (code->inst.OPCD == 19 && code->inst.SUBOP10 == 528) // bcctrx + { + if (!(code->inst.BO & BO_DONT_DECREMENT_FLAG)) + code->branchUsesCtr = true; + } +} + +bool PPCAnalyzer::IsBusyWaitLoop(CodeBlock* block, CodeOp* code, size_t instructions) +{ + // Very basic algorithm to detect busy wait loops: + // * It loops to itself and does not contain any other branches. + // * It does not write to memory. + // * It only reads from registers it wrote to earlier in the loop, or it + // does not write to these registers. + // + // Would benefit a lot from basic inlining support - a lot of the most + // used busy loops are DSP register interactions, which are bl/cmp/bne + // (with the bl target a pure function that follows the above rules). We + // don't detect these at the moment. + std::bitset<32> write_disallowed_regs; + std::bitset<32> written_regs; + for (size_t i = 0; i <= instructions; ++i) + { + if (code[i].opinfo->type == OpType::Branch) + { + if (code[i].branchUsesCtr) + return false; + if (code[i].branchTo == block->m_address && i == instructions) + return true; + } + else if (code[i].opinfo->type != OpType::Integer && code[i].opinfo->type != OpType::Load) + { + // In the future, some subsets of other instruction types might get + // supported. Right now, only try loops that have this very + // restricted instruction set. + return false; + } + else + { + for (int reg : code[i].regsIn) + { + if (reg == -1) + continue; + if (written_regs[reg]) + continue; + write_disallowed_regs[reg] = true; + } + for (int reg : code[i].regsOut) + { + if (reg == -1) + continue; + if (write_disallowed_regs[reg]) + return false; + written_regs[reg] = true; + } + } + } + return false; } u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std::size_t block_size) @@ -692,16 +776,16 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: code[i].opinfo = opinfo; code[i].address = address; code[i].inst = inst; - code[i].branchTo = UINT32_MAX; - code[i].branchToIndex = UINT32_MAX; code[i].skip = false; block->m_stats->numCycles += opinfo->numCycles; block->m_physical_addresses.insert(result.physical_address); SetInstructionStats(block, &code[i], opinfo, static_cast(i)); + code[i].branchIsIdleLoop = + code[i].branchTo == block->m_address && IsBusyWaitLoop(block, code, i); + bool follow = false; - u32 destination = 0; bool conditional_continue = false; @@ -715,7 +799,6 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: { // Always follow BX instructions. follow = true; - destination = SignExt26(inst.LI << 2) + (inst.AA ? 0 : address); if (inst.LK) { found_call = true; @@ -727,7 +810,6 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: { // Always follow unconditional BCX instructions, but they are very rare. follow = true; - destination = SignExt16(inst.BD << 2) + (inst.AA ? 0 : address); if (inst.LK) { found_call = true; @@ -744,7 +826,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: // the LR value on the stack as there are no spare registers. So we'd need // to check all store instruction to not alias with the stack. follow = true; - destination = code[caller].address + 4; + code[i].branchTo = code[caller].address + 4; found_call = false; code[i].skip = true; @@ -796,7 +878,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: { // Follow the unconditional branch. numFollows++; - address = destination; + address = code[i].branchTo; } else { diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 18fa3a4fb2..b74d1d0167 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -27,13 +27,14 @@ struct CodeOp // 16B UGeckoInstruction inst; GekkoOPInfo* opinfo; u32 address; - u32 branchTo; // if 0, not a branch - int branchToIndex; // index of target block + u32 branchTo; // if UINT32_MAX, not a branch BitSet32 regsOut; BitSet32 regsIn; BitSet32 fregsIn; s8 fregOut; bool isBranchTarget; + bool branchUsesCtr; + bool branchIsIdleLoop; bool wantsCR0; bool wantsCR1; bool wantsFPRF; @@ -213,6 +214,7 @@ private: void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type); void ReorderInstructions(u32 instructions, CodeOp* code); void SetInstructionStats(CodeBlock* block, CodeOp* code, const GekkoOPInfo* opinfo, u32 index); + bool IsBusyWaitLoop(CodeBlock* block, CodeOp* code, size_t instructions); // Options u32 m_options = 0; From b8b4b4a3835a54b562499fe077a7fd0d291b7ffc Mon Sep 17 00:00:00 2001 From: degasus Date: Thu, 9 Aug 2018 09:40:12 +0200 Subject: [PATCH 2/4] PowerPC: More idle loop detections. --- Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp | 14 ++++++- .../Core/PowerPC/JitArm64/JitArm64_Branch.cpp | 15 ++++++- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 39 ++++++++++--------- 3 files changed, 48 insertions(+), 20 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp index 3e6ef5e255..2ebb8e4175 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp @@ -285,7 +285,19 @@ void Jit64::bclrx(UGeckoInstruction inst) RCForkGuard fpr_guard = fpr.Fork(); gpr.Flush(); fpr.Flush(); - WriteBLRExit(); + + if (js.op->branchIsIdleLoop) + { + ABI_PushRegistersAndAdjustStack({}, 0); + ABI_CallFunction(CoreTiming::Idle); + ABI_PopRegistersAndAdjustStack({}, 0); + MOV(32, PPCSTATE(pc), Imm32(js.op->branchTo)); + WriteExceptionExit(); + } + else + { + WriteBLRExit(); + } } if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp index 08fc9c8aa4..97a81df2d4 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Branch.cpp @@ -278,7 +278,20 @@ void JitArm64::bclrx(UGeckoInstruction inst) gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL); fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL); - WriteBLRExit(WA); + if (js.op->branchIsIdleLoop) + { + // make idle loops go faster + ARM64Reg XA = EncodeRegTo64(WA); + + MOVP2R(XA, &CoreTiming::Idle); + BLR(XA); + + WriteExceptionExit(js.op->branchTo); + } + else + { + WriteBLRExit(WA); + } gpr.Unlock(WA); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 3866b2e672..7b18aa378f 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -782,9 +782,6 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: SetInstructionStats(block, &code[i], opinfo, static_cast(i)); - code[i].branchIsIdleLoop = - code[i].branchTo == block->m_address && IsBusyWaitLoop(block, code, i); - bool follow = false; bool conditional_continue = false; @@ -793,7 +790,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: // If it is small, the performance will be down. // If it is big, the size of generated code will be big and // cache clearning will happen many times. - if (enable_follow && HasOption(OPTION_BRANCH_FOLLOW) && numFollows < BRANCH_FOLLOWING_THRESHOLD) + if (enable_follow && HasOption(OPTION_BRANCH_FOLLOW)) { if (inst.OPCD == 18 && block_size > 1) { @@ -816,22 +813,25 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: caller = i; } } - else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call && - (inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION)) + else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call) { - // bclrx with unconditional branch = return - // Follow it if we can propagate the LR value of the last CALL instruction. - // Through it would be easy to track the upper level of call/return, - // we can't guarantee the LR value. The PPC ABI forces all functions to push - // the LR value on the stack as there are no spare registers. So we'd need - // to check all store instruction to not alias with the stack. - follow = true; code[i].branchTo = code[caller].address + 4; - found_call = false; - code[i].skip = true; + if ((inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION) && + numFollows < BRANCH_FOLLOWING_THRESHOLD) + { + // bclrx with unconditional branch = return + // Follow it if we can propagate the LR value of the last CALL instruction. + // Through it would be easy to track the upper level of call/return, + // we can't guarantee the LR value. The PPC ABI forces all functions to push + // the LR value on the stack as there are no spare registers. So we'd need + // to check all store instruction to not alias with the stack. + follow = true; + found_call = false; + code[i].skip = true; - // Skip the RET, so also don't generate the stack entry for the BLR optimization. - code[caller].skipLRStack = true; + // Skip the RET, so also don't generate the stack entry for the BLR optimization. + code[caller].skipLRStack = true; + } } else if (inst.OPCD == 31 && inst.SUBOP10 == 467) { @@ -874,7 +874,10 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: } } - if (follow) + code[i].branchIsIdleLoop = + code[i].branchTo == block->m_address && IsBusyWaitLoop(block, code, i); + + if (follow && numFollows < BRANCH_FOLLOWING_THRESHOLD) { // Follow the unconditional branch. numFollows++; From 55abe1a08573e063f4751257c13ee94aa998e505 Mon Sep 17 00:00:00 2001 From: degasus Date: Tue, 16 Apr 2019 08:53:44 +0200 Subject: [PATCH 3/4] Jit64: Refactor WriteIdleExit helper. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 9 +++++++++ Source/Core/Core/PowerPC/Jit64/Jit.h | 1 + Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp | 18 +++--------------- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 6 +----- 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 545047c40c..6934f66479 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -646,6 +646,15 @@ void Jit64::WriteRfiExitDestInRSCRATCH() JMP(asm_routines.dispatcher, true); } +void Jit64::WriteIdleExit(u32 destination) +{ + ABI_PushRegistersAndAdjustStack({}, 0); + ABI_CallFunction(CoreTiming::Idle); + ABI_PopRegistersAndAdjustStack({}, 0); + MOV(32, PPCSTATE(pc), Imm32(destination)); + WriteExceptionExit(); +} + void Jit64::WriteExceptionExit() { Cleanup(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 452cdbdfc6..d2baa32c8a 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -84,6 +84,7 @@ public: void WriteExceptionExit(); void WriteExternalExceptionExit(); void WriteRfiExitDestInRSCRATCH(); + void WriteIdleExit(u32 destination); bool Cleanup(); void GenerateConstantOverflow(bool overflow); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp index 2ebb8e4175..bd257a3977 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp @@ -93,11 +93,7 @@ void Jit64::bx(UGeckoInstruction inst) #endif if (js.op->branchIsIdleLoop) { - ABI_PushRegistersAndAdjustStack({}, 0); - ABI_CallFunction(CoreTiming::Idle); - ABI_PopRegistersAndAdjustStack({}, 0); - MOV(32, PPCSTATE(pc), Imm32(js.op->branchTo)); - WriteExceptionExit(); + WriteIdleExit(js.op->branchTo); } else { @@ -159,11 +155,7 @@ void Jit64::bcx(UGeckoInstruction inst) if (js.op->branchIsIdleLoop) { - ABI_PushRegistersAndAdjustStack({}, 0); - ABI_CallFunction(CoreTiming::Idle); - ABI_PopRegistersAndAdjustStack({}, 0); - MOV(32, PPCSTATE(pc), Imm32(js.op->branchTo)); - WriteExceptionExit(); + WriteIdleExit(js.op->branchTo); } else { @@ -288,11 +280,7 @@ void Jit64::bclrx(UGeckoInstruction inst) if (js.op->branchIsIdleLoop) { - ABI_PushRegistersAndAdjustStack({}, 0); - ABI_CallFunction(CoreTiming::Idle); - ABI_PopRegistersAndAdjustStack({}, 0); - MOV(32, PPCSTATE(pc), Imm32(js.op->branchTo)); - WriteExceptionExit(); + WriteIdleExit(js.op->branchTo); } else { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 057d039b8d..8eadc3d33f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -368,11 +368,7 @@ void Jit64::DoMergedBranch() if (next.LK) MOV(32, PPCSTATE(spr[SPR_LR]), Imm32(nextPC + 4)); - ABI_PushRegistersAndAdjustStack({}, 0); - ABI_CallFunction(CoreTiming::Idle); - ABI_PopRegistersAndAdjustStack({}, 0); - MOV(32, PPCSTATE(pc), Imm32(js.op[1].branchTo)); - WriteExceptionExit(); + WriteIdleExit(js.op[1].branchTo); } else if (next.OPCD == 16) // bcx { From 6ec4ade3b6a1131913ab9a006a7472530d8e90af Mon Sep 17 00:00:00 2001 From: degasus Date: Sat, 20 Apr 2019 20:51:43 +0200 Subject: [PATCH 4/4] Interpreter: Drop idle skipping in interpreter. And reimplement it in the cached interpreter based on the idle loop detection. --- .../CachedInterpreter/CachedInterpreter.cpp | 12 ++++++++++ .../Interpreter/Interpreter_Branch.cpp | 24 ------------------- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp index 81bd4b9a00..e2e4dd5a75 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp @@ -181,6 +181,15 @@ static bool CheckBreakpoint(u32 data) return false; } +static bool CheckIdle(u32 idle_pc) +{ + if (PowerPC::ppcState.npc == idle_pc) + { + CoreTiming::Idle(); + } + return false; +} + bool CachedInterpreter::HandleFunctionHooking(u32 address) { return HLE::ReplaceFunctionIfPossible(address, [&](u32 function, HLE::HookType type) { @@ -242,6 +251,7 @@ void CachedInterpreter::Jit(u32 address) const bool check_fpu = (op.opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound; const bool endblock = (op.opinfo->flags & FL_ENDBLOCK) != 0; const bool memcheck = (op.opinfo->flags & FL_LOADSTORE) && jo.memcheck; + const bool idle_loop = op.branchIsIdleLoop; if (breakpoint) { @@ -261,6 +271,8 @@ void CachedInterpreter::Jit(u32 address) m_code.emplace_back(PPCTables::GetInterpreterOp(op.inst), op.inst); if (memcheck) m_code.emplace_back(CheckDSI, js.downcountAmount); + if (idle_loop) + m_code.emplace_back(CheckIdle, js.blockStart); if (endblock) m_code.emplace_back(EndBlock, js.downcountAmount); } diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Branch.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Branch.cpp index a1920b3eef..3630eb37c7 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Branch.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Branch.cpp @@ -5,7 +5,6 @@ #include "Common/Assert.h" #include "Common/CommonTypes.h" #include "Core/ConfigManager.h" -#include "Core/CoreTiming.h" #include "Core/HLE/HLE.h" #include "Core/PowerPC/Interpreter/ExceptionUtils.h" #include "Core/PowerPC/Interpreter/Interpreter.h" @@ -23,11 +22,6 @@ void Interpreter::bx(UGeckoInstruction inst) NPC = PC + SignExt26(inst.LI << 2); m_end_block = true; - - if (NPC == PC) - { - CoreTiming::Idle(); - } } // bcx - ugly, straight from PPC manual equations :) @@ -56,24 +50,6 @@ void Interpreter::bcx(UGeckoInstruction inst) } m_end_block = true; - - // this code trys to detect the most common idle loop: - // lwz r0, XXXX(r13) - // cmpXwi r0,0 - // beq -8 - if (NPC == PC - 8 && inst.hex == 0x4182fff8 /* beq */) - { - if (PowerPC::HostRead_U32(PC - 8) >> 16 == 0x800D /* lwz */) - { - u32 last_inst = PowerPC::HostRead_U32(PC - 4); - - if (last_inst == 0x28000000 /* cmplwi */ || - (last_inst == 0x2C000000 /* cmpwi */ && SConfig::GetInstance().bWii)) - { - CoreTiming::Idle(); - } - } - } } void Interpreter::bcctrx(UGeckoInstruction inst)