mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2024-11-14 13:27:45 -07:00
Merge pull request #7287 from degasus/idle_skipping
Jit64 / JitArm64: Optimized idle skipping detection.
This commit is contained in:
commit
2abe333ce9
@ -181,6 +181,15 @@ static bool CheckBreakpoint(u32 data)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool CheckIdle(u32 idle_pc)
|
||||||
|
{
|
||||||
|
if (PowerPC::ppcState.npc == idle_pc)
|
||||||
|
{
|
||||||
|
CoreTiming::Idle();
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
bool CachedInterpreter::HandleFunctionHooking(u32 address)
|
bool CachedInterpreter::HandleFunctionHooking(u32 address)
|
||||||
{
|
{
|
||||||
return HLE::ReplaceFunctionIfPossible(address, [&](u32 function, HLE::HookType type) {
|
return HLE::ReplaceFunctionIfPossible(address, [&](u32 function, HLE::HookType type) {
|
||||||
@ -242,6 +251,7 @@ void CachedInterpreter::Jit(u32 address)
|
|||||||
const bool check_fpu = (op.opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound;
|
const bool check_fpu = (op.opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound;
|
||||||
const bool endblock = (op.opinfo->flags & FL_ENDBLOCK) != 0;
|
const bool endblock = (op.opinfo->flags & FL_ENDBLOCK) != 0;
|
||||||
const bool memcheck = (op.opinfo->flags & FL_LOADSTORE) && jo.memcheck;
|
const bool memcheck = (op.opinfo->flags & FL_LOADSTORE) && jo.memcheck;
|
||||||
|
const bool idle_loop = op.branchIsIdleLoop;
|
||||||
|
|
||||||
if (breakpoint)
|
if (breakpoint)
|
||||||
{
|
{
|
||||||
@ -261,6 +271,8 @@ void CachedInterpreter::Jit(u32 address)
|
|||||||
m_code.emplace_back(PPCTables::GetInterpreterOp(op.inst), op.inst);
|
m_code.emplace_back(PPCTables::GetInterpreterOp(op.inst), op.inst);
|
||||||
if (memcheck)
|
if (memcheck)
|
||||||
m_code.emplace_back(CheckDSI, js.downcountAmount);
|
m_code.emplace_back(CheckDSI, js.downcountAmount);
|
||||||
|
if (idle_loop)
|
||||||
|
m_code.emplace_back(CheckIdle, js.blockStart);
|
||||||
if (endblock)
|
if (endblock)
|
||||||
m_code.emplace_back(EndBlock, js.downcountAmount);
|
m_code.emplace_back(EndBlock, js.downcountAmount);
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,6 @@
|
|||||||
#include "Common/Assert.h"
|
#include "Common/Assert.h"
|
||||||
#include "Common/CommonTypes.h"
|
#include "Common/CommonTypes.h"
|
||||||
#include "Core/ConfigManager.h"
|
#include "Core/ConfigManager.h"
|
||||||
#include "Core/CoreTiming.h"
|
|
||||||
#include "Core/HLE/HLE.h"
|
#include "Core/HLE/HLE.h"
|
||||||
#include "Core/PowerPC/Interpreter/ExceptionUtils.h"
|
#include "Core/PowerPC/Interpreter/ExceptionUtils.h"
|
||||||
#include "Core/PowerPC/Interpreter/Interpreter.h"
|
#include "Core/PowerPC/Interpreter/Interpreter.h"
|
||||||
@ -23,11 +22,6 @@ void Interpreter::bx(UGeckoInstruction inst)
|
|||||||
NPC = PC + SignExt26(inst.LI << 2);
|
NPC = PC + SignExt26(inst.LI << 2);
|
||||||
|
|
||||||
m_end_block = true;
|
m_end_block = true;
|
||||||
|
|
||||||
if (NPC == PC)
|
|
||||||
{
|
|
||||||
CoreTiming::Idle();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// bcx - ugly, straight from PPC manual equations :)
|
// bcx - ugly, straight from PPC manual equations :)
|
||||||
@ -56,24 +50,6 @@ void Interpreter::bcx(UGeckoInstruction inst)
|
|||||||
}
|
}
|
||||||
|
|
||||||
m_end_block = true;
|
m_end_block = true;
|
||||||
|
|
||||||
// this code trys to detect the most common idle loop:
|
|
||||||
// lwz r0, XXXX(r13)
|
|
||||||
// cmpXwi r0,0
|
|
||||||
// beq -8
|
|
||||||
if (NPC == PC - 8 && inst.hex == 0x4182fff8 /* beq */)
|
|
||||||
{
|
|
||||||
if (PowerPC::HostRead_U32(PC - 8) >> 16 == 0x800D /* lwz */)
|
|
||||||
{
|
|
||||||
u32 last_inst = PowerPC::HostRead_U32(PC - 4);
|
|
||||||
|
|
||||||
if (last_inst == 0x28000000 /* cmplwi */ ||
|
|
||||||
(last_inst == 0x2C000000 /* cmpwi */ && SConfig::GetInstance().bWii))
|
|
||||||
{
|
|
||||||
CoreTiming::Idle();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Interpreter::bcctrx(UGeckoInstruction inst)
|
void Interpreter::bcctrx(UGeckoInstruction inst)
|
||||||
|
@ -29,8 +29,8 @@ static std::array<GekkoOPTemplate, 54> primarytable =
|
|||||||
{59, Interpreter::RunTable59, {"RunTable59", OpType::Subtable, 0, 0, 0, 0, 0}},
|
{59, Interpreter::RunTable59, {"RunTable59", OpType::Subtable, 0, 0, 0, 0, 0}},
|
||||||
{63, Interpreter::RunTable63, {"RunTable63", OpType::Subtable, 0, 0, 0, 0, 0}},
|
{63, Interpreter::RunTable63, {"RunTable63", OpType::Subtable, 0, 0, 0, 0, 0}},
|
||||||
|
|
||||||
{16, Interpreter::bcx, {"bcx", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}},
|
{16, Interpreter::bcx, {"bcx", OpType::Branch, FL_ENDBLOCK, 1, 0, 0, 0}},
|
||||||
{18, Interpreter::bx, {"bx", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}},
|
{18, Interpreter::bx, {"bx", OpType::Branch, FL_ENDBLOCK, 1, 0, 0, 0}},
|
||||||
|
|
||||||
{3, Interpreter::twi, {"twi", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}},
|
{3, Interpreter::twi, {"twi", OpType::System, FL_ENDBLOCK, 1, 0, 0, 0}},
|
||||||
{17, Interpreter::sc, {"sc", OpType::System, FL_ENDBLOCK, 2, 0, 0, 0}},
|
{17, Interpreter::sc, {"sc", OpType::System, FL_ENDBLOCK, 2, 0, 0, 0}},
|
||||||
|
@ -646,6 +646,15 @@ void Jit64::WriteRfiExitDestInRSCRATCH()
|
|||||||
JMP(asm_routines.dispatcher, true);
|
JMP(asm_routines.dispatcher, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Jit64::WriteIdleExit(u32 destination)
|
||||||
|
{
|
||||||
|
ABI_PushRegistersAndAdjustStack({}, 0);
|
||||||
|
ABI_CallFunction(CoreTiming::Idle);
|
||||||
|
ABI_PopRegistersAndAdjustStack({}, 0);
|
||||||
|
MOV(32, PPCSTATE(pc), Imm32(destination));
|
||||||
|
WriteExceptionExit();
|
||||||
|
}
|
||||||
|
|
||||||
void Jit64::WriteExceptionExit()
|
void Jit64::WriteExceptionExit()
|
||||||
{
|
{
|
||||||
Cleanup();
|
Cleanup();
|
||||||
|
@ -84,6 +84,7 @@ public:
|
|||||||
void WriteExceptionExit();
|
void WriteExceptionExit();
|
||||||
void WriteExternalExceptionExit();
|
void WriteExternalExceptionExit();
|
||||||
void WriteRfiExitDestInRSCRATCH();
|
void WriteRfiExitDestInRSCRATCH();
|
||||||
|
void WriteIdleExit(u32 destination);
|
||||||
bool Cleanup();
|
bool Cleanup();
|
||||||
|
|
||||||
void GenerateConstantOverflow(bool overflow);
|
void GenerateConstantOverflow(bool overflow);
|
||||||
|
@ -87,25 +87,18 @@ void Jit64::bx(UGeckoInstruction inst)
|
|||||||
gpr.Flush();
|
gpr.Flush();
|
||||||
fpr.Flush();
|
fpr.Flush();
|
||||||
|
|
||||||
u32 destination;
|
|
||||||
if (inst.AA)
|
|
||||||
destination = SignExt26(inst.LI << 2);
|
|
||||||
else
|
|
||||||
destination = js.compilerPC + SignExt26(inst.LI << 2);
|
|
||||||
#ifdef ACID_TEST
|
#ifdef ACID_TEST
|
||||||
if (inst.LK)
|
if (inst.LK)
|
||||||
AND(32, PPCSTATE(cr), Imm32(~(0xFF000000)));
|
AND(32, PPCSTATE(cr), Imm32(~(0xFF000000)));
|
||||||
#endif
|
#endif
|
||||||
if (destination == js.compilerPC)
|
if (js.op->branchIsIdleLoop)
|
||||||
{
|
{
|
||||||
ABI_PushRegistersAndAdjustStack({}, 0);
|
WriteIdleExit(js.op->branchTo);
|
||||||
ABI_CallFunction(CoreTiming::Idle);
|
}
|
||||||
ABI_PopRegistersAndAdjustStack({}, 0);
|
else
|
||||||
MOV(32, PPCSTATE(pc), Imm32(destination));
|
{
|
||||||
WriteExceptionExit();
|
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4);
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
WriteExit(destination, inst.LK, js.compilerPC + 4);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO - optimize to hell and beyond
|
// TODO - optimize to hell and beyond
|
||||||
@ -154,18 +147,20 @@ void Jit64::bcx(UGeckoInstruction inst)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 destination;
|
|
||||||
if (inst.AA)
|
|
||||||
destination = SignExt16(inst.BD << 2);
|
|
||||||
else
|
|
||||||
destination = js.compilerPC + SignExt16(inst.BD << 2);
|
|
||||||
|
|
||||||
{
|
{
|
||||||
RCForkGuard gpr_guard = gpr.Fork();
|
RCForkGuard gpr_guard = gpr.Fork();
|
||||||
RCForkGuard fpr_guard = fpr.Fork();
|
RCForkGuard fpr_guard = fpr.Fork();
|
||||||
gpr.Flush();
|
gpr.Flush();
|
||||||
fpr.Flush();
|
fpr.Flush();
|
||||||
WriteExit(destination, inst.LK, js.compilerPC + 4);
|
|
||||||
|
if (js.op->branchIsIdleLoop)
|
||||||
|
{
|
||||||
|
WriteIdleExit(js.op->branchTo);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)
|
if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)
|
||||||
@ -282,7 +277,15 @@ void Jit64::bclrx(UGeckoInstruction inst)
|
|||||||
RCForkGuard fpr_guard = fpr.Fork();
|
RCForkGuard fpr_guard = fpr.Fork();
|
||||||
gpr.Flush();
|
gpr.Flush();
|
||||||
fpr.Flush();
|
fpr.Flush();
|
||||||
WriteBLRExit();
|
|
||||||
|
if (js.op->branchIsIdleLoop)
|
||||||
|
{
|
||||||
|
WriteIdleExit(js.op->branchTo);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
WriteBLRExit();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)
|
if ((inst.BO & BO_DONT_CHECK_CONDITION) == 0)
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#include "Common/CommonTypes.h"
|
#include "Common/CommonTypes.h"
|
||||||
#include "Common/MathUtil.h"
|
#include "Common/MathUtil.h"
|
||||||
#include "Common/x64Emitter.h"
|
#include "Common/x64Emitter.h"
|
||||||
|
#include "Core/CoreTiming.h"
|
||||||
#include "Core/PowerPC/Jit64/Jit.h"
|
#include "Core/PowerPC/Jit64/Jit.h"
|
||||||
#include "Core/PowerPC/Jit64/RegCache/JitRegCache.h"
|
#include "Core/PowerPC/Jit64/RegCache/JitRegCache.h"
|
||||||
#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
|
#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
|
||||||
@ -361,7 +362,15 @@ void Jit64::DoMergedBranch()
|
|||||||
// Code that handles successful PPC branching.
|
// Code that handles successful PPC branching.
|
||||||
const UGeckoInstruction& next = js.op[1].inst;
|
const UGeckoInstruction& next = js.op[1].inst;
|
||||||
const u32 nextPC = js.op[1].address;
|
const u32 nextPC = js.op[1].address;
|
||||||
if (next.OPCD == 16) // bcx
|
|
||||||
|
if (js.op[1].branchIsIdleLoop)
|
||||||
|
{
|
||||||
|
if (next.LK)
|
||||||
|
MOV(32, PPCSTATE(spr[SPR_LR]), Imm32(nextPC + 4));
|
||||||
|
|
||||||
|
WriteIdleExit(js.op[1].branchTo);
|
||||||
|
}
|
||||||
|
else if (next.OPCD == 16) // bcx
|
||||||
{
|
{
|
||||||
if (next.LK)
|
if (next.LK)
|
||||||
MOV(32, PPCSTATE(spr[SPR_LR]), Imm32(nextPC + 4));
|
MOV(32, PPCSTATE(spr[SPR_LR]), Imm32(nextPC + 4));
|
||||||
|
@ -119,41 +119,6 @@ void Jit64::lXXx(UGeckoInstruction inst)
|
|||||||
signExtend = true;
|
signExtend = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!CPU::IsStepping() && inst.OPCD == 32 && CanMergeNextInstructions(2) &&
|
|
||||||
(inst.hex & 0xFFFF0000) == 0x800D0000 &&
|
|
||||||
(js.op[1].inst.hex == 0x28000000 ||
|
|
||||||
(SConfig::GetInstance().bWii && js.op[1].inst.hex == 0x2C000000)) &&
|
|
||||||
js.op[2].inst.hex == 0x4182fff8)
|
|
||||||
{
|
|
||||||
s32 offset = (s32)(s16)inst.SIMM_16;
|
|
||||||
RCX64Reg Ra = gpr.Bind(a, RCMode::Read);
|
|
||||||
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
|
|
||||||
RegCache::Realize(Ra, Rd);
|
|
||||||
|
|
||||||
SafeLoadToReg(Rd, Ra, accessSize, offset, CallerSavedRegistersInUse(), signExtend);
|
|
||||||
|
|
||||||
// if it's still 0, we can wait until the next event
|
|
||||||
TEST(32, Rd, Rd);
|
|
||||||
FixupBranch noIdle = J_CC(CC_NZ);
|
|
||||||
|
|
||||||
BitSet32 registersInUse = CallerSavedRegistersInUse();
|
|
||||||
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
|
|
||||||
|
|
||||||
ABI_CallFunction(CoreTiming::Idle);
|
|
||||||
|
|
||||||
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
|
|
||||||
|
|
||||||
// ! we must continue executing of the loop after exception handling, maybe there is still 0 in
|
|
||||||
// r0
|
|
||||||
// MOV(32, PPCSTATE(pc), Imm32(js.compilerPC));
|
|
||||||
WriteExceptionExit();
|
|
||||||
|
|
||||||
SetJumpTarget(noIdle);
|
|
||||||
|
|
||||||
// js.compilerPC += 8;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Determine whether this instruction updates inst.RA
|
// Determine whether this instruction updates inst.RA
|
||||||
bool update;
|
bool update;
|
||||||
if (inst.OPCD == 31)
|
if (inst.OPCD == 31)
|
||||||
|
@ -76,12 +76,6 @@ void JitArm64::bx(UGeckoInstruction inst)
|
|||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(bJITBranchOff);
|
JITDISABLE(bJITBranchOff);
|
||||||
|
|
||||||
u32 destination;
|
|
||||||
if (inst.AA)
|
|
||||||
destination = SignExt26(inst.LI << 2);
|
|
||||||
else
|
|
||||||
destination = js.compilerPC + SignExt26(inst.LI << 2);
|
|
||||||
|
|
||||||
if (inst.LK)
|
if (inst.LK)
|
||||||
{
|
{
|
||||||
ARM64Reg WA = gpr.GetReg();
|
ARM64Reg WA = gpr.GetReg();
|
||||||
@ -105,7 +99,7 @@ void JitArm64::bx(UGeckoInstruction inst)
|
|||||||
gpr.Flush(FlushMode::FLUSH_ALL);
|
gpr.Flush(FlushMode::FLUSH_ALL);
|
||||||
fpr.Flush(FlushMode::FLUSH_ALL);
|
fpr.Flush(FlushMode::FLUSH_ALL);
|
||||||
|
|
||||||
if (destination == js.compilerPC)
|
if (js.op->branchIsIdleLoop)
|
||||||
{
|
{
|
||||||
// make idle loops go faster
|
// make idle loops go faster
|
||||||
ARM64Reg WA = gpr.GetReg();
|
ARM64Reg WA = gpr.GetReg();
|
||||||
@ -115,11 +109,11 @@ void JitArm64::bx(UGeckoInstruction inst)
|
|||||||
BLR(XA);
|
BLR(XA);
|
||||||
gpr.Unlock(WA);
|
gpr.Unlock(WA);
|
||||||
|
|
||||||
WriteExceptionExit(js.compilerPC);
|
WriteExceptionExit(js.op->branchTo);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
WriteExit(destination, inst.LK, js.compilerPC + 4);
|
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitArm64::bcx(UGeckoInstruction inst)
|
void JitArm64::bcx(UGeckoInstruction inst)
|
||||||
@ -160,16 +154,25 @@ void JitArm64::bcx(UGeckoInstruction inst)
|
|||||||
}
|
}
|
||||||
gpr.Unlock(WA);
|
gpr.Unlock(WA);
|
||||||
|
|
||||||
u32 destination;
|
|
||||||
if (inst.AA)
|
|
||||||
destination = SignExt16(inst.BD << 2);
|
|
||||||
else
|
|
||||||
destination = js.compilerPC + SignExt16(inst.BD << 2);
|
|
||||||
|
|
||||||
gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
|
gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
|
||||||
fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
|
fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE);
|
||||||
|
|
||||||
WriteExit(destination, inst.LK, js.compilerPC + 4);
|
if (js.op->branchIsIdleLoop)
|
||||||
|
{
|
||||||
|
// make idle loops go faster
|
||||||
|
ARM64Reg WA = gpr.GetReg();
|
||||||
|
ARM64Reg XA = EncodeRegTo64(WA);
|
||||||
|
|
||||||
|
MOVP2R(XA, &CoreTiming::Idle);
|
||||||
|
BLR(XA);
|
||||||
|
gpr.Unlock(WA);
|
||||||
|
|
||||||
|
WriteExceptionExit(js.op->branchTo);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
WriteExit(js.op->branchTo, inst.LK, js.compilerPC + 4);
|
||||||
|
}
|
||||||
|
|
||||||
SwitchToNearCode();
|
SwitchToNearCode();
|
||||||
|
|
||||||
@ -275,7 +278,20 @@ void JitArm64::bclrx(UGeckoInstruction inst)
|
|||||||
gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
|
gpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
|
||||||
fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
|
fpr.Flush(conditional ? FlushMode::FLUSH_MAINTAIN_STATE : FlushMode::FLUSH_ALL);
|
||||||
|
|
||||||
WriteBLRExit(WA);
|
if (js.op->branchIsIdleLoop)
|
||||||
|
{
|
||||||
|
// make idle loops go faster
|
||||||
|
ARM64Reg XA = EncodeRegTo64(WA);
|
||||||
|
|
||||||
|
MOVP2R(XA, &CoreTiming::Idle);
|
||||||
|
BLR(XA);
|
||||||
|
|
||||||
|
WriteExceptionExit(js.op->branchTo);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
WriteBLRExit(WA);
|
||||||
|
}
|
||||||
|
|
||||||
gpr.Unlock(WA);
|
gpr.Unlock(WA);
|
||||||
|
|
||||||
|
@ -346,37 +346,6 @@ void JitArm64::lXX(UGeckoInstruction inst)
|
|||||||
}
|
}
|
||||||
|
|
||||||
SafeLoadToReg(d, update ? a : (a ? a : -1), offsetReg, flags, offset, update);
|
SafeLoadToReg(d, update ? a : (a ? a : -1), offsetReg, flags, offset, update);
|
||||||
|
|
||||||
// LWZ idle skipping
|
|
||||||
if (inst.OPCD == 32 && CanMergeNextInstructions(2) &&
|
|
||||||
(inst.hex & 0xFFFF0000) == 0x800D0000 && // lwz r0, XXXX(r13)
|
|
||||||
(js.op[1].inst.hex == 0x28000000 ||
|
|
||||||
(SConfig::GetInstance().bWii && js.op[1].inst.hex == 0x2C000000)) && // cmpXwi r0,0
|
|
||||||
js.op[2].inst.hex == 0x4182fff8) // beq -8
|
|
||||||
{
|
|
||||||
ARM64Reg WA = gpr.GetReg();
|
|
||||||
ARM64Reg XA = EncodeRegTo64(WA);
|
|
||||||
|
|
||||||
// if it's still 0, we can wait until the next event
|
|
||||||
FixupBranch noIdle = CBNZ(gpr.R(d));
|
|
||||||
|
|
||||||
FixupBranch far = B();
|
|
||||||
SwitchToFarCode();
|
|
||||||
SetJumpTarget(far);
|
|
||||||
|
|
||||||
gpr.Flush(FLUSH_MAINTAIN_STATE);
|
|
||||||
fpr.Flush(FLUSH_MAINTAIN_STATE);
|
|
||||||
|
|
||||||
MOVP2R(XA, &CoreTiming::Idle);
|
|
||||||
BLR(XA);
|
|
||||||
gpr.Unlock(WA);
|
|
||||||
|
|
||||||
WriteExceptionExit(js.compilerPC);
|
|
||||||
|
|
||||||
SwitchToNearCode();
|
|
||||||
|
|
||||||
SetJumpTarget(noIdle);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitArm64::stX(UGeckoInstruction inst)
|
void JitArm64::stX(UGeckoInstruction inst)
|
||||||
|
@ -640,6 +640,90 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock* block, CodeOp* code, const Gekk
|
|||||||
code->outputCR0 = true;
|
code->outputCR0 = true;
|
||||||
code->outputCR1 = true;
|
code->outputCR1 = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
code->branchUsesCtr = false;
|
||||||
|
code->branchTo = UINT32_MAX;
|
||||||
|
|
||||||
|
// For branch with immediate addresses (bx/bcx), compute the destination.
|
||||||
|
if (code->inst.OPCD == 18) // bx
|
||||||
|
{
|
||||||
|
if (code->inst.AA) // absolute
|
||||||
|
code->branchTo = SignExt26(code->inst.LI << 2);
|
||||||
|
else
|
||||||
|
code->branchTo = code->address + SignExt26(code->inst.LI << 2);
|
||||||
|
}
|
||||||
|
else if (code->inst.OPCD == 16) // bcx
|
||||||
|
{
|
||||||
|
if (code->inst.AA) // absolute
|
||||||
|
code->branchTo = SignExt16(code->inst.BD << 2);
|
||||||
|
else
|
||||||
|
code->branchTo = code->address + SignExt16(code->inst.BD << 2);
|
||||||
|
if (!(code->inst.BO & BO_DONT_DECREMENT_FLAG))
|
||||||
|
code->branchUsesCtr = true;
|
||||||
|
}
|
||||||
|
else if (code->inst.OPCD == 19 && code->inst.SUBOP10 == 16) // bclrx
|
||||||
|
{
|
||||||
|
if (!(code->inst.BO & BO_DONT_DECREMENT_FLAG))
|
||||||
|
code->branchUsesCtr = true;
|
||||||
|
}
|
||||||
|
else if (code->inst.OPCD == 19 && code->inst.SUBOP10 == 528) // bcctrx
|
||||||
|
{
|
||||||
|
if (!(code->inst.BO & BO_DONT_DECREMENT_FLAG))
|
||||||
|
code->branchUsesCtr = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PPCAnalyzer::IsBusyWaitLoop(CodeBlock* block, CodeOp* code, size_t instructions)
|
||||||
|
{
|
||||||
|
// Very basic algorithm to detect busy wait loops:
|
||||||
|
// * It loops to itself and does not contain any other branches.
|
||||||
|
// * It does not write to memory.
|
||||||
|
// * It only reads from registers it wrote to earlier in the loop, or it
|
||||||
|
// does not write to these registers.
|
||||||
|
//
|
||||||
|
// Would benefit a lot from basic inlining support - a lot of the most
|
||||||
|
// used busy loops are DSP register interactions, which are bl/cmp/bne
|
||||||
|
// (with the bl target a pure function that follows the above rules). We
|
||||||
|
// don't detect these at the moment.
|
||||||
|
std::bitset<32> write_disallowed_regs;
|
||||||
|
std::bitset<32> written_regs;
|
||||||
|
for (size_t i = 0; i <= instructions; ++i)
|
||||||
|
{
|
||||||
|
if (code[i].opinfo->type == OpType::Branch)
|
||||||
|
{
|
||||||
|
if (code[i].branchUsesCtr)
|
||||||
|
return false;
|
||||||
|
if (code[i].branchTo == block->m_address && i == instructions)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else if (code[i].opinfo->type != OpType::Integer && code[i].opinfo->type != OpType::Load)
|
||||||
|
{
|
||||||
|
// In the future, some subsets of other instruction types might get
|
||||||
|
// supported. Right now, only try loops that have this very
|
||||||
|
// restricted instruction set.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int reg : code[i].regsIn)
|
||||||
|
{
|
||||||
|
if (reg == -1)
|
||||||
|
continue;
|
||||||
|
if (written_regs[reg])
|
||||||
|
continue;
|
||||||
|
write_disallowed_regs[reg] = true;
|
||||||
|
}
|
||||||
|
for (int reg : code[i].regsOut)
|
||||||
|
{
|
||||||
|
if (reg == -1)
|
||||||
|
continue;
|
||||||
|
if (write_disallowed_regs[reg])
|
||||||
|
return false;
|
||||||
|
written_regs[reg] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std::size_t block_size)
|
u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std::size_t block_size)
|
||||||
@ -692,8 +776,6 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
|
|||||||
code[i].opinfo = opinfo;
|
code[i].opinfo = opinfo;
|
||||||
code[i].address = address;
|
code[i].address = address;
|
||||||
code[i].inst = inst;
|
code[i].inst = inst;
|
||||||
code[i].branchTo = UINT32_MAX;
|
|
||||||
code[i].branchToIndex = UINT32_MAX;
|
|
||||||
code[i].skip = false;
|
code[i].skip = false;
|
||||||
block->m_stats->numCycles += opinfo->numCycles;
|
block->m_stats->numCycles += opinfo->numCycles;
|
||||||
block->m_physical_addresses.insert(result.physical_address);
|
block->m_physical_addresses.insert(result.physical_address);
|
||||||
@ -701,7 +783,6 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
|
|||||||
SetInstructionStats(block, &code[i], opinfo, static_cast<u32>(i));
|
SetInstructionStats(block, &code[i], opinfo, static_cast<u32>(i));
|
||||||
|
|
||||||
bool follow = false;
|
bool follow = false;
|
||||||
u32 destination = 0;
|
|
||||||
|
|
||||||
bool conditional_continue = false;
|
bool conditional_continue = false;
|
||||||
|
|
||||||
@ -709,13 +790,12 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
|
|||||||
// If it is small, the performance will be down.
|
// If it is small, the performance will be down.
|
||||||
// If it is big, the size of generated code will be big and
|
// If it is big, the size of generated code will be big and
|
||||||
// cache clearning will happen many times.
|
// cache clearning will happen many times.
|
||||||
if (enable_follow && HasOption(OPTION_BRANCH_FOLLOW) && numFollows < BRANCH_FOLLOWING_THRESHOLD)
|
if (enable_follow && HasOption(OPTION_BRANCH_FOLLOW))
|
||||||
{
|
{
|
||||||
if (inst.OPCD == 18 && block_size > 1)
|
if (inst.OPCD == 18 && block_size > 1)
|
||||||
{
|
{
|
||||||
// Always follow BX instructions.
|
// Always follow BX instructions.
|
||||||
follow = true;
|
follow = true;
|
||||||
destination = SignExt26(inst.LI << 2) + (inst.AA ? 0 : address);
|
|
||||||
if (inst.LK)
|
if (inst.LK)
|
||||||
{
|
{
|
||||||
found_call = true;
|
found_call = true;
|
||||||
@ -727,29 +807,31 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
|
|||||||
{
|
{
|
||||||
// Always follow unconditional BCX instructions, but they are very rare.
|
// Always follow unconditional BCX instructions, but they are very rare.
|
||||||
follow = true;
|
follow = true;
|
||||||
destination = SignExt16(inst.BD << 2) + (inst.AA ? 0 : address);
|
|
||||||
if (inst.LK)
|
if (inst.LK)
|
||||||
{
|
{
|
||||||
found_call = true;
|
found_call = true;
|
||||||
caller = i;
|
caller = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call &&
|
else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call)
|
||||||
(inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION))
|
|
||||||
{
|
{
|
||||||
// bclrx with unconditional branch = return
|
code[i].branchTo = code[caller].address + 4;
|
||||||
// Follow it if we can propagate the LR value of the last CALL instruction.
|
if ((inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION) &&
|
||||||
// Through it would be easy to track the upper level of call/return,
|
numFollows < BRANCH_FOLLOWING_THRESHOLD)
|
||||||
// we can't guarantee the LR value. The PPC ABI forces all functions to push
|
{
|
||||||
// the LR value on the stack as there are no spare registers. So we'd need
|
// bclrx with unconditional branch = return
|
||||||
// to check all store instruction to not alias with the stack.
|
// Follow it if we can propagate the LR value of the last CALL instruction.
|
||||||
follow = true;
|
// Through it would be easy to track the upper level of call/return,
|
||||||
destination = code[caller].address + 4;
|
// we can't guarantee the LR value. The PPC ABI forces all functions to push
|
||||||
found_call = false;
|
// the LR value on the stack as there are no spare registers. So we'd need
|
||||||
code[i].skip = true;
|
// to check all store instruction to not alias with the stack.
|
||||||
|
follow = true;
|
||||||
|
found_call = false;
|
||||||
|
code[i].skip = true;
|
||||||
|
|
||||||
// Skip the RET, so also don't generate the stack entry for the BLR optimization.
|
// Skip the RET, so also don't generate the stack entry for the BLR optimization.
|
||||||
code[caller].skipLRStack = true;
|
code[caller].skipLRStack = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (inst.OPCD == 31 && inst.SUBOP10 == 467)
|
else if (inst.OPCD == 31 && inst.SUBOP10 == 467)
|
||||||
{
|
{
|
||||||
@ -792,11 +874,14 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (follow)
|
code[i].branchIsIdleLoop =
|
||||||
|
code[i].branchTo == block->m_address && IsBusyWaitLoop(block, code, i);
|
||||||
|
|
||||||
|
if (follow && numFollows < BRANCH_FOLLOWING_THRESHOLD)
|
||||||
{
|
{
|
||||||
// Follow the unconditional branch.
|
// Follow the unconditional branch.
|
||||||
numFollows++;
|
numFollows++;
|
||||||
address = destination;
|
address = code[i].branchTo;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -27,13 +27,14 @@ struct CodeOp // 16B
|
|||||||
UGeckoInstruction inst;
|
UGeckoInstruction inst;
|
||||||
GekkoOPInfo* opinfo;
|
GekkoOPInfo* opinfo;
|
||||||
u32 address;
|
u32 address;
|
||||||
u32 branchTo; // if 0, not a branch
|
u32 branchTo; // if UINT32_MAX, not a branch
|
||||||
int branchToIndex; // index of target block
|
|
||||||
BitSet32 regsOut;
|
BitSet32 regsOut;
|
||||||
BitSet32 regsIn;
|
BitSet32 regsIn;
|
||||||
BitSet32 fregsIn;
|
BitSet32 fregsIn;
|
||||||
s8 fregOut;
|
s8 fregOut;
|
||||||
bool isBranchTarget;
|
bool isBranchTarget;
|
||||||
|
bool branchUsesCtr;
|
||||||
|
bool branchIsIdleLoop;
|
||||||
bool wantsCR0;
|
bool wantsCR0;
|
||||||
bool wantsCR1;
|
bool wantsCR1;
|
||||||
bool wantsFPRF;
|
bool wantsFPRF;
|
||||||
@ -213,6 +214,7 @@ private:
|
|||||||
void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);
|
void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);
|
||||||
void ReorderInstructions(u32 instructions, CodeOp* code);
|
void ReorderInstructions(u32 instructions, CodeOp* code);
|
||||||
void SetInstructionStats(CodeBlock* block, CodeOp* code, const GekkoOPInfo* opinfo, u32 index);
|
void SetInstructionStats(CodeBlock* block, CodeOp* code, const GekkoOPInfo* opinfo, u32 index);
|
||||||
|
bool IsBusyWaitLoop(CodeBlock* block, CodeOp* code, size_t instructions);
|
||||||
|
|
||||||
// Options
|
// Options
|
||||||
u32 m_options = 0;
|
u32 m_options = 0;
|
||||||
|
Loading…
Reference in New Issue
Block a user