From ef05a147578b070fdab6042f68422dfe20f91e23 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 7 Sep 2013 00:19:32 +0000 Subject: [PATCH] [ARM] Clean up FPR cache. Rapid fire floating point instruction implementations. Adds 13 new instructions. --- Source/Core/Core/Src/PowerPC/JitArm32/Jit.h | 13 + .../PowerPC/JitArm32/JitArm_FloatingPoint.cpp | 54 +++- .../Src/PowerPC/JitArm32/JitArm_Paired.cpp | 246 +++++++++++++++++- .../Src/PowerPC/JitArm32/JitArm_Tables.cpp | 26 +- .../Core/Src/PowerPC/JitArm32/JitFPRCache.cpp | 135 ++++++---- .../Core/Src/PowerPC/JitArm32/JitFPRCache.h | 3 + .../Core/Src/PowerPC/JitArm32/JitRegCache.cpp | 4 +- .../Core/Src/PowerPC/JitArm32/JitRegCache.h | 49 ++-- 8 files changed, 412 insertions(+), 118 deletions(-) diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/Jit.h b/Source/Core/Core/Src/PowerPC/JitArm32/Jit.h index 2f9287c80e..072a6d40fa 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/Jit.h +++ b/Source/Core/Core/Src/PowerPC/JitArm32/Jit.h @@ -186,6 +186,8 @@ public: // Floating point void fabsx(UGeckoInstruction _inst); + void fnabsx(UGeckoInstruction _inst); + void fnegx(UGeckoInstruction _inst); void faddsx(UGeckoInstruction _inst); void faddx(UGeckoInstruction _inst); void fsubsx(UGeckoInstruction _inst); @@ -202,9 +204,20 @@ public: // Paired Singles void ps_add(UGeckoInstruction _inst); void ps_sum0(UGeckoInstruction _inst); + void ps_sum1(UGeckoInstruction _inst); void ps_madd(UGeckoInstruction _inst); void ps_sub(UGeckoInstruction _inst); void ps_mul(UGeckoInstruction _inst); + void ps_muls0(UGeckoInstruction _inst); + void ps_muls1(UGeckoInstruction _inst); + void ps_merge00(UGeckoInstruction _inst); + void ps_merge01(UGeckoInstruction _inst); + void ps_merge10(UGeckoInstruction _inst); + void ps_merge11(UGeckoInstruction _inst); + void ps_mr(UGeckoInstruction _inst); + void ps_neg(UGeckoInstruction _inst); + void ps_abs(UGeckoInstruction _inst); + void ps_nabs(UGeckoInstruction _inst); }; #endif // _JIT64_H diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_FloatingPoint.cpp index 3dcaf902d1..02a48740ce 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_FloatingPoint.cpp @@ -43,14 +43,46 @@ void JitArm::fabsx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITFloatingPointOff) - ARMReg vD = fpr.R0(inst.FD); ARMReg vB = fpr.R0(inst.FB); + ARMReg vD = fpr.R0(inst.FD, false); VABS(vD, vB); if (inst.Rc) Helper_UpdateCR1(vD); } +void JitArm::fnabsx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff) + + ARMReg vB = fpr.R0(inst.FB); + ARMReg vD = fpr.R0(inst.FD, false); + ARMReg V0 = fpr.GetReg(); + + // XXX: Could be done quicker + VABS(vD, vB); + VMOV(V0, vD); + VSUB(vD, vD, V0); + VSUB(vD, vD, V0); + + fpr.Unlock(V0); + if (inst.Rc) Helper_UpdateCR1(vD); +} + +void JitArm::fnegx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff) + + ARMReg vB = fpr.R0(inst.FB); + ARMReg vD = fpr.R0(inst.FD, false); + + VNEG(vD, vB); + + if (inst.Rc) Helper_UpdateCR1(vD); +} + void JitArm::faddsx(UGeckoInstruction inst) { INSTRUCTION_START @@ -58,8 +90,8 @@ void JitArm::faddsx(UGeckoInstruction inst) ARMReg vA = fpr.R0(inst.FA); ARMReg vB = fpr.R0(inst.FB); - ARMReg vD0 = fpr.R0(inst.FD); - ARMReg vD1 = fpr.R1(inst.FD); + ARMReg vD0 = fpr.R0(inst.FD, false); + ARMReg vD1 = fpr.R1(inst.FD, false); VADD(vD0, vA, vB); VMOV(vD1, vD0); @@ -71,9 +103,9 @@ void JitArm::faddx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITFloatingPointOff) - ARMReg vD = fpr.R0(inst.FD); ARMReg vA = fpr.R0(inst.FA); ARMReg vB = fpr.R0(inst.FB); + ARMReg vD = fpr.R0(inst.FD, false); VADD(vD, vA, vB); if (inst.Rc) Helper_UpdateCR1(vD); @@ -86,8 +118,8 @@ void JitArm::fsubsx(UGeckoInstruction inst) ARMReg vA = fpr.R0(inst.FA); ARMReg vB = fpr.R0(inst.FB); - ARMReg vD0 = fpr.R0(inst.FD); - ARMReg vD1 = fpr.R1(inst.FD); + ARMReg vD0 = fpr.R0(inst.FD, false); + ARMReg vD1 = fpr.R1(inst.FD, false); VSUB(vD0, vA, vB); VMOV(vD1, vD0); @@ -99,9 +131,9 @@ void JitArm::fsubx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITFloatingPointOff) - ARMReg vD = fpr.R0(inst.FD); ARMReg vA = fpr.R0(inst.FA); ARMReg vB = fpr.R0(inst.FB); + ARMReg vD = fpr.R0(inst.FD, false); VSUB(vD, vA, vB); if (inst.Rc) Helper_UpdateCR1(vD); @@ -114,8 +146,8 @@ void JitArm::fmulsx(UGeckoInstruction inst) ARMReg vA = fpr.R0(inst.FA); ARMReg vC = fpr.R0(inst.FC); - ARMReg vD0 = fpr.R0(inst.FD); - ARMReg vD1 = fpr.R1(inst.FD); + ARMReg vD0 = fpr.R0(inst.FD, false); + ARMReg vD1 = fpr.R1(inst.FD, false); VMUL(vD0, vA, vC); VMOV(vD1, vD0); @@ -127,9 +159,9 @@ void JitArm::fmulx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITFloatingPointOff) - ARMReg vD0 = fpr.R0(inst.FD); ARMReg vA = fpr.R0(inst.FA); ARMReg vC = fpr.R0(inst.FC); + ARMReg vD0 = fpr.R0(inst.FD, false); VMUL(vD0, vA, vC); if (inst.Rc) Helper_UpdateCR1(vD0); @@ -139,8 +171,8 @@ void JitArm::fmrx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITFloatingPointOff) - ARMReg vD = fpr.R0(inst.FD); ARMReg vB = fpr.R0(inst.FB); + ARMReg vD = fpr.R0(inst.FD, false); VMOV(vD, vB); diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_Paired.cpp b/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_Paired.cpp index e6a084716a..4f4751c02f 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_Paired.cpp +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_Paired.cpp @@ -40,12 +40,11 @@ void JitArm::ps_add(UGeckoInstruction inst) ARMReg vA1 = fpr.R1(a); ARMReg vB0 = fpr.R0(b); ARMReg vB1 = fpr.R1(b); - ARMReg vD0 = fpr.R0(d); - ARMReg vD1 = fpr.R1(d); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); VADD(vD0, vA0, vB0); VADD(vD1, vA1, vB1); - fpr.Flush(); } // Wrong, THP videos like SMS and Ikaruga show artifacts @@ -67,8 +66,8 @@ void JitArm::ps_madd(UGeckoInstruction inst) ARMReg vB1 = fpr.R1(b); ARMReg vC0 = fpr.R0(c); ARMReg vC1 = fpr.R1(c); - ARMReg vD0 = fpr.R0(d); - ARMReg vD1 = fpr.R1(d); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); ARMReg V0 = fpr.GetReg(); ARMReg V1 = fpr.GetReg(); @@ -99,14 +98,35 @@ void JitArm::ps_sum0(UGeckoInstruction inst) ARMReg vA0 = fpr.R0(a); ARMReg vB1 = fpr.R1(b); ARMReg vC1 = fpr.R1(c); - ARMReg vD0 = fpr.R0(d); - ARMReg vD1 = fpr.R1(d); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); VADD(vD0, vA0, vB1); VMOV(vD1, vC1); - fpr.Flush(); + } +void JitArm::ps_sum1(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff) + + u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; + + if (inst.Rc) { + Default(inst); return; + } + ARMReg vA0 = fpr.R0(a); + ARMReg vB1 = fpr.R1(b); + ARMReg vC0 = fpr.R0(c); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); + + VMOV(vD0, vC0); + VADD(vD1, vA0, vB1); +} + + void JitArm::ps_sub(UGeckoInstruction inst) { INSTRUCTION_START @@ -120,12 +140,11 @@ void JitArm::ps_sub(UGeckoInstruction inst) ARMReg vA1 = fpr.R1(a); ARMReg vB0 = fpr.R0(b); ARMReg vB1 = fpr.R1(b); - ARMReg vD0 = fpr.R0(d); - ARMReg vD1 = fpr.R1(d); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); VSUB(vD0, vA0, vB0); VSUB(vD1, vA1, vB1); - fpr.Flush(); } void JitArm::ps_mul(UGeckoInstruction inst) @@ -141,11 +160,210 @@ void JitArm::ps_mul(UGeckoInstruction inst) ARMReg vA1 = fpr.R1(a); ARMReg vC0 = fpr.R0(c); ARMReg vC1 = fpr.R1(c); - ARMReg vD0 = fpr.R0(d); - ARMReg vD1 = fpr.R1(d); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); VMUL(vD0, vA0, vC0); VMUL(vD1, vA1, vC1); - fpr.Flush(); } +void JitArm::ps_muls0(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff) + + u32 a = inst.FA, c = inst.FC, d = inst.FD; + if (inst.Rc){ + Default(inst); return; + } + ARMReg vA0 = fpr.R0(a); + ARMReg vA1 = fpr.R1(a); + ARMReg vC0 = fpr.R0(c); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); + ARMReg V0 = fpr.GetReg(); + ARMReg V1 = fpr.GetReg(); + + + VMUL(V0, vA0, vC0); + VMUL(V1, vA1, vC0); + VMOV(vD0, V0); + VMOV(vD1, V1); + + fpr.Unlock(V0); + fpr.Unlock(V1); +} + +void JitArm::ps_muls1(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff) + + u32 a = inst.FA, c = inst.FC, d = inst.FD; + if (inst.Rc){ + Default(inst); return; + } + ARMReg vA0 = fpr.R0(a); + ARMReg vA1 = fpr.R1(a); + ARMReg vC1 = fpr.R1(c); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); + ARMReg V0 = fpr.GetReg(); + ARMReg V1 = fpr.GetReg(); + + + VMUL(V0, vA0, vC1); + VMUL(V1, vA1, vC1); + VMOV(vD0, V0); + VMOV(vD1, V1); + + fpr.Unlock(V0); + fpr.Unlock(V1); +} + +void JitArm::ps_merge00(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff) + u32 a = inst.FA, b = inst.FB, d = inst.FD; + if (inst.Rc){ + Default(inst); return; + } + + ARMReg vA0 = fpr.R0(a); + ARMReg vB0 = fpr.R0(b); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); + VMOV(vD0, vA0); + VMOV(vD1, vB0); +} + +void JitArm::ps_merge01(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff) + u32 a = inst.FA, b = inst.FB, d = inst.FD; + if (inst.Rc){ + Default(inst); return; + } + + ARMReg vA0 = fpr.R0(a); + ARMReg vB1 = fpr.R1(b); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); + VMOV(vD0, vA0); + VMOV(vD1, vB1); +} + +void JitArm::ps_merge10(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff) + u32 a = inst.FA, b = inst.FB, d = inst.FD; + if (inst.Rc){ + Default(inst); return; + } + + ARMReg vA1 = fpr.R1(a); + ARMReg vB0 = fpr.R0(b); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); + VMOV(vD0, vA1); + VMOV(vD1, vB0); +} + +void JitArm::ps_merge11(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff) + u32 a = inst.FA, b = inst.FB, d = inst.FD; + if (inst.Rc){ + Default(inst); return; + } + + ARMReg vA1 = fpr.R1(a); + ARMReg vB1 = fpr.R1(b); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); + VMOV(vD0, vA1); + VMOV(vD1, vB1); +} + +void JitArm::ps_mr(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff) + u32 b = inst.FB, d = inst.FD; + if (inst.Rc){ + Default(inst); return; + } + + ARMReg vB0 = fpr.R0(b); + ARMReg vB1 = fpr.R1(b); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); + VMOV(vD0, vB0); + VMOV(vD1, vB1); +} + +void JitArm::ps_neg(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff) + u32 b = inst.FB, d = inst.FD; + if (inst.Rc){ + Default(inst); return; + } + + ARMReg vB0 = fpr.R0(b); + ARMReg vB1 = fpr.R1(b); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); + VNEG(vD0, vB0); + VNEG(vD1, vB1); +} + +void JitArm::ps_abs(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff) + u32 b = inst.FB, d = inst.FD; + if (inst.Rc){ + Default(inst); return; + } + + ARMReg vB0 = fpr.R0(b); + ARMReg vB1 = fpr.R1(b); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); + VABS(vD0, vB0); + VABS(vD1, vB1); +} + +void JitArm::ps_nabs(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff) + u32 b = inst.FB, d = inst.FD; + if (inst.Rc){ + Default(inst); return; + } + + ARMReg vB0 = fpr.R0(b); + ARMReg vB1 = fpr.R1(b); + ARMReg vD0 = fpr.R0(d, false); + ARMReg vD1 = fpr.R1(d, false); + ARMReg V0 = fpr.GetReg(); + + // XXX: Could be done quicker + VABS(vD0, vB0); + VMOV(V0, vD0); + VSUB(vD0, vD0, V0); + VSUB(vD0, vD0, V0); + VABS(vD1, vB1); + VMOV(V0, vD1); + VSUB(vD1, vD1, V0); + VSUB(vD1, vD1, V0); + + fpr.Unlock(V0); +} diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_Tables.cpp b/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_Tables.cpp index 338de355df..4a1aa9d28f 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_Tables.cpp +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_Tables.cpp @@ -127,16 +127,16 @@ static GekkoOPTemplate table4[] = { //SUBOP10 {0, &JitArm::Default}, //"ps_cmpu0", OPTYPE_PS, FL_SET_CRn}}, {32, &JitArm::Default}, //"ps_cmpo0", OPTYPE_PS, FL_SET_CRn}}, - {40, &JitArm::Default}, //"ps_neg", OPTYPE_PS, FL_RC_BIT}}, - {136, &JitArm::Default}, //"ps_nabs", OPTYPE_PS, FL_RC_BIT}}, - {264, &JitArm::Default}, //"ps_abs", OPTYPE_PS, FL_RC_BIT}}, + {40, &JitArm::ps_neg}, //"ps_neg", OPTYPE_PS, FL_RC_BIT}}, + {136, &JitArm::ps_nabs}, //"ps_nabs", OPTYPE_PS, FL_RC_BIT}}, + {264, &JitArm::ps_abs}, //"ps_abs", OPTYPE_PS, FL_RC_BIT}}, {64, &JitArm::Default}, //"ps_cmpu1", OPTYPE_PS, FL_RC_BIT}}, - {72, &JitArm::Default}, //"ps_mr", OPTYPE_PS, FL_RC_BIT}}, + {72, &JitArm::ps_mr}, //"ps_mr", OPTYPE_PS, FL_RC_BIT}}, {96, &JitArm::Default}, //"ps_cmpo1", OPTYPE_PS, FL_RC_BIT}}, - {528, &JitArm::Default}, //"ps_merge00", OPTYPE_PS, FL_RC_BIT}}, - {560, &JitArm::Default}, //"ps_merge01", OPTYPE_PS, FL_RC_BIT}}, - {592, &JitArm::Default}, //"ps_merge10", OPTYPE_PS, FL_RC_BIT}}, - {624, &JitArm::Default}, //"ps_merge11", OPTYPE_PS, FL_RC_BIT}}, + {528, &JitArm::ps_merge00}, //"ps_merge00", OPTYPE_PS, FL_RC_BIT}}, + {560, &JitArm::ps_merge01}, //"ps_merge01", OPTYPE_PS, FL_RC_BIT}}, + {592, &JitArm::ps_merge10}, //"ps_merge10", OPTYPE_PS, FL_RC_BIT}}, + {624, &JitArm::ps_merge11}, //"ps_merge11", OPTYPE_PS, FL_RC_BIT}}, {1014, &JitArm::Default}, //"dcbz_l", OPTYPE_SYSTEM, 0}}, }; @@ -144,9 +144,9 @@ static GekkoOPTemplate table4[] = static GekkoOPTemplate table4_2[] = { {10, &JitArm::ps_sum0}, //"ps_sum0", OPTYPE_PS, 0}}, - {11, &JitArm::Default}, //"ps_sum1", OPTYPE_PS, 0}}, - {12, &JitArm::Default}, //"ps_muls0", OPTYPE_PS, 0}}, - {13, &JitArm::Default}, //"ps_muls1", OPTYPE_PS, 0}}, + {11, &JitArm::ps_sum1}, //"ps_sum1", OPTYPE_PS, 0}}, + {12, &JitArm::ps_muls0}, //"ps_muls0", OPTYPE_PS, 0}}, + {13, &JitArm::ps_muls1}, //"ps_muls1", OPTYPE_PS, 0}}, {14, &JitArm::Default}, //"ps_madds0", OPTYPE_PS, 0}}, {15, &JitArm::Default}, //"ps_madds1", OPTYPE_PS, 0}}, {18, &JitArm::Default}, //"ps_div", OPTYPE_PS, 0, 16}}, @@ -352,8 +352,8 @@ static GekkoOPTemplate table63[] = {14, &JitArm::Default}, //"fctiwx", OPTYPE_FPU, FL_RC_BIT_F}}, {15, &JitArm::Default}, //"fctiwzx", OPTYPE_FPU, FL_RC_BIT_F}}, {72, &JitArm::fmrx}, //"fmrx", OPTYPE_FPU, FL_RC_BIT_F}}, - {136, &JitArm::Default}, //"fnabsx", OPTYPE_FPU, FL_RC_BIT_F}}, - {40, &JitArm::Default}, //"fnegx", OPTYPE_FPU, FL_RC_BIT_F}}, + {136, &JitArm::fnabsx}, //"fnabsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {40, &JitArm::fnegx}, //"fnegx", OPTYPE_FPU, FL_RC_BIT_F}}, {12, &JitArm::Default}, //"frspx", OPTYPE_FPU, FL_RC_BIT_F}}, {64, &JitArm::Default}, //"mcrfs", OPTYPE_SYSTEMFP, 0}}, diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitFPRCache.cpp b/Source/Core/Core/Src/PowerPC/JitArm32/JitFPRCache.cpp index 359c497b3c..802f767f72 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitFPRCache.cpp +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitFPRCache.cpp @@ -35,7 +35,6 @@ void ArmFPRCache::Init(ARMXEmitter *emitter) ArmCRegs[a].Reg = PPCRegs[a]; ArmCRegs[a].LastLoad = 0; ArmCRegs[a].PS1 = false; - ArmCRegs[a].Away = true; } for(u8 a = 0; a < NUMARMREG; ++a) { @@ -43,14 +42,11 @@ void ArmFPRCache::Init(ARMXEmitter *emitter) ArmRegs[a].free = true; } } + void ArmFPRCache::Start(PPCAnalyst::BlockRegStats &stats) { - for(u8 a = 0; a < NUMPPCREG; ++a) - { - ArmCRegs[a].PPCReg = 33; - ArmCRegs[a].LastLoad = 0; - } } + ARMReg *ArmFPRCache::GetPPCAllocationOrder(int &count) { // This will return us the allocation order of the registers we can use on @@ -101,59 +97,78 @@ void ArmFPRCache::Unlock(ARMReg V0) } } } -ARMReg ArmFPRCache::GetPPCReg(u32 preg, bool PS1, bool preLoad) +u32 ArmFPRCache::GetLeastUsedRegister(bool increment) { u32 HighestUsed = 0; - u8 Num = 0; + u8 lastRegIndex = 0; for(u8 a = 0; a < NUMPPCREG; ++a){ - ++ArmCRegs[a].LastLoad; + if (increment) + ++ArmCRegs[a].LastLoad; if (ArmCRegs[a].LastLoad > HighestUsed) { HighestUsed = ArmCRegs[a].LastLoad; - Num = a; + lastRegIndex = a; } } - // Check if already Loaded - for(u8 a = 0; a < NUMPPCREG; ++a) - if (ArmCRegs[a].PPCReg == preg && ArmCRegs[a].PS1 == PS1) - { - ArmCRegs[a].LastLoad = 0; - // Check if the value is actually in the reg - if (ArmCRegs[a].Away && preLoad) - { - // Load it now since we want it - s16 offset = PPCSTATE_OFF(ps) + (preg * 16) + (PS1 ? 8 : 0); - emit->VLDR(ArmCRegs[a].Reg, R9, offset); - ArmCRegs[a].Away = false; - } - return ArmCRegs[a].Reg; - } - // Check if we have a free register + return lastRegIndex; +} +bool ArmFPRCache::FindFreeRegister(u32 ®index) +{ for (u8 a = 0; a < NUMPPCREG; ++a) if (ArmCRegs[a].PPCReg == 33) { - s16 offset = PPCSTATE_OFF(ps) + (preg * 16) + (PS1 ? 8 : 0); - if (preLoad) - emit->VLDR(ArmCRegs[a].Reg, R9, offset); - ArmCRegs[a].PPCReg = preg; - ArmCRegs[a].LastLoad = 0; - ArmCRegs[a].PS1 = PS1; - ArmCRegs[a].Away = !preLoad; - return ArmCRegs[a].Reg; + regindex = a; + return true; } - // Alright, we couldn't get a free space, dump that least used register - s16 offsetOld = PPCSTATE_OFF(ps) + (ArmCRegs[Num].PPCReg * 16) + (ArmCRegs[Num].PS1 ? 8 : 0); - emit->VSTR(ArmCRegs[Num].Reg, R9, offsetOld); - - s16 offsetNew = PPCSTATE_OFF(ps) + (preg * 16) + (PS1 ? 8 : 0); - if (preLoad) - emit->VLDR(ArmCRegs[Num].Reg, R9, offsetNew); - ArmCRegs[Num].PPCReg = preg; - ArmCRegs[Num].LastLoad = 0; - ArmCRegs[Num].PS1 = PS1; - ArmCRegs[Num].Away = !preLoad; - return ArmCRegs[Num].Reg; + return false; +} +ARMReg ArmFPRCache::GetPPCReg(u32 preg, bool PS1, bool preLoad) +{ + u32 lastRegIndex = GetLeastUsedRegister(true); + + if (_regs[preg][PS1].GetType() != REG_NOTLOADED) + { + u8 a = _regs[preg][PS1].GetRegIndex(); + ArmCRegs[a].LastLoad = 0; + if (_regs[preg][PS1].GetType() == REG_AWAY && preLoad) + { + s16 offset = PPCSTATE_OFF(ps) + (preg * 16) + (PS1 ? 8 : 0); + emit->VLDR(ArmCRegs[a].Reg, R9, offset); + _regs[preg][PS1].LoadToReg(a); + } + return ArmCRegs[a].Reg; + } + + u32 regindex; + if (FindFreeRegister(regindex)) + { + s16 offset = PPCSTATE_OFF(ps) + (preg * 16) + (PS1 ? 8 : 0); + emit->VLDR(ArmCRegs[regindex].Reg, R9, offset); + + ArmCRegs[regindex].PPCReg = preg; + ArmCRegs[regindex].LastLoad = 0; + + _regs[preg][PS1].LoadToReg(regindex); + return ArmCRegs[regindex].Reg; + } + + // Alright, we couldn't get a free space, dump that least used register + s16 offsetOld = PPCSTATE_OFF(ps) + (ArmCRegs[lastRegIndex].PPCReg * 16) + (ArmCRegs[lastRegIndex].PS1 ? 8 : 0); + s16 offsetNew = PPCSTATE_OFF(ps) + (preg * 16) + (PS1 ? 8 : 0); + + emit->VSTR(ArmCRegs[lastRegIndex].Reg, R9, offsetOld); + emit->VLDR(ArmCRegs[lastRegIndex].Reg, R9, offsetNew); + + _regs[ArmCRegs[lastRegIndex].PPCReg][PS1].Flush(); + + ArmCRegs[lastRegIndex].PPCReg = preg; + ArmCRegs[lastRegIndex].LastLoad = 0; + ArmCRegs[lastRegIndex].PS1 = PS1; + + _regs[preg][PS1].LoadToReg(lastRegIndex); + + return ArmCRegs[lastRegIndex].Reg; } ARMReg ArmFPRCache::R0(u32 preg, bool preLoad) @@ -168,14 +183,28 @@ ARMReg ArmFPRCache::R1(u32 preg, bool preLoad) void ArmFPRCache::Flush() { - for(u8 a = 0; a < NUMPPCREG; ++a) - if (ArmCRegs[a].PPCReg != 33) + for (u8 a = 0; a < 32; ++a) + { + if (_regs[a][0].GetType() == REG_REG) { - s16 offset = PPCSTATE_OFF(ps) + (ArmCRegs[a].PPCReg * 16) + (ArmCRegs[a].PS1 ? 8 : 0); - emit->VSTR(ArmCRegs[a].Reg, R9, offset); - ArmCRegs[a].PPCReg = 33; - ArmCRegs[a].LastLoad = 0; - ArmCRegs[a].Away = true; + s16 offset = PPCSTATE_OFF(ps) + (a * 16); + u32 regindex = _regs[a][0].GetRegIndex(); + emit->VSTR(ArmCRegs[regindex].Reg, R9, offset); + + ArmCRegs[regindex].PPCReg = 33; + ArmCRegs[regindex].LastLoad = 0; + _regs[a][0].Flush(); } + if (_regs[a][1].GetType() == REG_REG) + { + s16 offset = PPCSTATE_OFF(ps) + (a * 16) + 8; + u32 regindex = _regs[a][1].GetRegIndex(); + emit->VSTR(ArmCRegs[regindex].Reg, R9, offset); + + ArmCRegs[regindex].PPCReg = 33; + ArmCRegs[regindex].LastLoad = 0; + _regs[a][1].Flush(); + } + } } diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitFPRCache.h b/Source/Core/Core/Src/PowerPC/JitArm32/JitFPRCache.h index b8c17f470f..10a3520d56 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitFPRCache.h +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitFPRCache.h @@ -29,6 +29,7 @@ using namespace ArmGen; class ArmFPRCache { private: + OpArg _regs[32][2]; // One for each FPR reg JRCPPC ArmCRegs[ARMFPUREGS]; JRCReg ArmRegs[ARMFPUREGS]; @@ -40,6 +41,8 @@ private: ARMReg GetPPCReg(u32 preg, bool PS1, bool preLoad); + u32 GetLeastUsedRegister(bool increment); + bool FindFreeRegister(u32 ®index); protected: ARMXEmitter *emit; diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitRegCache.cpp b/Source/Core/Core/Src/PowerPC/JitArm32/JitRegCache.cpp index be79bd4b71..562770e367 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitRegCache.cpp +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitRegCache.cpp @@ -126,10 +126,8 @@ bool ArmRegCache::FindFreeRegister(u32 ®index) ARMReg ArmRegCache::R(u32 preg) { if (regs[preg].GetType() == REG_IMM) - { return BindToRegister(preg); - //asm ("bkpt #1;"); - } + u32 lastRegIndex = GetLeastUsedRegister(true); // Check if already Loaded diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitRegCache.h b/Source/Core/Core/Src/PowerPC/JitArm32/JitRegCache.h index 9eac9eeee8..d41af42d6e 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitRegCache.h +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitRegCache.h @@ -38,55 +38,57 @@ using namespace ArmGen; enum RegType { REG_NOTLOADED = 0, - REG_REG, - REG_IMM, + REG_REG, // Reg type is register + REG_IMM, // Reg is really a IMM + REG_AWAY, // Bound to a register, but not preloaded }; class OpArg { private: - class Reg{ - public: - RegType m_type; - u8 m_reg; // index to register - u32 m_value; - Reg() - { - m_type = REG_NOTLOADED; - m_reg = 33; - m_value = 0; - } - } Reg; + RegType m_type; // store type + u8 m_reg; // index to register + u32 m_value; // IMM value public: - OpArg(){} + OpArg() + { + m_type = REG_NOTLOADED; + m_reg = 33; + m_value = 0; + } RegType GetType() { - return Reg.m_type; + return m_type; } u8 GetRegIndex() { - return Reg.m_reg; + return m_reg; } u32 GetImm() { - return Reg.m_value; + return m_value; + } + void LoadToAway(u8 reg) + { + m_type = REG_AWAY; + m_reg = reg; } void LoadToReg(u8 reg) { - Reg.m_type = REG_REG; - Reg.m_reg = reg; + m_type = REG_REG; + m_reg = reg; } void LoadToImm(u32 imm) { - Reg.m_type = REG_IMM; - Reg.m_value = imm; + m_type = REG_IMM; + m_value = imm; } void Flush() { - Reg.m_type = REG_NOTLOADED; + m_type = REG_NOTLOADED; } }; @@ -96,7 +98,6 @@ struct JRCPPC bool PS1; ARMReg Reg; // Tied to which ARM Register u32 LastLoad; - bool Away; // Only used in FPR cache }; struct JRCReg {