From a973c0bf5bdc983ee6547f8533ea0ff3d5c750b4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 15 Jun 2024 16:07:36 -0400 Subject: [PATCH] initial implementation of interlock cycles --- src/ARM.cpp | 13 ++- src/ARM.h | 41 ++++++++-- src/ARMInterpreter.cpp | 4 +- src/ARMInterpreter_ALU.cpp | 134 +++++++++++++++---------------- src/ARMInterpreter_LoadStore.cpp | 112 +++++++++++++++----------- 5 files changed, 179 insertions(+), 125 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index bac57879..899fe661 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -190,6 +190,8 @@ void ARM::Reset() BreakReq = false; #endif + memset(InterlockTimestamp, 0, sizeof(InterlockTimestamp)); + // zorp JumpTo(ExceptionBase); } @@ -1314,9 +1316,16 @@ void ARMv4::AddCycles_CD() Cycles += numC + numD; } } - u64 ARMv5::Timestamp() { return NDS.ARM9Timestamp; } - u64 ARMv4::Timestamp() { return NDS.ARM7Timestamp; } +u64& ARMv5::Timestamp() +{ + return NDS.ARM9Timestamp; +} + +u64& ARMv4::Timestamp() +{ + return NDS.ARM7Timestamp; +} u8 ARMv5::BusRead8(u32 addr) { diff --git a/src/ARM.h b/src/ARM.h index 9fb48930..ff857db9 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -30,6 +30,8 @@ #include "debug/GdbStub.h" #endif +#define INTERLOCK + namespace melonDS { inline u32 ROR(u32 x, u32 n) @@ -143,23 +145,46 @@ public: virtual void AddCycles_CDI() = 0; virtual void AddCycles_CD() = 0; - inline void AddCycles_L(const u8 reg1) + inline void AddCycles_L(const u32 delay, const u32 reg1) { - Cycles += InterlockTimestamp[reg1]; + if (InterlockTimestamp[reg1] > Timestamp() + delay); + Timestamp() = InterlockTimestamp[reg1]; + } + + inline void AddCycles_L(const u32 delay, const u32 reg1, const u32 reg2) + { + u64 cycles = std::max(InterlockTimestamp[reg1], InterlockTimestamp[reg2]); + if (cycles > Timestamp() + delay) + Timestamp() = cycles; + } + + inline void AddCycles_L(const u32 delay, const u32 reg1, const u32 reg2, const u32 reg3) + { + u64 cycles = std::max(InterlockTimestamp[reg1], std::max(InterlockTimestamp[reg2], InterlockTimestamp[reg3])); + if (cycles > Timestamp() + delay) + Timestamp() = cycles; } - inline void AddCycles_L(const u8 reg1, const u8 reg2) + // fetch the value of a register while handling any interlock cycles + inline u32 GetReg(const u32 reg, const u32 delay = 0) { - Cycles += std::max(InterlockTimestamp[reg1], InterlockTimestamp[reg2]); +#ifdef INTERLOCK + if (InterlockTimestamp[reg] > (Timestamp() + delay)) + Timestamp() = InterlockTimestamp[reg] - delay; +#endif + return R[reg]; } // Must be called after all of an instruction's cycles are calculated!!! - inline void SetCycles_L(const u8 reg, const u8 cycles, const u8 type) + inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) { +#ifdef INTERLOCK InterlockTimestamp[reg] = cycles + Timestamp() + Cycles; + //InterlockType[reg] = type; +#endif } - virtual u64 Timestamp() = 0; + virtual u64& Timestamp() = 0; void CheckGdbIncoming(); @@ -326,7 +351,7 @@ public: // Cycles += numC + numD; } - u64 Timestamp() override; + u64& Timestamp() override; void GetCodeMemRegion(u32 addr, MemRegion* region); @@ -443,7 +468,7 @@ public: void AddCycles_CDI() override; void AddCycles_CD() override; - u64 Timestamp() override; + u64& Timestamp() override; protected: u8 BusRead8(u32 addr) override; u16 BusRead16(u32 addr) override; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 5a09d210..5621876a 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -163,7 +163,7 @@ void A_MSR_REG(ARM* cpu) if ((cpu->CPSR & 0x1F) == 0x10) mask &= 0xFFFFFF00; - u32 val = cpu->R[cpu->CurInstr & 0xF]; + u32 val = cpu->GetReg(cpu->CurInstr & 0xF, 1); // bit4 is forced to 1 val |= 0x00000010; @@ -216,7 +216,7 @@ void A_MCR(ARM* cpu) u32 cn = (cpu->CurInstr >> 16) & 0xF; u32 cm = cpu->CurInstr & 0xF; u32 cpinfo = (cpu->CurInstr >> 5) & 0x7; - u32 val = cpu->R[(cpu->CurInstr>>12)&0xF]; + u32 val = cpu->GetReg((cpu->CurInstr>>12)&0xF); if (((cpu->CurInstr>>12) & 0xF) == 15) val += 4; if (cpu->Num==0 && cp==15) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 0331aa08..ac18872b 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -160,14 +160,14 @@ inline bool OverflowSbc(u32 a, u32 b, u32 carry) cpu->SetC(b & 0x80000000); #define A_CALC_OP2_REG_SHIFT_IMM(shiftop) \ - u32 b = cpu->R[cpu->CurInstr&0xF]; \ + u32 b = cpu->GetReg(cpu->CurInstr&0xF); \ u32 s = (cpu->CurInstr>>7)&0x1F; \ shiftop(b, s); #define A_CALC_OP2_REG_SHIFT_REG(shiftop) \ - u32 b = cpu->R[cpu->CurInstr&0xF]; \ + u32 b = cpu->GetReg(cpu->CurInstr&0xF); \ if ((cpu->CurInstr&0xF)==15) b += 4; \ - shiftop(b, (cpu->R[(cpu->CurInstr>>8)&0xF] & 0xFF)); + shiftop(b, (cpu->GetReg((cpu->CurInstr>>8)&0xF) & 0xFF)); #define A_IMPLEMENT_ALU_OP(x,s) \ @@ -313,7 +313,7 @@ void A_##x##_REG_ROR_REG(ARM* cpu) \ #define A_AND(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a & b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -326,7 +326,7 @@ void A_##x##_REG_ROR_REG(ARM* cpu) \ } #define A_AND_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a & b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -344,7 +344,7 @@ A_IMPLEMENT_ALU_OP(AND,_S) #define A_EOR(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a ^ b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -357,7 +357,7 @@ A_IMPLEMENT_ALU_OP(AND,_S) } #define A_EOR_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a ^ b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -375,7 +375,7 @@ A_IMPLEMENT_ALU_OP(EOR,_S) #define A_SUB(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a - b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -388,7 +388,7 @@ A_IMPLEMENT_ALU_OP(EOR,_S) } #define A_SUB_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a - b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -408,7 +408,7 @@ A_IMPLEMENT_ALU_OP(SUB,) #define A_RSB(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = b - a; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -421,7 +421,7 @@ A_IMPLEMENT_ALU_OP(SUB,) } #define A_RSB_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = b - a; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -441,7 +441,7 @@ A_IMPLEMENT_ALU_OP(RSB,) #define A_ADD(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a + b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -454,7 +454,7 @@ A_IMPLEMENT_ALU_OP(RSB,) } #define A_ADD_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a + b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -474,7 +474,7 @@ A_IMPLEMENT_ALU_OP(ADD,) #define A_ADC(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a + b + (cpu->CPSR&0x20000000 ? 1:0); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -487,7 +487,7 @@ A_IMPLEMENT_ALU_OP(ADD,) } #define A_ADC_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res_tmp = a + b; \ u32 carry = (cpu->CPSR&0x20000000 ? 1:0); \ u32 res = res_tmp + carry; \ @@ -509,7 +509,7 @@ A_IMPLEMENT_ALU_OP(ADC,) #define A_SBC(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a - b - (cpu->CPSR&0x20000000 ? 0:1); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -522,7 +522,7 @@ A_IMPLEMENT_ALU_OP(ADC,) } #define A_SBC_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res_tmp = a - b; \ u32 carry = (cpu->CPSR&0x20000000 ? 0:1); \ u32 res = res_tmp - carry; \ @@ -544,7 +544,7 @@ A_IMPLEMENT_ALU_OP(SBC,) #define A_RSC(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = b - a - (cpu->CPSR&0x20000000 ? 0:1); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -557,7 +557,7 @@ A_IMPLEMENT_ALU_OP(SBC,) } #define A_RSC_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res_tmp = b - a; \ u32 carry = (cpu->CPSR&0x20000000 ? 0:1); \ u32 res = res_tmp - carry; \ @@ -579,7 +579,7 @@ A_IMPLEMENT_ALU_OP(RSC,) #define A_TST(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a & b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -589,7 +589,7 @@ A_IMPLEMENT_ALU_TEST(TST,_S) #define A_TEQ(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a ^ b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -599,7 +599,7 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) #define A_CMP(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a - b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -611,7 +611,7 @@ A_IMPLEMENT_ALU_TEST(CMP,) #define A_CMN(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a + b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -623,7 +623,7 @@ A_IMPLEMENT_ALU_TEST(CMN,) #define A_ORR(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a | b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -636,7 +636,7 @@ A_IMPLEMENT_ALU_TEST(CMN,) } #define A_ORR_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a | b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -699,7 +699,7 @@ void A_MOV_REG_LSL_IMM_DBG(ARM* cpu) #define A_BIC(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a & ~b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -712,7 +712,7 @@ void A_MOV_REG_LSL_IMM_DBG(ARM* cpu) } #define A_BIC_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a & ~b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -761,8 +761,8 @@ A_IMPLEMENT_ALU_OP(MVN,_S) void A_MUL(ARM* cpu) { - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF); u32 res = rm * rs; @@ -791,9 +791,9 @@ void A_MUL(ARM* cpu) void A_MLA(ARM* cpu) { - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF); + u32 rn = cpu->GetReg((cpu->CurInstr >> 12) & 0xF); u32 res = (rm * rs) + rn; @@ -822,8 +822,8 @@ void A_MLA(ARM* cpu) void A_UMULL(ARM* cpu) { - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); u64 res = (u64)rm * (u64)rs; @@ -848,17 +848,17 @@ void A_UMULL(ARM* cpu) !res); if (cpu->Num==1) cpu->SetC(0); } - else cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions + else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_UMLAL(ARM* cpu) { - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); u64 res = (u64)rm * (u64)rs; - u64 rd = (u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL); + u64 rd = (u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL); // CHECKME: INTERLOCK? res += rd; cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; @@ -887,8 +887,8 @@ void A_UMLAL(ARM* cpu) void A_SMULL(ARM* cpu) { - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); s64 res = (s64)(s32)rm * (s64)(s32)rs; @@ -913,17 +913,17 @@ void A_SMULL(ARM* cpu) !res); if (cpu->Num==1) cpu->SetC(0); } - else cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions + else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_SMLAL(ARM* cpu) { - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); s64 res = (s64)(s32)rm * (s64)(s32)rs; - s64 rd = (s64)((u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL)); + s64 rd = (s64)((u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL)); // CHECKME: INTERLOCK? res += rd; cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; @@ -947,16 +947,16 @@ void A_SMLAL(ARM* cpu) !res); if (cpu->Num==1) cpu->SetC(0); } - else cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions + else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_SMLAxy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); + u32 rn = cpu->GetReg((cpu->CurInstr >> 12) & 0xF, 1); if (cpu->CurInstr & (1<<5)) rm >>= 16; else rm &= 0xFFFF; @@ -978,9 +978,9 @@ void A_SMLAWy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); + u32 rn = cpu->GetReg((cpu->CurInstr >> 12) & 0xF, 1); if (cpu->CurInstr & (1<<6)) rs >>= 16; else rs &= 0xFFFF; @@ -1000,8 +1000,8 @@ void A_SMULxy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); if (cpu->CurInstr & (1<<5)) rm >>= 16; else rm &= 0xFFFF; @@ -1019,8 +1019,8 @@ void A_SMULWy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); if (cpu->CurInstr & (1<<6)) rs >>= 16; else rs &= 0xFFFF; @@ -1036,8 +1036,8 @@ void A_SMLALxy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 0); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 0); // yeah this one actually doesn't need two interlock cycles to interlock if (cpu->CurInstr & (1<<5)) rm >>= 16; else rm &= 0xFFFF; @@ -1053,7 +1053,7 @@ void A_SMLALxy(ARM* cpu) cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); cpu->AddCycles_CI(1); - cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Norm); + cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Norm); } @@ -1062,7 +1062,7 @@ void A_CLZ(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 val = cpu->R[cpu->CurInstr & 0xF]; + u32 val = cpu->GetReg(cpu->CurInstr & 0xF, 1); u32 res = 0; while ((val & 0xFF000000) == 0) @@ -1086,8 +1086,8 @@ void A_QADD(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rn = cpu->GetReg((cpu->CurInstr >> 16) & 0xF, 1); u32 res = rm + rn; if (OverflowAdd(rm, rn)) @@ -1105,8 +1105,8 @@ void A_QSUB(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rn = cpu->GetReg((cpu->CurInstr >> 16) & 0xF, 1); u32 res = rm - rn; if (OverflowSub(rm, rn)) @@ -1124,8 +1124,8 @@ void A_QDADD(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rn = cpu->GetReg((cpu->CurInstr >> 16) & 0xF, 1); if (OverflowAdd(rn, rn)) { @@ -1151,8 +1151,8 @@ void A_QDSUB(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rn = cpu->GetReg((cpu->CurInstr >> 16) & 0xF, 1); if (OverflowAdd(rn, rn)) { diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 4e93c749..a11e912d 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -53,7 +53,7 @@ namespace melonDS::ARMInterpreter if (!(cpu->CurInstr & (1<<23))) offset = -offset; #define A_WB_CALC_OFFSET_REG(shiftop) \ - u32 offset = cpu->R[cpu->CurInstr & 0xF]; \ + u32 offset = cpu->GetReg(cpu->CurInstr & 0xF); \ u32 shift = ((cpu->CurInstr>>7)&0x1F); \ shiftop(offset, shift); \ if (!(cpu->CurInstr & (1<<23))) offset = -offset; @@ -61,8 +61,8 @@ namespace melonDS::ARMInterpreter #define A_STR \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ storeval += 4; \ bool dataabort = !cpu->DataWrite32(offset, storeval); \ @@ -72,8 +72,8 @@ namespace melonDS::ARMInterpreter // TODO: user mode (bit21) #define A_STR_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ storeval += 4; \ bool dataabort = !cpu->DataWrite32(addr, storeval); \ @@ -82,8 +82,8 @@ namespace melonDS::ARMInterpreter cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_STRB \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite8(offset, storeval); \ cpu->AddCycles_CD(); \ @@ -92,8 +92,8 @@ namespace melonDS::ARMInterpreter // TODO: user mode (bit21) #define A_STRB_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite8(addr, storeval); \ cpu->AddCycles_CD(); \ @@ -101,7 +101,7 @@ namespace melonDS::ARMInterpreter cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDR \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead32(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -120,7 +120,7 @@ namespace melonDS::ARMInterpreter // TODO: user mode #define A_LDR_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead32(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -138,7 +138,7 @@ namespace melonDS::ARMInterpreter } #define A_LDRB \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -153,7 +153,7 @@ namespace melonDS::ARMInterpreter // TODO: user mode #define A_LDRB_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -242,14 +242,14 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (!(cpu->CurInstr & (1<<23))) offset = -offset; #define A_HD_CALC_OFFSET_REG \ - u32 offset = cpu->R[cpu->CurInstr & 0xF]; \ + u32 offset = cpu->GetReg(cpu->CurInstr & 0xF); \ if (!(cpu->CurInstr & (1<<23))) offset = -offset; #define A_STRH \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite16(offset, storeval); \ cpu->AddCycles_CD(); \ @@ -257,8 +257,8 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_STRH_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite16(addr, storeval); \ cpu->AddCycles_CD(); \ @@ -269,7 +269,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRD \ if (cpu->Num != 0) return; \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ if (!cpu->DataRead32 (offset, &cpu->R[r])) {cpu->AddCycles_CDI(); return;} \ @@ -287,7 +287,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRD_POST \ if (cpu->Num != 0) return; \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ if (!cpu->DataRead32 (addr, &cpu->R[r])) {cpu->AddCycles_CDI(); return;} \ @@ -305,11 +305,11 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_STRD \ if (cpu->Num != 0) return; \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - bool dataabort = !cpu->DataWrite32(offset, cpu->R[r]); /* yes, this data abort behavior is on purpose */ \ - u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ + bool dataabort = !cpu->DataWrite32(offset, cpu->GetReg(r)); /* yes, this data abort behavior is on purpose */ \ + u32 storeval = cpu->GetReg(r+1, cpu->DataCycles); if (r == 14) storeval+=4; \ dataabort |= !cpu->DataWrite32S (offset+4, storeval, dataabort); /* no, i dont understand it either */ \ cpu->AddCycles_CD(); \ if (dataabort) return; \ @@ -317,18 +317,18 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_STRD_POST \ if (cpu->Num != 0) return; \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - bool dataabort = !cpu->DataWrite32(addr, cpu->R[r]); \ - u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ + bool dataabort = !cpu->DataWrite32(addr, cpu->GetReg(r)); \ + u32 storeval = cpu->GetReg(r+1, cpu->DataCycles); if (r == 14) storeval+=4; \ dataabort |= !cpu->DataWrite32S (addr+4, storeval, dataabort); \ cpu->AddCycles_CD(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRH \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -342,7 +342,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRH_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -356,7 +356,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRSB \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -371,7 +371,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRSB_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -386,7 +386,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRSH \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -401,7 +401,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRSH_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -452,8 +452,8 @@ A_IMPLEMENT_HD_LDRSTR(LDRSH) void A_SWP(ARM* cpu) { - u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 base = cpu->GetReg((cpu->CurInstr >> 16) & 0xF); if ((cpu->CurInstr & 0xF) == 15) rm += 4; u32 val; @@ -468,9 +468,18 @@ void A_SWP(ARM* cpu) if (rd != 15) { cpu->R[rd] = ROR(val, 8*(base&0x3)); - cpu->SetCycles_L(rd, 1, cpu->ILT_Norm); // TODO: it adds an extra interlock cycle when doing a misaligned load from a non-itcm address + + u32 cycles; + if (base & 3) // add an extra interlock cycle when doing a misaligned load from a non-itcm address (checkme: does it matter whether you're executing from there?) + { + if (cpu->Num == 1) cycles = 2; // checkme + else cycles = ((base < ((ARMv5*)cpu)->ITCMSize) && ((cpu->R[15]-8) < ((ARMv5*)cpu)->ITCMSize)) ? 1 : 2; + } + else cycles = 1; + + cpu->SetCycles_L(rd, cycles, cpu->ILT_Norm); } - else if (cpu->Num==1) // for some reason these jumps don't work on the arm 9? + else if (cpu->Num == 1) // for some reason these jumps don't work on the arm 9? cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1, cpu->ILT_Norm); } else cpu->AddCycles_CDI(); @@ -481,8 +490,8 @@ void A_SWP(ARM* cpu) void A_SWPB(ARM* cpu) { - u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; - u32 rm = cpu->R[cpu->CurInstr & 0xF] & 0xFF; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1) & 0xFF; + u32 base = cpu->GetReg((cpu->CurInstr >> 16) & 0xF); if ((cpu->CurInstr & 0xF) == 15) rm += 4; u32 val; @@ -497,9 +506,15 @@ void A_SWPB(ARM* cpu) if (rd != 15) { cpu->R[rd] = val; - cpu->SetCycles_L(rd, 1, cpu->ILT_Norm); // TODO: it adds an extra interlock cycle when doing a load from a non-itcm address + + // add an extra interlock cycle when doing a load from a non-itcm address (checkme: does it matter whether you're executing from there?) + u32 cycles; + if (cpu->Num == 1) cycles = 2; // checkme + else cycles = ((base < ((ARMv5*)cpu)->ITCMSize) && ((cpu->R[15]-8) < ((ARMv5*)cpu)->ITCMSize)) ? 1 : 2; + + cpu->SetCycles_L(rd, cycles, cpu->ILT_Norm); } - else if (cpu->Num==1)// for some reason these jumps don't work on the arm 9? + else if (cpu->Num == 1)// for some reason these jumps don't work on the arm 9? cpu->JumpTo(val & ~1); } else cpu->AddCycles_CDI(); @@ -513,12 +528,12 @@ void A_SWPB(ARM* cpu) void A_LDM(ARM* cpu) { u32 baseid = (cpu->CurInstr >> 16) & 0xF; - u32 base = cpu->R[baseid]; + u32 base = cpu->GetReg(baseid, 1); u32 wbbase; u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; - u8 lastreg = 0; // TODO: this doesn't support 0 reg LDMs (do those even work?) + u32 lastreg = 0; // TODO: this doesn't support 0 reg LDMs (do those even work?) if (!(cpu->CurInstr & (1<<23))) // decrement { @@ -554,8 +569,8 @@ void A_LDM(ARM* cpu) } first = false; - if (!preinc) base += 4; lastreg = i; + if (!preinc) base += 4; } } @@ -578,7 +593,12 @@ void A_LDM(ARM* cpu) else { cpu->AddCycles_CDI(); - cpu->SetCycles_L(lastreg, 1, cpu->ILT_Norm); // TODO: THIS DOESN'T APPLY WHEN LOADING FROM ITCM + + u32 lastbase = base; + if (!preinc) lastbase -= 4; + // no interlock occurs when loading from itcm (checkme: does it matter whether you're executing from there?) + if ((((ARMv5*)cpu)->ITCMSize < lastbase) && ((cpu->R[15]-8) > ((ARMv5*)cpu)->ITCMSize) && (cpu->CurInstr & (0x7FFF >> (15 - lastreg)))) + cpu->SetCycles_L(lastreg, 1, cpu->ILT_Norm); } // switch back to previous regs @@ -628,7 +648,7 @@ void A_LDM(ARM* cpu) void A_STM(ARM* cpu) { u32 baseid = (cpu->CurInstr >> 16) & 0xF; - u32 base = cpu->R[baseid]; + u32 base = cpu->GetReg(baseid, 1); u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; @@ -672,7 +692,7 @@ void A_STM(ARM* cpu) val = oldbase; else val = base; } - else val = cpu->R[i]; + else val = cpu->GetReg(i, 1+cpu->DataCycles); if (i == 15) val+=4;