From 109bbed3d0959b07c03c0bde36118f685497cb6f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 24 Jun 2024 19:44:38 -0400 Subject: [PATCH] improve ldm timings I believe this also applies to other loads as well, but currently untested. --- src/ARM.cpp | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/ARM.h | 13 +------------ src/CP15.cpp | 19 ++++++++++-------- 3 files changed, 67 insertions(+), 20 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 899fe661..cb72dad5 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -302,6 +302,10 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) u32 oldregion = R[15] >> 24; u32 newregion = addr >> 24; + + if (addr < ITCMSize) CodeRegion = Mem9_ITCM; + else if ((addr & DTCMMask) == DTCMBase) CodeRegion = Mem9_DTCM; + else CodeRegion = NDS.ARM9Regions[addr >> 14]; RegionCodeCycles = MemTimings[addr >> 12][0]; @@ -1255,6 +1259,57 @@ bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) } +void ARMv5::AddCycles_CDI() +{ + // LDR/LDM cycles. ARM9 seems to skip the internal cycle there. + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + s32 numD = DataCycles; + + // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early + s32 early; + switch (DataRegion) + { + case 0: // background region; CHECKME + case Mem9_DTCM: + case Mem9_BIOS: + case Mem9_WRAM: + case Mem9_IO: + case Mem9_Pal: // CHECKME + default: + early = 2; + break; + + case Mem9_OAM: // CHECKME + case Mem9_GBAROM: + case Mem9_GBARAM: + early = 4; + break; + + case Mem9_MainRAM: + early = (CodeRegion == Mem9_MainRAM) ? 0 : 4; + break; + + case Mem9_VRAM: // the dsi can toggle the bus width of vram between 32 and 16 bit + early = (NDS.ConsoleType == 0 || !(((DSi&)NDS).SCFG_EXT[0] & (1<<13))) ? 4 : 2; + break; + + case Mem9_ITCM: // itcm data fetches cannot be done at the same time as a code fetch, it'll even incurr a 1 cycle penalty when executing from itcm + early = (CodeRegion == Mem9_ITCM) ? -1 : 0; + break; + } + + if (numD > early) + { + numC -= early; + if (numC < 0) numC = 0; + Cycles += numC + numD; + } + else + { + Cycles += numC; + } +} + void ARMv4::AddCycles_C() { // code only. this code fetch is sequential. diff --git a/src/ARM.h b/src/ARM.h index 3ef0d439..25a96ef2 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -325,18 +325,7 @@ public: Cycles += numC + numI; } - void AddCycles_CDI() override - { - // LDR/LDM cycles. ARM9 seems to skip the internal cycle there. - // TODO: ITCM data fetches shouldn't be parallelized, they say - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; - - //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); - //else - // Cycles += numC + numD; - } + void AddCycles_CDI() override; void AddCycles_CD() override { diff --git a/src/CP15.cpp b/src/CP15.cpp index 7b11696b..319ac9c4 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -815,22 +815,23 @@ bool ARMv5::DataRead8(u32 addr, u32* val) return false; } - DataRegion = addr; - if (addr < ITCMSize) { + DataRegion = Mem9_ITCM; DataCycles = 1; *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { + DataRegion = Mem9_DTCM; DataCycles = 1; *val = *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } - + *val = BusRead8(addr); + DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -843,24 +844,25 @@ bool ARMv5::DataRead16(u32 addr, u32* val) return false; } - DataRegion = addr; - addr &= ~1; if (addr < ITCMSize) { + DataRegion = Mem9_ITCM; DataCycles = 1; *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { + DataRegion = Mem9_DTCM; DataCycles = 1; *val = *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } - + *val = BusRead16(addr); + DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -873,24 +875,25 @@ bool ARMv5::DataRead32(u32 addr, u32* val) return false; } - DataRegion = addr; - addr &= ~3; if (addr < ITCMSize) { + DataRegion = Mem9_ITCM; DataCycles = 1; *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { + DataRegion = Mem9_DTCM; DataCycles = 1; *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } *val = BusRead32(addr); + DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][2]; return true; }