new block cache and much more...

- more reliable code invalidation detection
- blocks aren't stopped at any branch, but are being followed
if possible to get larger blocks
- idle loop recognition
- optimised literal loads, load/store cycle counting
 and loads/stores from constant addresses
This commit is contained in:
RSDuck
2019-10-03 01:10:59 +02:00
parent 5338c28f40
commit a687be9879
19 changed files with 1557 additions and 696 deletions

View File

@ -9,142 +9,67 @@
namespace ARMJIT
{
typedef u32 (*CompiledBlock)();
struct FetchedInstr
enum ExeMemKind
{
u32 A_Reg(int pos) const
{
return (Instr >> pos) & 0xF;
}
u32 T_Reg(int pos) const
{
return (Instr >> pos) & 0x7;
}
u32 Cond() const
{
return Instr >> 28;
}
u8 SetFlags;
u32 Instr;
u32 NextInstr[2];
u32 Addr;
u8 CodeCycles;
ARMInstrInfo::Info Info;
exeMem_Unmapped = 0,
exeMem_ITCM,
exeMem_MainRAM,
exeMem_SWRAM,
exeMem_LCDC,
exeMem_ARM9_BIOS,
exeMem_ARM7_BIOS,
exeMem_ARM7_WRAM,
exeMem_ARM7_WVRAM,
exeMem_Count
};
/*
Copied from DeSmuME
Some names where changed to match the nomenclature of melonDS
extern const u32 ExeMemRegionOffsets[];
extern const u32 ExeMemRegionSizes[];
Since it's nowhere explained and atleast I needed some time to get behind it,
here's a summary on how it works:
more or less all memory locations from which code can be executed are
represented by an array of function pointers, which point to null or
a function which executes a block instructions starting from there.
typedef u32 (*JitBlockEntry)();
The most significant 4 bits of each address is ignored. This 28 bit space is
divided into 0x2000 32 KB for ARM9 and 0x4000 16 KB for ARM7, each of which
a pointer to the relevant place inside the afore mentioned arrays. 32 and 16 KB
are the sizes of the smallest contigous memory region mapped to the respective CPU.
Because ARM addresses are always aligned to 4 bytes and Thumb to a 2 byte boundary,
we only need every second half word to be adressable.
extern u32 AddrTranslate9[0x2000];
extern u32 AddrTranslate7[0x4000];
In case a memory write hits mapped memory, the function block at this
address is set to null, so it's recompiled the next time it's executed.
This method has disadvantages, namely that only writing to the
first instruction of a block marks it as invalid and that memory remapping
(SWRAM and VRAM) isn't taken into account.
*/
struct BlockCache
{
CompiledBlock* AddrMapping9[0x2000] = {0};
CompiledBlock* AddrMapping7[0x4000] = {0};
CompiledBlock MainRAM[4*1024*1024/2];
CompiledBlock SWRAM[0x8000/2]; // Shared working RAM
CompiledBlock ARM9_ITCM[0x8000/2];
CompiledBlock ARM9_LCDC[0xA4000/2];
CompiledBlock ARM9_BIOS[0x8000/2];
CompiledBlock ARM7_BIOS[0x4000/2];
CompiledBlock ARM7_WRAM[0x10000/2]; // dedicated ARM7 WRAM
CompiledBlock ARM7_WVRAM[0x40000/2]; // VRAM allocated as Working RAM
};
extern BlockCache cache;
const u32 ExeMemSpaceSize = 0x518000; // I hate you C++, sometimes I really hate you...
extern JitBlockEntry FastBlockAccess[ExeMemSpaceSize / 2];
template <u32 num>
inline bool IsMapped(u32 addr)
{
if (num == 0)
return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] >= ExeMemRegionSizes[exeMem_Unmapped];
else
return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] >= ExeMemRegionSizes[exeMem_Unmapped];
}
template <u32 num>
inline CompiledBlock LookUpBlock(u32 addr)
inline u32 TranslateAddr(u32 addr)
{
if (num == 0)
return cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1];
return AddrTranslate9[(addr & 0xFFFFFFF) >> 15] + (addr & 0x7FFF);
else
return cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1];
return AddrTranslate7[(addr & 0xFFFFFFF) >> 14] + (addr & 0x3FFF);
}
template <u32 num>
inline void Invalidate16(u32 addr)
inline JitBlockEntry LookUpBlock(u32 addr)
{
if (IsMapped<num>(addr))
{
if (num == 0)
cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = NULL;
else
cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = NULL;
}
}
template <u32 num>
inline void Invalidate32(u32 addr)
{
if (IsMapped<num>(addr))
{
if (num == 0)
{
CompiledBlock* page = cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15];
page[(addr & 0x7FFF) >> 1] = NULL;
page[((addr + 2) & 0x7FFF) >> 1] = NULL;
}
else
{
CompiledBlock* page = cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14];
page[(addr & 0x3FFF) >> 1] = NULL;
page[((addr + 2) & 0x3FFF) >> 1] = NULL;
}
}
}
template <u32 num>
inline void InsertBlock(u32 addr, CompiledBlock func)
{
if (num == 0)
cache.AddrMapping9[(addr & 0xFFFFFFF) >> 15][(addr & 0x7FFF) >> 1] = func;
else
cache.AddrMapping7[(addr & 0xFFFFFFF) >> 14][(addr & 0x3FFF) >> 1] = func;
return FastBlockAccess[TranslateAddr<num>(addr) / 2];
}
void Init();
void DeInit();
CompiledBlock CompileBlock(ARM* cpu);
void InvalidateByAddr(u32 pseudoPhysical);
void InvalidateAll();
void InvalidateBlockCache();
void InvalidateITCM(u32 addr);
void InvalidateByAddr7(u32 addr);
void CompileBlock(ARM* cpu);
void ResetBlockCache();
}