From 8903df7300fd76ca7202e884032107f2fb0b8734 Mon Sep 17 00:00:00 2001 From: Fiora Date: Fri, 2 Jan 2015 18:34:10 -0800 Subject: [PATCH] MMU: simplify code to restore original data register after failed load Instead of passing the value around constantly, just store it in the regcache, note where it is, and restore it on the exception path. This saves a whole bunch of pushing and popping and gives a ~5% speed boost in Rebel Strike. It's a bit ugly, but it simplifies a lot of code and is faster, too. --- Source/Core/Core/HW/Memmap.h | 7 -- Source/Core/Core/HW/MemmapFunctions.cpp | 93 ++++--------------- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 12 ++- .../Core/Core/PowerPC/Jit64/JitRegCache.cpp | 4 +- Source/Core/Core/PowerPC/Jit64/JitRegCache.h | 2 +- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 16 +++- .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 7 +- Source/Core/Core/PowerPC/JitCommon/JitBase.h | 4 + .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 5 +- .../PowerPC/JitCommon/TrampolineCache.cpp | 41 +++++--- 10 files changed, 89 insertions(+), 102 deletions(-) diff --git a/Source/Core/Core/HW/Memmap.h b/Source/Core/Core/HW/Memmap.h index c22d2b6a67..603f1dc10b 100644 --- a/Source/Core/Core/HW/Memmap.h +++ b/Source/Core/Core/HW/Memmap.h @@ -96,13 +96,6 @@ u16 Read_U16(const u32 address); u32 Read_U32(const u32 address); u64 Read_U64(const u32 address); -u32 Read_S8_Val(const u32 address, u32 var); -u32 Read_U8_Val(const u32 address, u32 var); -u32 Read_S16_Val(const u32 address, u32 var); -u32 Read_U16_Val(const u32 address, u32 var); -u32 Read_U32_Val(const u32 address, u32 var); -u64 Read_U64_Val(const u32 address, u64 var); - // Useful helper functions, used by ARM JIT float Read_F32(const u32 address); double Read_F64(const u32 address); diff --git a/Source/Core/Core/HW/MemmapFunctions.cpp b/Source/Core/Core/HW/MemmapFunctions.cpp index 04736a650c..8fbe067a9f 100644 --- a/Source/Core/Core/HW/MemmapFunctions.cpp +++ b/Source/Core/Core/HW/MemmapFunctions.cpp @@ -92,8 +92,8 @@ static u32 EFB_Read(const u32 addr) static void GenerateDSIException(u32 _EffectiveAddress, bool _bWrite); -template -__forceinline void ReadFromHardware(U &_var, const u32 em_address) +template +__forceinline T ReadFromHardware(const u32 em_address) { int segment = em_address >> 28; // Quick check for an address that can't meet any of the following conditions, @@ -104,33 +104,28 @@ __forceinline void ReadFromHardware(U &_var, const u32 em_address) if ((em_address & 0xC8000000) == 0xC8000000) { if (em_address < 0xcc000000) - _var = EFB_Read(em_address); + return EFB_Read(em_address); else - _var = (T)mmio_mapping->Read::type>(em_address); - return; + return (T)mmio_mapping->Read::type>(em_address); } else if (segment == 0x8 || segment == 0xC || segment == 0x0) { - _var = bswap((*(const T*)&m_pRAM[em_address & RAM_MASK])); - return; + return bswap((*(const T*)&m_pRAM[em_address & RAM_MASK])); } else if (m_pEXRAM && (segment == 0x9 || segment == 0xD || segment == 0x1)) { - _var = bswap((*(const T*)&m_pEXRAM[em_address & EXRAM_MASK])); - return; + return bswap((*(const T*)&m_pEXRAM[em_address & EXRAM_MASK])); } else if (segment == 0xE && (em_address < (0xE0000000 + L1_CACHE_SIZE))) { - _var = bswap((*(const T*)&m_pL1Cache[em_address & L1_CACHE_MASK])); - return; + return bswap((*(const T*)&m_pL1Cache[em_address & L1_CACHE_MASK])); } } if (bFakeVMEM && (segment == 0x7 || segment == 0x4)) { // fake VMEM - _var = bswap((*(const T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK])); - return; + return bswap((*(const T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK])); } // MMU: Do page table translation @@ -139,7 +134,7 @@ __forceinline void ReadFromHardware(U &_var, const u32 em_address) { if (flag == FLAG_READ) GenerateDSIException(em_address, false); - return; + return 0; } // Handle loads that cross page boundaries (ewwww) @@ -157,20 +152,20 @@ __forceinline void ReadFromHardware(U &_var, const u32 em_address) { if (flag == FLAG_READ) GenerateDSIException(em_address_next_page, false); - return; + return 0; } - _var = 0; + T var = 0; for (u32 addr = em_address; addr < em_address + sizeof(T); addr++, tlb_addr++) { if (addr == em_address_next_page) tlb_addr = tlb_addr_next_page; - _var = (_var << 8) | Memory::base[tlb_addr]; + var = (var << 8) | Memory::base[tlb_addr]; } - return; + return var; } // The easy case! - _var = bswap(*(const T*)&Memory::base[tlb_addr]); + return bswap(*(const T*)&Memory::base[tlb_addr]); } @@ -331,32 +326,28 @@ static __forceinline void Memcheck(u32 address, u32 var, bool write, int size) u8 Read_U8(const u32 address) { - u8 var = 0; - ReadFromHardware(var, address); + u8 var = ReadFromHardware(address); Memcheck(address, var, false, 1); return (u8)var; } u16 Read_U16(const u32 address) { - u16 var = 0; - ReadFromHardware(var, address); + u16 var = ReadFromHardware(address); Memcheck(address, var, false, 2); return (u16)var; } u32 Read_U32(const u32 address) { - u32 var = 0; - ReadFromHardware(var, address); + u32 var = ReadFromHardware(address); Memcheck(address, var, false, 4); return var; } u64 Read_U64(const u32 address) { - u64 var = 0; - ReadFromHardware(var, address); + u64 var = ReadFromHardware(address); Memcheck(address, (u32)var, false, 8); return var; } @@ -385,48 +376,6 @@ float Read_F32(const u32 address) return cvt.d; } -u32 Read_U8_Val(const u32 address, u32 var) -{ - ReadFromHardware(var, address); - Memcheck(address, var, false, 1); - return var; -} - -u32 Read_S8_Val(const u32 address, u32 var) -{ - ReadFromHardware(var, address); - Memcheck(address, var, false, 1); - return var; -} - -u32 Read_U16_Val(const u32 address, u32 var) -{ - ReadFromHardware(var, address); - Memcheck(address, var, false, 2); - return var; -} - -u32 Read_S16_Val(const u32 address, u32 var) -{ - ReadFromHardware(var, address); - Memcheck(address, var, false, 2); - return var; -} - -u32 Read_U32_Val(const u32 address, u32 var) -{ - ReadFromHardware(var, address); - Memcheck(address, var, false, 4); - return var; -} - -u64 Read_U64_Val(const u32 address, u64 var) -{ - ReadFromHardware(var, address); - Memcheck(address, (u32)var, false, 8); - return var; -} - u32 Read_U8_ZX(const u32 address) { return (u32)Read_U8(address); @@ -489,16 +438,14 @@ void Write_F64(const double var, const u32 address) } u8 ReadUnchecked_U8(const u32 address) { - u8 var = 0; - ReadFromHardware(var, address); + u8 var = ReadFromHardware(address); return var; } u32 ReadUnchecked_U32(const u32 address) { - u32 var = 0; - ReadFromHardware(var, address); + u32 var = ReadFromHardware(address); return var; } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 436364d0de..e10256f491 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -615,6 +615,8 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.downcountAmount += opinfo->numCycles; js.fastmemLoadStore = NULL; js.fixupExceptionHandler = false; + js.revertGprLoad = -1; + js.revertFprLoad = -1; if (i == (code_block.m_num_instructions - 1)) { @@ -787,8 +789,14 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc exceptionHandlerAtLoc[js.fastmemLoadStore] = GetWritableCodePtr(); } - gpr.Flush(FLUSH_MAINTAIN_STATE); - fpr.Flush(FLUSH_MAINTAIN_STATE); + BitSet32 gprToFlush = BitSet32::AllTrue(32); + BitSet32 fprToFlush = BitSet32::AllTrue(32); + if (js.revertGprLoad >= 0) + gprToFlush[js.revertGprLoad] = false; + if (js.revertFprLoad >= 0) + fprToFlush[js.revertFprLoad] = false; + gpr.Flush(FLUSH_MAINTAIN_STATE, gprToFlush); + fpr.Flush(FLUSH_MAINTAIN_STATE, fprToFlush); // If a memory exception occurs, the exception handler will read // from PC. Update PC with the latest value in case that happens. diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp index f91694ba9e..334c46379e 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp @@ -401,7 +401,7 @@ void FPURegCache::StoreRegister(size_t preg, OpArg newLoc) emit->MOVAPD(newLoc, regs[preg].location.GetSimpleReg()); } -void RegCache::Flush(FlushMode mode) +void RegCache::Flush(FlushMode mode, BitSet32 regsToFlush) { for (unsigned int i = 0; i < xregs.size(); i++) { @@ -409,7 +409,7 @@ void RegCache::Flush(FlushMode mode) PanicAlert("Someone forgot to unlock X64 reg %u", i); } - for (unsigned int i = 0; i < regs.size(); i++) + for (unsigned int i : regsToFlush) { if (regs[i].locked) { diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h index 3943e83852..0e2f2ea687 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h @@ -81,7 +81,7 @@ public: LockX(reg1); LockX(reg2); } - void Flush(FlushMode mode = FLUSH_ALL); + void Flush(FlushMode mode = FLUSH_ALL, BitSet32 regsToFlush = BitSet32::AllTrue(32)); void Flush(PPCAnalyst::CodeOp *op) {Flush();} int SanityCheck() const; void KillImmediate(size_t preg, bool doLoad, bool makeDirty); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index 1edd6b3868..c322c2248f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -246,9 +246,23 @@ void Jit64::lXXx(UGeckoInstruction inst) } gpr.Lock(a, b, d); + if (update && storeAddress) gpr.BindToRegister(a, true, true); - gpr.BindToRegister(d, js.memcheck, true); + + // A bit of an evil hack here. We need to retain the original value of this register for the + // exception path, but we'd rather not needlessly pass it around if we don't have to, since + // the exception path is very rare. So we store the value in the regcache, let the load path + // clobber it, then restore the value in the exception path. + // TODO: no other load has to do this at the moment, since no other loads go directly to the + // target registers, but if that ever changes, we need to do it there too. + if (js.memcheck) + { + gpr.StoreFromRegister(d); + js.revertGprLoad = d; + } + gpr.BindToRegister(d, false, true); + BitSet32 registersInUse = CallerSavedRegistersInUse(); // We need to save the (usually scratch) address register for the update. if (update && storeAddress) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index 4cfbc3b756..bc61136a6c 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -66,7 +66,12 @@ void Jit64::lfXXX(UGeckoInstruction inst) } fpr.Lock(d); - fpr.BindToRegister(d, js.memcheck || !single); + if (js.memcheck && single) + { + fpr.StoreFromRegister(d); + js.revertFprLoad = d; + } + fpr.BindToRegister(d, !single); BitSet32 registersInUse = CallerSavedRegistersInUse(); if (update && js.memcheck) registersInUse[RSCRATCH2] = true; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 5a526f8f48..cb79f3f511 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -79,6 +79,10 @@ protected: // so just fixup that branch instead of testing for a DSI again. bool fixupExceptionHandler; Gen::FixupBranch exceptionHandler; + // If these are set, we've stored the old value of a register which will be loaded in revertLoad, + // which lets us revert it on the exception path. + int revertGprLoad; + int revertFprLoad; bool firstFPInstructionFound; bool isLastInstruction; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index fef3e90677..1209e2bd46 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -302,10 +302,7 @@ FixupBranch EmuCodeBlock::CheckIfSafeAddress(OpArg reg_value, X64Reg reg_addr, B void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, BitSet32 registersInUse, bool signExtend, int flags) { - if (!jit->js.memcheck) - { - registersInUse[reg_value] = false; - } + registersInUse[reg_value] = false; if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem && !opAddress.IsImm() && !(flags & (SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_FASTMEM)) diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp index b91a0f13ca..f5bbea78dc 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp @@ -42,39 +42,58 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, B const u8* trampoline = GetCodePtr(); X64Reg addrReg = (X64Reg)info.scaledReg; X64Reg dataReg = (X64Reg)info.regOperandReg; - registersInUse[addrReg] = true; - registersInUse[dataReg] = false; + int stack_offset = 0; + bool push_param1 = registersInUse[ABI_PARAM1]; - ABI_PushRegistersAndAdjustStack(registersInUse, 0); + if (push_param1) + { + PUSH(ABI_PARAM1); + stack_offset = 8; + registersInUse[ABI_PARAM1] = 0; + } int dataRegSize = info.operandSize == 8 ? 64 : 32; - MOVTwo(dataRegSize, ABI_PARAM1, addrReg, info.displacement, ABI_PARAM2, dataReg); + if (addrReg != ABI_PARAM1 && info.displacement) + LEA(32, ABI_PARAM1, MDisp(addrReg, info.displacement)); + else if (addrReg != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R(addrReg)); + else if (info.displacement) + ADD(32, R(ABI_PARAM1), Imm32(info.displacement)); + + ABI_PushRegistersAndAdjustStack(registersInUse, stack_offset); switch (info.operandSize) { case 8: - CALL((void *)&Memory::Read_U64_Val); + CALL((void *)&Memory::Read_U64); break; case 4: - CALL((void *)&Memory::Read_U32_Val); + CALL((void *)&Memory::Read_U32); break; case 2: - CALL(info.signExtend ? (void *)&Memory::Read_S16_Val : (void *)&Memory::Read_U16_Val); + CALL((void *)&Memory::Read_U16); break; case 1: - CALL(info.signExtend ? (void *)&Memory::Read_S8_Val : (void *)&Memory::Read_U8_Val); + CALL((void *)&Memory::Read_U8); break; } - if (dataReg != ABI_RETURN) - MOV(dataRegSize, R(dataReg), R(ABI_RETURN)); + ABI_PopRegistersAndAdjustStack(registersInUse, stack_offset); + + if (push_param1) + POP(ABI_PARAM1); - ABI_PopRegistersAndAdjustStack(registersInUse, 0); if (exceptionHandler) { TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); J_CC(CC_NZ, exceptionHandler); } + + if (info.signExtend) + MOVSX(dataRegSize, info.operandSize * 8, dataReg, R(ABI_RETURN)); + else if (dataReg != ABI_RETURN || info.operandSize < 4) + MOVZX(dataRegSize, info.operandSize * 8, dataReg, R(ABI_RETURN)); + JMP(returnPtr, true); return trampoline; }