diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp index 3b9339bcf0..120e718d6b 100644 --- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp @@ -405,6 +405,8 @@ void fresx(UGeckoInstruction _inst) if (_inst.Rc) Helper_UpdateCR1(rPS0(_inst.FD)); } +// #define USE_ACCURATE_FRSQRTEX + void frsqrtex(UGeckoInstruction _inst) { double b = rPS0(_inst.FB); @@ -415,6 +417,7 @@ void frsqrtex(UGeckoInstruction _inst) } else { +#ifdef USE_ACCURATE_FRSQRTEX if (b == 0.0) { SetFPException(FPSCR_ZX); riPS0(_inst.FD) = 0x7ff0000000000000; @@ -436,6 +439,11 @@ void frsqrtex(UGeckoInstruction _inst) outa |= frsqrtex_lut[idx] >> 12; riPS0(_inst.FD) = ((u64)outa << 32) + (u64)outb; } +#else + if (b == 0.0) + SetFPException(FPSCR_ZX); + rPS0(_inst.FD) = ForceDouble(1.0 / sqrt(b)); +#endif } UpdateFPRF(rPS0(_inst.FD)); if (_inst.Rc) Helper_UpdateCR1(rPS0(_inst.FD)); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp index c6c82fc08c..374e23bef8 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp @@ -443,7 +443,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.blockStart = em_address; js.fifoBytesThisBlock = 0; js.curBlock = b; - js.blockSetsQuantizers = false; js.block_flags = 0; js.cancel = false; diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h index 40d68e0655..f79c3510bd 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h @@ -110,7 +110,6 @@ private: int block_flags; bool isLastInstruction; - bool blockSetsQuantizers; int fifoBytesThisBlock; @@ -247,6 +246,7 @@ public: void ps_muls(UGeckoInstruction inst); void fp_arith_s(UGeckoInstruction inst); + void frsqrtex(UGeckoInstruction inst); void fcmpx(UGeckoInstruction inst); void fmrx(UGeckoInstruction inst); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp index 95b06e85b9..663fc67891 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -71,6 +71,9 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEm fpr.UnlockAll(); } + +static const double one_const = 1.0f; + void Jit64::fp_arith_s(UGeckoInstruction inst) { INSTRUCTION_START @@ -79,9 +82,23 @@ void Jit64::fp_arith_s(UGeckoInstruction inst) Default(inst); return; } + if (inst.SUBOP5 == 26) { + // frsqrtex + int d = inst.FD; + int b = inst.FB; + fpr.Lock(b, d); + fpr.LoadToX64(d, true, true); + MOVSD(XMM0, M((void *)&one_const)); + SQRTSD(XMM1, fpr.R(b)); + DIVSD(XMM0, R(XMM1)); + MOVSD(fpr.R(d), XMM0); + fpr.UnlockAll(); + return; + } + if (inst.SUBOP5 != 18 && inst.SUBOP5 != 20 && inst.SUBOP5 != 21 && inst.SUBOP5 != 25) { - Default(inst); return; + Default(inst); return; } // Only the interpreter has "proper" support for (some) FP flags @@ -253,3 +270,5 @@ void Jit64::fcmpx(UGeckoInstruction inst) SetJumpTarget(continue3); fpr.UnlockAll(); } + + diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index ae3d091720..793163cca3 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -275,9 +275,7 @@ void Jit64::stfs(UGeckoInstruction inst) MOV(32, gpr.R(a), R(ABI_PARAM2)); } CVTSD2SS(XMM0, fpr.R(s)); - MOVSS(M(&temp32), XMM0); - MOV(32, R(ABI_PARAM1), M(&temp32)); - SafeWriteRegToReg(ABI_PARAM1, ABI_PARAM2, 32, 0); + SafeWriteFloatToReg(XMM0, ABI_PARAM2); gpr.UnlockAll(); gpr.UnlockAllX(); fpr.UnlockAll(); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp index 2238cd627e..650f4d1b86 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -51,9 +51,8 @@ void Jit64::psq_st(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(LoadStorePaired) - js.block_flags |= BLOCK_USE_GQR0 << inst.I; - if (js.blockSetsQuantizers || !inst.RA) + if (!inst.RA) { // TODO: Support these cases if it becomes necessary. Default(inst); @@ -105,12 +104,13 @@ void Jit64::psq_st(UGeckoInstruction inst) MOV(32, gpr.R(a), R(ECX)); MOVZX(32, 16, EAX, M(&PowerPC::ppcState.spr[SPR_GQR0 + inst.I])); MOVZX(32, 8, EDX, R(AL)); - // FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]! + // FIXME: Fix ModR/M encoding to allow [EDX*4+disp32] without a base register! #ifdef _M_IX86 - SHL(32, R(EDX), Imm8(2)); + int addr_shift = 2; #else - SHL(32, R(EDX), Imm8(3)); + int addr_shift = 3; #endif + SHL(32, R(EDX), Imm8(addr_shift)); if (inst.W) { // One value XORPS(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions. @@ -130,14 +130,20 @@ void Jit64::psq_l(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(LoadStorePaired) - js.block_flags |= BLOCK_USE_GQR0 << inst.I; - - if (js.blockSetsQuantizers || !inst.RA || inst.W) + if (!inst.RA) { Default(inst); return; } + const UGQR gqr(rSPR(SPR_GQR0 + inst.I)); + + if (inst.W) { + // PanicAlert("Single ps load: %i %i", gqr.ST_TYPE, gqr.ST_SCALE); + Default(inst); + return; + } + bool update = inst.OPCD == 57; int offset = inst.SIMM_12; diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp index 8db22dff9e..40990a7f52 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -52,7 +52,6 @@ void Jit64::mtspr(UGeckoInstruction inst) case SPR_GQR0 + 5: case SPR_GQR0 + 6: case SPR_GQR0 + 7: - js.blockSetsQuantizers = true; // Prevent recompiler from compiling in old quantizer values. // If the value changed, destroy all blocks using this quantizer // This will create a little bit of block churn, but hopefully not too bad. diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp index 4f69c56c8f..c8ea9ad9a3 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp @@ -397,8 +397,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitB js.blockStart = em_address; js.fifoBytesThisBlock = 0; js.curBlock = b; - js.blockSetsQuantizers = false; - js.block_flags = 0; js.cancel = false; //Analyze the block, collect all instructions it is made of (including inlining, @@ -464,7 +462,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitB // Perform actual code generation WriteCode(); - b->flags = js.block_flags; b->codeSize = (u32)(GetCodePtr() - normalEntry); b->originalSize = size; return normalEntry; diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h index b6b163a113..de85869993 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h @@ -98,10 +98,8 @@ private: UGeckoInstruction next_inst; // for easy peephole opt. int instructionNumber; int downcountAmount; - int block_flags; bool isLastInstruction; - bool blockSetsQuantizers; bool forceUnsafeLoad; int fifoBytesThisBlock; diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp index 2ffd19100a..3d34917bdd 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp @@ -299,6 +299,9 @@ void CommonAsmRoutines::GenQuantizedSingleStores() { // Easy! const u8* storeSingleFloat = AlignCode4(); + SafeWriteFloatToReg(XMM0, ECX); + RET(); + /* if (cpu_info.bSSSE3) { PSHUFB(XMM0, M((void *)pbswapShuffle2x4)); // TODO: SafeWriteFloat @@ -309,8 +312,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() { MOVSS(M(&psTemp[0]), XMM0); MOV(32, R(EAX), M(&psTemp[0])); SafeWriteRegToReg(EAX, ECX, 32, 0, true); - } - RET(); + }*/ const u8* storeSingleU8 = AlignCode4(); // Used by MKWii SHR(32, R(EAX), Imm8(6)); @@ -336,8 +338,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() { const u8* storeSingleU16 = AlignCode4(); // Used by MKWii SHR(32, R(EAX), Imm8(6)); MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_quantizeTableS)); - PUNPCKLDQ(XMM1, R(XMM1)); - MULPS(XMM0, R(XMM1)); + MULSS(XMM0, R(XMM1)); PXOR(XMM1, R(XMM1)); MAXSS(XMM0, R(XMM1)); MINSS(XMM0, M((void *)&m_65535)); diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/Src/PowerPC/JitCommon/JitCache.h index 20f9d759bb..76c2e08290 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/Src/PowerPC/JitCommon/JitCache.h @@ -42,13 +42,6 @@ #define JIT_ICACHE_INVALID_BYTE 0x14 #define JIT_ICACHE_INVALID_WORD 0x14141414 - -enum BlockFlag -{ - BLOCK_USE_GQR0 = 0x1, BLOCK_USE_GQR1 = 0x2, BLOCK_USE_GQR2 = 0x4, BLOCK_USE_GQR3 = 0x8, - BLOCK_USE_GQR4 = 0x10, BLOCK_USE_GQR5 = 0x20, BLOCK_USE_GQR6 = 0x40, BLOCK_USE_GQR7 = 0x80, -}; - // TODO(ector) - optimize this struct for size struct JitBlock { diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp index 5f9251ece1..265c121fc7 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp @@ -18,6 +18,7 @@ #include "Common.h" #include "Thunk.h" +#include "CPUDetect.h" #include "../PowerPC.h" #include "../../Core.h" #include "../../HW/GPFifo.h" @@ -139,6 +140,36 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce SetJumpTarget(arg2); } +static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; +static u32 GC_ALIGNED16(float_buffer); + +void EmuCodeBlock::SafeWriteFloatToReg(X64Reg xmm_value, X64Reg reg_addr) +{ + TEST(32, R(reg_addr), Imm32(0x0C000000)); + if (false && cpu_info.bSSSE3) { + // This path should be faster but for some reason it causes errors so I've disabled it. + FixupBranch argh = J_CC(CC_Z); + MOVSS(M(&float_buffer), xmm_value); + MOV(32, R(EAX), M(&float_buffer)); + BSWAP(32, EAX); + ABI_CallFunctionRR(thunks.ProtectFunction(((void *)&Memory::Write_U32), 2), EAX, reg_addr); + FixupBranch arg2 = J(); + SetJumpTarget(argh); + PSHUFB(xmm_value, M((void *)pbswapShuffle1x4)); + #ifdef _M_IX86 + AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK)); + MOVD_xmm(MDisp(reg_addr, (u32)Memory::base), xmm_value); + #else + MOVD_xmm(MComplex(RBX, reg_addr, SCALE_1, 0), xmm_value); + #endif + SetJumpTarget(arg2); + } else { + MOVSS(M(&float_buffer), xmm_value); + MOV(32, R(EAX), M(&float_buffer)); + SafeWriteRegToReg(EAX, reg_addr, 32, 0, true); + } +} + void EmuCodeBlock::WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address) { #ifdef _M_X64 diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.h index 4fad3db64a..dbc730df17 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.h @@ -29,6 +29,9 @@ public: void SafeLoadRegToEAX(Gen::X64Reg reg, int accessSize, s32 offset, bool signExtend = false); void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, bool swap = true); + // Trashes both inputs and EAX. + void SafeWriteFloatToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr); + void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address); void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address); void JitClearCA();