From def5666419a04c22c61925f96d2c970a0bb01d72 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 13 Jun 2021 14:22:59 +0200 Subject: [PATCH 1/4] JitArm64: Fix FPRF handling of denormal singles The interpreter was wrong after all. Hardware verified. --- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 0fe91bd34d..0f09176fda 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -455,25 +455,12 @@ void JitArm64::GenerateFPRF(bool single) FixupBranch nan_or_inf = B(CCFlags::CC_EQ); // exp != 0 && exp != EXP_MASK - const u8* normal = GetCodePtr(); emit_write_fprf_and_ret(); // exp == 0 SetJumpTarget(zero_or_denormal); TSTI2R(input_reg, INPUT_FRAC_MASK); - FixupBranch denormal; - if (single) - { - // To match the interpreter, what we output should be based on how the input would be classified - // after conversion to double. Converting a denormal single to a double always results in a - // normal double, so for denormal singles we need to output PPC_FPCLASS_PN/PPC_FPCLASS_NN. - // TODO: Hardware test that the interpreter actually is correct. - B(CCFlags::CC_NEQ, normal); - } - else - { - denormal = B(CCFlags::CC_NEQ); - } + FixupBranch denormal = B(CCFlags::CC_NEQ); // exp == 0 && frac == 0 LSR(ARM64Reg::W1, fprf_reg, 3); @@ -483,8 +470,7 @@ void JitArm64::GenerateFPRF(bool single) emit_write_fprf_and_ret(); // exp == 0 && frac != 0 - if (!single) - SetJumpTarget(denormal); + SetJumpTarget(denormal); ORRI2R(fprf_reg, fprf_reg, Common::PPC_FPCLASS_PD & ~OUTPUT_SIGN_MASK); B(write_fprf_and_ret); From d56721ebb9910cd14115f778c57c58a6fd0ca366 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 13 Jun 2021 14:35:04 +0200 Subject: [PATCH 2/4] Interpreter: Fix FPRF handling of denormal singles --- Source/Core/Common/FloatUtils.h | 1 - .../Interpreter/Interpreter_FloatingPoint.cpp | 40 +++++++++---------- .../Interpreter/Interpreter_Paired.cpp | 32 +++++++-------- Source/Core/Core/PowerPC/PowerPC.cpp | 7 +++- Source/Core/Core/PowerPC/PowerPC.h | 3 +- .../UnitTests/Core/PowerPC/JitArm64/FPRF.cpp | 6 +-- 6 files changed, 47 insertions(+), 42 deletions(-) diff --git a/Source/Core/Common/FloatUtils.h b/Source/Core/Common/FloatUtils.h index 7fca042ace..7480cdcfea 100644 --- a/Source/Core/Common/FloatUtils.h +++ b/Source/Core/Common/FloatUtils.h @@ -87,7 +87,6 @@ enum PPCFpClass // Uses PowerPC conventions for the return value, so it can be easily // used directly in CPU emulation. u32 ClassifyDouble(double dvalue); -// More efficient float version. u32 ClassifyFloat(float fvalue); struct BaseAndDec diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp index 66ccff82af..d02e8ae71b 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp @@ -302,7 +302,7 @@ void Interpreter::frspx(UGeckoInstruction inst) // round to single if (!is_snan || FPSCR.VE == 0) { rPS(inst.FD).Fill(rounded); - PowerPC::UpdateFPRF(b); + PowerPC::UpdateFPRFSingle(rounded); } FPSCR.ClearFIFR(); @@ -311,7 +311,7 @@ void Interpreter::frspx(UGeckoInstruction inst) // round to single { SetFI(&FPSCR, b != rounded); FPSCR.FR = fabs(rounded) > fabs(b); - PowerPC::UpdateFPRF(rounded); + PowerPC::UpdateFPRFSingle(rounded); rPS(inst.FD).Fill(rounded); } @@ -333,7 +333,7 @@ void Interpreter::fmulx(UGeckoInstruction inst) rPS(inst.FD).SetPS0(result); FPSCR.FI = 0; // are these flags important? FPSCR.FR = 0; - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFDouble(result); } if (inst.Rc) @@ -354,7 +354,7 @@ void Interpreter::fmulsx(UGeckoInstruction inst) rPS(inst.FD).Fill(result); FPSCR.FI = 0; FPSCR.FR = 0; - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFSingle(result); } if (inst.Rc) @@ -372,7 +372,7 @@ void Interpreter::fmaddx(UGeckoInstruction inst) { const double result = ForceDouble(FPSCR, product.value); rPS(inst.FD).SetPS0(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFDouble(result); } if (inst.Rc) @@ -395,7 +395,7 @@ void Interpreter::fmaddsx(UGeckoInstruction inst) rPS(inst.FD).Fill(result); FPSCR.FI = d_value.value != result; FPSCR.FR = 0; - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFSingle(result); } if (inst.Rc) @@ -413,7 +413,7 @@ void Interpreter::faddx(UGeckoInstruction inst) { const double result = ForceDouble(FPSCR, sum.value); rPS(inst.FD).SetPS0(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFDouble(result); } if (inst.Rc) @@ -430,7 +430,7 @@ void Interpreter::faddsx(UGeckoInstruction inst) { const double result = ForceSingle(FPSCR, sum.value); rPS(inst.FD).Fill(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFSingle(result); } if (inst.Rc) @@ -450,7 +450,7 @@ void Interpreter::fdivx(UGeckoInstruction inst) { const double result = ForceDouble(FPSCR, quotient.value); rPS(inst.FD).SetPS0(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFDouble(result); } // FR,FI,OX,UX??? @@ -470,7 +470,7 @@ void Interpreter::fdivsx(UGeckoInstruction inst) { const double result = ForceSingle(FPSCR, quotient.value); rPS(inst.FD).Fill(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFSingle(result); } if (inst.Rc) @@ -485,7 +485,7 @@ void Interpreter::fresx(UGeckoInstruction inst) const auto compute_result = [inst](double value) { const double result = Common::ApproximateReciprocal(value); rPS(inst.FD).Fill(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFSingle(result); }; if (b == 0.0) @@ -523,7 +523,7 @@ void Interpreter::frsqrtex(UGeckoInstruction inst) const auto compute_result = [inst](double value) { const double result = Common::ApproximateReciprocalSquareRoot(value); rPS(inst.FD).SetPS0(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFDouble(result); }; if (b < 0.0) @@ -574,7 +574,7 @@ void Interpreter::fmsubx(UGeckoInstruction inst) { const double result = ForceDouble(FPSCR, product.value); rPS(inst.FD).SetPS0(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFDouble(result); } if (inst.Rc) @@ -594,7 +594,7 @@ void Interpreter::fmsubsx(UGeckoInstruction inst) { const double result = ForceSingle(FPSCR, product.value); rPS(inst.FD).Fill(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFSingle(result); } if (inst.Rc) @@ -615,7 +615,7 @@ void Interpreter::fnmaddx(UGeckoInstruction inst) const double result = std::isnan(tmp) ? tmp : -tmp; rPS(inst.FD).SetPS0(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFDouble(result); } if (inst.Rc) @@ -637,7 +637,7 @@ void Interpreter::fnmaddsx(UGeckoInstruction inst) const double result = std::isnan(tmp) ? tmp : -tmp; rPS(inst.FD).Fill(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFSingle(result); } if (inst.Rc) @@ -658,7 +658,7 @@ void Interpreter::fnmsubx(UGeckoInstruction inst) const double result = std::isnan(tmp) ? tmp : -tmp; rPS(inst.FD).SetPS0(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFDouble(result); } if (inst.Rc) @@ -680,7 +680,7 @@ void Interpreter::fnmsubsx(UGeckoInstruction inst) const double result = std::isnan(tmp) ? tmp : -tmp; rPS(inst.FD).Fill(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFSingle(result); } if (inst.Rc) @@ -698,7 +698,7 @@ void Interpreter::fsubx(UGeckoInstruction inst) { const double result = ForceDouble(FPSCR, difference.value); rPS(inst.FD).SetPS0(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFDouble(result); } if (inst.Rc) @@ -716,7 +716,7 @@ void Interpreter::fsubsx(UGeckoInstruction inst) { const double result = ForceSingle(FPSCR, difference.value); rPS(inst.FD).Fill(result); - PowerPC::UpdateFPRF(result); + PowerPC::UpdateFPRFSingle(result); } if (inst.Rc) diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp index 89d51eabb7..140433892c 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp @@ -117,7 +117,7 @@ void Interpreter::ps_div(UGeckoInstruction inst) const double ps1 = ForceSingle(FPSCR, NI_div(&FPSCR, a.PS1AsDouble(), b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -145,7 +145,7 @@ void Interpreter::ps_res(UGeckoInstruction inst) const double ps1 = Common::ApproximateReciprocal(b); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -178,7 +178,7 @@ void Interpreter::ps_rsqrte(UGeckoInstruction inst) const double dst_ps1 = ForceSingle(FPSCR, Common::ApproximateReciprocalSquareRoot(ps1)); rPS(inst.FD).SetBoth(dst_ps0, dst_ps1); - PowerPC::UpdateFPRF(dst_ps0); + PowerPC::UpdateFPRFSingle(dst_ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -193,7 +193,7 @@ void Interpreter::ps_sub(UGeckoInstruction inst) const double ps1 = ForceSingle(FPSCR, NI_sub(&FPSCR, a.PS1AsDouble(), b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -208,7 +208,7 @@ void Interpreter::ps_add(UGeckoInstruction inst) const double ps1 = ForceSingle(FPSCR, NI_add(&FPSCR, a.PS1AsDouble(), b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -226,7 +226,7 @@ void Interpreter::ps_mul(UGeckoInstruction inst) const double ps1 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS1AsDouble(), c1).value); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -247,7 +247,7 @@ void Interpreter::ps_msub(UGeckoInstruction inst) ForceSingle(FPSCR, NI_msub(&FPSCR, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -268,7 +268,7 @@ void Interpreter::ps_madd(UGeckoInstruction inst) ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -292,7 +292,7 @@ void Interpreter::ps_nmsub(UGeckoInstruction inst) const double ps1 = std::isnan(tmp1) ? tmp1 : -tmp1; rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -316,7 +316,7 @@ void Interpreter::ps_nmadd(UGeckoInstruction inst) const double ps1 = std::isnan(tmp1) ? tmp1 : -tmp1; rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -332,7 +332,7 @@ void Interpreter::ps_sum0(UGeckoInstruction inst) const double ps1 = ForceSingle(FPSCR, c.PS1AsDouble()); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -348,7 +348,7 @@ void Interpreter::ps_sum1(UGeckoInstruction inst) const double ps1 = ForceSingle(FPSCR, NI_add(&FPSCR, a.PS0AsDouble(), b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps1); + PowerPC::UpdateFPRFSingle(ps1); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -364,7 +364,7 @@ void Interpreter::ps_muls0(UGeckoInstruction inst) const double ps1 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS1AsDouble(), c0).value); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -380,7 +380,7 @@ void Interpreter::ps_muls1(UGeckoInstruction inst) const double ps1 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS1AsDouble(), c1).value); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -399,7 +399,7 @@ void Interpreter::ps_madds0(UGeckoInstruction inst) ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS1AsDouble(), c0, b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); @@ -418,7 +418,7 @@ void Interpreter::ps_madds1(UGeckoInstruction inst) ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); - PowerPC::UpdateFPRF(ps0); + PowerPC::UpdateFPRFSingle(ps0); if (inst.Rc) PowerPC::ppcState.UpdateCR1(); diff --git a/Source/Core/Core/PowerPC/PowerPC.cpp b/Source/Core/Core/PowerPC/PowerPC.cpp index 73eac23a94..817276f8d7 100644 --- a/Source/Core/Core/PowerPC/PowerPC.cpp +++ b/Source/Core/Core/PowerPC/PowerPC.cpp @@ -626,11 +626,16 @@ void PowerPCState::SetSR(u32 index, u32 value) // FPSCR update functions -void UpdateFPRF(double dvalue) +void UpdateFPRFDouble(double dvalue) { FPSCR.FPRF = Common::ClassifyDouble(dvalue); } +void UpdateFPRFSingle(float fvalue) +{ + FPSCR.FPRF = Common::ClassifyFloat(fvalue); +} + void RoundingModeUpdated() { // The rounding mode is separate for each thread, so this must run on the CPU thread diff --git a/Source/Core/Core/PowerPC/PowerPC.h b/Source/Core/Core/PowerPC/PowerPC.h index 73919161f2..fdc0f16a09 100644 --- a/Source/Core/Core/PowerPC/PowerPC.h +++ b/Source/Core/Core/PowerPC/PowerPC.h @@ -304,7 +304,8 @@ inline void SetXER_OV(bool value) SetXER_SO(value); } -void UpdateFPRF(double dvalue); +void UpdateFPRFDouble(double dvalue); +void UpdateFPRFSingle(float fvalue); void RoundingModeUpdated(); diff --git a/Source/UnitTests/Core/PowerPC/JitArm64/FPRF.cpp b/Source/UnitTests/Core/PowerPC/JitArm64/FPRF.cpp index 7bbdffdf64..270cc1b05d 100644 --- a/Source/UnitTests/Core/PowerPC/JitArm64/FPRF.cpp +++ b/Source/UnitTests/Core/PowerPC/JitArm64/FPRF.cpp @@ -74,14 +74,14 @@ TEST(JitArm64, FPRF) for (const u64 double_input : double_test_values) { const u32 expected_double = - RunUpdateFPRF([&] { PowerPC::UpdateFPRF(Common::BitCast(double_input)); }); + RunUpdateFPRF([&] { PowerPC::UpdateFPRFDouble(Common::BitCast(double_input)); }); const u32 actual_double = RunUpdateFPRF([&] { test.fprf_double(double_input); }); EXPECT_EQ(expected_double, actual_double); const u32 single_input = ConvertToSingle(double_input); - const u32 expected_single = RunUpdateFPRF( - [&] { PowerPC::UpdateFPRF(Common::BitCast(ConvertToDouble(single_input))); }); + const u32 expected_single = + RunUpdateFPRF([&] { PowerPC::UpdateFPRFSingle(Common::BitCast(single_input)); }); const u32 actual_single = RunUpdateFPRF([&] { test.fprf_single(single_input); }); EXPECT_EQ(expected_single, actual_single); } From 8d2c069c3454fee6180483f3bf29fe496d71c5f8 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 13 Jun 2021 14:45:09 +0200 Subject: [PATCH 3/4] Interpreter: Return float from ForceSingle Performance optimization, along with making the code a little neater. Saves us from performing a single -> double -> single conversion when calling UpdateFPRFSingle. --- .../PowerPC/Interpreter/Interpreter_FPUtils.h | 6 +- .../Interpreter/Interpreter_FloatingPoint.cpp | 22 +++--- .../Interpreter/Interpreter_Paired.cpp | 76 +++++++++---------- 3 files changed, 47 insertions(+), 57 deletions(-) diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h index bedc3085d3..37a355338e 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h @@ -36,15 +36,13 @@ inline void SetFPException(UReg_FPSCR* fpscr, u32 mask) fpscr->VX = (fpscr->Hex & FPSCR_VX_ANY) != 0; } -inline double ForceSingle(const UReg_FPSCR& fpscr, double value) +inline float ForceSingle(const UReg_FPSCR& fpscr, double value) { - // convert to float... - float x = (float)value; + float x = static_cast(value); if (!cpu_info.bFlushToZero && fpscr.NI) { x = Common::FlushToZero(x); } - // ...and back to double: return x; } diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp index d02e8ae71b..1c0ff8f3fd 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FloatingPoint.cpp @@ -290,7 +290,7 @@ void Interpreter::fselx(UGeckoInstruction inst) void Interpreter::frspx(UGeckoInstruction inst) // round to single { const double b = rPS(inst.FB).PS0AsDouble(); - const double rounded = ForceSingle(FPSCR, b); + const float rounded = ForceSingle(FPSCR, b); if (std::isnan(b)) { @@ -349,7 +349,7 @@ void Interpreter::fmulsx(UGeckoInstruction inst) if (FPSCR.VE == 0 || d_value.HasNoInvalidExceptions()) { - const double result = ForceSingle(FPSCR, d_value.value); + const float result = ForceSingle(FPSCR, d_value.value); rPS(inst.FD).Fill(result); FPSCR.FI = 0; @@ -390,7 +390,7 @@ void Interpreter::fmaddsx(UGeckoInstruction inst) if (FPSCR.VE == 0 || d_value.HasNoInvalidExceptions()) { - const double result = ForceSingle(FPSCR, d_value.value); + const float result = ForceSingle(FPSCR, d_value.value); rPS(inst.FD).Fill(result); FPSCR.FI = d_value.value != result; @@ -428,7 +428,7 @@ void Interpreter::faddsx(UGeckoInstruction inst) if (FPSCR.VE == 0 || sum.HasNoInvalidExceptions()) { - const double result = ForceSingle(FPSCR, sum.value); + const float result = ForceSingle(FPSCR, sum.value); rPS(inst.FD).Fill(result); PowerPC::UpdateFPRFSingle(result); } @@ -468,7 +468,7 @@ void Interpreter::fdivsx(UGeckoInstruction inst) if (not_divide_by_zero && not_invalid) { - const double result = ForceSingle(FPSCR, quotient.value); + const float result = ForceSingle(FPSCR, quotient.value); rPS(inst.FD).Fill(result); PowerPC::UpdateFPRFSingle(result); } @@ -592,7 +592,7 @@ void Interpreter::fmsubsx(UGeckoInstruction inst) if (FPSCR.VE == 0 || product.HasNoInvalidExceptions()) { - const double result = ForceSingle(FPSCR, product.value); + const float result = ForceSingle(FPSCR, product.value); rPS(inst.FD).Fill(result); PowerPC::UpdateFPRFSingle(result); } @@ -633,8 +633,8 @@ void Interpreter::fnmaddsx(UGeckoInstruction inst) if (FPSCR.VE == 0 || product.HasNoInvalidExceptions()) { - const double tmp = ForceSingle(FPSCR, product.value); - const double result = std::isnan(tmp) ? tmp : -tmp; + const float tmp = ForceSingle(FPSCR, product.value); + const float result = std::isnan(tmp) ? tmp : -tmp; rPS(inst.FD).Fill(result); PowerPC::UpdateFPRFSingle(result); @@ -676,8 +676,8 @@ void Interpreter::fnmsubsx(UGeckoInstruction inst) if (FPSCR.VE == 0 || product.HasNoInvalidExceptions()) { - const double tmp = ForceSingle(FPSCR, product.value); - const double result = std::isnan(tmp) ? tmp : -tmp; + const float tmp = ForceSingle(FPSCR, product.value); + const float result = std::isnan(tmp) ? tmp : -tmp; rPS(inst.FD).Fill(result); PowerPC::UpdateFPRFSingle(result); @@ -714,7 +714,7 @@ void Interpreter::fsubsx(UGeckoInstruction inst) if (FPSCR.VE == 0 || difference.HasNoInvalidExceptions()) { - const double result = ForceSingle(FPSCR, difference.value); + const float result = ForceSingle(FPSCR, difference.value); rPS(inst.FD).Fill(result); PowerPC::UpdateFPRFSingle(result); } diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp index 140433892c..20e5405236 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Paired.cpp @@ -113,8 +113,8 @@ void Interpreter::ps_div(UGeckoInstruction inst) const auto& a = rPS(inst.FA); const auto& b = rPS(inst.FB); - const double ps0 = ForceSingle(FPSCR, NI_div(&FPSCR, a.PS0AsDouble(), b.PS0AsDouble()).value); - const double ps1 = ForceSingle(FPSCR, NI_div(&FPSCR, a.PS1AsDouble(), b.PS1AsDouble()).value); + const float ps0 = ForceSingle(FPSCR, NI_div(&FPSCR, a.PS0AsDouble(), b.PS0AsDouble()).value); + const float ps1 = ForceSingle(FPSCR, NI_div(&FPSCR, a.PS1AsDouble(), b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); @@ -174,8 +174,8 @@ void Interpreter::ps_rsqrte(UGeckoInstruction inst) if (Common::IsSNAN(ps0) || Common::IsSNAN(ps1)) SetFPException(&FPSCR, FPSCR_VXSNAN); - const double dst_ps0 = ForceSingle(FPSCR, Common::ApproximateReciprocalSquareRoot(ps0)); - const double dst_ps1 = ForceSingle(FPSCR, Common::ApproximateReciprocalSquareRoot(ps1)); + const float dst_ps0 = ForceSingle(FPSCR, Common::ApproximateReciprocalSquareRoot(ps0)); + const float dst_ps1 = ForceSingle(FPSCR, Common::ApproximateReciprocalSquareRoot(ps1)); rPS(inst.FD).SetBoth(dst_ps0, dst_ps1); PowerPC::UpdateFPRFSingle(dst_ps0); @@ -189,8 +189,8 @@ void Interpreter::ps_sub(UGeckoInstruction inst) const auto& a = rPS(inst.FA); const auto& b = rPS(inst.FB); - const double ps0 = ForceSingle(FPSCR, NI_sub(&FPSCR, a.PS0AsDouble(), b.PS0AsDouble()).value); - const double ps1 = ForceSingle(FPSCR, NI_sub(&FPSCR, a.PS1AsDouble(), b.PS1AsDouble()).value); + const float ps0 = ForceSingle(FPSCR, NI_sub(&FPSCR, a.PS0AsDouble(), b.PS0AsDouble()).value); + const float ps1 = ForceSingle(FPSCR, NI_sub(&FPSCR, a.PS1AsDouble(), b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); @@ -204,8 +204,8 @@ void Interpreter::ps_add(UGeckoInstruction inst) const auto& a = rPS(inst.FA); const auto& b = rPS(inst.FB); - const double ps0 = ForceSingle(FPSCR, NI_add(&FPSCR, a.PS0AsDouble(), b.PS0AsDouble()).value); - const double ps1 = ForceSingle(FPSCR, NI_add(&FPSCR, a.PS1AsDouble(), b.PS1AsDouble()).value); + const float ps0 = ForceSingle(FPSCR, NI_add(&FPSCR, a.PS0AsDouble(), b.PS0AsDouble()).value); + const float ps1 = ForceSingle(FPSCR, NI_add(&FPSCR, a.PS1AsDouble(), b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); @@ -222,8 +222,8 @@ void Interpreter::ps_mul(UGeckoInstruction inst) const double c0 = Force25Bit(c.PS0AsDouble()); const double c1 = Force25Bit(c.PS1AsDouble()); - const double ps0 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS0AsDouble(), c0).value); - const double ps1 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS1AsDouble(), c1).value); + const float ps0 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS0AsDouble(), c0).value); + const float ps1 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS1AsDouble(), c1).value); rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); @@ -241,10 +241,8 @@ void Interpreter::ps_msub(UGeckoInstruction inst) const double c0 = Force25Bit(c.PS0AsDouble()); const double c1 = Force25Bit(c.PS1AsDouble()); - const double ps0 = - ForceSingle(FPSCR, NI_msub(&FPSCR, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); - const double ps1 = - ForceSingle(FPSCR, NI_msub(&FPSCR, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); + const float ps0 = ForceSingle(FPSCR, NI_msub(&FPSCR, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); + const float ps1 = ForceSingle(FPSCR, NI_msub(&FPSCR, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); @@ -262,10 +260,8 @@ void Interpreter::ps_madd(UGeckoInstruction inst) const double c0 = Force25Bit(c.PS0AsDouble()); const double c1 = Force25Bit(c.PS1AsDouble()); - const double ps0 = - ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); - const double ps1 = - ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); + const float ps0 = ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); + const float ps1 = ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); @@ -283,13 +279,13 @@ void Interpreter::ps_nmsub(UGeckoInstruction inst) const double c0 = Force25Bit(c.PS0AsDouble()); const double c1 = Force25Bit(c.PS1AsDouble()); - const double tmp0 = + const float tmp0 = ForceSingle(FPSCR, NI_msub(&FPSCR, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); - const double tmp1 = + const float tmp1 = ForceSingle(FPSCR, NI_msub(&FPSCR, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); - const double ps0 = std::isnan(tmp0) ? tmp0 : -tmp0; - const double ps1 = std::isnan(tmp1) ? tmp1 : -tmp1; + const float ps0 = std::isnan(tmp0) ? tmp0 : -tmp0; + const float ps1 = std::isnan(tmp1) ? tmp1 : -tmp1; rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); @@ -307,13 +303,13 @@ void Interpreter::ps_nmadd(UGeckoInstruction inst) const double c0 = Force25Bit(c.PS0AsDouble()); const double c1 = Force25Bit(c.PS1AsDouble()); - const double tmp0 = + const float tmp0 = ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); - const double tmp1 = + const float tmp1 = ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); - const double ps0 = std::isnan(tmp0) ? tmp0 : -tmp0; - const double ps1 = std::isnan(tmp1) ? tmp1 : -tmp1; + const float ps0 = std::isnan(tmp0) ? tmp0 : -tmp0; + const float ps1 = std::isnan(tmp1) ? tmp1 : -tmp1; rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); @@ -328,8 +324,8 @@ void Interpreter::ps_sum0(UGeckoInstruction inst) const auto& b = rPS(inst.FB); const auto& c = rPS(inst.FC); - const double ps0 = ForceSingle(FPSCR, NI_add(&FPSCR, a.PS0AsDouble(), b.PS1AsDouble()).value); - const double ps1 = ForceSingle(FPSCR, c.PS1AsDouble()); + const float ps0 = ForceSingle(FPSCR, NI_add(&FPSCR, a.PS0AsDouble(), b.PS1AsDouble()).value); + const float ps1 = ForceSingle(FPSCR, c.PS1AsDouble()); rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); @@ -344,8 +340,8 @@ void Interpreter::ps_sum1(UGeckoInstruction inst) const auto& b = rPS(inst.FB); const auto& c = rPS(inst.FC); - const double ps0 = ForceSingle(FPSCR, c.PS0AsDouble()); - const double ps1 = ForceSingle(FPSCR, NI_add(&FPSCR, a.PS0AsDouble(), b.PS1AsDouble()).value); + const float ps0 = ForceSingle(FPSCR, c.PS0AsDouble()); + const float ps1 = ForceSingle(FPSCR, NI_add(&FPSCR, a.PS0AsDouble(), b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps1); @@ -360,8 +356,8 @@ void Interpreter::ps_muls0(UGeckoInstruction inst) const auto& c = rPS(inst.FC); const double c0 = Force25Bit(c.PS0AsDouble()); - const double ps0 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS0AsDouble(), c0).value); - const double ps1 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS1AsDouble(), c0).value); + const float ps0 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS0AsDouble(), c0).value); + const float ps1 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS1AsDouble(), c0).value); rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); @@ -376,8 +372,8 @@ void Interpreter::ps_muls1(UGeckoInstruction inst) const auto& c = rPS(inst.FC); const double c1 = Force25Bit(c.PS1AsDouble()); - const double ps0 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS0AsDouble(), c1).value); - const double ps1 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS1AsDouble(), c1).value); + const float ps0 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS0AsDouble(), c1).value); + const float ps1 = ForceSingle(FPSCR, NI_mul(&FPSCR, a.PS1AsDouble(), c1).value); rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); @@ -393,10 +389,8 @@ void Interpreter::ps_madds0(UGeckoInstruction inst) const auto& c = rPS(inst.FC); const double c0 = Force25Bit(c.PS0AsDouble()); - const double ps0 = - ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); - const double ps1 = - ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS1AsDouble(), c0, b.PS1AsDouble()).value); + const float ps0 = ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS0AsDouble(), c0, b.PS0AsDouble()).value); + const float ps1 = ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS1AsDouble(), c0, b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); @@ -412,10 +406,8 @@ void Interpreter::ps_madds1(UGeckoInstruction inst) const auto& c = rPS(inst.FC); const double c1 = Force25Bit(c.PS1AsDouble()); - const double ps0 = - ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS0AsDouble(), c1, b.PS0AsDouble()).value); - const double ps1 = - ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); + const float ps0 = ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS0AsDouble(), c1, b.PS0AsDouble()).value); + const float ps1 = ForceSingle(FPSCR, NI_madd(&FPSCR, a.PS1AsDouble(), c1, b.PS1AsDouble()).value); rPS(inst.FD).SetBoth(ps0, ps1); PowerPC::UpdateFPRFSingle(ps0); From ccd8233ea378b5737e4c7d086d4d762f8645951c Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 13 Jun 2021 16:03:47 +0200 Subject: [PATCH 4/4] Jit64: Fix FPRF handling of denormal singles --- Source/Core/Core/PowerPC/Jit64/Jit.h | 5 +- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 76 +++++++++--- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 12 +- .../Core/PowerPC/Jit64Common/EmuCodeBlock.cpp | 110 +++++++++--------- .../Core/PowerPC/Jit64Common/EmuCodeBlock.h | 4 +- 5 files changed, 128 insertions(+), 79 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 029cef0974..3480c85279 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -121,8 +121,11 @@ public: // Generates a branch that will check if a given bit of a CR register part // is set or not. Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true); - void SetFPRFIfNeeded(Gen::X64Reg xmm); + void SetFPRFIfNeeded(const Gen::OpArg& xmm, bool single); + void FinalizeSingleResult(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true, + bool duplicate = false); + void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input); void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in, Gen::X64Reg clobber = Gen::XMM0); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index d4ae8ca797..957a0c461f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -33,13 +33,63 @@ alignas(16) static const double half_qnan_and_s32_max[2] = {0x7FFFFFFF, -0x80000 // We can avoid calculating FPRF if it's not needed; every float operation resets it, so // if it's going to be clobbered in a future instruction before being read, we can just // not calculate it. -void Jit64::SetFPRFIfNeeded(X64Reg xmm) +void Jit64::SetFPRFIfNeeded(const OpArg& input, bool single) { // As far as we know, the games that use this flag only need FPRF for fmul and fmadd, but // FPRF is fast enough in JIT that we might as well just enable it for every float instruction // if the FPRF flag is set. - if (SConfig::GetInstance().bFPRF && js.op->wantsFPRF) - SetFPRF(xmm); + if (!SConfig::GetInstance().bFPRF || !js.op->wantsFPRF) + return; + + X64Reg xmm = XMM0; + if (input.IsSimpleReg()) + xmm = input.GetSimpleReg(); + else + MOVSD(xmm, input); + + SetFPRF(xmm, single); +} + +void Jit64::FinalizeSingleResult(X64Reg output, const OpArg& input, bool packed, bool duplicate) +{ + // Most games don't need these. Zelda requires it though - some platforms get stuck without them. + if (jo.accurateSinglePrecision) + { + if (packed) + { + CVTPD2PS(output, input); + SetFPRFIfNeeded(R(output), true); + CVTPS2PD(output, R(output)); + } + else + { + CVTSD2SS(output, input); + SetFPRFIfNeeded(R(output), true); + CVTSS2SD(output, R(output)); + if (duplicate) + MOVDDUP(output, R(output)); + } + } + else + { + if (!input.IsSimpleReg(output)) + { + if (duplicate) + MOVDDUP(output, input); + else + MOVAPD(output, input); + } + + SetFPRFIfNeeded(input, true); + } +} + +void Jit64::FinalizeDoubleResult(X64Reg output, const OpArg& input) +{ + if (!input.IsSimpleReg(output)) + MOVSD(output, input); + + SetFPRFIfNeeded(input, false); } void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm, X64Reg clobber) @@ -210,8 +260,9 @@ void Jit64::fp_arith(UGeckoInstruction inst) HandleNaNs(inst, Rd, dest); if (single) - ForceSinglePrecision(Rd, Rd, packed, true); - SetFPRFIfNeeded(Rd); + FinalizeSingleResult(Rd, Rd, packed, true); + else + FinalizeDoubleResult(Rd, Rd); }; switch (inst.SUBOP5) @@ -452,14 +503,13 @@ void Jit64::fmaddXX(UGeckoInstruction inst) if (single) { HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1); - ForceSinglePrecision(Rd, R(result_reg), packed, true); + FinalizeSingleResult(Rd, R(result_reg), packed, true); } else { HandleNaNs(inst, result_reg, result_reg, XMM1); - MOVSD(Rd, R(result_reg)); + FinalizeDoubleResult(Rd, R(result_reg)); } - SetFPRFIfNeeded(Rd); } void Jit64::fsign(UGeckoInstruction inst) @@ -763,12 +813,11 @@ void Jit64::frspx(UGeckoInstruction inst) int d = inst.FD; bool packed = js.op->fprIsDuplicated[b] && !cpu_info.bAtom; - RCOpArg Rb = fpr.Use(b, RCMode::Read); + RCOpArg Rb = fpr.Bind(b, RCMode::Read); RCX64Reg Rd = fpr.Bind(d, RCMode::Write); RegCache::Realize(Rb, Rd); - ForceSinglePrecision(Rd, Rb, packed, true); - SetFPRFIfNeeded(Rd); + FinalizeSingleResult(Rd, Rb, packed, true); } void Jit64::frsqrtex(UGeckoInstruction inst) @@ -786,8 +835,7 @@ void Jit64::frsqrtex(UGeckoInstruction inst) MOVAPD(XMM0, Rb); CALL(asm_routines.frsqrte); - MOVSD(Rd, XMM0); - SetFPRFIfNeeded(Rd); + FinalizeDoubleResult(Rd, R(XMM0)); } void Jit64::fresx(UGeckoInstruction inst) @@ -806,5 +854,5 @@ void Jit64::fresx(UGeckoInstruction inst) MOVAPD(XMM0, Rb); CALL(asm_routines.fres); MOVDDUP(Rd, R(XMM0)); - SetFPRFIfNeeded(Rd); + SetFPRFIfNeeded(R(XMM0), true); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index fa5a91bd8c..d07b9e6bc0 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -77,8 +77,7 @@ void Jit64::ps_sum(UGeckoInstruction inst) PanicAlertFmt("ps_sum WTF!!!"); } HandleNaNs(inst, Rd, tmp, tmp == XMM1 ? XMM0 : XMM1); - ForceSinglePrecision(Rd, Rd); - SetFPRFIfNeeded(Rd); + FinalizeSingleResult(Rd, Rd); } void Jit64::ps_muls(UGeckoInstruction inst) @@ -112,8 +111,7 @@ void Jit64::ps_muls(UGeckoInstruction inst) Force25BitPrecision(XMM1, R(XMM1), XMM0); MULPD(XMM1, Ra); HandleNaNs(inst, Rd, XMM1); - ForceSinglePrecision(Rd, Rd); - SetFPRFIfNeeded(Rd); + FinalizeSingleResult(Rd, Rd); } void Jit64::ps_mergeXX(UGeckoInstruction inst) @@ -171,8 +169,7 @@ void Jit64::ps_rsqrte(UGeckoInstruction inst) CALL(asm_routines.frsqrte); MOVLHPS(Rd, XMM0); - ForceSinglePrecision(Rd, Rd); - SetFPRFIfNeeded(Rd); + FinalizeSingleResult(Rd, Rd); } void Jit64::ps_res(UGeckoInstruction inst) @@ -196,8 +193,7 @@ void Jit64::ps_res(UGeckoInstruction inst) CALL(asm_routines.fres); MOVLHPS(Rd, XMM0); - ForceSinglePrecision(Rd, Rd); - SetFPRFIfNeeded(Rd); + FinalizeSingleResult(Rd, Rd); } void Jit64::ps_cmpXX(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp index 409b158891..01a5115ba0 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp @@ -727,34 +727,6 @@ void EmuCodeBlock::JitClearCA() MOV(8, PPCSTATE(xer_ca), Imm8(0)); } -void EmuCodeBlock::ForceSinglePrecision(X64Reg output, const OpArg& input, bool packed, - bool duplicate) -{ - // Most games don't need these. Zelda requires it though - some platforms get stuck without them. - if (m_jit.jo.accurateSinglePrecision) - { - if (packed) - { - CVTPD2PS(output, input); - CVTPS2PD(output, R(output)); - } - else - { - CVTSD2SS(output, input); - CVTSS2SD(output, R(output)); - if (duplicate) - MOVDDUP(output, R(output)); - } - } - else if (!input.IsSimpleReg(output)) - { - if (duplicate) - MOVDDUP(output, input); - else - MOVAPD(output, input); - } -} - // Abstract between AVX and SSE: automatically handle 3-operand instructions void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&), void (XEmitter::*sseOp)(X64Reg, const OpArg&), X64Reg regOp, @@ -907,30 +879,35 @@ void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr MOVDDUP(dst, R(dst)); } -alignas(16) static const u64 psDoubleExp[2] = {0x7FF0000000000000ULL, 0}; -alignas(16) static const u64 psDoubleFrac[2] = {0x000FFFFFFFFFFFFFULL, 0}; -alignas(16) static const u64 psDoubleNoSign[2] = {0x7FFFFFFFFFFFFFFFULL, 0}; +alignas(16) static const u64 psDoubleExp[2] = {Common::DOUBLE_EXP, 0}; +alignas(16) static const u64 psDoubleFrac[2] = {Common::DOUBLE_FRAC, 0}; +alignas(16) static const u64 psDoubleNoSign[2] = {~Common::DOUBLE_SIGN, 0}; + +alignas(16) static const u32 psFloatExp[4] = {Common::FLOAT_EXP, 0, 0, 0}; +alignas(16) static const u32 psFloatFrac[4] = {Common::FLOAT_FRAC, 0, 0, 0}; +alignas(16) static const u32 psFloatNoSign[4] = {~Common::FLOAT_SIGN, 0, 0, 0}; // TODO: it might be faster to handle FPRF in the same way as CR is currently handled for integer, -// storing -// the result of each floating point op and calculating it when needed. This is trickier than for -// integers -// though, because there's 32 possible FPRF bit combinations but only 9 categories of floating point -// values, -// which makes the whole thing rather trickier. -// Fortunately, PPCAnalyzer can optimize out a large portion of FPRF calculations, so maybe this -// isn't -// quite that necessary. -void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm) +// storing the result of each floating point op and calculating it when needed. This is trickier +// than for integers though, because there's 32 possible FPRF bit combinations but only 9 categories +// of floating point values. Fortunately, PPCAnalyzer can optimize out a large portion of FPRF +// calculations, so maybe this isn't quite that necessary. +void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm, bool single) { + const int input_size = single ? 32 : 64; + AND(32, PPCSTATE(fpscr), Imm32(~FPRF_MASK)); FixupBranch continue1, continue2, continue3, continue4; if (cpu_info.bSSE4_1) { MOVQ_xmm(R(RSCRATCH), xmm); - SHR(64, R(RSCRATCH), Imm8(63)); // Get the sign bit; almost all the branches need it. - PTEST(xmm, MConst(psDoubleExp)); + // Get the sign bit; almost all the branches need it. + SHR(input_size, R(RSCRATCH), Imm8(input_size - 1)); + if (single) + PTEST(xmm, MConst(psFloatExp)); + else + PTEST(xmm, MConst(psDoubleExp)); FixupBranch maxExponent = J_CC(CC_C); FixupBranch zeroExponent = J_CC(CC_Z); @@ -940,7 +917,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm) continue1 = J(); SetJumpTarget(maxExponent); - PTEST(xmm, MConst(psDoubleFrac)); + if (single) + PTEST(xmm, MConst(psFloatFrac)); + else + PTEST(xmm, MConst(psDoubleFrac)); FixupBranch notNAN = J_CC(CC_Z); // Max exponent + mantissa: PPC_FPCLASS_QNAN @@ -955,7 +935,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm) continue3 = J(); SetJumpTarget(zeroExponent); - PTEST(xmm, MConst(psDoubleNoSign)); + if (single) + PTEST(xmm, MConst(psFloatNoSign)); + else + PTEST(xmm, MConst(psDoubleNoSign)); FixupBranch zero = J_CC(CC_Z); // No exponent + mantissa: sign ? PPC_FPCLASS_ND : PPC_FPCLASS_PD; @@ -971,37 +954,58 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm) else { MOVQ_xmm(R(RSCRATCH), xmm); - TEST(64, R(RSCRATCH), MConst(psDoubleExp)); + if (single) + TEST(32, R(RSCRATCH), Imm32(Common::FLOAT_EXP)); + else + TEST(64, R(RSCRATCH), MConst(psDoubleExp)); FixupBranch zeroExponent = J_CC(CC_Z); - AND(64, R(RSCRATCH), MConst(psDoubleNoSign)); - CMP(64, R(RSCRATCH), MConst(psDoubleExp)); + + if (single) + { + AND(32, R(RSCRATCH), Imm32(~Common::FLOAT_SIGN)); + CMP(32, R(RSCRATCH), Imm32(Common::FLOAT_EXP)); + } + else + { + AND(64, R(RSCRATCH), MConst(psDoubleNoSign)); + CMP(64, R(RSCRATCH), MConst(psDoubleExp)); + } FixupBranch nan = J_CC(CC_G); // This works because if the sign bit is set, RSCRATCH is negative FixupBranch infinity = J_CC(CC_E); + MOVQ_xmm(R(RSCRATCH), xmm); - SHR(64, R(RSCRATCH), Imm8(63)); + SHR(input_size, R(RSCRATCH), Imm8(input_size - 1)); LEA(32, RSCRATCH, MScaled(RSCRATCH, Common::PPC_FPCLASS_NN - Common::PPC_FPCLASS_PN, Common::PPC_FPCLASS_PN)); continue1 = J(); + SetJumpTarget(nan); MOV(32, R(RSCRATCH), Imm32(Common::PPC_FPCLASS_QNAN)); continue2 = J(); + SetJumpTarget(infinity); MOVQ_xmm(R(RSCRATCH), xmm); - SHR(64, R(RSCRATCH), Imm8(63)); + SHR(input_size, R(RSCRATCH), Imm8(input_size - 1)); LEA(32, RSCRATCH, MScaled(RSCRATCH, Common::PPC_FPCLASS_NINF - Common::PPC_FPCLASS_PINF, Common::PPC_FPCLASS_PINF)); continue3 = J(); + SetJumpTarget(zeroExponent); - TEST(64, R(RSCRATCH), MConst(psDoubleNoSign)); + if (single) + TEST(input_size, R(RSCRATCH), Imm32(~Common::FLOAT_SIGN)); + else + TEST(input_size, R(RSCRATCH), MConst(psDoubleNoSign)); FixupBranch zero = J_CC(CC_Z); - SHR(64, R(RSCRATCH), Imm8(63)); + + SHR(input_size, R(RSCRATCH), Imm8(input_size - 1)); LEA(32, RSCRATCH, MScaled(RSCRATCH, Common::PPC_FPCLASS_ND - Common::PPC_FPCLASS_PD, Common::PPC_FPCLASS_PD)); continue4 = J(); + SetJumpTarget(zero); - SHR(64, R(RSCRATCH), Imm8(63)); + SHR(input_size, R(RSCRATCH), Imm8(input_size - 1)); SHL(32, R(RSCRATCH), Imm8(4)); ADD(32, R(RSCRATCH), Imm8(Common::PPC_FPCLASS_PZ)); } diff --git a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h index b8a1aae0c9..9f5c373df3 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h +++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.h @@ -117,14 +117,12 @@ public: void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&, u8), Gen::X64Reg regOp, const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm); - void ForceSinglePrecision(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true, - bool duplicate = false); void Force25BitPrecision(Gen::X64Reg output, const Gen::OpArg& input, Gen::X64Reg tmp); // RSCRATCH might get trashed void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false); void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src); - void SetFPRF(Gen::X64Reg xmm); + void SetFPRF(Gen::X64Reg xmm, bool single); void Clear(); protected: