Jit64: Fix fmaddXX with accurate NaNs

So it turns out you have to pass XMM0 as the clobber register
to HandleNaNs, because HandleNaNs uses BLENDVPD and BLENDVPD
implicitly uses XMM0, and nobody noticed when I broke this in
2c38d64 because nobody plays the one game that needs accurate NaNs.
This commit is contained in:
JosJuice
2021-07-28 22:56:22 +02:00
parent 2a9742c74c
commit 3bb4a4e344
3 changed files with 49 additions and 37 deletions

View File

@ -126,7 +126,7 @@ public:
bool duplicate = false); bool duplicate = false);
void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input); void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input);
void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in, void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in,
Gen::X64Reg clobber = Gen::XMM0); Gen::X64Reg clobber);
void MultiplyImmediate(u32 imm, int a, int d, bool overflow); void MultiplyImmediate(u32 imm, int a, int d, bool overflow);

View File

@ -257,7 +257,7 @@ void Jit64::fp_arith(UGeckoInstruction inst)
avx_op(avxOp, sseOp, dest, Rop1, Rop2, packed, reversible); avx_op(avxOp, sseOp, dest, Rop1, Rop2, packed, reversible);
} }
HandleNaNs(inst, Rd, dest); HandleNaNs(inst, Rd, dest, XMM0);
if (single) if (single)
FinalizeSingleResult(Rd, Rd, packed, true); FinalizeSingleResult(Rd, Rd, packed, true);
else else
@ -345,7 +345,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
RegCache::Realize(Ra, Rb, Rc, Rd); RegCache::Realize(Ra, Rb, Rc, Rd);
} }
X64Reg result_reg = XMM0; X64Reg scratch_xmm = !use_fma && inst.SUBOP5 == 30 ? XMM1 : XMM0;
X64Reg result_xmm = scratch_xmm == XMM0 ? XMM1 : XMM0;
if (software_fma) if (software_fma)
{ {
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i) for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
@ -392,31 +393,35 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
if (packed) if (packed)
{ {
MOVSD(Rd, XMM0); MOVSD(Rd, XMM0);
result_reg = Rd; result_xmm = Rd;
}
else
{
result_xmm = XMM0;
} }
if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd
XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits)); XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
} }
else else
{ {
switch (inst.SUBOP5) switch (inst.SUBOP5)
{ {
case 14: // madds0 case 14: // madds0
MOVDDUP(XMM0, Rc); MOVDDUP(result_xmm, Rc);
if (round_input) if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1); Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
break; break;
case 15: // madds1 case 15: // madds1
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3); avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3);
if (round_input) if (round_input)
Force25BitPrecision(XMM0, R(XMM0), XMM1); Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
break; break;
default: default:
if (single && round_input) if (single && round_input)
Force25BitPrecision(XMM0, Rc, XMM1); Force25BitPrecision(result_xmm, Rc, scratch_xmm);
else else
MOVAPD(XMM0, Rc); MOVAPD(result_xmm, Rc);
break; break;
} }
@ -426,17 +431,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
{ {
case 28: // msub case 28: // msub
if (packed) if (packed)
VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra); VFMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else else
VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra); VFMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break; break;
case 14: // madds0 case 14: // madds0
case 15: // madds1 case 15: // madds1
case 29: // madd case 29: // madd
if (packed) if (packed)
VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra); VFMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else else
VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra); VFMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break; break;
// PowerPC and x86 define NMADD/NMSUB differently // PowerPC and x86 define NMADD/NMSUB differently
// x86: D = -A*C (+/-) B // x86: D = -A*C (+/-) B
@ -444,15 +449,15 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
// so we have to swap them; the ADD/SUB here isn't a typo. // so we have to swap them; the ADD/SUB here isn't a typo.
case 30: // nmsub case 30: // nmsub
if (packed) if (packed)
VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra); VFNMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else else
VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra); VFNMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break; break;
case 31: // nmadd case 31: // nmadd
if (packed) if (packed)
VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra); VFNMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else else
VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra); VFNMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break; break;
} }
} }
@ -462,52 +467,59 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
{ {
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
// so handle it separately. // so handle it separately.
MOVAPD(XMM1, Rb); MOVAPD(scratch_xmm, Rb);
if (packed) if (packed)
{ {
MULPD(XMM0, Ra); MULPD(result_xmm, Ra);
SUBPD(XMM1, R(XMM0)); SUBPD(scratch_xmm, R(result_xmm));
} }
else else
{ {
MULSD(XMM0, Ra); MULSD(result_xmm, Ra);
SUBSD(XMM1, R(XMM0)); SUBSD(scratch_xmm, R(result_xmm));
} }
result_reg = XMM1; result_xmm = scratch_xmm;
} }
else else
{ {
if (packed) if (packed)
{ {
MULPD(XMM0, Ra); MULPD(result_xmm, Ra);
if (inst.SUBOP5 == 28) // msub if (inst.SUBOP5 == 28) // msub
SUBPD(XMM0, Rb); SUBPD(result_xmm, Rb);
else //(n)madd(s[01]) else //(n)madd(s[01])
ADDPD(XMM0, Rb); ADDPD(result_xmm, Rb);
} }
else else
{ {
MULSD(XMM0, Ra); MULSD(result_xmm, Ra);
if (inst.SUBOP5 == 28) if (inst.SUBOP5 == 28)
SUBSD(XMM0, Rb); SUBSD(result_xmm, Rb);
else else
ADDSD(XMM0, Rb); ADDSD(result_xmm, Rb);
} }
if (inst.SUBOP5 == 31) // nmadd if (inst.SUBOP5 == 31) // nmadd
XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits)); XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
} }
} }
} }
if (SConfig::GetInstance().bAccurateNaNs && result_xmm == XMM0)
{
// HandleNaNs needs to clobber XMM0
MOVAPD(XMM1, R(result_xmm));
result_xmm = XMM1;
}
if (single) if (single)
{ {
HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1); HandleNaNs(inst, result_xmm, result_xmm, XMM0);
FinalizeSingleResult(Rd, R(result_reg), packed, true); FinalizeSingleResult(Rd, R(result_xmm), packed, true);
} }
else else
{ {
HandleNaNs(inst, result_reg, result_reg, XMM1); HandleNaNs(inst, result_xmm, result_xmm, XMM0);
FinalizeDoubleResult(Rd, R(result_reg)); FinalizeDoubleResult(Rd, R(result_xmm));
} }
} }

View File

@ -109,7 +109,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
if (round_input) if (round_input)
Force25BitPrecision(XMM1, R(XMM1), XMM0); Force25BitPrecision(XMM1, R(XMM1), XMM0);
MULPD(XMM1, Ra); MULPD(XMM1, Ra);
HandleNaNs(inst, Rd, XMM1); HandleNaNs(inst, Rd, XMM1, XMM0);
FinalizeSingleResult(Rd, Rd); FinalizeSingleResult(Rd, Rd);
} }