Jit: Use accurate negation order for FMA instructions

It was believed that this only mattered when the rounding mode was
set to round to infinity, which games generally don't do, but it
can also affect the sign of the output when the inputs are all zero.
This commit is contained in:
JosJuice 2021-07-29 11:37:45 +02:00
parent c86c02e46b
commit 93e636abc3
3 changed files with 99 additions and 156 deletions

View File

@ -345,13 +345,18 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
RegCache::Realize(Ra, Rb, Rc, Rd); RegCache::Realize(Ra, Rb, Rc, Rd);
} }
X64Reg scratch_xmm = !use_fma && inst.SUBOP5 == 30 ? XMM1 : XMM0; const bool subtract = inst.SUBOP5 == 28 || inst.SUBOP5 == 30; // msub, nmsub
X64Reg result_xmm = scratch_xmm == XMM0 ? XMM1 : XMM0; const bool negate = inst.SUBOP5 == 30 || inst.SUBOP5 == 31; // nmsub, nmadd
const bool madds0 = inst.SUBOP5 == 14;
const bool madds1 = inst.SUBOP5 == 15;
X64Reg scratch_xmm = XMM0;
X64Reg result_xmm = XMM1;
if (software_fma) if (software_fma)
{ {
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i) for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
{ {
if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15) // (i == 0 || madds0) && !madds1 if ((i == 0 || madds0) && !madds1)
{ {
if (round_input) if (round_input)
Force25BitPrecision(XMM1, Rc, XMM2); Force25BitPrecision(XMM1, Rc, XMM2);
@ -381,7 +386,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
MOVHLPS(XMM2, Rb.GetSimpleReg()); MOVHLPS(XMM2, Rb.GetSimpleReg());
} }
if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30) // nsub, nmsub if (subtract)
XORPS(XMM2, MConst(psSignBits)); XORPS(XMM2, MConst(psSignBits));
BitSet32 registers_in_use = CallerSavedRegistersInUse(); BitSet32 registers_in_use = CallerSavedRegistersInUse();
@ -399,111 +404,74 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
{ {
result_xmm = XMM0; result_xmm = XMM0;
} }
if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd
XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
} }
else else
{ {
switch (inst.SUBOP5) if (madds0)
{ {
case 14: // madds0
MOVDDUP(result_xmm, Rc); MOVDDUP(result_xmm, Rc);
if (round_input) if (round_input)
Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm); Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
break; }
case 15: // madds1 else if (madds1)
{
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3); avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3);
if (round_input) if (round_input)
Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm); Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
break; }
default: else
{
if (single && round_input) if (single && round_input)
Force25BitPrecision(result_xmm, Rc, scratch_xmm); Force25BitPrecision(result_xmm, Rc, scratch_xmm);
else else
MOVAPD(result_xmm, Rc); MOVAPD(result_xmm, Rc);
break;
} }
if (use_fma) if (use_fma)
{ {
switch (inst.SUBOP5) if (subtract)
{ {
case 28: // msub
if (packed) if (packed)
VFMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra); VFMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else else
VFMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra); VFMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break;
case 14: // madds0
case 15: // madds1
case 29: // madd
if (packed)
VFMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else
VFMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break;
// PowerPC and x86 define NMADD/NMSUB differently
// x86: D = -A*C (+/-) B
// PPC: D = -(A*C (+/-) B)
// so we have to swap them; the ADD/SUB here isn't a typo.
case 30: // nmsub
if (packed)
VFNMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else
VFNMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break;
case 31: // nmadd
if (packed)
VFNMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else
VFNMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break;
}
}
else
{
if (inst.SUBOP5 == 30) // nmsub
{
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
// so handle it separately.
MOVAPD(scratch_xmm, Rb);
if (packed)
{
MULPD(result_xmm, Ra);
SUBPD(scratch_xmm, R(result_xmm));
}
else
{
MULSD(result_xmm, Ra);
SUBSD(scratch_xmm, R(result_xmm));
}
result_xmm = scratch_xmm;
} }
else else
{ {
if (packed) if (packed)
{ VFMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
MULPD(result_xmm, Ra);
if (inst.SUBOP5 == 28) // msub
SUBPD(result_xmm, Rb);
else //(n)madd(s[01])
ADDPD(result_xmm, Rb);
}
else else
{ VFMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
MULSD(result_xmm, Ra); }
if (inst.SUBOP5 == 28) }
SUBSD(result_xmm, Rb); else
else {
ADDSD(result_xmm, Rb); if (packed)
} {
if (inst.SUBOP5 == 31) // nmadd MULPD(result_xmm, Ra);
XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits)); if (subtract)
SUBPD(result_xmm, Rb);
else
ADDPD(result_xmm, Rb);
}
else
{
MULSD(result_xmm, Ra);
if (subtract)
SUBSD(result_xmm, Rb);
else
ADDSD(result_xmm, Rb);
} }
} }
} }
// Using x64's nmadd/nmsub would require us to swap the sign of the addend
// (i.e. PPC nmadd maps to x64 nmsub), which can cause problems with signed zeroes.
// Also, PowerPC's nmadd/nmsub round before the final negation unlike x64's nmadd/nmsub.
// So, negate using a separate instruction instead of using x64's nmadd/nmsub.
if (negate)
XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
if (SConfig::GetInstance().bAccurateNaNs && result_xmm == XMM0) if (SConfig::GetInstance().bAccurateNaNs && result_xmm == XMM0)
{ {
// HandleNaNs needs to clobber XMM0 // HandleNaNs needs to clobber XMM0

View File

@ -178,18 +178,24 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
case 25: case 25:
m_float_emit.FMUL(VD, VA, VC); m_float_emit.FMUL(VD, VA, VC);
break; break;
case 28: case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
m_float_emit.FNMSUB(VD, VA, VC, VB); m_float_emit.FNMSUB(VD, VA, VC, VB);
break; // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm" break;
case 29: case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
m_float_emit.FMADD(VD, VA, VC, VB); m_float_emit.FMADD(VD, VA, VC, VB);
break; // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm" break;
case 30: // While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic],
m_float_emit.FMSUB(VD, VA, VC, VB); // the subtly different definitions affect how signed zeroes are handled.
break; // fnmsub: "D = -(A*C - B)" vs "Vd = Va + (-Vn)*Vm" // Also, PowerPC's nmadd/nmsub perform rounding before the final negation.
case 31: // So, negate using a separate instruction instead of using AArch64's nmadd/msub.
m_float_emit.FNMADD(VD, VA, VC, VB); case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
break; // fnmadd: "D = -(A*C + B)" vs "Vd = (-Va) + (-Vn)*Vm" m_float_emit.FNMSUB(VD, VA, VC, VB);
m_float_emit.FNEG(VD, VD);
break;
case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
m_float_emit.FMADD(VD, VA, VC, VB);
m_float_emit.FNEG(VD, VD);
break;
default: default:
ASSERT_MSG(DYNA_REC, 0, "fp_arith"); ASSERT_MSG(DYNA_REC, 0, "fp_arith");
break; break;

View File

@ -147,26 +147,29 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
ARM64Reg V0 = ARM64Reg::INVALID_REG; ARM64Reg V0 = ARM64Reg::INVALID_REG;
ARM64Reg V1Q = ARM64Reg::INVALID_REG; ARM64Reg V1Q = ARM64Reg::INVALID_REG;
if (round_c || (d != b && (d == a || d == c))) const auto allocate_v0_if_needed = [&] {
{ if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetReg(); {
V0 = reg_encoder(V0Q); V0Q = fpr.GetReg();
} V0 = reg_encoder(V0Q);
}
};
if (round_c) if (round_c)
{ {
ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
allocate_v0_if_needed();
V1Q = fpr.GetReg(); V1Q = fpr.GetReg();
Force25BitPrecision(reg_encoder(V1Q), VC, V0); Force25BitPrecision(reg_encoder(V1Q), VC, V0);
VC = reg_encoder(V1Q); VC = reg_encoder(V1Q);
} }
ARM64Reg result_reg = VD;
switch (op5) switch (op5)
{ {
case 14: // ps_madds0 case 14: // ps_madds0: d = a * c.ps0 + b
// d = a * c.ps0 + b
if (VD == VB) if (VD == VB)
{ {
m_float_emit.FMLA(size, VD, VA, VC, 0); m_float_emit.FMLA(size, VD, VA, VC, 0);
@ -178,13 +181,13 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
} }
else else
{ {
allocate_v0_if_needed();
m_float_emit.MOV(V0, VB); m_float_emit.MOV(V0, VB);
m_float_emit.FMLA(size, V0, VA, VC, 0); m_float_emit.FMLA(size, V0, VA, VC, 0);
m_float_emit.MOV(VD, V0); result_reg = V0;
} }
break; break;
case 15: // ps_madds1 case 15: // ps_madds1: d = a * c.ps1 + b
// d = a * c.ps1 + b
if (VD == VB) if (VD == VB)
{ {
m_float_emit.FMLA(size, VD, VA, VC, 1); m_float_emit.FMLA(size, VD, VA, VC, 1);
@ -196,34 +199,29 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
} }
else else
{ {
allocate_v0_if_needed();
m_float_emit.MOV(V0, VB); m_float_emit.MOV(V0, VB);
m_float_emit.FMLA(size, V0, VA, VC, 1); m_float_emit.FMLA(size, V0, VA, VC, 1);
m_float_emit.MOV(VD, V0); result_reg = V0;
} }
break; break;
case 28: // ps_msub case 28: // ps_msub: d = a * c - b
// d = a * c - b case 30: // ps_nmsub: d = -(a * c - b)
if (VD == VB) if (VD != VA && VD != VC)
{
// d = -(-a * c + b)
// rounding is incorrect if the rounding mode is +/- infinity
m_float_emit.FMLS(size, VD, VA, VC);
m_float_emit.FNEG(size, VD, VD);
}
else if (VD != VA && VD != VC)
{ {
m_float_emit.FNEG(size, VD, VB); m_float_emit.FNEG(size, VD, VB);
m_float_emit.FMLA(size, VD, VA, VC); m_float_emit.FMLA(size, VD, VA, VC);
} }
else else
{ {
allocate_v0_if_needed();
m_float_emit.FNEG(size, V0, VB); m_float_emit.FNEG(size, V0, VB);
m_float_emit.FMLA(size, V0, VA, VC); m_float_emit.FMLA(size, V0, VA, VC);
m_float_emit.MOV(VD, V0); result_reg = V0;
} }
break; break;
case 29: // ps_madd case 29: // ps_madd: d = a * c + b
// d = a * c + b case 31: // ps_nmadd: d = -(a * c + b)
if (VD == VB) if (VD == VB)
{ {
m_float_emit.FMLA(size, VD, VA, VC); m_float_emit.FMLA(size, VD, VA, VC);
@ -235,53 +233,10 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
} }
else else
{ {
allocate_v0_if_needed();
m_float_emit.MOV(V0, VB); m_float_emit.MOV(V0, VB);
m_float_emit.FMLA(size, V0, VA, VC); m_float_emit.FMLA(size, V0, VA, VC);
m_float_emit.MOV(VD, V0); result_reg = V0;
}
break;
case 30: // ps_nmsub
// d = -(a * c - b)
// =>
// d = -a * c + b
// Note: PowerPC rounds before the final negation.
// We don't handle this at the moment because it's
// only relevant when rounding to +/- infinity.
if (VD == VB)
{
m_float_emit.FMLS(size, VD, VA, VC);
}
else if (VD != VA && VD != VC)
{
m_float_emit.MOV(VD, VB);
m_float_emit.FMLS(size, VD, VA, VC);
}
else
{
m_float_emit.MOV(V0, VB);
m_float_emit.FMLS(size, V0, VA, VC);
m_float_emit.MOV(VD, V0);
}
break;
case 31: // ps_nmadd
// d = -(a * c + b)
if (VD == VB)
{
m_float_emit.FMLA(size, VD, VA, VC);
m_float_emit.FNEG(size, VD, VD);
}
else if (VD != VA && VD != VC)
{
// d = -a * c - b
// See rounding note at ps_nmsub.
m_float_emit.FNEG(size, VD, VB);
m_float_emit.FMLS(size, VD, VA, VC);
}
else
{
m_float_emit.MOV(V0, VB);
m_float_emit.FMLA(size, V0, VA, VC);
m_float_emit.FNEG(size, VD, V0);
} }
break; break;
default: default:
@ -289,6 +244,20 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
break; break;
} }
switch (op5)
{
case 30: // ps_nmsub
case 31: // ps_nmadd
// PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
// for any of AArch64's FMA instructions, so we negate using a separate instruction.
m_float_emit.FNEG(size, VD, result_reg);
break;
default:
if (result_reg != VD)
m_float_emit.MOV(VD, result_reg);
break;
}
if (V0Q != ARM64Reg::INVALID_REG) if (V0Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V0Q); fpr.Unlock(V0Q);
if (V1Q != ARM64Reg::INVALID_REG) if (V1Q != ARM64Reg::INVALID_REG)