Jit: Use accurate negation order for FMA instructions

It was believed that this only mattered when the rounding mode was
set to round to infinity, which games generally don't do, but it
can also affect the sign of the output when the inputs are all zero.
This commit is contained in:
JosJuice 2021-07-29 11:37:45 +02:00
parent c86c02e46b
commit 93e636abc3
3 changed files with 99 additions and 156 deletions

View File

@ -345,13 +345,18 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
RegCache::Realize(Ra, Rb, Rc, Rd);
}
X64Reg scratch_xmm = !use_fma && inst.SUBOP5 == 30 ? XMM1 : XMM0;
X64Reg result_xmm = scratch_xmm == XMM0 ? XMM1 : XMM0;
const bool subtract = inst.SUBOP5 == 28 || inst.SUBOP5 == 30; // msub, nmsub
const bool negate = inst.SUBOP5 == 30 || inst.SUBOP5 == 31; // nmsub, nmadd
const bool madds0 = inst.SUBOP5 == 14;
const bool madds1 = inst.SUBOP5 == 15;
X64Reg scratch_xmm = XMM0;
X64Reg result_xmm = XMM1;
if (software_fma)
{
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
{
if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15) // (i == 0 || madds0) && !madds1
if ((i == 0 || madds0) && !madds1)
{
if (round_input)
Force25BitPrecision(XMM1, Rc, XMM2);
@ -381,7 +386,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
MOVHLPS(XMM2, Rb.GetSimpleReg());
}
if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30) // nsub, nmsub
if (subtract)
XORPS(XMM2, MConst(psSignBits));
BitSet32 registers_in_use = CallerSavedRegistersInUse();
@ -399,111 +404,74 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
{
result_xmm = XMM0;
}
if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd
XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
}
else
{
switch (inst.SUBOP5)
if (madds0)
{
case 14: // madds0
MOVDDUP(result_xmm, Rc);
if (round_input)
Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
break;
case 15: // madds1
}
else if (madds1)
{
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3);
if (round_input)
Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
break;
default:
}
else
{
if (single && round_input)
Force25BitPrecision(result_xmm, Rc, scratch_xmm);
else
MOVAPD(result_xmm, Rc);
break;
}
if (use_fma)
{
switch (inst.SUBOP5)
if (subtract)
{
case 28: // msub
if (packed)
VFMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else
VFMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break;
case 14: // madds0
case 15: // madds1
case 29: // madd
if (packed)
VFMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else
VFMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break;
// PowerPC and x86 define NMADD/NMSUB differently
// x86: D = -A*C (+/-) B
// PPC: D = -(A*C (+/-) B)
// so we have to swap them; the ADD/SUB here isn't a typo.
case 30: // nmsub
if (packed)
VFNMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else
VFNMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break;
case 31: // nmadd
if (packed)
VFNMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else
VFNMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra);
break;
}
}
else
{
if (inst.SUBOP5 == 30) // nmsub
{
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
// so handle it separately.
MOVAPD(scratch_xmm, Rb);
if (packed)
{
MULPD(result_xmm, Ra);
SUBPD(scratch_xmm, R(result_xmm));
}
else
{
MULSD(result_xmm, Ra);
SUBSD(scratch_xmm, R(result_xmm));
}
result_xmm = scratch_xmm;
}
else
{
if (packed)
{
MULPD(result_xmm, Ra);
if (inst.SUBOP5 == 28) // msub
SUBPD(result_xmm, Rb);
else //(n)madd(s[01])
ADDPD(result_xmm, Rb);
}
VFMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra);
else
{
MULSD(result_xmm, Ra);
if (inst.SUBOP5 == 28)
SUBSD(result_xmm, Rb);
else
ADDSD(result_xmm, Rb);
}
if (inst.SUBOP5 == 31) // nmadd
XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
VFMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra);
}
}
else
{
if (packed)
{
MULPD(result_xmm, Ra);
if (subtract)
SUBPD(result_xmm, Rb);
else
ADDPD(result_xmm, Rb);
}
else
{
MULSD(result_xmm, Ra);
if (subtract)
SUBSD(result_xmm, Rb);
else
ADDSD(result_xmm, Rb);
}
}
}
// Using x64's nmadd/nmsub would require us to swap the sign of the addend
// (i.e. PPC nmadd maps to x64 nmsub), which can cause problems with signed zeroes.
// Also, PowerPC's nmadd/nmsub round before the final negation unlike x64's nmadd/nmsub.
// So, negate using a separate instruction instead of using x64's nmadd/nmsub.
if (negate)
XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits));
if (SConfig::GetInstance().bAccurateNaNs && result_xmm == XMM0)
{
// HandleNaNs needs to clobber XMM0

View File

@ -178,18 +178,24 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
case 25:
m_float_emit.FMUL(VD, VA, VC);
break;
case 28:
case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
m_float_emit.FNMSUB(VD, VA, VC, VB);
break; // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
case 29:
break;
case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
m_float_emit.FMADD(VD, VA, VC, VB);
break; // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
case 30:
m_float_emit.FMSUB(VD, VA, VC, VB);
break; // fnmsub: "D = -(A*C - B)" vs "Vd = Va + (-Vn)*Vm"
case 31:
m_float_emit.FNMADD(VD, VA, VC, VB);
break; // fnmadd: "D = -(A*C + B)" vs "Vd = (-Va) + (-Vn)*Vm"
break;
// While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic],
// the subtly different definitions affect how signed zeroes are handled.
// Also, PowerPC's nmadd/nmsub perform rounding before the final negation.
// So, negate using a separate instruction instead of using AArch64's nmadd/msub.
case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
m_float_emit.FNMSUB(VD, VA, VC, VB);
m_float_emit.FNEG(VD, VD);
break;
case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
m_float_emit.FMADD(VD, VA, VC, VB);
m_float_emit.FNEG(VD, VD);
break;
default:
ASSERT_MSG(DYNA_REC, 0, "fp_arith");
break;

View File

@ -147,26 +147,29 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
ARM64Reg V0 = ARM64Reg::INVALID_REG;
ARM64Reg V1Q = ARM64Reg::INVALID_REG;
if (round_c || (d != b && (d == a || d == c)))
{
V0Q = fpr.GetReg();
V0 = reg_encoder(V0Q);
}
const auto allocate_v0_if_needed = [&] {
if (V0Q == ARM64Reg::INVALID_REG)
{
V0Q = fpr.GetReg();
V0 = reg_encoder(V0Q);
}
};
if (round_c)
{
ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
allocate_v0_if_needed();
V1Q = fpr.GetReg();
Force25BitPrecision(reg_encoder(V1Q), VC, V0);
VC = reg_encoder(V1Q);
}
ARM64Reg result_reg = VD;
switch (op5)
{
case 14: // ps_madds0
// d = a * c.ps0 + b
case 14: // ps_madds0: d = a * c.ps0 + b
if (VD == VB)
{
m_float_emit.FMLA(size, VD, VA, VC, 0);
@ -178,13 +181,13 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
}
else
{
allocate_v0_if_needed();
m_float_emit.MOV(V0, VB);
m_float_emit.FMLA(size, V0, VA, VC, 0);
m_float_emit.MOV(VD, V0);
result_reg = V0;
}
break;
case 15: // ps_madds1
// d = a * c.ps1 + b
case 15: // ps_madds1: d = a * c.ps1 + b
if (VD == VB)
{
m_float_emit.FMLA(size, VD, VA, VC, 1);
@ -196,34 +199,29 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
}
else
{
allocate_v0_if_needed();
m_float_emit.MOV(V0, VB);
m_float_emit.FMLA(size, V0, VA, VC, 1);
m_float_emit.MOV(VD, V0);
result_reg = V0;
}
break;
case 28: // ps_msub
// d = a * c - b
if (VD == VB)
{
// d = -(-a * c + b)
// rounding is incorrect if the rounding mode is +/- infinity
m_float_emit.FMLS(size, VD, VA, VC);
m_float_emit.FNEG(size, VD, VD);
}
else if (VD != VA && VD != VC)
case 28: // ps_msub: d = a * c - b
case 30: // ps_nmsub: d = -(a * c - b)
if (VD != VA && VD != VC)
{
m_float_emit.FNEG(size, VD, VB);
m_float_emit.FMLA(size, VD, VA, VC);
}
else
{
allocate_v0_if_needed();
m_float_emit.FNEG(size, V0, VB);
m_float_emit.FMLA(size, V0, VA, VC);
m_float_emit.MOV(VD, V0);
result_reg = V0;
}
break;
case 29: // ps_madd
// d = a * c + b
case 29: // ps_madd: d = a * c + b
case 31: // ps_nmadd: d = -(a * c + b)
if (VD == VB)
{
m_float_emit.FMLA(size, VD, VA, VC);
@ -235,53 +233,10 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
}
else
{
allocate_v0_if_needed();
m_float_emit.MOV(V0, VB);
m_float_emit.FMLA(size, V0, VA, VC);
m_float_emit.MOV(VD, V0);
}
break;
case 30: // ps_nmsub
// d = -(a * c - b)
// =>
// d = -a * c + b
// Note: PowerPC rounds before the final negation.
// We don't handle this at the moment because it's
// only relevant when rounding to +/- infinity.
if (VD == VB)
{
m_float_emit.FMLS(size, VD, VA, VC);
}
else if (VD != VA && VD != VC)
{
m_float_emit.MOV(VD, VB);
m_float_emit.FMLS(size, VD, VA, VC);
}
else
{
m_float_emit.MOV(V0, VB);
m_float_emit.FMLS(size, V0, VA, VC);
m_float_emit.MOV(VD, V0);
}
break;
case 31: // ps_nmadd
// d = -(a * c + b)
if (VD == VB)
{
m_float_emit.FMLA(size, VD, VA, VC);
m_float_emit.FNEG(size, VD, VD);
}
else if (VD != VA && VD != VC)
{
// d = -a * c - b
// See rounding note at ps_nmsub.
m_float_emit.FNEG(size, VD, VB);
m_float_emit.FMLS(size, VD, VA, VC);
}
else
{
m_float_emit.MOV(V0, VB);
m_float_emit.FMLA(size, V0, VA, VC);
m_float_emit.FNEG(size, VD, V0);
result_reg = V0;
}
break;
default:
@ -289,6 +244,20 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
break;
}
switch (op5)
{
case 30: // ps_nmsub
case 31: // ps_nmadd
// PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case
// for any of AArch64's FMA instructions, so we negate using a separate instruction.
m_float_emit.FNEG(size, VD, result_reg);
break;
default:
if (result_reg != VD)
m_float_emit.MOV(VD, result_reg);
break;
}
if (V0Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V0Q);
if (V1Q != ARM64Reg::INVALID_REG)