mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2024-11-14 21:37:52 -07:00
Merge pull request #9748 from JosJuice/fma-accuracy
Interpreter/Jit64: Emulate FMA accurately in more cases
This commit is contained in:
commit
6c0180fc61
@ -238,7 +238,7 @@ inline FPResult NI_sub(UReg_FPSCR* fpscr, double a, double b)
|
|||||||
// inputs are checked for NaN is still a, b, c.
|
// inputs are checked for NaN is still a, b, c.
|
||||||
inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)
|
inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)
|
||||||
{
|
{
|
||||||
FPResult result{a * c};
|
FPResult result{std::fma(a, c, b)};
|
||||||
|
|
||||||
if (std::isnan(result.value))
|
if (std::isnan(result.value))
|
||||||
{
|
{
|
||||||
@ -263,27 +263,7 @@ inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
result.SetException(fpscr, FPSCR_VXIMZ);
|
result.SetException(fpscr, std::isnan(a * c) ? FPSCR_VXIMZ : FPSCR_VXISI);
|
||||||
result.value = PPC_NAN;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
result.value += b;
|
|
||||||
|
|
||||||
if (std::isnan(result.value))
|
|
||||||
{
|
|
||||||
if (Common::IsSNAN(b))
|
|
||||||
result.SetException(fpscr, FPSCR_VXSNAN);
|
|
||||||
|
|
||||||
fpscr->ClearFIFR();
|
|
||||||
|
|
||||||
if (std::isnan(b))
|
|
||||||
{
|
|
||||||
result.value = MakeQuiet(b);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
result.SetException(fpscr, FPSCR_VXISI);
|
|
||||||
result.value = PPC_NAN;
|
result.value = PPC_NAN;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -296,7 +276,7 @@ inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)
|
|||||||
|
|
||||||
inline FPResult NI_msub(UReg_FPSCR* fpscr, double a, double c, double b)
|
inline FPResult NI_msub(UReg_FPSCR* fpscr, double a, double c, double b)
|
||||||
{
|
{
|
||||||
FPResult result{a * c};
|
FPResult result{std::fma(a, c, -b)};
|
||||||
|
|
||||||
if (std::isnan(result.value))
|
if (std::isnan(result.value))
|
||||||
{
|
{
|
||||||
@ -321,27 +301,7 @@ inline FPResult NI_msub(UReg_FPSCR* fpscr, double a, double c, double b)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
result.SetException(fpscr, FPSCR_VXIMZ);
|
result.SetException(fpscr, std::isnan(a * c) ? FPSCR_VXIMZ : FPSCR_VXISI);
|
||||||
result.value = PPC_NAN;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
result.value -= b;
|
|
||||||
|
|
||||||
if (std::isnan(result.value))
|
|
||||||
{
|
|
||||||
if (Common::IsSNAN(b))
|
|
||||||
result.SetException(fpscr, FPSCR_VXSNAN);
|
|
||||||
|
|
||||||
fpscr->ClearFIFR();
|
|
||||||
|
|
||||||
if (std::isnan(b))
|
|
||||||
{
|
|
||||||
result.value = MakeQuiet(b);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
result.SetException(fpscr, FPSCR_VXISI);
|
|
||||||
result.value = PPC_NAN;
|
result.value = PPC_NAN;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
// Refer to the license.txt file included.
|
// Refer to the license.txt file included.
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <limits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "Common/Assert.h"
|
#include "Common/Assert.h"
|
||||||
@ -239,138 +241,213 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||||||
JITDISABLE(bJITFloatingPointOff);
|
JITDISABLE(bJITFloatingPointOff);
|
||||||
FALLBACK_IF(inst.Rc);
|
FALLBACK_IF(inst.Rc);
|
||||||
|
|
||||||
|
// While we don't know if any games are actually affected (replays seem to work with all the usual
|
||||||
|
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
|
||||||
|
// be extra careful and use software FMA on CPUs that don't have hardware FMA.
|
||||||
|
const bool software_fma = !cpu_info.bFMA && Core::WantsDeterminism();
|
||||||
|
|
||||||
int a = inst.FA;
|
int a = inst.FA;
|
||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
int c = inst.FC;
|
int c = inst.FC;
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
bool single = inst.OPCD == 4 || inst.OPCD == 59;
|
bool single = inst.OPCD == 4 || inst.OPCD == 59;
|
||||||
bool round_input = single && !js.op->fprIsSingle[c];
|
bool round_input = single && !js.op->fprIsSingle[c];
|
||||||
bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && single && js.op->fprIsDuplicated[a] &&
|
bool packed =
|
||||||
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
|
inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
|
||||||
|
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
|
||||||
|
|
||||||
// While we don't know if any games are actually affected (replays seem to work with all the usual
|
RCOpArg Ra;
|
||||||
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
|
RCOpArg Rb;
|
||||||
// be extra careful and don't use FMA, even if in theory it might be okay.
|
RCOpArg Rc;
|
||||||
// Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
|
RCX64Reg Rd;
|
||||||
// to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
|
RCX64Reg scratch_guard;
|
||||||
// instances on different computers giving identical results.
|
if (software_fma)
|
||||||
const bool use_fma = cpu_info.bFMA && !Core::WantsDeterminism();
|
|
||||||
|
|
||||||
// For use_fma == true:
|
|
||||||
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
|
||||||
// if we have to pick one of a or b to bind, let's make it b.
|
|
||||||
RCOpArg Ra = fpr.Use(a, RCMode::Read);
|
|
||||||
RCOpArg Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
|
||||||
RCOpArg Rc = fpr.Use(c, RCMode::Read);
|
|
||||||
RCX64Reg Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
|
|
||||||
RegCache::Realize(Ra, Rb, Rc, Rd);
|
|
||||||
|
|
||||||
switch (inst.SUBOP5)
|
|
||||||
{
|
{
|
||||||
case 14:
|
scratch_guard = fpr.Scratch(XMM2);
|
||||||
MOVDDUP(XMM1, Rc);
|
Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
|
||||||
if (round_input)
|
Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
||||||
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
|
||||||
break;
|
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
|
||||||
case 15:
|
RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
|
||||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, Rc, Rc, 3);
|
|
||||||
if (round_input)
|
|
||||||
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::WantsDeterminism());
|
|
||||||
X64Reg tmp1 = special ? XMM0 : XMM1;
|
|
||||||
X64Reg tmp2 = special ? XMM1 : XMM0;
|
|
||||||
if (single && round_input)
|
|
||||||
Force25BitPrecision(tmp1, Rc, tmp2);
|
|
||||||
else
|
|
||||||
MOVAPD(tmp1, Rc);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (use_fma)
|
|
||||||
{
|
|
||||||
switch (inst.SUBOP5)
|
|
||||||
{
|
|
||||||
case 28: // msub
|
|
||||||
if (packed)
|
|
||||||
VFMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
|
|
||||||
else
|
|
||||||
VFMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
|
|
||||||
break;
|
|
||||||
case 14: // madds0
|
|
||||||
case 15: // madds1
|
|
||||||
case 29: // madd
|
|
||||||
if (packed)
|
|
||||||
VFMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
|
|
||||||
else
|
|
||||||
VFMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
|
|
||||||
break;
|
|
||||||
// PowerPC and x86 define NMADD/NMSUB differently
|
|
||||||
// x86: D = -A*C (+/-) B
|
|
||||||
// PPC: D = -(A*C (+/-) B)
|
|
||||||
// so we have to swap them; the ADD/SUB here isn't a typo.
|
|
||||||
case 30: // nmsub
|
|
||||||
if (packed)
|
|
||||||
VFNMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
|
|
||||||
else
|
|
||||||
VFNMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
|
|
||||||
break;
|
|
||||||
case 31: // nmadd
|
|
||||||
if (packed)
|
|
||||||
VFNMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
|
|
||||||
else
|
|
||||||
VFNMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (inst.SUBOP5 == 30) // nmsub
|
|
||||||
{
|
|
||||||
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it
|
|
||||||
// separately.
|
|
||||||
MOVAPD(XMM1, Rb);
|
|
||||||
if (packed)
|
|
||||||
{
|
|
||||||
MULPD(XMM0, Ra);
|
|
||||||
SUBPD(XMM1, R(XMM0));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
MULSD(XMM0, Ra);
|
|
||||||
SUBSD(XMM1, R(XMM0));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
// For cpu_info.bFMA == true:
|
||||||
|
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
||||||
|
// if we have to pick one of a or b to bind, let's make it b.
|
||||||
|
Ra = fpr.Use(a, RCMode::Read);
|
||||||
|
Rb = cpu_info.bFMA ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
||||||
|
Rc = fpr.Use(c, RCMode::Read);
|
||||||
|
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
|
||||||
|
RegCache::Realize(Ra, Rb, Rc, Rd);
|
||||||
|
}
|
||||||
|
|
||||||
|
X64Reg result_reg = XMM0;
|
||||||
|
if (software_fma)
|
||||||
|
{
|
||||||
|
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
|
||||||
|
{
|
||||||
|
if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15) // (i == 0 || madds0) && !madds1
|
||||||
|
{
|
||||||
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM1, Rc, XMM2);
|
||||||
|
else
|
||||||
|
MOVSD(XMM1, Rc);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOVHLPS(XMM1, Rc.GetSimpleReg());
|
||||||
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM1, R(XMM1), XMM2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the result from the previous loop iteration into Rd so we don't lose it.
|
||||||
|
// It's important that this is done after reading Rc above, in case we have madds1 and c == d.
|
||||||
|
if (packed && i == 0)
|
||||||
|
MOVLHPS(Rd, XMM0);
|
||||||
|
|
||||||
|
if (i == 0)
|
||||||
|
{
|
||||||
|
MOVSD(XMM0, Ra);
|
||||||
|
MOVSD(XMM2, Rb);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOVHLPS(XMM0, Ra.GetSimpleReg());
|
||||||
|
MOVHLPS(XMM2, Rb.GetSimpleReg());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30) // nsub, nmsub
|
||||||
|
XORPS(XMM2, MConst(psSignBits));
|
||||||
|
|
||||||
|
BitSet32 registers_in_use = CallerSavedRegistersInUse();
|
||||||
|
ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
|
||||||
|
ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
|
||||||
|
ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
|
||||||
|
}
|
||||||
|
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
MULPD(XMM1, Ra);
|
MOVSD(Rd, XMM0);
|
||||||
if (inst.SUBOP5 == 28) // msub
|
result_reg = Rd;
|
||||||
SUBPD(XMM1, Rb);
|
}
|
||||||
else //(n)madd(s[01])
|
|
||||||
ADDPD(XMM1, Rb);
|
if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd
|
||||||
|
XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
switch (inst.SUBOP5)
|
||||||
|
{
|
||||||
|
case 14: // madds0
|
||||||
|
MOVDDUP(XMM0, Rc);
|
||||||
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
|
break;
|
||||||
|
case 15: // madds1
|
||||||
|
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3);
|
||||||
|
if (round_input)
|
||||||
|
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (single && round_input)
|
||||||
|
Force25BitPrecision(XMM0, Rc, XMM1);
|
||||||
|
else
|
||||||
|
MOVAPD(XMM0, Rc);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cpu_info.bFMA)
|
||||||
|
{
|
||||||
|
switch (inst.SUBOP5)
|
||||||
|
{
|
||||||
|
case 28: // msub
|
||||||
|
if (packed)
|
||||||
|
VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||||
|
else
|
||||||
|
VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||||
|
break;
|
||||||
|
case 14: // madds0
|
||||||
|
case 15: // madds1
|
||||||
|
case 29: // madd
|
||||||
|
if (packed)
|
||||||
|
VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||||
|
else
|
||||||
|
VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||||
|
break;
|
||||||
|
// PowerPC and x86 define NMADD/NMSUB differently
|
||||||
|
// x86: D = -A*C (+/-) B
|
||||||
|
// PPC: D = -(A*C (+/-) B)
|
||||||
|
// so we have to swap them; the ADD/SUB here isn't a typo.
|
||||||
|
case 30: // nmsub
|
||||||
|
if (packed)
|
||||||
|
VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||||
|
else
|
||||||
|
VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||||
|
break;
|
||||||
|
case 31: // nmadd
|
||||||
|
if (packed)
|
||||||
|
VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||||
|
else
|
||||||
|
VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
MULSD(XMM1, Ra);
|
// No hardware support for FMA, and determinism is not enabled. In this case we inaccurately
|
||||||
if (inst.SUBOP5 == 28)
|
// do the multiplication and addition/subtraction in two separate operations for performance.
|
||||||
SUBSD(XMM1, Rb);
|
|
||||||
|
if (inst.SUBOP5 == 30) // nmsub
|
||||||
|
{
|
||||||
|
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
|
||||||
|
// so handle it separately.
|
||||||
|
MOVAPD(XMM1, Rb);
|
||||||
|
if (packed)
|
||||||
|
{
|
||||||
|
MULPD(XMM0, Ra);
|
||||||
|
SUBPD(XMM1, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MULSD(XMM0, Ra);
|
||||||
|
SUBSD(XMM1, R(XMM0));
|
||||||
|
}
|
||||||
|
result_reg = XMM1;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
ADDSD(XMM1, Rb);
|
{
|
||||||
|
if (packed)
|
||||||
|
{
|
||||||
|
MULPD(XMM0, Ra);
|
||||||
|
if (inst.SUBOP5 == 28) // msub
|
||||||
|
SUBPD(XMM0, Rb);
|
||||||
|
else //(n)madd(s[01])
|
||||||
|
ADDPD(XMM0, Rb);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MULSD(XMM0, Ra);
|
||||||
|
if (inst.SUBOP5 == 28)
|
||||||
|
SUBSD(XMM0, Rb);
|
||||||
|
else
|
||||||
|
ADDSD(XMM0, Rb);
|
||||||
|
}
|
||||||
|
if (inst.SUBOP5 == 31) // nmadd
|
||||||
|
XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (inst.SUBOP5 == 31) // nmadd
|
|
||||||
XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (single)
|
if (single)
|
||||||
{
|
{
|
||||||
HandleNaNs(inst, Rd, XMM1);
|
HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1);
|
||||||
ForceSinglePrecision(Rd, Rd, packed, true);
|
ForceSinglePrecision(Rd, R(result_reg), packed, true);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
HandleNaNs(inst, XMM1, XMM1);
|
HandleNaNs(inst, result_reg, result_reg, XMM1);
|
||||||
MOVSD(Rd, R(XMM1));
|
MOVSD(Rd, R(result_reg));
|
||||||
}
|
}
|
||||||
SetFPRFIfNeeded(Rd);
|
SetFPRFIfNeeded(Rd);
|
||||||
}
|
}
|
||||||
|
@ -828,7 +828,8 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
(this->*sseOp)(XMM0, arg2, imm);
|
(this->*sseOp)(XMM0, arg2, imm);
|
||||||
MOVAPD(regOp, R(XMM0));
|
if (regOp != XMM0)
|
||||||
|
MOVAPD(regOp, R(XMM0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
Loading…
Reference in New Issue
Block a user