Merge pull request #9748 from JosJuice/fma-accuracy

Interpreter/Jit64: Emulate FMA accurately in more cases
2025-07-24 14:49:42 -06:00 · 2021-06-06 02:29:42 +01:00
parent 2f8c9a7735 2c38d6419e
commit 6c0180fc61
3 changed files with 190 additions and 152 deletions
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@ -238,7 +238,7 @@ inline FPResult NI_sub(UReg_FPSCR* fpscr, double a, double b)
 // inputs are checked for NaN is still a, b, c.
 inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)
 {
-  FPResult result{a * c};
+  FPResult result{std::fma(a, c, b)};

  if (std::isnan(result.value))
  {
@ -263,27 +263,7 @@ inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)
      return result;
    }

-    result.SetException(fpscr, FPSCR_VXIMZ);
-    result.value = PPC_NAN;
-    return result;
-  }
-
-  result.value += b;
-
-  if (std::isnan(result.value))
-  {
-    if (Common::IsSNAN(b))
-      result.SetException(fpscr, FPSCR_VXSNAN);
-
-    fpscr->ClearFIFR();
-
-    if (std::isnan(b))
-    {
-      result.value = MakeQuiet(b);
-      return result;
-    }
-
-    result.SetException(fpscr, FPSCR_VXISI);
+    result.SetException(fpscr, std::isnan(a * c) ? FPSCR_VXIMZ : FPSCR_VXISI);
    result.value = PPC_NAN;
    return result;
  }
@ -296,7 +276,7 @@ inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)

 inline FPResult NI_msub(UReg_FPSCR* fpscr, double a, double c, double b)
 {
-  FPResult result{a * c};
+  FPResult result{std::fma(a, c, -b)};

  if (std::isnan(result.value))
  {
@ -321,27 +301,7 @@ inline FPResult NI_msub(UReg_FPSCR* fpscr, double a, double c, double b)
      return result;
    }

-    result.SetException(fpscr, FPSCR_VXIMZ);
-    result.value = PPC_NAN;
-    return result;
-  }
-
-  result.value -= b;
-
-  if (std::isnan(result.value))
-  {
-    if (Common::IsSNAN(b))
-      result.SetException(fpscr, FPSCR_VXSNAN);
-
-    fpscr->ClearFIFR();
-
-    if (std::isnan(b))
-    {
-      result.value = MakeQuiet(b);
-      return result;
-    }
-
-    result.SetException(fpscr, FPSCR_VXISI);
+    result.SetException(fpscr, std::isnan(a * c) ? FPSCR_VXIMZ : FPSCR_VXISI);
    result.value = PPC_NAN;
    return result;
  }
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -3,6 +3,8 @@
 // Refer to the license.txt file included.

 #include <algorithm>
+#include <cmath>
+#include <limits>
 #include <vector>

 #include "Common/Assert.h"
@ -239,72 +241,139 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
  JITDISABLE(bJITFloatingPointOff);
  FALLBACK_IF(inst.Rc);

+  // While we don't know if any games are actually affected (replays seem to work with all the usual
+  // suspects for desyncing), netplay and other applications need absolute perfect determinism, so
+  // be extra careful and use software FMA on CPUs that don't have hardware FMA.
+  const bool software_fma = !cpu_info.bFMA && Core::WantsDeterminism();
+
  int a = inst.FA;
  int b = inst.FB;
  int c = inst.FC;
  int d = inst.FD;
  bool single = inst.OPCD == 4 || inst.OPCD == 59;
  bool round_input = single && !js.op->fprIsSingle[c];
-  bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && single && js.op->fprIsDuplicated[a] &&
+  bool packed =
+      inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
                         js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);

-  // While we don't know if any games are actually affected (replays seem to work with all the usual
-  // suspects for desyncing), netplay and other applications need absolute perfect determinism, so
-  // be extra careful and don't use FMA, even if in theory it might be okay.
-  // Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
-  // to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
-  // instances on different computers giving identical results.
-  const bool use_fma = cpu_info.bFMA && !Core::WantsDeterminism();
-
-  // For use_fma == true:
+  RCOpArg Ra;
+  RCOpArg Rb;
+  RCOpArg Rc;
+  RCX64Reg Rd;
+  RCX64Reg scratch_guard;
+  if (software_fma)
+  {
+    scratch_guard = fpr.Scratch(XMM2);
+    Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
+    Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
+    Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
+    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
+    RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
+  }
+  else
+  {
+    // For cpu_info.bFMA == true:
    //   Statistics suggests b is a lot less likely to be unbound in practice, so
    //   if we have to pick one of a or b to bind, let's make it b.
-  RCOpArg Ra = fpr.Use(a, RCMode::Read);
-  RCOpArg Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
-  RCOpArg Rc = fpr.Use(c, RCMode::Read);
-  RCX64Reg Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
+    Ra = fpr.Use(a, RCMode::Read);
+    Rb = cpu_info.bFMA ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
+    Rc = fpr.Use(c, RCMode::Read);
+    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
    RegCache::Realize(Ra, Rb, Rc, Rd);
+  }

+  X64Reg result_reg = XMM0;
+  if (software_fma)
+  {
+    for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
+    {
+      if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15)  // (i == 0 || madds0) && !madds1
+      {
+        if (round_input)
+          Force25BitPrecision(XMM1, Rc, XMM2);
+        else
+          MOVSD(XMM1, Rc);
+      }
+      else
+      {
+        MOVHLPS(XMM1, Rc.GetSimpleReg());
+        if (round_input)
+          Force25BitPrecision(XMM1, R(XMM1), XMM2);
+      }
+
+      // Write the result from the previous loop iteration into Rd so we don't lose it.
+      // It's important that this is done after reading Rc above, in case we have madds1 and c == d.
+      if (packed && i == 0)
+        MOVLHPS(Rd, XMM0);
+
+      if (i == 0)
+      {
+        MOVSD(XMM0, Ra);
+        MOVSD(XMM2, Rb);
+      }
+      else
+      {
+        MOVHLPS(XMM0, Ra.GetSimpleReg());
+        MOVHLPS(XMM2, Rb.GetSimpleReg());
+      }
+
+      if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30)  // nsub, nmsub
+        XORPS(XMM2, MConst(psSignBits));
+
+      BitSet32 registers_in_use = CallerSavedRegistersInUse();
+      ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
+      ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
+      ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
+    }
+
+    if (packed)
+    {
+      MOVSD(Rd, XMM0);
+      result_reg = Rd;
+    }
+
+    if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31)  // nmsub, nmadd
+      XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits));
+  }
+  else
+  {
    switch (inst.SUBOP5)
    {
-  case 14:
-    MOVDDUP(XMM1, Rc);
+    case 14:  // madds0
+      MOVDDUP(XMM0, Rc);
      if (round_input)
-      Force25BitPrecision(XMM1, R(XMM1), XMM0);
+        Force25BitPrecision(XMM0, R(XMM0), XMM1);
      break;
-  case 15:
-    avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, Rc, Rc, 3);
+    case 15:  // madds1
+      avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3);
      if (round_input)
-      Force25BitPrecision(XMM1, R(XMM1), XMM0);
+        Force25BitPrecision(XMM0, R(XMM0), XMM1);
      break;
    default:
-    bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::WantsDeterminism());
-    X64Reg tmp1 = special ? XMM0 : XMM1;
-    X64Reg tmp2 = special ? XMM1 : XMM0;
      if (single && round_input)
-      Force25BitPrecision(tmp1, Rc, tmp2);
+        Force25BitPrecision(XMM0, Rc, XMM1);
      else
-      MOVAPD(tmp1, Rc);
+        MOVAPD(XMM0, Rc);
      break;
    }

-  if (use_fma)
+    if (cpu_info.bFMA)
    {
      switch (inst.SUBOP5)
      {
      case 28:  // msub
        if (packed)
-        VFMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
+          VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
        else
-        VFMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
+          VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
        break;
      case 14:  // madds0
      case 15:  // madds1
      case 29:  // madd
        if (packed)
-        VFMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
+          VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
        else
-        VFMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
+          VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
        break;
      // PowerPC and x86 define NMADD/NMSUB differently
      // x86: D = -A*C (+/-) B
@ -312,22 +381,27 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
      // so we have to swap them; the ADD/SUB here isn't a typo.
      case 30:  // nmsub
        if (packed)
-        VFNMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
+          VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
        else
-        VFNMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
+          VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
        break;
      case 31:  // nmadd
        if (packed)
-        VFNMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
+          VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
        else
-        VFNMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
+          VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
        break;
      }
    }
-  else if (inst.SUBOP5 == 30)  // nmsub
+    else
    {
-    // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it
-    // separately.
+      // No hardware support for FMA, and determinism is not enabled. In this case we inaccurately
+      // do the multiplication and addition/subtraction in two separate operations for performance.
+
+      if (inst.SUBOP5 == 30)  // nmsub
+      {
+        // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
+        // so handle it separately.
        MOVAPD(XMM1, Rb);
        if (packed)
        {
@ -339,38 +413,41 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
          MULSD(XMM0, Ra);
          SUBSD(XMM1, R(XMM0));
        }
+        result_reg = XMM1;
      }
      else
      {
        if (packed)
        {
-      MULPD(XMM1, Ra);
+          MULPD(XMM0, Ra);
          if (inst.SUBOP5 == 28)  // msub
-        SUBPD(XMM1, Rb);
+            SUBPD(XMM0, Rb);
          else  //(n)madd(s[01])
-        ADDPD(XMM1, Rb);
+            ADDPD(XMM0, Rb);
        }
        else
        {
-      MULSD(XMM1, Ra);
+          MULSD(XMM0, Ra);
          if (inst.SUBOP5 == 28)
-        SUBSD(XMM1, Rb);
+            SUBSD(XMM0, Rb);
          else
-        ADDSD(XMM1, Rb);
+            ADDSD(XMM0, Rb);
        }
        if (inst.SUBOP5 == 31)  // nmadd
-      XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits));
+          XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits));
+      }
+    }
  }

  if (single)
  {
-    HandleNaNs(inst, Rd, XMM1);
-    ForceSinglePrecision(Rd, Rd, packed, true);
+    HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1);
+    ForceSinglePrecision(Rd, R(result_reg), packed, true);
  }
  else
  {
-    HandleNaNs(inst, XMM1, XMM1);
-    MOVSD(Rd, R(XMM1));
+    HandleNaNs(inst, result_reg, result_reg, XMM1);
+    MOVSD(Rd, R(result_reg));
  }
  SetFPRFIfNeeded(Rd);
 }
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
@ -828,6 +828,7 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
    else
    {
      (this->*sseOp)(XMM0, arg2, imm);
+      if (regOp != XMM0)
        MOVAPD(regOp, R(XMM0));
    }
  }