Merge pull request #9748 from JosJuice/fma-accuracy

Interpreter/Jit64: Emulate FMA accurately in more cases
2025-07-24 14:49:42 -06:00 · 2021-06-06 02:29:42 +01:00
parent 2f8c9a7735 2c38d6419e
commit 6c0180fc61
3 changed files with 190 additions and 152 deletions
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@ -238,7 +238,7 @@ inline FPResult NI_sub(UReg_FPSCR* fpscr, double a, double b)
 // inputs are checked for NaN is still a, b, c.
 inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)
 {
-  FPResult result{a * c};
+  FPResult result{std::fma(a, c, b)};
  if (std::isnan(result.value))
  {
@ -263,27 +263,7 @@ inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)
      return result;
    }
-    result.SetException(fpscr, FPSCR_VXIMZ);
+    result.SetException(fpscr, std::isnan(a * c) ? FPSCR_VXIMZ : FPSCR_VXISI);
    result.value = PPC_NAN;
    return result;
  }
  result.value += b;
  if (std::isnan(result.value))
  {
    if (Common::IsSNAN(b))
      result.SetException(fpscr, FPSCR_VXSNAN);
    fpscr->ClearFIFR();
    if (std::isnan(b))
    {
      result.value = MakeQuiet(b);
      return result;
    }
    result.SetException(fpscr, FPSCR_VXISI);
    result.value = PPC_NAN;
    return result;
  }
@ -296,7 +276,7 @@ inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)
 inline FPResult NI_msub(UReg_FPSCR* fpscr, double a, double c, double b)
 {
-  FPResult result{a * c};
+  FPResult result{std::fma(a, c, -b)};
  if (std::isnan(result.value))
  {
@ -321,27 +301,7 @@ inline FPResult NI_msub(UReg_FPSCR* fpscr, double a, double c, double b)
      return result;
    }
-    result.SetException(fpscr, FPSCR_VXIMZ);
+    result.SetException(fpscr, std::isnan(a * c) ? FPSCR_VXIMZ : FPSCR_VXISI);
    result.value = PPC_NAN;
    return result;
  }
  result.value -= b;
  if (std::isnan(result.value))
  {
    if (Common::IsSNAN(b))
      result.SetException(fpscr, FPSCR_VXSNAN);
    fpscr->ClearFIFR();
    if (std::isnan(b))
    {
      result.value = MakeQuiet(b);
      return result;
    }
    result.SetException(fpscr, FPSCR_VXISI);
    result.value = PPC_NAN;
    return result;
  }
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -3,6 +3,8 @@
 // Refer to the license.txt file included.
 #include <algorithm>
 #include <cmath>
 #include <limits>
 #include <vector>
 #include "Common/Assert.h"
@ -239,138 +241,213 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
  JITDISABLE(bJITFloatingPointOff);
  FALLBACK_IF(inst.Rc);
  // While we don't know if any games are actually affected (replays seem to work with all the usual
  // suspects for desyncing), netplay and other applications need absolute perfect determinism, so
  // be extra careful and use software FMA on CPUs that don't have hardware FMA.
  const bool software_fma = !cpu_info.bFMA && Core::WantsDeterminism();
  int a = inst.FA;
  int b = inst.FB;
  int c = inst.FC;
  int d = inst.FD;
  bool single = inst.OPCD == 4 || inst.OPCD == 59;
  bool round_input = single && !js.op->fprIsSingle[c];
-  bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && single && js.op->fprIsDuplicated[a] &&
+  bool packed =
-                                   js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
+      inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
                         js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
-  // While we don't know if any games are actually affected (replays seem to work with all the usual
+  RCOpArg Ra;
-  // suspects for desyncing), netplay and other applications need absolute perfect determinism, so
+  RCOpArg Rb;
-  // be extra careful and don't use FMA, even if in theory it might be okay.
+  RCOpArg Rc;
-  // Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
+  RCX64Reg Rd;
-  // to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
+  RCX64Reg scratch_guard;
-  // instances on different computers giving identical results.
+  if (software_fma)
  const bool use_fma = cpu_info.bFMA && !Core::WantsDeterminism();
  // For use_fma == true:
  //   Statistics suggests b is a lot less likely to be unbound in practice, so
  //   if we have to pick one of a or b to bind, let's make it b.
  RCOpArg Ra = fpr.Use(a, RCMode::Read);
  RCOpArg Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
  RCOpArg Rc = fpr.Use(c, RCMode::Read);
  RCX64Reg Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
  RegCache::Realize(Ra, Rb, Rc, Rd);
  switch (inst.SUBOP5)
  {
-  case 14:
+    scratch_guard = fpr.Scratch(XMM2);
-    MOVDDUP(XMM1, Rc);
+    Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
-    if (round_input)
+    Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
-      Force25BitPrecision(XMM1, R(XMM1), XMM0);
+    Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
-    break;
+    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
-  case 15:
+    RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
    avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, Rc, Rc, 3);
    if (round_input)
      Force25BitPrecision(XMM1, R(XMM1), XMM0);
    break;
  default:
    bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::WantsDeterminism());
    X64Reg tmp1 = special ? XMM0 : XMM1;
    X64Reg tmp2 = special ? XMM1 : XMM0;
    if (single && round_input)
      Force25BitPrecision(tmp1, Rc, tmp2);
    else
      MOVAPD(tmp1, Rc);
    break;
  }
  if (use_fma)
  {
    switch (inst.SUBOP5)
    {
    case 28:  // msub
      if (packed)
        VFMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
      else
        VFMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
      break;
    case 14:  // madds0
    case 15:  // madds1
    case 29:  // madd
      if (packed)
        VFMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
      else
        VFMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
      break;
    // PowerPC and x86 define NMADD/NMSUB differently
    // x86: D = -A*C (+/-) B
    // PPC: D = -(A*C (+/-) B)
    // so we have to swap them; the ADD/SUB here isn't a typo.
    case 30:  // nmsub
      if (packed)
        VFNMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
      else
        VFNMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
      break;
    case 31:  // nmadd
      if (packed)
        VFNMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
      else
        VFNMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
      break;
    }
  }
  else if (inst.SUBOP5 == 30)  // nmsub
  {
    // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it
    // separately.
    MOVAPD(XMM1, Rb);
    if (packed)
    {
      MULPD(XMM0, Ra);
      SUBPD(XMM1, R(XMM0));
    }
    else
    {
      MULSD(XMM0, Ra);
      SUBSD(XMM1, R(XMM0));
    }
  }
  else
  {
    // For cpu_info.bFMA == true:
    //   Statistics suggests b is a lot less likely to be unbound in practice, so
    //   if we have to pick one of a or b to bind, let's make it b.
    Ra = fpr.Use(a, RCMode::Read);
    Rb = cpu_info.bFMA ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
    Rc = fpr.Use(c, RCMode::Read);
    Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
    RegCache::Realize(Ra, Rb, Rc, Rd);
  }
  X64Reg result_reg = XMM0;
  if (software_fma)
  {
    for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
    {
      if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15)  // (i == 0 || madds0) && !madds1
      {
        if (round_input)
          Force25BitPrecision(XMM1, Rc, XMM2);
        else
          MOVSD(XMM1, Rc);
      }
      else
      {
        MOVHLPS(XMM1, Rc.GetSimpleReg());
        if (round_input)
          Force25BitPrecision(XMM1, R(XMM1), XMM2);
      }
      // Write the result from the previous loop iteration into Rd so we don't lose it.
      // It's important that this is done after reading Rc above, in case we have madds1 and c == d.
      if (packed && i == 0)
        MOVLHPS(Rd, XMM0);
      if (i == 0)
      {
        MOVSD(XMM0, Ra);
        MOVSD(XMM2, Rb);
      }
      else
      {
        MOVHLPS(XMM0, Ra.GetSimpleReg());
        MOVHLPS(XMM2, Rb.GetSimpleReg());
      }
      if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30)  // nsub, nmsub
        XORPS(XMM2, MConst(psSignBits));
      BitSet32 registers_in_use = CallerSavedRegistersInUse();
      ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
      ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
      ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
    }
    if (packed)
    {
-      MULPD(XMM1, Ra);
+      MOVSD(Rd, XMM0);
-      if (inst.SUBOP5 == 28)  // msub
+      result_reg = Rd;
-        SUBPD(XMM1, Rb);
+    }
-      else  //(n)madd(s[01])
+
-        ADDPD(XMM1, Rb);
+    if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31)  // nmsub, nmadd
      XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits));
  }
  else
  {
    switch (inst.SUBOP5)
    {
    case 14:  // madds0
      MOVDDUP(XMM0, Rc);
      if (round_input)
        Force25BitPrecision(XMM0, R(XMM0), XMM1);
      break;
    case 15:  // madds1
      avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3);
      if (round_input)
        Force25BitPrecision(XMM0, R(XMM0), XMM1);
      break;
    default:
      if (single && round_input)
        Force25BitPrecision(XMM0, Rc, XMM1);
      else
        MOVAPD(XMM0, Rc);
      break;
    }
    if (cpu_info.bFMA)
    {
      switch (inst.SUBOP5)
      {
      case 28:  // msub
        if (packed)
          VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
        else
          VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
        break;
      case 14:  // madds0
      case 15:  // madds1
      case 29:  // madd
        if (packed)
          VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
        else
          VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
        break;
      // PowerPC and x86 define NMADD/NMSUB differently
      // x86: D = -A*C (+/-) B
      // PPC: D = -(A*C (+/-) B)
      // so we have to swap them; the ADD/SUB here isn't a typo.
      case 30:  // nmsub
        if (packed)
          VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
        else
          VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
        break;
      case 31:  // nmadd
        if (packed)
          VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
        else
          VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
        break;
      }
    }
    else
    {
-      MULSD(XMM1, Ra);
+      // No hardware support for FMA, and determinism is not enabled. In this case we inaccurately
-      if (inst.SUBOP5 == 28)
+      // do the multiplication and addition/subtraction in two separate operations for performance.
-        SUBSD(XMM1, Rb);
+
      if (inst.SUBOP5 == 30)  // nmsub
      {
        // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
        // so handle it separately.
        MOVAPD(XMM1, Rb);
        if (packed)
        {
          MULPD(XMM0, Ra);
          SUBPD(XMM1, R(XMM0));
        }
        else
        {
          MULSD(XMM0, Ra);
          SUBSD(XMM1, R(XMM0));
        }
        result_reg = XMM1;
      }
      else
-        ADDSD(XMM1, Rb);
+      {
        if (packed)
        {
          MULPD(XMM0, Ra);
          if (inst.SUBOP5 == 28)  // msub
            SUBPD(XMM0, Rb);
          else  //(n)madd(s[01])
            ADDPD(XMM0, Rb);
        }
        else
        {
          MULSD(XMM0, Ra);
          if (inst.SUBOP5 == 28)
            SUBSD(XMM0, Rb);
          else
            ADDSD(XMM0, Rb);
        }
        if (inst.SUBOP5 == 31)  // nmadd
          XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits));
      }
    }
    if (inst.SUBOP5 == 31)  // nmadd
      XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits));
  }
  if (single)
  {
-    HandleNaNs(inst, Rd, XMM1);
+    HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1);
-    ForceSinglePrecision(Rd, Rd, packed, true);
+    ForceSinglePrecision(Rd, R(result_reg), packed, true);
  }
  else
  {
-    HandleNaNs(inst, XMM1, XMM1);
+    HandleNaNs(inst, result_reg, result_reg, XMM1);
-    MOVSD(Rd, R(XMM1));
+    MOVSD(Rd, R(result_reg));
  }
  SetFPRFIfNeeded(Rd);
 }
--- a/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/EmuCodeBlock.cpp
@ -828,7 +828,8 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
    else
    {
      (this->*sseOp)(XMM0, arg2, imm);
-      MOVAPD(regOp, R(XMM0));
+      if (regOp != XMM0)
        MOVAPD(regOp, R(XMM0));
    }
  }
  else