Merge pull request #9751 from JosJuice/jitarm64-fcmpx-ftz

JitArm64: Make fcmpX with flush-to-zero enabled less bad
2025-09-13 06:52:58 -06:00 · 2021-06-05 05:27:52 -04:00
parent 36871c9329 b6cc3c4b6e
commit 638909aec6
4 changed files with 40 additions and 5 deletions
--- a/Source/Core/Common/ArmCPUDetect.cpp
+++ b/Source/Core/Common/ArmCPUDetect.cpp
@ -63,13 +63,13 @@ CPUInfo::CPUInfo()
 void CPUInfo::Detect()
 {
  // Set some defaults here
-  // When ARMv8 CPUs come out, these need to be updated.
  HTT = false;
  OS64bit = true;
  CPU64bit = true;
  Mode64bit = true;
  vendor = CPUVendor::ARM;
  bFlushToZero = true;
+  bAFP = false;

 #ifdef __APPLE__
  num_cores = std::thread::hardware_concurrency();
--- a/Source/Core/Common/ArmFPURoundMode.cpp
+++ b/Source/Core/Common/ArmFPURoundMode.cpp
@ -2,8 +2,10 @@
 // Licensed under GPLv2+
 // Refer to the license.txt file included.

+#include "Common/CPUDetect.h"
 #include "Common/CommonTypes.h"
 #include "Common/FPURoundMode.h"
+#include "Common/Logging/Log.h"

 #ifdef _MSC_VER
 #include <intrin.h>
@ -45,8 +47,25 @@ void SetPrecisionMode(PrecisionMode mode)

 void SetSIMDMode(int rounding_mode, bool non_ieee_mode)
 {
-  // Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0)
+  // When AH is disabled, FZ controls flush-to-zero for both inputs and outputs. When AH is enabled,
+  // FZ controls flush-to-zero for outputs, and FIZ controls flush-to-zero for inputs.
  constexpr u32 FZ = 1 << 24;
+  constexpr u32 AH = 1 << 1;
+  constexpr u32 FIZ = 1 << 0;
+  constexpr u32 flush_to_zero_mask = FZ | AH | FIZ;
+
+  // On CPUs with FEAT_AFP support, setting AH = 1, FZ = 1, FIZ = 0 emulates the GC/Wii CPU's
+  // "non-IEEE mode". Unfortunately, FEAT_AFP didn't exist until 2020, so we can't count on setting
+  // AH actually doing anything. But flushing both inputs and outputs seems to cause less problems
+  // than flushing nothing, so let's just set FZ and AH and roll with whatever behavior we get.
+  const u32 flush_to_zero_bits = (non_ieee_mode ? FZ | AH : 0);
+  static bool afp_warning_shown = false;
+  if (!afp_warning_shown && !cpu_info.bAFP && non_ieee_mode)
+  {
+    afp_warning_shown = true;
+    WARN_LOG_FMT(POWERPC,
+                 "Non-IEEE mode was requested, but host CPU is not known to support FEAT_AFP");
+  }

  // lookup table for FPSCR.RN-to-FPCR.RMode translation
  constexpr u32 rounding_mode_table[] = {
@ -55,9 +74,11 @@ void SetSIMDMode(int rounding_mode, bool non_ieee_mode)
      (1 << 22),  // +inf
      (2 << 22),  // -inf
  };
+  constexpr u32 rounding_mode_mask = 3 << 22;
+  const u32 rounding_mode_bits = rounding_mode_table[rounding_mode];

-  const u64 base = default_fpcr & ~(0b111 << 22);
-  SetFPCR(base | rounding_mode_table[rounding_mode] | (non_ieee_mode ? FZ : 0));
+  const u64 base = default_fpcr & ~(flush_to_zero_mask | rounding_mode_mask);
+  SetFPCR(base | rounding_mode_bits | flush_to_zero_bits);
 }

 void SaveSIMDState()
--- a/Source/Core/Common/CPUDetect.h
+++ b/Source/Core/Common/CPUDetect.h
@ -64,6 +64,7 @@ struct CPUInfo
  bool bCRC32 = false;
  bool bSHA1 = false;
  bool bSHA2 = false;
+  bool bAFP = false;  // Alternate floating-point behavior

  // Call Detect()
  explicit CPUInfo();
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@ -3,6 +3,7 @@
 // Refer to the license.txt file included.

 #include "Common/Arm64Emitter.h"
+#include "Common/CPUDetect.h"
 #include "Common/CommonTypes.h"
 #include "Common/StringUtil.h"

@ -374,7 +375,19 @@ void JitArm64::fcmpX(UGeckoInstruction inst)
  const u32 b = inst.FB;
  const int crf = inst.CRFD;

-  const bool singles = fpr.IsSingle(a, true) && fpr.IsSingle(b, true);
+  // On the GC/Wii CPU, outputs are flushed to zero if FPSCR.NI is set, and inputs are never
+  // flushed to zero. Ideally we would emulate FPSCR.NI by setting FPCR.FZ and FPCR.AH, but
+  // unfortunately FPCR.AH is a very new feature that we can't rely on (as of 2021). For CPUs
+  // without FPCR.AH, the best we can do (without killing the performance by explicitly flushing
+  // outputs using bitwise operations) is to only set FPCR.FZ, which flushes both inputs and
+  // outputs. This may cause problems in some cases, and one such case is Pokémon Battle Revolution,
+  // which does not progress past the title screen if a denormal single compares equal to zero.
+  // Workaround: Perform the comparison using a double operation instead. This ensures that denormal
+  // singles behave correctly in comparisons, but we still have a problem with denormal doubles.
+  const bool input_ftz_workaround =
+      !cpu_info.bAFP && (!js.fpr_is_store_safe[a] || !js.fpr_is_store_safe[b]);
+
+  const bool singles = fpr.IsSingle(a, true) && fpr.IsSingle(b, true) && !input_ftz_workaround;
  const RegType type = singles ? RegType::LowerPairSingle : RegType::LowerPair;
  const auto reg_encoder = singles ? EncodeRegToSingle : EncodeRegToDouble;