JitArm64: Skip checking last input for NaN for non-SIMD operations

AArch64's handling of NaNs in arithmetic instructions matches PowerPC's as long as no more than one of the operands is NaN. If we know that all inputs except the last input are non-NaN, we can therefore skip checking the last input. This is an optimization that in principle only works for non-SIMD operations, but ps_sumX effectively is non-SIMD as far as the arithmetic part of it is concerned, so we can use it there too.
2025-07-26 07:39:45 -06:00 · 2023-08-09 19:46:27 +02:00
parent 95f06ef231
commit 8274dcbfe4
2 changed files with 43 additions and 66 deletions
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@ -80,9 +80,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
  const bool fma = use_b && use_c;
  const bool negate_result = (op5 & ~0x1) == 30;

-  // Addition and subtraction can't generate new NaNs, they can only take NaNs from inputs
-  const bool can_generate_nan = (op5 & ~0x1) != 20;
-
  const bool output_is_single = inst.OPCD == 59;
  const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
  const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];
@ -203,37 +200,28 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
    if (use_c && VA != VC && (!use_b || VB != VC))
      inputs.push_back(VC);

-    // If any inputs are NaNs, pick the first NaN of them and set its quiet bit
-    for (size_t i = 0; i < inputs.size(); ++i)
+    // If any inputs are NaNs, pick the first NaN of them and set its quiet bit.
+    // However, we can skip checking the last input, because if exactly one input is NaN, AArch64
+    // arithmetic instructions automatically pick that NaN and make it quiet, just like we want.
+    for (size_t i = 0; i < inputs.size() - 1; ++i)
    {
-      // Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at
-      // least one NaN input
-      const bool check_input = can_generate_nan || i != inputs.size() - 1;
-
      const ARM64Reg input = inputs[i];
-      FixupBranch skip;
-      if (check_input)
-      {
+
      m_float_emit.FCMP(input);
-        skip = B(CCFlags::CC_VC);
-      }
+      FixupBranch skip = B(CCFlags::CC_VC);

      // Make the NaN quiet
      m_float_emit.FADD(VD, input, input);

      nan_fixups.push_back(B());

-      if (check_input)
      SetJumpTarget(skip);
    }

    std::optional<FixupBranch> nan_early_fixup;
-    if (can_generate_nan)
-    {
-      // There was no NaN in any of the inputs, so the NaN must have been generated by the
-      // arithmetic instruction. In this case, the result is already correct.
    if (negate_result)
    {
+      // If we have a NaN, we must not execute FNEG.
      if (result_reg != VD)
        m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg));
      nan_fixups.push_back(B());
@ -242,7 +230,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
    {
      nan_early_fixup = B();
    }
-    }

    SwitchToNearCode();

--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@ -384,45 +384,38 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)

  m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1);

-  FixupBranch a_nan_done, b_nan_done;
+  FixupBranch a_nan_done;
  if (m_accurate_nans)
  {
-    const auto check_nan = [&](ARM64Reg input) {
-      m_float_emit.FCMP(scalar_reg_encoder(input));
-      FixupBranch not_nan = B(CCFlags::CC_VC);
-      FixupBranch nan = B();
-      SetJumpTarget(not_nan);
+    m_float_emit.FCMP(scalar_reg_encoder(VA));
+    FixupBranch a_not_nan = B(CCFlags::CC_VC);
+    FixupBranch a_nan = B();
+    SetJumpTarget(a_not_nan);

    SwitchToFarCode();
-      SetJumpTarget(nan);
+    SetJumpTarget(a_nan);

    if (upper)
    {
-        m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
-                          scalar_reg_encoder(input));
+      m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(VA), scalar_reg_encoder(VA));
      m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0));
    }
    else if (d != c)
    {
-        m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(input),
-                          scalar_reg_encoder(input));
+      m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(VA), scalar_reg_encoder(VA));
      m_float_emit.INS(size, VD, 1, VC, 1);
    }
    else
    {
-        m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
-                          scalar_reg_encoder(input));
+      m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(VA), scalar_reg_encoder(VA));
      m_float_emit.INS(size, VD, 0, V0, 0);
    }

-      FixupBranch nan_done = B();
+    FixupBranch a_nan_done = B();
    SwitchToNearCode();

-      return nan_done;
-    };
-
-    a_nan_done = check_nan(VA);
-    b_nan_done = check_nan(V0);
+    // If exactly one input is NaN, AArch64 arithmetic instructions automatically pick that NaN
+    // and make it quiet, just like we want. So if rA isn't NaN, we can skip checking rB.
  }

  if (upper)
@ -442,10 +435,7 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
  }

  if (m_accurate_nans)
-  {
    SetJumpTarget(a_nan_done);
-    SetJumpTarget(b_nan_done);
-  }

  fpr.Unlock(V0);
  if (temp_gpr != ARM64Reg::INVALID_REG)