JitArm64: Use one instruction for making NaNs quiet

Instead of materializing the quiet bit in a register and ORing the NaN with it, we can perform an arithmetic operation on the NaN. This is a cycle or two slower on some CPUs in cases where generating the quiet bit pipelined well, but this is farcode that rarely runs, so instruction fetch latency is the bigger concern. And for non-SIMD cases, we also save a register.
2025-07-23 14:19:46 -06:00 · 2023-08-09 20:53:10 +02:00
parent 5d9838548b
commit 4ecdb9e57e
3 changed files with 16 additions and 55 deletions
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.h
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h
@ -180,10 +180,6 @@ public:

  void FloatCompare(UGeckoInstruction inst, bool upper = false);

-  // temp_gpr can be INVALID_REG if single is true
-  void EmitQuietNaNBitConstant(Arm64Gen::ARM64Reg dest_reg, bool single,
-                               Arm64Gen::ARM64Reg temp_gpr);
-
  bool IsFPRStoreSafe(size_t guest_reg) const;

 protected:
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@ -137,12 +137,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)

  const ARM64Reg temp_gpr = m_accurate_nans && !single ? gpr.GetReg() : ARM64Reg::INVALID_REG;

-  if (m_accurate_nans)
-  {
-    if (V0Q == ARM64Reg::INVALID_REG)
-      V0Q = fpr.GetReg();
-  }
-
  switch (op5)
  {
  case 18:
@ -202,10 +196,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
    SwitchToFarCode();
    SetJumpTarget(nan);

-    const ARM64Reg quiet_bit_reg = reg_encoder(V0Q);
-
-    EmitQuietNaNBitConstant(quiet_bit_reg, inputs_are_singles && output_is_single, temp_gpr);
-
    Common::SmallVector<ARM64Reg, 3> inputs;
    inputs.push_back(VA);
    if (use_b && VA != VB)
@ -213,7 +203,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
    if (use_c && VA != VC && (!use_b || VB != VC))
      inputs.push_back(VC);

-    // If any inputs are NaNs, pick the first NaN of them and OR it with the quiet bit
+    // If any inputs are NaNs, pick the first NaN of them and set its quiet bit
    for (size_t i = 0; i < inputs.size(); ++i)
    {
      // Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at
@ -228,8 +218,9 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
        skip = B(CCFlags::CC_VC);
      }

-      m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(input),
-                       EncodeRegToDouble(quiet_bit_reg));
+      // Make the NaN quiet
+      m_float_emit.FADD(VD, input, input);
+
      nan_fixups.push_back(B());

      if (check_input)
@ -886,29 +877,6 @@ void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, AR
  }
 }

-void JitArm64::EmitQuietNaNBitConstant(ARM64Reg dest_reg, bool single, ARM64Reg temp_gpr)
-{
-  // dest_reg = QNaN & ~SNaN
-  //
-  // (Alternatively, dest_reg = QNaN would also work, but that would take
-  // two instructions to emit even for singles)
-
-  if (single)
-  {
-    m_float_emit.MOVI(32, dest_reg, 0x40, 16);
-  }
-  else
-  {
-    ASSERT(temp_gpr != ARM64Reg::INVALID_REG);
-
-    MOVI2R(EncodeRegTo64(temp_gpr), 0x0008'0000'0000'0000);
-    if (IsQuad(dest_reg))
-      m_float_emit.DUP(64, dest_reg, EncodeRegTo64(temp_gpr));
-    else
-      m_float_emit.FMOV(dest_reg, EncodeRegTo64(temp_gpr));
-  }
-}
-
 bool JitArm64::IsFPRStoreSafe(size_t guest_reg) const
 {
  return js.fpr_is_store_safe[guest_reg];
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@ -272,16 +272,15 @@ void JitArm64::ps_arith(UGeckoInstruction inst)

    // Make the NaNs quiet

-    const ARM64Reg quiet_bit_reg = VD == result_reg ? reg_encoder(V2Q) : VD;
-    EmitQuietNaNBitConstant(quiet_bit_reg, singles, temp_gpr);
+    const ARM64Reg quiet_nan_reg = VD == result_reg ? reg_encoder(V2Q) : VD;

+    m_float_emit.FADD(size, quiet_nan_reg, result_reg, result_reg);
    m_float_emit.FCMEQ(size, nan_temp_reg_paired, result_reg, result_reg);
-    m_float_emit.ORR(quiet_bit_reg, quiet_bit_reg, result_reg);
    if (negate_result)
      m_float_emit.FNEG(size, result_reg, result_reg);
    if (VD == result_reg)
-      m_float_emit.BIF(VD, quiet_bit_reg, nan_temp_reg_paired);
-    else  // quiet_bit_reg == VD
+      m_float_emit.BIF(VD, quiet_nan_reg, nan_temp_reg_paired);
+    else  // quiet_nan_reg == VD
      m_float_emit.BIT(VD, result_reg, nan_temp_reg_paired);

    nan_fixup = B();
@ -381,7 +380,6 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
  const ARM64Reg VC = fpr.R(c, type);
  const ARM64Reg VD = fpr.RW(d, type);
  const ARM64Reg V0 = fpr.GetReg();
-  const ARM64Reg V1 = m_accurate_nans ? fpr.GetReg() : ARM64Reg::INVALID_REG;
  const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG;

  m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1);
@ -398,22 +396,23 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
      SwitchToFarCode();
      SetJumpTarget(nan);

-      EmitQuietNaNBitConstant(scalar_reg_encoder(V1), singles, temp_gpr);
-
      if (upper)
      {
-        m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(input));
-        m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V1));
+        m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
+                          scalar_reg_encoder(input));
+        m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0));
      }
      else if (d != c)
      {
-        m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(V1), EncodeRegToDouble(input));
+        m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(input),
+                          scalar_reg_encoder(input));
        m_float_emit.INS(size, VD, 1, VC, 1);
      }
      else
      {
-        m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(input));
-        m_float_emit.INS(size, VD, 0, V1, 0);
+        m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
+                          scalar_reg_encoder(input));
+        m_float_emit.INS(size, VD, 0, V0, 0);
      }

      FixupBranch nan_done = B();
@ -449,8 +448,6 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
  }

  fpr.Unlock(V0);
-  if (m_accurate_nans)
-    fpr.Unlock(V1);
  if (temp_gpr != ARM64Reg::INVALID_REG)
    gpr.Unlock(temp_gpr);