From 2f1a8ee1b9cb2c7e78688c2bdf7e08b5a533441f Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Sat, 26 Nov 2022 14:32:42 +0100
Subject: [PATCH] Jit64: Skip HandleNaNs for operations that can't generate NaN

Operations that have two operands and can't generate a default NaN,
i.e. addition and subtraction, already have the desired NaN handling
on x86. We just need to make sure to not reverse the operands.

This fixes ps_sum0/ps_sum1 outputting NaNs in cases where they shouldn't.
(HandleNaNs assumes that a NaN in a ps0 input always results in a NaN in
the ps0 output, and correspondingly for ps1.)
---
 .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp  | 24 ++++++++++++-------
 Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp |  3 ++-
 2 files changed, 17 insertions(+), 10 deletions(-)
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index 5daaf9e9b5..fc27ad7bf1 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -236,8 +236,7 @@ void Jit64::fp_arith(UGeckoInstruction inst)
 
   bool single = inst.OPCD == 4 || inst.OPCD == 59;
   // If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP
-  // at the end by
-  // using packed arithmetic instead.
+  // at the end by using packed arithmetic instead.
   bool packed = inst.OPCD == 4 ||
                 (inst.OPCD == 59 && js.op->fprIsDuplicated[a] && js.op->fprIsDuplicated[arg2]);
   // Packed divides are slower than scalar divides on basically all x86, so this optimization isn't
@@ -249,10 +248,12 @@ void Jit64::fp_arith(UGeckoInstruction inst)
   void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&) = nullptr;
   void (XEmitter::*sseOp)(X64Reg, const OpArg&) = nullptr;
   bool reversible = false;
-  bool roundRHS = false;
+  bool round_rhs = false;
+  bool preserve_inputs = false;
   switch (inst.SUBOP5)
   {
   case 18:
+    preserve_inputs = m_accurate_nans;
     avxOp = packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD;
     sseOp = packed ? &XEmitter::DIVPD : &XEmitter::DIVSD;
     break;
@@ -261,13 +262,14 @@ void Jit64::fp_arith(UGeckoInstruction inst)
     sseOp = packed ? &XEmitter::SUBPD : &XEmitter::SUBSD;
     break;
   case 21:
-    reversible = true;
+    reversible = !m_accurate_nans;
     avxOp = packed ? &XEmitter::VADDPD : &XEmitter::VADDSD;
     sseOp = packed ? &XEmitter::ADDPD : &XEmitter::ADDSD;
     break;
   case 25:
     reversible = true;
-    roundRHS = single && !js.op->fprIsSingle[c];
+    round_rhs = single && !js.op->fprIsSingle[c];
+    preserve_inputs = m_accurate_nans;
     avxOp = packed ? &XEmitter::VMULPD : &XEmitter::VMULSD;
     sseOp = packed ? &XEmitter::MULPD : &XEmitter::MULSD;
     break;
@@ -280,9 +282,8 @@ void Jit64::fp_arith(UGeckoInstruction inst)
   RCOpArg Rarg2 = fpr.Use(arg2, RCMode::Read);
   RegCache::Realize(Rd, Ra, Rarg2);
 
-  bool preserve_inputs = m_accurate_nans;
   X64Reg dest = preserve_inputs ? XMM1 : static_cast<X64Reg>(Rd);
-  if (roundRHS)
+  if (round_rhs)
   {
     if (a == d && !preserve_inputs)
     {
@@ -300,10 +301,15 @@ void Jit64::fp_arith(UGeckoInstruction inst)
     avx_op(avxOp, sseOp, dest, Ra, Rarg2, packed, reversible);
   }
 
-  if (inst.SUBOP5 != 25)
+  switch (inst.SUBOP5)
+  {
+  case 18:
     HandleNaNs(inst, dest, XMM0, Ra, Rarg2, std::nullopt);
-  else
+    break;
+  case 25:
     HandleNaNs(inst, dest, XMM0, Ra, std::nullopt, Rarg2);
+    break;
+  }
 
   if (single)
     FinalizeSingleResult(Rd, R(dest), packed, true);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
index 160cd77497..ab6a5b7638 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@@ -79,7 +79,8 @@ void Jit64::ps_sum(UGeckoInstruction inst)
   default:
     PanicAlertFmt("ps_sum WTF!!!");
   }
-  HandleNaNs(inst, tmp, tmp == XMM1 ? XMM0 : XMM1, Ra, Rb, Rc);
+  // We're intentionally not calling HandleNaNs here.
+  // For addition and subtraction specifically, x86's NaN behavior matches PPC's.
   FinalizeSingleResult(Rd, R(tmp));
 }