JIT: some paired singles optimizations

2025-07-23 14:19:46 -06:00 · 2014-07-26 23:32:02 -07:00
parent 68b2d86daf
commit 34287b8042
1 changed files with 45 additions and 73 deletions
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@ -3,19 +3,13 @@
 // Refer to the license.txt file included.

 #include "Common/CommonTypes.h"
+#include "Common/CPUDetect.h"

 #include "Core/PowerPC/Jit64/Jit.h"
 #include "Core/PowerPC/Jit64/JitRegCache.h"

 using namespace Gen;

-// TODO
-// ps_madds0
-// ps_muls0
-// ps_madds1
-//   cmppd, andpd, andnpd, or
-//   lfsx, ps_merge01 etc
-
 static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
 static const u64 GC_ALIGNED16(psAbsMask[2])  = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};

@ -36,9 +30,6 @@ void Jit64::ps_mr(UGeckoInstruction inst)

 void Jit64::ps_sel(UGeckoInstruction inst)
 {
-	// we can't use (V)BLENDVPD here because it just looks at the sign bit
-	// but we need -0 = +0
-
 	INSTRUCTION_START
 	JITDISABLE(bJITPairedOff);
 	FALLBACK_IF(inst.Rc);
@ -49,16 +40,26 @@ void Jit64::ps_sel(UGeckoInstruction inst)
 	int c = inst.FC;

 	fpr.Lock(a, b, c, d);
-	MOVAPD(XMM0, fpr.R(a));
-	PXOR(XMM1, R(XMM1));
-	// XMM0 = XMM0 < 0 ? all 1s : all 0s
-	CMPPD(XMM0, R(XMM1), LT);
-	MOVAPD(XMM1, R(XMM0));
-	PAND(XMM0, fpr.R(b));
-	PANDN(XMM1, fpr.R(c));
-	POR(XMM0, R(XMM1));
+
+	if (cpu_info.bSSE4_1)
+	{
+		PXOR(XMM0, R(XMM0));
+		CMPPD(XMM0, fpr.R(a), LT); // XMM0 = XMM0 >= 0 ? all 1s : all 0s
+		MOVAPD(XMM1, fpr.R(b));
+		BLENDVPD(XMM1, fpr.R(c));
+	}
+	else
+	{
+		MOVAPD(XMM1, fpr.R(a));
+		PXOR(XMM0, R(XMM0));
+		CMPPD(XMM1, R(XMM0), LT); // XMM0 = XMM0 < 0 ? all 1s : all 0s
+		MOVAPD(XMM0, R(XMM1));
+		PAND(XMM1, fpr.R(b));
+		PANDN(XMM0, fpr.R(c));
+		POR(XMM1, R(XMM0));
+	}
 	fpr.BindToRegister(d, false);
-	MOVAPD(fpr.RX(d), R(XMM0));
+	MOVAPD(fpr.RX(d), R(XMM1));
 	fpr.UnlockAll();
 }

@ -98,20 +99,6 @@ void Jit64::ps_sign(UGeckoInstruction inst)
 	fpr.UnlockAll();
 }

-//add a, b, c
-
-//mov a, b
-//add a, c
-//we need:
-/*
-psq_l
-psq_stu
-*/
-
-/*
-add a,b,a
-*/
-
 //There's still a little bit more optimization that can be squeezed out of this
 void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
 {
@ -152,7 +139,7 @@ void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X6
 			MOVAPD(XMM0, fpr.R(b));
 			fpr.BindToRegister(d, false);
 			MOVAPD(fpr.RX(d), fpr.R(a));
-			(this->*op)(fpr.RX(d), Gen::R(XMM0));
+			(this->*op)(fpr.RX(d), R(XMM0));
 		}
 	}
 	else
@ -204,32 +191,26 @@ void Jit64::ps_sum(UGeckoInstruction inst)
 	int b = inst.FB;
 	int c = inst.FC;
 	fpr.Lock(a,b,c,d);
-	fpr.BindToRegister(d, d == a || d == b || d == c, true);
 	switch (inst.SUBOP5)
 	{
 	case 10:
-		// ps_sum0, do the sum in upper subregisters, merge uppers
-		MOVDDUP(XMM0, fpr.R(a));
-		MOVAPD(XMM1, fpr.R(b));
-		ADDPD(XMM0, R(XMM1));
-		UNPCKHPD(XMM0, fpr.R(c)); //merge
-		MOVAPD(fpr.R(d), XMM0);
+		MOVDDUP(XMM0, fpr.R(a));  // {a.ps0, a.ps0}
+		ADDPD(XMM0, fpr.R(b));    // {a.ps0 + b.ps0, a.ps0 + b.ps1}
+		UNPCKHPD(XMM0, fpr.R(c)); // {a.ps0 + b.ps1, c.ps1}
 		break;
 	case 11:
-		// ps_sum1, do the sum in lower subregisters, merge lowers
-		MOVAPD(XMM0, fpr.R(a));
-		MOVAPD(XMM1, fpr.R(b));
-		SHUFPD(XMM1, R(XMM1), 5); // copy higher to lower
-		ADDPD(XMM0, R(XMM1)); // sum lowers
-		MOVAPD(XMM1, fpr.R(c));
-		UNPCKLPD(XMM1, R(XMM0)); // merge
-		MOVAPD(fpr.R(d), XMM1);
+		MOVDDUP(XMM1, fpr.R(a));  // {a.ps0, a.ps0}
+		ADDPD(XMM1, fpr.R(b));    // {a.ps0 + b.ps0, a.ps0 + b.ps1}
+		MOVAPD(XMM0, fpr.R(c));
+		SHUFPD(XMM0, R(XMM1), 2); // {c.ps0, a.ps0 + b.ps1}
 		break;
 	default:
 		PanicAlert("ps_sum WTF!!!");
 	}
-	ForceSinglePrecisionP(fpr.RX(d));
-	SetFPRFIfNeeded(inst, fpr.RX(d));
+	ForceSinglePrecisionP(XMM0);
+	SetFPRFIfNeeded(inst, XMM0);
+	fpr.BindToRegister(d, false);
+	MOVAPD(fpr.RX(d), R(XMM0));
 	fpr.UnlockAll();
 }

@ -244,37 +225,28 @@ void Jit64::ps_muls(UGeckoInstruction inst)
 	int a = inst.FA;
 	int c = inst.FC;
 	fpr.Lock(a, c, d);
-	fpr.BindToRegister(d, d == a || d == c, true);
 	switch (inst.SUBOP5)
 	{
 	case 12:
-		// Single multiply scalar high
-		// TODO - faster version for when regs are different
-		MOVDDUP(XMM1, fpr.R(c));
-		Force25BitPrecision(XMM1, XMM0);
-		MOVAPD(XMM0, fpr.R(a));
-		MULPD(XMM0, R(XMM1));
-		MOVAPD(fpr.R(d), XMM0);
+		MOVDDUP(XMM0, fpr.R(c));
 		break;
 	case 13:
-		// TODO - faster version for when regs are different
-		MOVAPD(XMM1, fpr.R(c));
-		Force25BitPrecision(XMM1, XMM0);
-		MOVAPD(XMM0, fpr.R(a));
-		SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
-		MULPD(XMM0, R(XMM1));
-		MOVAPD(fpr.R(d), XMM0);
+		MOVAPD(XMM0, fpr.R(c));
+		SHUFPD(XMM0, R(XMM0), 3);
 		break;
 	default:
 		PanicAlert("ps_muls WTF!!!");
 	}
-	ForceSinglePrecisionP(fpr.RX(d));
-	SetFPRFIfNeeded(inst, fpr.RX(d));
+	Force25BitPrecision(XMM0, XMM1);
+	MULPD(XMM0, fpr.R(a));
+	ForceSinglePrecisionP(XMM0);
+	SetFPRFIfNeeded(inst, XMM0);
+	fpr.BindToRegister(d, false);
+	MOVAPD(fpr.RX(d), R(XMM0));
 	fpr.UnlockAll();
 }


-//TODO: find easy cases and optimize them, do a breakout like ps_arith
 void Jit64::ps_mergeXX(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
@ -305,7 +277,7 @@ void Jit64::ps_mergeXX(UGeckoInstruction inst)
 		_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
 	}
 	fpr.BindToRegister(d, false);
-	MOVAPD(fpr.RX(d), Gen::R(XMM0));
+	MOVAPD(fpr.RX(d), R(XMM0));
 	fpr.UnlockAll();
 }

@ -373,8 +345,8 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
 		return;
 	}
 	fpr.BindToRegister(d, false);
-	MOVAPD(fpr.RX(d), Gen::R(XMM0));
-	ForceSinglePrecisionP(fpr.RX(d));
-	SetFPRFIfNeeded(inst, fpr.RX(d));
+	ForceSinglePrecisionP(XMM0);
+	SetFPRFIfNeeded(inst, XMM0);
+	MOVAPD(fpr.RX(d), R(XMM0));
 	fpr.UnlockAll();
 }