From a52774ca639d01993e38dc4cc7c0010643ccfb60 Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Tue, 21 Jul 2020 22:46:10 +0200
Subject: [PATCH 1/4] Jit64: fselx - Add AVX path

AVX has a four-operand VBLENDVPD instruction, which allows for the first
input and the destination to be different. By taking advantage of this,
we no longer need to copy one of the inputs around and we can just
reference it directly, provided it's already in a register (I have yet
to see this not be the case).

Before:
66 0F 57 C0          xorpd       xmm0,xmm0
F2 41 0F C2 C6 06    cmpnlesd    xmm0,xmm14
41 0F 28 CE          movaps      xmm1,xmm14
66 41 0F 38 15 CA    blendvpd    xmm1,xmm10,xmm0
F2 44 0F 10 F1       movsd       xmm14,xmm1

After:
66 0F 57 C0          xorpd       xmm0,xmm0
F2 41 0F C2 C6 06    cmpnlesd    xmm0,xmm14
C4 C3 09 4B CA 00    vblendvpd   xmm1,xmm14,xmm10,xmm0
F2 44 0F 10 F1       movsd       xmm14,xmm1
---
 .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp     | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index dd41561944..0d6bff0b1a 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -437,7 +437,21 @@ void Jit64::fselx(UGeckoInstruction inst)
   else
     CMPSD(XMM0, Ra, CMP_NLE);
 
-  if (cpu_info.bSSE4_1)
+  if (cpu_info.bAVX)
+  {
+    X64Reg src1 = XMM1;
+    if (Rc.IsSimpleReg())
+    {
+      src1 = Rc.GetSimpleReg();
+    }
+    else
+    {
+      MOVAPD(XMM1, Rc);
+    }
+
+    VBLENDVPD(XMM1, src1, Rb, XMM0);
+  }
+  else if (cpu_info.bSSE4_1)
   {
     MOVAPD(XMM1, Rc);
     BLENDVPD(XMM1, Rb);

From afb86a12abff89116ce012e3d5fbbbd264368be7 Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Tue, 28 Jul 2020 23:09:58 +0200
Subject: [PATCH 2/4] Jit64: fselx - Optimize AVX packed

For the packed variant, we can skip the final MOVAPS and write the
result directly into the destination register.

Before:
66 0F 57 C0          xorpd       xmm0,xmm0
66 41 0F C2 C1 06    cmpnlepd    xmm0,xmm9
C4 C3 09 4B CC 00    vblendvpd   xmm1,xmm14,xmm12,xmm0
44 0F 28 F1          movaps      xmm14,xmm1

After:
66 0F 57 C0          xorpd       xmm0,xmm0
66 41 0F C2 C1 06    cmpnlepd    xmm0,xmm9
C4 43 09 4B F4 00    vblendvpd   xmm14,xmm14,xmm12,xmm0
---
 Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index 0d6bff0b1a..a0895b47c9 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -449,6 +449,12 @@ void Jit64::fselx(UGeckoInstruction inst)
       MOVAPD(XMM1, Rc);
     }
 
+    if (packed)
+    {
+      VBLENDVPD(Rd, src1, Rb, XMM0);
+      return;
+    }
+
     VBLENDVPD(XMM1, src1, Rb, XMM0);
   }
   else if (cpu_info.bSSE4_1)

From 31755bc13a3ebf4a55b3962fa50699db8c9767f4 Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Wed, 29 Jul 2020 17:28:48 +0200
Subject: [PATCH 3/4] Jit64: fselx - Optimize SSE4.1 packed

Pretty much the same optimization we did for AVX, although slightly more
constrained because we're stuck with the two-operand instruction where
destination and source have to match.

We could also specialize the case where registers b, c, and d are all
distinct, but I decided against it since I couldn't find any game that
does this.

Before:
66 0F 57 C0          xorpd       xmm0,xmm0
66 41 0F C2 C1 06    cmpnlepd    xmm0,xmm9
41 0F 28 CE          movaps      xmm1,xmm14
66 41 0F 38 15 CC    blendvpd    xmm1,xmm12,xmm0
44 0F 28 F1          movaps      xmm14,xmm1

After:
66 0F 57 C0          xorpd       xmm0,xmm0
66 41 0F C2 C1 06    cmpnlepd    xmm0,xmm9
66 45 0F 38 15 F4    blendvpd    xmm14,xmm12,xmm0
---
 Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index a0895b47c9..f520520601 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -459,6 +459,12 @@ void Jit64::fselx(UGeckoInstruction inst)
   }
   else if (cpu_info.bSSE4_1)
   {
+    if (packed && d == c)
+    {
+      BLENDVPD(Rd, Rb);
+      return;
+    }
+
     MOVAPD(XMM1, Rc);
     BLENDVPD(XMM1, Rb);
   }

From 08bdeefe05e740903634333c506e2ab4dfd6eb4a Mon Sep 17 00:00:00 2001
From: Sintendo <bram.speeckaert@gmail.com>
Date: Sun, 2 Aug 2020 18:07:47 +0200
Subject: [PATCH 4/4] Jit64AsmCommon: Use AVX in ConvertDoubleToSingle

Using AVX we can eliminate another MOVAPS instruction here.

Before:
0F 28 C8                movaps      xmm1,xmm0
66 0F DB 0D CF 2C 00 00 pand        xmm1,xmmword ptr [1F8D283B220h]

After:
C5 F9 DB 0D D2 2C 00 00 vpand       xmm1,xmm0,xmmword ptr [271835FB220h]
---
 Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
index 57fb829688..ef3fcc067a 100644
--- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
+++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
@@ -77,8 +77,7 @@ void CommonAsmRoutines::GenConvertDoubleToSingle()
   else
   {
     // We want bits 0, 1
-    MOVAPD(XMM1, R(XMM0));
-    PAND(XMM1, MConst(double_top_two_bits));
+    avx_op(&XEmitter::VPAND, &XEmitter::PAND, XMM1, R(XMM0), MConst(double_top_two_bits));
     PSRLQ(XMM1, 32);
 
     // And 5 through to 34