From a52774ca639d01993e38dc4cc7c0010643ccfb60 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Tue, 21 Jul 2020 22:46:10 +0200 Subject: [PATCH 1/4] Jit64: fselx - Add AVX path AVX has a four-operand VBLENDVPD instruction, which allows for the first input and the destination to be different. By taking advantage of this, we no longer need to copy one of the inputs around and we can just reference it directly, provided it's already in a register (I have yet to see this not be the case). Before: 66 0F 57 C0 xorpd xmm0,xmm0 F2 41 0F C2 C6 06 cmpnlesd xmm0,xmm14 41 0F 28 CE movaps xmm1,xmm14 66 41 0F 38 15 CA blendvpd xmm1,xmm10,xmm0 F2 44 0F 10 F1 movsd xmm14,xmm1 After: 66 0F 57 C0 xorpd xmm0,xmm0 F2 41 0F C2 C6 06 cmpnlesd xmm0,xmm14 C4 C3 09 4B CA 00 vblendvpd xmm1,xmm14,xmm10,xmm0 F2 44 0F 10 F1 movsd xmm14,xmm1 --- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index dd41561944..0d6bff0b1a 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -437,7 +437,21 @@ void Jit64::fselx(UGeckoInstruction inst) else CMPSD(XMM0, Ra, CMP_NLE); - if (cpu_info.bSSE4_1) + if (cpu_info.bAVX) + { + X64Reg src1 = XMM1; + if (Rc.IsSimpleReg()) + { + src1 = Rc.GetSimpleReg(); + } + else + { + MOVAPD(XMM1, Rc); + } + + VBLENDVPD(XMM1, src1, Rb, XMM0); + } + else if (cpu_info.bSSE4_1) { MOVAPD(XMM1, Rc); BLENDVPD(XMM1, Rb); From afb86a12abff89116ce012e3d5fbbbd264368be7 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Tue, 28 Jul 2020 23:09:58 +0200 Subject: [PATCH 2/4] Jit64: fselx - Optimize AVX packed For the packed variant, we can skip the final MOVAPS and write the result directly into the destination register. Before: 66 0F 57 C0 xorpd xmm0,xmm0 66 41 0F C2 C1 06 cmpnlepd xmm0,xmm9 C4 C3 09 4B CC 00 vblendvpd xmm1,xmm14,xmm12,xmm0 44 0F 28 F1 movaps xmm14,xmm1 After: 66 0F 57 C0 xorpd xmm0,xmm0 66 41 0F C2 C1 06 cmpnlepd xmm0,xmm9 C4 43 09 4B F4 00 vblendvpd xmm14,xmm14,xmm12,xmm0 --- Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 0d6bff0b1a..a0895b47c9 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -449,6 +449,12 @@ void Jit64::fselx(UGeckoInstruction inst) MOVAPD(XMM1, Rc); } + if (packed) + { + VBLENDVPD(Rd, src1, Rb, XMM0); + return; + } + VBLENDVPD(XMM1, src1, Rb, XMM0); } else if (cpu_info.bSSE4_1) From 31755bc13a3ebf4a55b3962fa50699db8c9767f4 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 29 Jul 2020 17:28:48 +0200 Subject: [PATCH 3/4] Jit64: fselx - Optimize SSE4.1 packed Pretty much the same optimization we did for AVX, although slightly more constrained because we're stuck with the two-operand instruction where destination and source have to match. We could also specialize the case where registers b, c, and d are all distinct, but I decided against it since I couldn't find any game that does this. Before: 66 0F 57 C0 xorpd xmm0,xmm0 66 41 0F C2 C1 06 cmpnlepd xmm0,xmm9 41 0F 28 CE movaps xmm1,xmm14 66 41 0F 38 15 CC blendvpd xmm1,xmm12,xmm0 44 0F 28 F1 movaps xmm14,xmm1 After: 66 0F 57 C0 xorpd xmm0,xmm0 66 41 0F C2 C1 06 cmpnlepd xmm0,xmm9 66 45 0F 38 15 F4 blendvpd xmm14,xmm12,xmm0 --- Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index a0895b47c9..f520520601 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -459,6 +459,12 @@ void Jit64::fselx(UGeckoInstruction inst) } else if (cpu_info.bSSE4_1) { + if (packed && d == c) + { + BLENDVPD(Rd, Rb); + return; + } + MOVAPD(XMM1, Rc); BLENDVPD(XMM1, Rb); } From 08bdeefe05e740903634333c506e2ab4dfd6eb4a Mon Sep 17 00:00:00 2001 From: Sintendo Date: Sun, 2 Aug 2020 18:07:47 +0200 Subject: [PATCH 4/4] Jit64AsmCommon: Use AVX in ConvertDoubleToSingle Using AVX we can eliminate another MOVAPS instruction here. Before: 0F 28 C8 movaps xmm1,xmm0 66 0F DB 0D CF 2C 00 00 pand xmm1,xmmword ptr [1F8D283B220h] After: C5 F9 DB 0D D2 2C 00 00 vpand xmm1,xmm0,xmmword ptr [271835FB220h] --- Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp index 57fb829688..ef3fcc067a 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp @@ -77,8 +77,7 @@ void CommonAsmRoutines::GenConvertDoubleToSingle() else { // We want bits 0, 1 - MOVAPD(XMM1, R(XMM0)); - PAND(XMM1, MConst(double_top_two_bits)); + avx_op(&XEmitter::VPAND, &XEmitter::PAND, XMM1, R(XMM0), MConst(double_top_two_bits)); PSRLQ(XMM1, 32); // And 5 through to 34