From 09cdb076a3938ecce7422f1f77ac24314176263b Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 21 Aug 2021 16:28:51 +0200 Subject: [PATCH 1/6] JitArm64: divwx - Optimize constant dividend When the dividend is known at compile time, we can eliminate some of the branching and precompute the result for the overflow case. --- Source/Core/Common/Arm64Emitter.cpp | 4 ++ Source/Core/Common/Arm64Emitter.h | 1 + .../PowerPC/JitArm64/JitArm64_Integer.cpp | 38 +++++++++++++++++-- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index ce3c0a5103..c0d19f652f 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -1373,6 +1373,10 @@ void ARM64XEmitter::CMP(ARM64Reg Rn, u32 imm, bool shift) { EncodeAddSubImmInst(1, true, shift, imm, Rn, Is64Bit(Rn) ? ARM64Reg::SP : ARM64Reg::WSP); } +void ARM64XEmitter::CMN(ARM64Reg Rn, u32 imm, bool shift) +{ + EncodeAddSubImmInst(0, true, shift, imm, Rn, Is64Bit(Rn) ? ARM64Reg::SP : ARM64Reg::WSP); +} // Data Processing (Immediate) void ARM64XEmitter::MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos) diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index a5a4c03e4e..9093642417 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -1006,6 +1006,7 @@ public: void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); void CMP(ARM64Reg Rn, u32 imm, bool shift = false); + void CMN(ARM64Reg Rn, u32 imm, bool shift = false); // Data Processing (Immediate) void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = ShiftAmount::Shift0); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index 6b2f94d80c..7bc7ceb2a7 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -1327,6 +1327,36 @@ void JitArm64::divwx(UGeckoInstruction inst) if (inst.Rc) ComputeRC0(imm_d); } + else if (gpr.IsImm(a)) + { + const u32 dividend = gpr.GetImm(a); + + gpr.BindToRegister(d, d == b); + + ARM64Reg RB = gpr.R(b); + ARM64Reg RD = gpr.R(d); + + FixupBranch overflow1 = CBZ(RB); + FixupBranch overflow2; + if (dividend == 0x80000000) + { + CMN(RB, 1); + overflow2 = B(CC_EQ); + } + SDIV(RD, gpr.R(a), RB); + FixupBranch done = B(); + + SetJumpTarget(overflow1); + if (dividend == 0x80000000) + SetJumpTarget(overflow2); + + MOVI2R(RD, dividend & 0x80000000 ? 0xFFFFFFFF : 0); + + SetJumpTarget(done); + + if (inst.Rc) + ComputeRC0(RD); + } else if (gpr.IsImm(b) && gpr.GetImm(b) != 0 && gpr.GetImm(b) != UINT32_C(0xFFFFFFFF)) { ARM64Reg WA = gpr.GetReg(); @@ -1352,16 +1382,16 @@ void JitArm64::divwx(UGeckoInstruction inst) ARM64Reg RB = gpr.R(b); ARM64Reg RD = gpr.R(d); - FixupBranch slow1 = CBZ(RB); + FixupBranch overflow1 = CBZ(RB); MOVI2R(WA, -0x80000000LL); CMP(RA, WA); CCMN(RB, 1, 0, CC_EQ); - FixupBranch slow2 = B(CC_EQ); + FixupBranch overflow2 = B(CC_EQ); SDIV(RD, RA, RB); FixupBranch done = B(); - SetJumpTarget(slow1); - SetJumpTarget(slow2); + SetJumpTarget(overflow1); + SetJumpTarget(overflow2); ASR(RD, RA, 31); From f8e97f5a8af0de3f770abd7aab137933668fef76 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 21 Aug 2021 16:31:25 +0200 Subject: [PATCH 2/6] JitArm64: divwx - Special case dividend == 0 Zero divided by any number is still zero. For whatever reason, this case shows up frequently too. --- Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index 7bc7ceb2a7..777281abfd 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -1327,6 +1327,13 @@ void JitArm64::divwx(UGeckoInstruction inst) if (inst.Rc) ComputeRC0(imm_d); } + else if (gpr.IsImm(a) && gpr.GetImm(a) == 0) + { + // Zero divided by anything is always zero + gpr.SetImmediate(d, 0); + if (inst.Rc) + ComputeRC0(0); + } else if (gpr.IsImm(a)) { const u32 dividend = gpr.GetImm(a); From eb8581c26dd000c496ef6b2181e42b5f17fe439a Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 21 Aug 2021 18:00:51 +0200 Subject: [PATCH 3/6] JitArm64: divwx - Optimize constant divisor Optimize division by a constant into multiplication. This method is also used by GCC and LLVM. We also add optimized paths for divisors 0, 1, and -1, because they don't work using this method. They don't occur very often, but are necessary for correctness. --- .../PowerPC/JitArm64/JitArm64_Integer.cpp | 76 +++++++++++++++++-- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index 777281abfd..0b29127976 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -10,10 +10,12 @@ #include "Core/CoreTiming.h" #include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" +#include "Core/PowerPC/JitCommon/DivUtils.h" #include "Core/PowerPC/PPCTables.h" #include "Core/PowerPC/PowerPC.h" using namespace Arm64Gen; +using namespace JitCommon; #define CARRY_IF_NEEDED(inst_without_carry, inst_with_carry, ...) \ do \ @@ -1364,16 +1366,80 @@ void JitArm64::divwx(UGeckoInstruction inst) if (inst.Rc) ComputeRC0(RD); } - else if (gpr.IsImm(b) && gpr.GetImm(b) != 0 && gpr.GetImm(b) != UINT32_C(0xFFFFFFFF)) + else if (gpr.IsImm(b) && gpr.GetImm(b) != UINT32_C(0x80000000)) { - ARM64Reg WA = gpr.GetReg(); - MOVI2R(WA, gpr.GetImm(b)); + const s32 divisor = s32(gpr.GetImm(b)); gpr.BindToRegister(d, d == a); - SDIV(gpr.R(d), gpr.R(a), WA); + // Handle 0, 1, and -1 explicitly + if (divisor == 0) + { + ASR(gpr.R(d), gpr.R(a), 31); + } + else if (divisor == 1) + { + if (d != a) + MOV(gpr.R(d), gpr.R(a)); + } + else if (divisor == -1) + { + ARM64Reg WA = gpr.GetReg(); - gpr.Unlock(WA); + // Rd = (Ra == 0x80000000) ? 0xFFFFFFFF : -Ra + MOVI2R(WA, 0x80000000); + CMP(gpr.R(a), WA); + NEG(gpr.R(d), gpr.R(a)); + CSINV(gpr.R(d), gpr.R(d), ARM64Reg::WZR, CCFlags::CC_NEQ); + + gpr.Unlock(WA); + } + else + { + // Optimize signed 32-bit integer division by a constant + Magic m = SignedDivisionConstants(divisor); + + ARM64Reg WA = gpr.GetReg(); + ARM64Reg WB = gpr.GetReg(); + ARM64Reg RD = gpr.R(d); + + ARM64Reg XA = EncodeRegTo64(WA); + ARM64Reg XB = EncodeRegTo64(WB); + ARM64Reg XD = EncodeRegTo64(RD); + + SXTW(XA, gpr.R(a)); + MOVI2R(XB, s64(m.multiplier)); + + if (divisor > 0 && m.multiplier < 0) + { + MUL(XD, XA, XB); + ADD(XD, XA, XD, ArithOption(XD, ShiftType::LSR, 32)); + LSR(WA, WA, 31); + ADD(RD, WA, RD, ArithOption(RD, ShiftType::ASR, m.shift)); + } + else if (divisor < 0 && m.multiplier > 0) + { + MNEG(XD, XA, XB); + ADD(XA, XD, XA, ArithOption(XA, ShiftType::LSR, 32)); + LSR(RD, WA, 31); + ADD(RD, RD, WA, ArithOption(WA, ShiftType::ASR, m.shift)); + } + else if (m.multiplier > 0) + { + MUL(XD, XA, XB); + ASR(XD, XD, 32 + m.shift); + ADD(RD, RD, WA, ArithOption(WA, ShiftType::LSR, 31)); + } + else + { + MUL(XD, XA, XB); + LSR(XA, XD, 63); + ASR(XD, XD, 32 + m.shift); + ADD(RD, WA, RD); + } + + gpr.Unlock(WA, WB); + } if (inst.Rc) ComputeRC0(gpr.R(d)); From 9889e7eb33b5732add517f9197c710f9ee56e3fb Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 21 Aug 2021 18:47:31 +0200 Subject: [PATCH 4/6] JitArm64: divwx - Optimize power-of-two divisors Power-of-two divisors can be done more elegantly, so handle them separately. --- Source/Core/Common/Arm64Emitter.h | 8 +++++++ .../PowerPC/JitArm64/JitArm64_Integer.cpp | 24 ++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 9093642417..5ea860a77a 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -896,7 +896,15 @@ public: CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1)); } void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); } + void NEG(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option) + { + SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs, Option); + } void NEGS(ARM64Reg Rd, ARM64Reg Rs) { SUBS(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); } + void NEGS(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option) + { + SUBS(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs, Option); + } // Data-Processing 1 source void RBIT(ARM64Reg Rd, ARM64Reg Rn); void REV16(ARM64Reg Rd, ARM64Reg Rn); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index 0b29127976..a6b88a8cb1 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -1366,7 +1366,7 @@ void JitArm64::divwx(UGeckoInstruction inst) if (inst.Rc) ComputeRC0(RD); } - else if (gpr.IsImm(b) && gpr.GetImm(b) != UINT32_C(0x80000000)) + else if (gpr.IsImm(b)) { const s32 divisor = s32(gpr.GetImm(b)); @@ -1394,6 +1394,28 @@ void JitArm64::divwx(UGeckoInstruction inst) gpr.Unlock(WA); } + else if (MathUtil::IsPow2(divisor) || MathUtil::IsPow2(-static_cast(divisor))) + { + const u32 abs_val = static_cast(std::abs(static_cast(divisor))); + + ARM64Reg RA = gpr.R(a); + ARM64Reg RD = gpr.R(d); + + const bool allocate_reg = a == d; + ARM64Reg WA = allocate_reg ? gpr.GetReg() : RD; + + TST(RA, RA); + ADDI2R(WA, RA, abs_val - 1, WA); + CSEL(WA, RA, WA, CCFlags::CC_PL); + + if (divisor < 0) + NEG(RD, WA, ArithOption(WA, ShiftType::ASR, IntLog2(abs_val))); + else + ASR(RD, WA, IntLog2(abs_val)); + + if (allocate_reg) + gpr.Unlock(WA); + } else { // Optimize signed 32-bit integer division by a constant From 91b112b984ec8d460bd15fc64129411ee633d5a5 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 21 Aug 2021 18:59:55 +0200 Subject: [PATCH 5/6] JitArm64: divwx - Optimize division by 2 ...and let's optimize a divisor of 2 ever so slightly for good measure. Most GameCube games seem to hit this on launch. --- .../Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index a6b88a8cb1..f61fbe49a5 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -1394,6 +1394,18 @@ void JitArm64::divwx(UGeckoInstruction inst) gpr.Unlock(WA); } + else if (divisor == 2 || divisor == -2) + { + ARM64Reg RA = gpr.R(a); + ARM64Reg RD = gpr.R(d); + + ADD(RD, RA, RA, ArithOption(RA, ShiftType::LSR, 31)); + + if (divisor < 0) + NEG(RD, RD, ArithOption(RD, ShiftType::ASR, 1)); + else + ASR(RD, RD, 1); + } else if (MathUtil::IsPow2(divisor) || MathUtil::IsPow2(-static_cast(divisor))) { const u32 abs_val = static_cast(std::abs(static_cast(divisor))); From feefc17b023cda2d5c012d8b5b1a5ca019a3d717 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 21 Aug 2021 20:28:18 +0200 Subject: [PATCH 6/6] JitArm64: divwx - Optimize comparisons to 0x80000000 --- .../Core/PowerPC/JitArm64/JitArm64_Integer.cpp | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index f61fbe49a5..989a732cee 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -1384,15 +1384,9 @@ void JitArm64::divwx(UGeckoInstruction inst) } else if (divisor == -1) { - ARM64Reg WA = gpr.GetReg(); - // Rd = (Ra == 0x80000000) ? 0xFFFFFFFF : -Ra - MOVI2R(WA, 0x80000000); - CMP(gpr.R(a), WA); - NEG(gpr.R(d), gpr.R(a)); - CSINV(gpr.R(d), gpr.R(d), ARM64Reg::WZR, CCFlags::CC_NEQ); - - gpr.Unlock(WA); + NEGS(gpr.R(d), gpr.R(a)); + CSINV(gpr.R(d), gpr.R(d), ARM64Reg::WZR, CCFlags::CC_VC); } else if (divisor == 2 || divisor == -2) { @@ -1484,15 +1478,13 @@ void JitArm64::divwx(UGeckoInstruction inst) gpr.BindToRegister(d, d == a || d == b); - ARM64Reg WA = gpr.GetReg(); ARM64Reg RA = gpr.R(a); ARM64Reg RB = gpr.R(b); ARM64Reg RD = gpr.R(d); FixupBranch overflow1 = CBZ(RB); - MOVI2R(WA, -0x80000000LL); - CMP(RA, WA); - CCMN(RB, 1, 0, CC_EQ); + NEGS(ARM64Reg::WZR, RA); // Is RA 0x80000000? + CCMN(RB, 1, 0, CC_VS); // Is RB -1? FixupBranch overflow2 = B(CC_EQ); SDIV(RD, RA, RB); FixupBranch done = B(); @@ -1504,8 +1496,6 @@ void JitArm64::divwx(UGeckoInstruction inst) SetJumpTarget(done); - gpr.Unlock(WA); - if (inst.Rc) ComputeRC0(RD); }