From 825a10616c58e0dd7a733fed9d684ccadc97826c Mon Sep 17 00:00:00 2001 From: Bram Speeckaert Date: Sat, 23 Mar 2024 13:02:41 +0100 Subject: [PATCH 1/3] DivUtils: Add unsigned division magic function Takes the logic from Jit64 and moves it into DivUtils, so it can be reused by other backends as well. --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 2 +- .../PowerPC/JitArm64/JitArm64_Integer.cpp | 2 +- .../Core/Core/PowerPC/JitCommon/DivUtils.cpp | 42 ++++++++++++++++--- Source/Core/Core/PowerPC/JitCommon/DivUtils.h | 25 ++++++++++- .../UnitTests/Core/PowerPC/DivUtilsTest.cpp | 41 +++++++++++++++--- 5 files changed, 97 insertions(+), 15 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index afc1c9a920..671bd9fa9e 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1792,7 +1792,7 @@ void Jit64::divwx(UGeckoInstruction inst) else { // Optimize signed 32-bit integer division by a constant - Magic m = SignedDivisionConstants(divisor); + SignedMagic m = SignedDivisionConstants(divisor); MOVSX(64, 32, RSCRATCH, Ra); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index 43c90c9827..95f75a3f2a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -1675,7 +1675,7 @@ void JitArm64::divwx(UGeckoInstruction inst) else { // Optimize signed 32-bit integer division by a constant - Magic m = SignedDivisionConstants(divisor); + SignedMagic m = SignedDivisionConstants(divisor); ARM64Reg WA = gpr.GetReg(); ARM64Reg WB = gpr.GetReg(); diff --git a/Source/Core/Core/PowerPC/JitCommon/DivUtils.cpp b/Source/Core/Core/PowerPC/JitCommon/DivUtils.cpp index f1b6a1baf6..4648407e86 100644 --- a/Source/Core/Core/PowerPC/JitCommon/DivUtils.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/DivUtils.cpp @@ -3,16 +3,18 @@ #include "Core/PowerPC/JitCommon/DivUtils.h" +#include +#include #include namespace JitCommon { -Magic SignedDivisionConstants(s32 d) +SignedMagic SignedDivisionConstants(s32 divisor) { const u32 two31 = 2147483648; - const u32 ad = std::abs(d); - const u32 t = two31 - (d < 0); + const u32 ad = std::abs(divisor); + const u32 t = two31 - (divisor < 0); const u32 anc = t - 1 - t % ad; u32 q1 = two31 / anc; u32 r1 = two31 - q1 * anc; @@ -44,13 +46,43 @@ Magic SignedDivisionConstants(s32 d) delta = ad - r2; } while (q1 < delta || (q1 == delta && r1 == 0)); - Magic mag; + SignedMagic mag; mag.multiplier = q2 + 1; - if (d < 0) + if (divisor < 0) mag.multiplier = -mag.multiplier; mag.shift = p - 32; return mag; } +UnsignedMagic UnsignedDivisionConstants(u32 divisor) +{ + u32 shift = 31 - std::countl_zero(divisor); + + u64 magic_dividend = 0x100000000ULL << shift; + u32 multiplier = magic_dividend / divisor; + u32 max_quotient = multiplier >> shift; + + // Test for failure in round-up method + u32 round_up = (u64(multiplier + 1) * (max_quotient * divisor - 1)) >> (shift + 32); + bool fast = round_up == max_quotient - 1; + + if (fast) + { + multiplier++; + + // Use smallest magic number and shift amount possible + u32 trailing_zeroes = std::min(shift, u32(std::countr_zero(multiplier))); + multiplier >>= trailing_zeroes; + shift -= trailing_zeroes; + } + + UnsignedMagic mag; + mag.multiplier = multiplier; + mag.shift = shift; + mag.fast = fast; + + return mag; +} + } // namespace JitCommon diff --git a/Source/Core/Core/PowerPC/JitCommon/DivUtils.h b/Source/Core/Core/PowerPC/JitCommon/DivUtils.h index 73d91426e1..2cc3f2e494 100644 --- a/Source/Core/Core/PowerPC/JitCommon/DivUtils.h +++ b/Source/Core/Core/PowerPC/JitCommon/DivUtils.h @@ -7,7 +7,7 @@ namespace JitCommon { -struct Magic +struct SignedMagic { s32 multiplier; u8 shift; @@ -16,6 +16,27 @@ struct Magic // Calculate the constants required to optimize a signed 32-bit integer division. // Taken from The PowerPC Compiler Writer's Guide and LLVM. // Divisor must not be -1, 0, 1 or INT_MIN. -Magic SignedDivisionConstants(s32 divisor); +SignedMagic SignedDivisionConstants(s32 divisor); + +struct UnsignedMagic +{ + u32 multiplier; + u8 shift; + bool fast; +}; + +/// Calculate the constants required to optimize an unsigned 32-bit integer +/// division. +/// Divisor must not be 0, 1, or a power of two. +/// +/// Original implementation by calc84maniac. +/// Results are the same as the approach laid out in Hacker's Delight, with an +/// improvement for so-called uncooperative divisors (e.g. 7), as discovered by +/// ridiculousfish. +/// +/// See also: +/// https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html +/// https://rubenvannieuwpoort.nl/posts/division-by-constant-unsigned-integers +UnsignedMagic UnsignedDivisionConstants(u32 divisor); } // namespace JitCommon diff --git a/Source/UnitTests/Core/PowerPC/DivUtilsTest.cpp b/Source/UnitTests/Core/PowerPC/DivUtilsTest.cpp index 894d0f628a..b218745849 100644 --- a/Source/UnitTests/Core/PowerPC/DivUtilsTest.cpp +++ b/Source/UnitTests/Core/PowerPC/DivUtilsTest.cpp @@ -9,12 +9,12 @@ using namespace JitCommon; TEST(DivUtils, Signed) { - Magic m3 = SignedDivisionConstants(3); - Magic m5 = SignedDivisionConstants(5); - Magic m7 = SignedDivisionConstants(7); - Magic minus3 = SignedDivisionConstants(-3); - Magic minus5 = SignedDivisionConstants(-5); - Magic minus7 = SignedDivisionConstants(-7); + SignedMagic m3 = SignedDivisionConstants(3); + SignedMagic m5 = SignedDivisionConstants(5); + SignedMagic m7 = SignedDivisionConstants(7); + SignedMagic minus3 = SignedDivisionConstants(-3); + SignedMagic minus5 = SignedDivisionConstants(-5); + SignedMagic minus7 = SignedDivisionConstants(-7); EXPECT_EQ(0x55555556, m3.multiplier); EXPECT_EQ(0, m3.shift); @@ -30,3 +30,32 @@ TEST(DivUtils, Signed) EXPECT_EQ(0x6DB6DB6D, minus7.multiplier); EXPECT_EQ(2, minus7.shift); } + +TEST(DivUtils, Unsigned) +{ + UnsignedMagic m3 = UnsignedDivisionConstants(3); + UnsignedMagic m5 = UnsignedDivisionConstants(5); + UnsignedMagic m7 = UnsignedDivisionConstants(7); + UnsignedMagic m9 = UnsignedDivisionConstants(9); + UnsignedMagic m19 = UnsignedDivisionConstants(19); + + EXPECT_EQ(0xAAAAAAABU, m3.multiplier); + EXPECT_EQ(1, m3.shift); + EXPECT_TRUE(m3.fast); + + EXPECT_EQ(0xCCCCCCCDU, m5.multiplier); + EXPECT_EQ(2, m5.shift); + EXPECT_TRUE(m5.fast); + + EXPECT_EQ(0x92492492U, m7.multiplier); + EXPECT_EQ(2, m7.shift); + EXPECT_FALSE(m7.fast); + + EXPECT_EQ(0x38E38E39U, m9.multiplier); + EXPECT_EQ(1, m9.shift); + EXPECT_TRUE(m9.fast); + + EXPECT_EQ(0xD79435E5U, m19.multiplier); + EXPECT_EQ(4, m19.shift); + EXPECT_FALSE(m19.fast); +} From 749ee2ff5e0d80ea260160403c1a2d33491d1100 Mon Sep 17 00:00:00 2001 From: Bram Speeckaert Date: Sat, 23 Mar 2024 13:05:57 +0100 Subject: [PATCH 2/3] Jit64: Refactor divwux Now that we've moved the logic to DivUtils, refactor the Jit64 code to use it. --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 37 ++++++------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 671bd9fa9e..9925fd4c0d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1451,12 +1451,10 @@ void Jit64::divwux(UGeckoInstruction inst) } else { - u32 shift = 31; - while (!(divisor & (1 << shift))) - shift--; - - if (divisor == (u32)(1 << shift)) + if (MathUtil::IsPow2(divisor)) { + u32 shift = MathUtil::IntLog2(divisor); + RCOpArg Ra = gpr.Use(a, RCMode::Read); RCX64Reg Rd = gpr.Bind(d, RCMode::Write); RegCache::Realize(Ra, Rd); @@ -1468,24 +1466,22 @@ void Jit64::divwux(UGeckoInstruction inst) } else { - u64 magic_dividend = 0x100000000ULL << shift; - u32 magic = (u32)(magic_dividend / divisor); - u32 max_quotient = magic >> shift; + UnsignedMagic m = UnsignedDivisionConstants(divisor); // Test for failure in round-up method - if (((u64)(magic + 1) * (max_quotient * divisor - 1)) >> (shift + 32) != max_quotient - 1) + if (!m.fast) { // If failed, use slower round-down method RCOpArg Ra = gpr.Use(a, RCMode::Read); RCX64Reg Rd = gpr.Bind(d, RCMode::Write); RegCache::Realize(Ra, Rd); - MOV(32, R(RSCRATCH), Imm32(magic)); + MOV(32, R(RSCRATCH), Imm32(m.multiplier)); if (d != a) MOV(32, Rd, Ra); IMUL(64, Rd, R(RSCRATCH)); ADD(64, Rd, R(RSCRATCH)); - SHR(64, Rd, Imm8(shift + 32)); + SHR(64, Rd, Imm8(m.shift + 32)); } else { @@ -1494,32 +1490,23 @@ void Jit64::divwux(UGeckoInstruction inst) RCX64Reg Rd = gpr.Bind(d, RCMode::Write); RegCache::Realize(Ra, Rd); - magic++; - - // Use smallest magic number and shift amount possible - while ((magic & 1) == 0 && shift > 0) - { - magic >>= 1; - shift--; - } - // Three-operand IMUL sign extends the immediate to 64 bits, so we may only // use it when the magic number has its most significant bit set to 0 - if ((magic & 0x80000000) == 0) + if ((m.multiplier & 0x80000000) == 0) { - IMUL(64, Rd, Ra, Imm32(magic)); + IMUL(64, Rd, Ra, Imm32(m.multiplier)); } else if (d == a) { - MOV(32, R(RSCRATCH), Imm32(magic)); + MOV(32, R(RSCRATCH), Imm32(m.multiplier)); IMUL(64, Rd, R(RSCRATCH)); } else { - MOV(32, Rd, Imm32(magic)); + MOV(32, Rd, Imm32(m.multiplier)); IMUL(64, Rd, Ra); } - SHR(64, Rd, Imm8(shift + 32)); + SHR(64, Rd, Imm8(m.shift + 32)); } } if (inst.OE) From 2580837c60b991ae34e7f7ae4b562e382857c4de Mon Sep 17 00:00:00 2001 From: Bram Speeckaert Date: Sat, 23 Mar 2024 13:08:35 +0100 Subject: [PATCH 3/3] JitArm64: Optimize divwux When the divisor is a constant value, we can emit more efficient code. For powers of two, we can use bit shifts. For other values, we can instead use a multiplication by magic constant method. - Example 1 - Division by 16 (power of two) Before: mov w24, #0x10 ; =16 udiv w27, w25, w24 After: lsr w27, w25, #4 - Example 2 - Division by 10 (fast) Before: mov w25, #0xa ; =10 udiv w27, w26, w25 After: mov w27, #0xcccd ; =52429 movk w27, #0xcccc, lsl #16 umull x27, w26, w27 lsr x27, x27, #35 - Example 3 - Division by 127 (slow) Before: mov w26, #0x7f ; =127 udiv w27, w27, w26 After: mov w26, #0x408 ; =1032 movk w26, #0x8102, lsl #16 umaddl x27, w27, w26, x26 lsr x27, x27, #38 --- .../PowerPC/JitArm64/JitArm64_Integer.cpp | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index 95f75a3f2a..b5ebfc8bf4 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -1538,6 +1538,60 @@ void JitArm64::divwux(UGeckoInstruction inst) if (inst.Rc) ComputeRC0(gpr.GetImm(d)); } + else if (gpr.IsImm(b)) + { + const u32 divisor = gpr.GetImm(b); + + if (divisor == 0) + { + gpr.SetImmediate(d, 0); + if (inst.Rc) + ComputeRC0(0); + } + else + { + const bool allocate_reg = d == a; + gpr.BindToRegister(d, allocate_reg); + + ARM64Reg RD = gpr.R(d); + ARM64Reg RA = gpr.R(a); + + if (MathUtil::IsPow2(divisor)) + { + int shift = MathUtil::IntLog2(divisor); + if (shift) + LSR(RD, RA, shift); + else if (d != a) + MOV(RD, RA); + } + else + { + UnsignedMagic m = UnsignedDivisionConstants(divisor); + + ARM64Reg WI = allocate_reg ? gpr.GetReg() : RD; + ARM64Reg XD = EncodeRegTo64(RD); + + MOVI2R(WI, m.multiplier); + + if (m.fast) + { + UMULL(XD, RA, WI); + } + else + { + UMADDL(XD, RA, WI, EncodeRegTo64(WI)); + } + + LSR(XD, XD, 32 + m.shift); + + if (allocate_reg) + gpr.Unlock(WI); + } + + if (inst.Rc) + ComputeRC0(gpr.R(d)); + } + } else { gpr.BindToRegister(d, d == a || d == b);