Merge pull request #9566 from Sintendo/jit64divwx

Jit64: Optimize divwx
This commit is contained in:
LC 2021-03-22 14:40:02 -04:00 committed by GitHub
commit 15ebb1d9e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 329 additions and 5 deletions

View File

@ -430,6 +430,8 @@ add_library(core
PowerPC/Interpreter/Interpreter_Tables.cpp
PowerPC/Interpreter/Interpreter.cpp
PowerPC/Interpreter/Interpreter.h
PowerPC/JitCommon/DivUtils.cpp
PowerPC/JitCommon/DivUtils.h
PowerPC/JitCommon/JitAsmCommon.cpp
PowerPC/JitCommon/JitAsmCommon.h
PowerPC/JitCommon/JitBase.cpp

View File

@ -96,7 +96,7 @@ public:
void GenerateConstantOverflow(bool overflow);
void GenerateConstantOverflow(s64 val);
void GenerateOverflow();
void GenerateOverflow(Gen::CCFlags cond = Gen::CCFlags::CC_NO);
void FinalizeCarryOverflow(bool oe, bool inv = false);
void FinalizeCarry(Gen::CCFlags cond);
void FinalizeCarry(bool ca);

View File

@ -16,10 +16,12 @@
#include "Core/PowerPC/Jit64/Jit.h"
#include "Core/PowerPC/Jit64/RegCache/JitRegCache.h"
#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
#include "Core/PowerPC/JitCommon/DivUtils.h"
#include "Core/PowerPC/PPCAnalyst.h"
#include "Core/PowerPC/PowerPC.h"
using namespace Gen;
using namespace JitCommon;
void Jit64::GenerateConstantOverflow(s64 val)
{
@ -42,9 +44,9 @@ void Jit64::GenerateConstantOverflow(bool overflow)
}
// We could do overflow branchlessly, but unlike carry it seems to be quite a bit rarer.
void Jit64::GenerateOverflow()
void Jit64::GenerateOverflow(Gen::CCFlags cond)
{
FixupBranch jno = J_CC(CC_NO);
FixupBranch jno = J_CC(cond);
// XER[OV/SO] = 1
MOV(8, PPCSTATE(xer_so_ov), Imm8(XER_OV_MASK | XER_SO_MASK));
FixupBranch exit = J();
@ -1342,6 +1344,207 @@ void Jit64::divwx(UGeckoInstruction inst)
GenerateConstantOverflow(false);
}
}
else if (gpr.IsImm(a))
{
// Constant dividend
const u32 dividend = gpr.Imm32(a);
if (dividend == 0)
{
if (inst.OE)
{
RCOpArg Rb = gpr.Use(b, RCMode::Read);
RegCache::Realize(Rb);
CMP_or_TEST(32, Rb, Imm32(0));
GenerateOverflow(CC_NZ);
}
// Zero divided by anything is always zero
gpr.SetImmediate32(d, 0);
}
else
{
RCX64Reg Rb = gpr.Bind(b, RCMode::Read);
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
// no register choice
RCX64Reg eax = gpr.Scratch(EAX);
RCX64Reg edx = gpr.Scratch(EDX);
RegCache::Realize(Rb, Rd, eax, edx);
// Check for divisor == 0
TEST(32, Rb, Rb);
FixupBranch normal_path;
if (dividend == 0x80000000)
{
// Divisor is 0, proceed to overflow case
const FixupBranch overflow = J_CC(CC_Z);
// Otherwise, check for divisor == -1
CMP(32, Rb, Imm32(0xFFFFFFFF));
normal_path = J_CC(CC_NE);
SetJumpTarget(overflow);
}
else
{
// Divisor is not 0, take normal path
normal_path = J_CC(CC_NZ);
// Otherwise, proceed to overflow case
}
// Set Rd to all ones or all zeroes
if (dividend & 0x80000000)
MOV(32, Rd, Imm32(0xFFFFFFFF));
else
XOR(32, Rd, Rd);
if (inst.OE)
GenerateConstantOverflow(true);
const FixupBranch done = J();
SetJumpTarget(normal_path);
MOV(32, eax, Imm32(dividend));
CDQ();
IDIV(32, Rb);
MOV(32, Rd, eax);
if (inst.OE)
GenerateConstantOverflow(false);
SetJumpTarget(done);
}
}
else if (gpr.IsImm(b))
{
// Constant divisor
const s32 divisor = gpr.SImm32(b);
RCOpArg Ra = gpr.Use(a, RCMode::Read);
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
RegCache::Realize(Ra, Rd);
// Handle 0, 1, and -1 explicitly
if (divisor == 0)
{
if (d != a)
MOV(32, Rd, Ra);
SAR(32, Rd, Imm8(31));
if (inst.OE)
GenerateConstantOverflow(true);
}
else if (divisor == 1)
{
if (d != a)
MOV(32, Rd, Ra);
if (inst.OE)
GenerateConstantOverflow(false);
}
else if (divisor == -1)
{
if (d != a)
MOV(32, Rd, Ra);
NEG(32, Rd);
const FixupBranch normal = J_CC(CC_NO);
MOV(32, Rd, Imm32(0xFFFFFFFF));
if (inst.OE)
GenerateConstantOverflow(true);
const FixupBranch done = J();
SetJumpTarget(normal);
if (inst.OE)
GenerateConstantOverflow(false);
SetJumpTarget(done);
}
else if (divisor == 2 || divisor == -2)
{
X64Reg tmp = RSCRATCH;
if (Ra.IsSimpleReg() && Ra.GetSimpleReg() != Rd)
tmp = Ra.GetSimpleReg();
else
MOV(32, R(tmp), Ra);
MOV(32, Rd, R(tmp));
SHR(32, Rd, Imm8(31));
ADD(32, Rd, R(tmp));
SAR(32, Rd, Imm8(1));
if (divisor < 0)
NEG(32, Rd);
if (inst.OE)
GenerateConstantOverflow(false);
}
else if (MathUtil::IsPow2(divisor) || MathUtil::IsPow2(-divisor))
{
u32 abs_val = std::abs(divisor);
X64Reg tmp = RSCRATCH;
if (Ra.IsSimpleReg() && Ra.GetSimpleReg() != Rd)
tmp = Ra.GetSimpleReg();
else
MOV(32, R(tmp), Ra);
TEST(32, R(tmp), R(tmp));
LEA(32, Rd, MDisp(tmp, abs_val - 1));
CMOVcc(32, Rd, R(tmp), CC_NS);
SAR(32, Rd, Imm8(IntLog2(abs_val)));
if (divisor < 0)
NEG(32, Rd);
if (inst.OE)
GenerateConstantOverflow(false);
}
else
{
// Optimize signed 32-bit integer division by a constant
Magic m = SignedDivisionConstants(divisor);
MOVSX(64, 32, RSCRATCH, Ra);
if (divisor > 0 && m.multiplier < 0)
{
IMUL(64, Rd, R(RSCRATCH), Imm32(m.multiplier));
SHR(64, Rd, Imm8(32));
ADD(32, Rd, R(RSCRATCH));
SHR(32, R(RSCRATCH), Imm8(31));
SAR(32, Rd, Imm8(m.shift));
}
else if (divisor < 0 && m.multiplier > 0)
{
IMUL(64, Rd, R(RSCRATCH), Imm32(m.multiplier));
SHR(64, R(RSCRATCH), Imm8(32));
SUB(32, R(RSCRATCH), Rd);
MOV(32, Rd, R(RSCRATCH));
SHR(32, Rd, Imm8(31));
SAR(32, R(RSCRATCH), Imm8(m.shift));
}
else if (m.multiplier > 0)
{
IMUL(64, Rd, R(RSCRATCH), Imm32(m.multiplier));
SHR(32, R(RSCRATCH), Imm8(31));
SAR(64, R(Rd), Imm8(32 + m.shift));
}
else
{
IMUL(64, RSCRATCH, R(RSCRATCH), Imm32(m.multiplier));
MOV(64, Rd, R(RSCRATCH));
SHR(64, R(RSCRATCH), Imm8(63));
SAR(64, R(Rd), Imm8(32 + m.shift));
}
ADD(32, Rd, R(RSCRATCH));
if (inst.OE)
GenerateConstantOverflow(false);
}
}
else
{
RCOpArg Ra = gpr.Use(a, RCMode::Read);
@ -1364,7 +1567,6 @@ void Jit64::divwx(UGeckoInstruction inst)
SetJumpTarget(overflow);
SAR(32, eax, Imm8(31));
MOV(32, Rd, eax);
if (inst.OE)
{
GenerateConstantOverflow(true);
@ -1376,12 +1578,13 @@ void Jit64::divwx(UGeckoInstruction inst)
CDQ();
IDIV(32, Rb);
MOV(32, Rd, eax);
if (inst.OE)
{
GenerateConstantOverflow(false);
}
SetJumpTarget(done);
MOV(32, Rd, eax);
}
if (inst.Rc)
ComputeRC(d);

View File

@ -0,0 +1,57 @@
// Copyright 2021 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include <cstdlib>
#include "Core/PowerPC/JitCommon/DivUtils.h"
namespace JitCommon
{
Magic SignedDivisionConstants(s32 d)
{
const u32 two31 = 2147483648;
const u32 ad = std::abs(d);
const u32 t = two31 - (d < 0);
const u32 anc = t - 1 - t % ad;
u32 q1 = two31 / anc;
u32 r1 = two31 - q1 * anc;
u32 q2 = two31 / ad;
u32 r2 = two31 - q2 * ad;
s32 p = 31;
u32 delta;
do
{
p++;
q1 *= 2;
r1 *= 2;
if (r1 >= anc)
{
q1++;
r1 -= anc;
}
q2 *= 2;
r2 *= 2;
if (r2 >= ad)
{
q2++;
r2 -= ad;
}
delta = ad - r2;
} while (q1 < delta || (q1 == delta && r1 == 0));
Magic mag;
mag.multiplier = q2 + 1;
if (d < 0)
mag.multiplier = -mag.multiplier;
mag.shift = p - 32;
return mag;
}
} // namespace JitCommon

View File

@ -0,0 +1,22 @@
// Copyright 2021 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#pragma once
#include "Common/CommonTypes.h"
namespace JitCommon
{
struct Magic
{
s32 multiplier;
u8 shift;
};
// Calculate the constants required to optimize a signed 32-bit integer division.
// Taken from The PowerPC Compiler Writer's Guide and LLVM.
// Divisor must not be -1, 0, and 1.
Magic SignedDivisionConstants(s32 divisor);
} // namespace JitCommon

View File

@ -27,6 +27,12 @@
<Project>{41279555-f94f-4ebc-99de-af863c10c5c4}</Project>
</ProjectReference>
</ItemGroup>
<ItemGroup>
<ClInclude Include="Core\PowerPC\JitCommon\DivUtils.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="Core\PowerPC\JitCommon\DivUtils.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
</Project>

View File

@ -0,0 +1,33 @@
// Copyright 2021 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include <gtest/gtest.h>
#include "Core/PowerPC/JitCommon/DivUtils.h"
using namespace JitCommon;
TEST(DivUtils, Signed)
{
Magic m3 = SignedDivisionConstants(3);
Magic m5 = SignedDivisionConstants(5);
Magic m7 = SignedDivisionConstants(7);
Magic minus3 = SignedDivisionConstants(-3);
Magic minus5 = SignedDivisionConstants(-5);
Magic minus7 = SignedDivisionConstants(-7);
EXPECT_EQ(0x55555556, m3.multiplier);
EXPECT_EQ(0, m3.shift);
EXPECT_EQ(0x66666667, m5.multiplier);
EXPECT_EQ(1, m5.shift);
EXPECT_EQ(-0x6DB6DB6D, m7.multiplier);
EXPECT_EQ(2, m7.shift);
EXPECT_EQ(-0x55555556, minus3.multiplier);
EXPECT_EQ(0, minus3.shift);
EXPECT_EQ(-0x66666667, minus5.multiplier);
EXPECT_EQ(1, minus5.shift);
EXPECT_EQ(0x6DB6DB6D, minus7.multiplier);
EXPECT_EQ(2, minus7.shift);
}

View File

@ -69,6 +69,7 @@
<ClCompile Include="Core\IOS\FS\FileSystemTest.cpp" />
<ClCompile Include="Core\MMIOTest.cpp" />
<ClCompile Include="Core\PageFaultTest.cpp" />
<ClCompile Include="DivUtilsTest.cpp" />
<ClCompile Include="FileUtil.cpp" />
<ClCompile Include="VideoCommon\VertexLoaderTest.cpp" />
<ClCompile Include="StubHost.cpp" />