[AArch64] Implement Fiora's preemptive paired loadstore optimization.

This provides a decent speed up in pretty much everything that touches pair loadstores because in most cases they are just regular non-quantizing
float loadstores that happen.
This commit is contained in:
Ryan Houdek 2015-09-01 16:22:44 -05:00
parent e01428935f
commit 2c68f6bfc5
4 changed files with 131 additions and 48 deletions

View File

@ -7,6 +7,7 @@
#include "Common/PerformanceCounter.h"
#include "Core/PatchEngine.h"
#include "Core/PowerPC/JitInterface.h"
#include "Core/PowerPC/Profiler.h"
#include "Core/PowerPC/JitArm64/Jit.h"
#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h"
@ -351,6 +352,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
js.isLastInstruction = false;
js.firstFPInstructionFound = false;
js.assumeNoPairedQuantize = false;
js.blockStart = em_address;
js.fifoBytesThisBlock = 0;
js.downcountAmount = 0;
@ -396,6 +398,30 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
// get start tic
BeginTimeProfile(b);
}
if (code_block.m_gqr_used.Count() == 1 && js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end())
{
int gqr = *code_block.m_gqr_used.begin();
if (!code_block.m_gqr_modified[gqr] && !GQR(gqr))
{
LDR(INDEX_UNSIGNED, W0, X29, PPCSTATE_OFF(spr[SPR_GQR0]) + gqr * 4);
FixupBranch no_fail = B(CC_EQ);
FixupBranch fail = B();
SwitchToFarCode();
SetJumpTarget(fail);
MOVI2R(W0, js.blockStart);
STR(INDEX_UNSIGNED, W0, X29, PPCSTATE_OFF(pc));
MOVI2R(W0, (u32)JitInterface::ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE);
MOVI2R(X1, (u64)&JitInterface::CompileExceptionCheck);
BLR(X1);
MOVI2R(X1, (u64)asm_routines.dispatcher);
BR(X1);
SwitchToNearCode();
SetJumpTarget(no_fail);
js.assumeNoPairedQuantize = true;
}
}
const u8 *normalEntry = GetCodePtr();
b->normalEntry = normalEntry;

View File

@ -51,7 +51,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
{
if (flags & BackPatchInfo::FLAG_STORE &&
flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64 | BackPatchInfo::FLAG_SIZE_F32I))
flags & BackPatchInfo::FLAG_MASK_FLOAT)
{
if (flags & BackPatchInfo::FLAG_SIZE_F32)
{
@ -64,6 +64,12 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
m_float_emit.REV32(8, D0, RS);
m_float_emit.STR(32, D0, X28, addr);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2)
{
m_float_emit.FCVTN(32, D0, RS);
m_float_emit.REV32(8, D0, D0);
m_float_emit.STR(64, Q0, X28, addr);
}
else
{
m_float_emit.REV64(8, Q0, RS);
@ -71,7 +77,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
}
}
else if (flags & BackPatchInfo::FLAG_LOAD &&
flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64))
flags & BackPatchInfo::FLAG_MASK_FLOAT)
{
if (flags & BackPatchInfo::FLAG_SIZE_F32)
{
@ -166,7 +172,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
m_float_emit.ABI_PushRegisters(fprs_to_push, X30);
if (flags & BackPatchInfo::FLAG_STORE &&
flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64 | BackPatchInfo::FLAG_SIZE_F32I))
flags & BackPatchInfo::FLAG_MASK_FLOAT)
{
if (flags & BackPatchInfo::FLAG_SIZE_F32)
{
@ -181,6 +187,14 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
MOVI2R(X30, (u64)&PowerPC::Write_U32);
BLR(X30);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2)
{
m_float_emit.FCVTN(32, D0, RS);
m_float_emit.UMOV(64, X0, D0, 0);
ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32));
MOVI2R(X30, (u64)PowerPC::Write_U64);
BLR(X30);
}
else
{
MOVI2R(X30, (u64)&PowerPC::Write_U64);
@ -190,7 +204,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
}
else if (flags & BackPatchInfo::FLAG_LOAD &&
flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64))
flags & BackPatchInfo::FLAG_MASK_FLOAT)
{
if (flags & BackPatchInfo::FLAG_SIZE_F32)
{

View File

@ -38,8 +38,7 @@ void JitArm64::psq_l(UGeckoInstruction inst)
ARM64Reg scale_reg = W0;
ARM64Reg addr_reg = W1;
ARM64Reg type_reg = W2;
LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
ARM64Reg VS;
if (inst.RA || update) // Always uses the register on update
{
@ -53,21 +52,43 @@ void JitArm64::psq_l(UGeckoInstruction inst)
MOVI2R(addr_reg, (u32)offset);
}
UBFM(type_reg, scale_reg, 16, 18); // Type
UBFM(scale_reg, scale_reg, 24, 29); // Scale
if (update)
{
gpr.BindToRegister(inst.RA, REG_REG);
MOV(arm_addr, addr_reg);
}
MOVI2R(X30, (u64)&asm_routines.pairedLoadQuantized[inst.W * 8]);
LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(X30);
if (js.assumeNoPairedQuantize)
{
VS = fpr.RW(inst.RS, REG_REG);
if (!inst.W)
{
ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), X28);
m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg));
m_float_emit.REV32(8, VS, VS);
}
else
{
m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), X28);
m_float_emit.REV32(8, VS, VS);
}
m_float_emit.FCVTL(64, VS, VS);
}
else
{
LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
UBFM(type_reg, scale_reg, 16, 18); // Type
UBFM(scale_reg, scale_reg, 24, 29); // Scale
MOVI2R(X30, (u64)&asm_routines.pairedLoadQuantized[inst.W * 8]);
LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(X30);
VS = fpr.RW(inst.RS, REG_REG);
m_float_emit.FCVTL(64, VS, D0);
}
ARM64Reg VS = fpr.RW(inst.RS, REG_REG);
m_float_emit.FCVTL(64, VS, D0);
if (inst.W)
{
m_float_emit.FMOV(D0, 0x70); // 1.0 as a Double
@ -106,11 +127,9 @@ void JitArm64::psq_st(UGeckoInstruction inst)
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
// Wipe the registers we are using as temporaries
gprs_in_use &= BitSet32(~0x40000007);
gprs_in_use &= BitSet32(~7);
fprs_in_use &= BitSet32(~3);
LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
if (inst.RA || update) // Always uses the register on update
{
if (offset >= 0)
@ -123,40 +142,59 @@ void JitArm64::psq_st(UGeckoInstruction inst)
MOVI2R(addr_reg, (u32)offset);
}
UBFM(type_reg, scale_reg, 0, 2); // Type
UBFM(scale_reg, scale_reg, 8, 13); // Scale
if (update)
{
gpr.BindToRegister(inst.RA, REG_REG);
MOV(arm_addr, addr_reg);
}
m_float_emit.FCVTN(32, D0, VS);
// Inline address check
if (js.assumeNoPairedQuantize)
{
u32 flags = BackPatchInfo::FLAG_STORE;
flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2);
EmitBackpatchRoutine(flags,
jo.fastmem,
jo.fastmem,
VS, EncodeRegTo64(addr_reg),
gprs_in_use,
fprs_in_use);
}
else
{
if (inst.W)
m_float_emit.FCVT(32, 64, D0, VS);
else
m_float_emit.FCVTN(32, D0, VS);
LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
UBFM(type_reg, scale_reg, 0, 2); // Type
UBFM(scale_reg, scale_reg, 8, 13); // Scale
// Inline address check
TST(addr_reg, 6, 1);
FixupBranch argh = B(CC_NEQ);
FixupBranch pass = B(CC_EQ);
FixupBranch fail = B();
SwitchToFarCode();
SetJumpTarget(fail);
// Slow
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[16 + inst.W * 8]);
LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
ABI_PushRegisters(gprs_in_use);
m_float_emit.ABI_PushRegisters(fprs_in_use, X30);
BLR(EncodeRegTo64(type_reg));
m_float_emit.ABI_PopRegisters(fprs_in_use, X30);
ABI_PopRegisters(gprs_in_use);
FixupBranch continue1 = B();
SwitchToNearCode();
SetJumpTarget(pass);
// Fast
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]);
LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(EncodeRegTo64(type_reg));
FixupBranch continue1 = B();
SetJumpTarget(argh);
// Slow
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[16 + inst.W * 8]);
LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
ABI_PushRegisters(gprs_in_use);
m_float_emit.ABI_PushRegisters(fprs_in_use, X30);
BLR(EncodeRegTo64(type_reg));
m_float_emit.ABI_PopRegisters(fprs_in_use, X30);
ABI_PopRegisters(gprs_in_use);
SetJumpTarget(continue1);
}

View File

@ -9,17 +9,22 @@ struct BackPatchInfo
{
enum
{
FLAG_STORE = (1 << 0),
FLAG_LOAD = (1 << 1),
FLAG_SIZE_8 = (1 << 2),
FLAG_SIZE_16 = (1 << 3),
FLAG_SIZE_32 = (1 << 4),
FLAG_SIZE_F32 = (1 << 5),
FLAG_SIZE_F64 = (1 << 6),
FLAG_REVERSE = (1 << 7),
FLAG_EXTEND = (1 << 8),
FLAG_SIZE_F32I = (1 << 9),
FLAG_ZERO_256 = (1 << 10),
FLAG_STORE = (1 << 0),
FLAG_LOAD = (1 << 1),
FLAG_SIZE_8 = (1 << 2),
FLAG_SIZE_16 = (1 << 3),
FLAG_SIZE_32 = (1 << 4),
FLAG_SIZE_F32 = (1 << 5),
FLAG_SIZE_F32X2 = (1 << 6),
FLAG_SIZE_F64 = (1 << 7),
FLAG_REVERSE = (1 << 8),
FLAG_EXTEND = (1 << 9),
FLAG_SIZE_F32I = (1 << 10),
FLAG_ZERO_256 = (1 << 11),
FLAG_MASK_FLOAT = FLAG_SIZE_F32 |
FLAG_SIZE_F32X2 |
FLAG_SIZE_F64 |
FLAG_SIZE_F32I,
};
static u32 GetFlagSize(u32 flags)