[AArch64] Optimize slowmem paired stores.

This came up from the discussion we were having prior about dumping half of a kilobyte of VFP registers to the stack is insanity.
This was due to me basically copying exactly what I did on ARMv7's paired loadstores, where the impact is less since we only use the bottom 64bits of
the VFP registers.

So I decided to think about how to improve upon this since I got called out on my terrible code.
The solution I have come up with is instead of jumping to the common ASM routine and having that check if it needs to take the fastmem or slowmem
routes, just inline the check in to the JIT block and jump to either a fastmem or slowmem handler.
Fairly simple, and this allows us to only flush the registers that are required when doing so. Should give a reasonable increase in performance for
games that use the slowmem path quite a lot.
This commit is contained in:
Ryan Houdek 2015-03-08 12:30:41 -05:00
parent f6511c3ba5
commit 7f50cc0873
2 changed files with 218 additions and 212 deletions

View File

@ -94,9 +94,18 @@ void JitArm64::psq_st(UGeckoInstruction inst)
fpr.Lock(Q0, Q1);
ARM64Reg arm_addr = gpr.R(inst.RA);
ARM64Reg VS = fpr.R(inst.RS);
ARM64Reg scale_reg = W0;
ARM64Reg addr_reg = W1;
ARM64Reg type_reg = gpr.GetReg();
ARM64Reg type_reg = W2;
BitSet32 gprs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
// Wipe the registers we are using as temporaries
gprs_in_use &= BitSet32(~0x40000007);
fprs_in_use &= BitSet32(~3);
LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
@ -118,13 +127,35 @@ void JitArm64::psq_st(UGeckoInstruction inst)
if (update)
MOV(arm_addr, addr_reg);
ARM64Reg VS = fpr.R(inst.RS);
m_float_emit.FCVTN(32, D0, VS);
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]);
LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(X30);
gpr.Unlock(W0, W1, W2, W30, type_reg);
// Inline address check
{
TST(addr_reg, 6, 1);
FixupBranch argh = B(CC_NEQ);
// Fast
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]);
LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(EncodeRegTo64(type_reg));
FixupBranch continue1 = B();
SetJumpTarget(argh);
// Slow
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[16 + inst.W * 8]);
LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
ABI_PushRegisters(gprs_in_use);
m_float_emit.ABI_PushRegisters(fprs_in_use, X30);
BLR(EncodeRegTo64(type_reg));
m_float_emit.ABI_PopRegisters(fprs_in_use, X30);
ABI_PushRegisters(gprs_in_use);
SetJumpTarget(continue1);
}
gpr.Unlock(W0, W1, W2, W30);
fpr.Unlock(Q0, Q1);
}

View File

@ -107,7 +107,6 @@ void JitArm64AsmRoutineManager::GenerateCommon()
ARM64Reg addr_reg = X1;
ARM64Reg scale_reg = X0;
ARM64FloatEmitter float_emit(this);
const u32 GPR_CALLER_SAVE = 0x6007FFFF;
const u8* loadPairedIllegal = GetCodePtr();
BRK(100);
@ -263,36 +262,27 @@ void JitArm64AsmRoutineManager::GenerateCommon()
// Stores
const u8* storePairedIllegal = GetCodePtr();
BRK(0x101);
const u8* storePairedFloat = GetCodePtr();
const u8* storePairedFloat;
const u8* storePairedFloatSlow;
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storePairedFloat = GetCodePtr();
float_emit.REV32(8, D0, D0);
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(64, Q0, 0, addr_reg, SP);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
storePairedFloatSlow = GetCodePtr();
float_emit.UMOV(64, X0, Q0, 0);
ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32));
MOVI2R(X30, (u64)PowerPC::Write_U64);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
MOVI2R(X2, (u64)PowerPC::Write_U64);
BR(X2);
}
const u8* storePairedU8 = GetCodePtr();
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
const u8* storePairedU8;
const u8* storePairedU8Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
@ -300,30 +290,26 @@ void JitArm64AsmRoutineManager::GenerateCommon()
float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storePairedU8 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(16, Q0, 0, addr_reg, SP);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
storePairedU8Slow = GetCodePtr();
emit_quantize();
float_emit.UMOV(16, W0, Q0, 0);
REV16(W0, W0);
MOVI2R(X30, (u64)PowerPC::Write_U16);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
MOVI2R(X2, (u64)PowerPC::Write_U16);
BR(X2);
}
const u8* storePairedS8 = GetCodePtr();
const u8* storePairedS8;
const u8* storePairedS8Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
@ -331,31 +317,27 @@ void JitArm64AsmRoutineManager::GenerateCommon()
float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storePairedS8 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(16, Q0, 0, addr_reg, SP);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
storePairedS8Slow = GetCodePtr();
emit_quantize();
float_emit.UMOV(16, W0, Q0, 0);
REV16(W0, W0);
MOVI2R(X30, (u64)PowerPC::Write_U16);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
MOVI2R(X2, (u64)PowerPC::Write_U16);
BR(X2);
}
const u8* storePairedU16 = GetCodePtr();
const u8* storePairedU16;
const u8* storePairedU16Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
@ -363,29 +345,26 @@ void JitArm64AsmRoutineManager::GenerateCommon()
float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.REV16(8, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storePairedU16 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(32, Q0, 0, addr_reg, SP);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
storePairedU16Slow = GetCodePtr();
emit_quantize();
float_emit.REV32(8, D0, D0);
float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)PowerPC::Write_U32);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
MOVI2R(X2, (u64)PowerPC::Write_U32);
BR(X2);
}
const u8* storePairedS16 = GetCodePtr(); // Used by Viewtiful Joe's intro movie
const u8* storePairedS16; // Used by Viewtiful Joe's intro movie
const u8* storePairedS16Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
@ -393,54 +372,41 @@ void JitArm64AsmRoutineManager::GenerateCommon()
float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.REV16(8, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storePairedS16 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(32, Q0, 0, addr_reg, SP);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
storePairedS16Slow = GetCodePtr();
emit_quantize();
float_emit.REV32(8, D0, D0);
float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)PowerPC::Write_U32);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
MOVI2R(X2, (u64)PowerPC::Write_U32);
BR(X2);
}
const u8* storeSingleFloat = GetCodePtr();
const u8* storeSingleFloat;
const u8* storeSingleFloatSlow;
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storeSingleFloat = GetCodePtr();
float_emit.REV32(8, D0, D0);
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.STR(32, INDEX_UNSIGNED, D0, addr_reg, 0);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
storeSingleFloatSlow = GetCodePtr();
float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)&PowerPC::Write_U32);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
MOVI2R(X2, (u64)&PowerPC::Write_U32);
BR(X2);
}
const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii
const u8* storeSingleU8; // Used by MKWii
const u8* storeSingleU8Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
@ -448,28 +414,25 @@ void JitArm64AsmRoutineManager::GenerateCommon()
float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storeSingleU8 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(8, Q0, 0, addr_reg);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)&PowerPC::Write_U8);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
storeSingleU8Slow = GetCodePtr();
emit_quantize();
float_emit.UMOV(8, W0, Q0, 0);
MOVI2R(X2, (u64)&PowerPC::Write_U8);
BR(X2);
}
const u8* storeSingleS8 = GetCodePtr();
const u8* storeSingleS8;
const u8* storeSingleS8Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
@ -477,85 +440,77 @@ void JitArm64AsmRoutineManager::GenerateCommon()
float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storeSingleS8 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(8, Q0, 0, addr_reg);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.SMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)&PowerPC::Write_U8);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
storeSingleS8Slow = GetCodePtr();
emit_quantize();
float_emit.SMOV(8, W0, Q0, 0);
MOVI2R(X2, (u64)&PowerPC::Write_U8);
BR(X2);
}
const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii
const u8* storeSingleU16; // Used by MKWii
const u8* storeSingleU16Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1);
float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storeSingleU16 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.REV16(8, D0, D0);
float_emit.ST1(16, Q0, 0, addr_reg);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)&PowerPC::Write_U16);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
storeSingleU16Slow = GetCodePtr();
emit_quantize();
float_emit.UMOV(16, W0, Q0, 0);
MOVI2R(X2, (u64)&PowerPC::Write_U16);
BR(X2);
}
const u8* storeSingleS16 = GetCodePtr();
const u8* storeSingleS16;
const u8* storeSingleS16Slow;
{
auto emit_quantize = [this, &float_emit, scale_reg]()
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1);
float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0);
};
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storeSingleS16 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.REV16(8, D0, D0);
float_emit.ST1(16, Q0, 0, addr_reg);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.SMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)&PowerPC::Write_U16);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
storeSingleS16Slow = GetCodePtr();
emit_quantize();
float_emit.SMOV(16, W0, Q0, 0);
MOVI2R(X2, (u64)&PowerPC::Write_U16);
BR(X2);
}
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*));
ReserveCodeSpace(32 * sizeof(u8*));
// Fast
pairedStoreQuantized[0] = storePairedFloat;
pairedStoreQuantized[1] = storePairedIllegal;
pairedStoreQuantized[2] = storePairedIllegal;
@ -573,4 +528,24 @@ void JitArm64AsmRoutineManager::GenerateCommon()
pairedStoreQuantized[13] = storeSingleU16;
pairedStoreQuantized[14] = storeSingleS8;
pairedStoreQuantized[15] = storeSingleS16;
// Slow
pairedStoreQuantized[16] = storePairedFloatSlow;
pairedStoreQuantized[17] = storePairedIllegal;
pairedStoreQuantized[18] = storePairedIllegal;
pairedStoreQuantized[19] = storePairedIllegal;
pairedStoreQuantized[20] = storePairedU8Slow;
pairedStoreQuantized[21] = storePairedU16Slow;
pairedStoreQuantized[22] = storePairedS8Slow;
pairedStoreQuantized[23] = storePairedS16Slow;
pairedStoreQuantized[24] = storeSingleFloatSlow;
pairedStoreQuantized[25] = storePairedIllegal;
pairedStoreQuantized[26] = storePairedIllegal;
pairedStoreQuantized[27] = storePairedIllegal;
pairedStoreQuantized[28] = storeSingleU8Slow;
pairedStoreQuantized[29] = storeSingleU16Slow;
pairedStoreQuantized[30] = storeSingleS8Slow;
pairedStoreQuantized[31] = storeSingleS16Slow;
}