JitArm64: Move psq_st address check to EmitBackpatchRoutine

This way the address check will take up less icache (since it's
only emitted once for each routine rather than once for each
psq_st instruction), and we also get address checking for psq_l.
Matches Jit64's approach.

The disadvantage: In the slowmem case, the routines have to
push *every* caller-saved register onto the stack, even though
most callers probably don't need it. But at long as the slowmem
case isn't hit frequently, this is fine.
This commit is contained in:
JosJuice 2021-07-09 12:13:58 +02:00
parent cd84339dfd
commit 96760093e9
5 changed files with 182 additions and 251 deletions

View File

@ -225,10 +225,9 @@ protected:
void DumpCode(const u8* start, const u8* end);
// Backpatching routines
bool DisasmLoadStore(const u8* ptr, u32* flags, Arm64Gen::ARM64Reg* reg);
void EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, Arm64Gen::ARM64Reg RS,
Arm64Gen::ARM64Reg addr, BitSet32 gprs_to_push = BitSet32(0),
BitSet32 fprs_to_push = BitSet32(0));
BitSet32 fprs_to_push = BitSet32(0), bool emitting_routine = false);
// Loadstore routines
void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update);
void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset);
@ -236,6 +235,7 @@ protected:
// jumps to the returned FixupBranch. Clobbers tmp and the 17 lower bits of addr_out.
Arm64Gen::FixupBranch BATAddressLookup(Arm64Gen::ARM64Reg addr_out, Arm64Gen::ARM64Reg addr_in,
Arm64Gen::ARM64Reg tmp, const void* bat_table);
Arm64Gen::FixupBranch CheckIfSafeAddress(Arm64Gen::ARM64Reg addr);
void DoJit(u32 em_address, JitBlock* b, u32 nextPC);

View File

@ -3,6 +3,7 @@
#include <cinttypes>
#include <cstddef>
#include <optional>
#include <string>
#include "Common/BitSet.h"
@ -51,13 +52,18 @@ void JitArm64::DoBacktrace(uintptr_t access_address, SContext* ctx)
}
void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, ARM64Reg RS,
ARM64Reg addr, BitSet32 gprs_to_push, BitSet32 fprs_to_push)
ARM64Reg addr, BitSet32 gprs_to_push, BitSet32 fprs_to_push,
bool emitting_routine)
{
bool in_far_code = false;
const u8* fastmem_start = GetCodePtr();
std::optional<FixupBranch> slowmem_fixup;
if (fastmem)
{
if (do_farcode && emitting_routine)
slowmem_fixup = CheckIfSafeAddress(addr);
if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT))
{
ARM64Reg temp = ARM64Reg::D0;
@ -110,34 +116,45 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR
{
if (fastmem && do_farcode)
{
SlowmemHandler handler;
handler.dest_reg = RS;
handler.addr_reg = addr;
handler.gprs = gprs_to_push;
handler.fprs = fprs_to_push;
handler.flags = flags;
FastmemArea* fastmem_area = &m_fault_to_handler[fastmem_end];
auto handler_loc_iter = m_handler_to_loc.find(handler);
if (handler_loc_iter == m_handler_to_loc.end())
if (emitting_routine)
{
in_far_code = true;
SwitchToFarCode();
const u8* handler_loc = GetCodePtr();
m_handler_to_loc[handler] = handler_loc;
fastmem_area->fastmem_code = fastmem_start;
fastmem_area->slowmem_code = handler_loc;
}
else
{
const u8* handler_loc = handler_loc_iter->second;
fastmem_area->fastmem_code = fastmem_start;
fastmem_area->slowmem_code = handler_loc;
return;
SlowmemHandler handler;
handler.dest_reg = RS;
handler.addr_reg = addr;
handler.gprs = gprs_to_push;
handler.fprs = fprs_to_push;
handler.flags = flags;
FastmemArea* fastmem_area = &m_fault_to_handler[fastmem_end];
auto handler_loc_iter = m_handler_to_loc.find(handler);
if (handler_loc_iter == m_handler_to_loc.end())
{
in_far_code = true;
SwitchToFarCode();
const u8* handler_loc = GetCodePtr();
m_handler_to_loc[handler] = handler_loc;
fastmem_area->fastmem_code = fastmem_start;
fastmem_area->slowmem_code = handler_loc;
}
else
{
const u8* handler_loc = handler_loc_iter->second;
fastmem_area->fastmem_code = fastmem_start;
fastmem_area->slowmem_code = handler_loc;
return;
}
}
}
if (slowmem_fixup)
SetJumpTarget(*slowmem_fixup);
ABI_PushRegisters(gprs_to_push);
m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30);
@ -229,8 +246,17 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR
if (in_far_code)
{
RET(ARM64Reg::X30);
SwitchToNearCode();
if (emitting_routine)
{
FixupBranch done = B();
SwitchToNearCode();
SetJumpTarget(done);
}
else
{
RET(ARM64Reg::X30);
SwitchToNearCode();
}
}
}

View File

@ -288,6 +288,16 @@ FixupBranch JitArm64::BATAddressLookup(ARM64Reg addr_out, ARM64Reg addr_in, ARM6
return fail;
}
FixupBranch JitArm64::CheckIfSafeAddress(Arm64Gen::ARM64Reg addr)
{
// FIXME: This doesn't correctly account for the BAT configuration.
TST(addr, LogicalImm(0x0c000000, 32));
FixupBranch pass = B(CC_EQ);
FixupBranch fail = B();
SetJumpTarget(pass);
return fail;
}
void JitArm64::lXX(UGeckoInstruction inst)
{
INSTRUCTION_START

View File

@ -19,10 +19,10 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(jo.memcheck || !jo.fastmem);
FALLBACK_IF(jo.memcheck);
// The asm routines assume address translation is on.
FALLBACK_IF(!MSR.DR);
// If we have a fastmem arena, the asm routines assume address translation is on.
FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR);
// X30 is LR
// X0 is the address
@ -111,10 +111,10 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(jo.memcheck || !jo.fastmem);
FALLBACK_IF(jo.memcheck);
// The asm routines assume address translation is on.
FALLBACK_IF(!MSR.DR);
// If we have a fastmem arena, the asm routines assume address translation is on.
FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR);
// X30 is LR
// X0 contains the scale
@ -213,33 +213,9 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
UBFM(type_reg, scale_reg, 0, 2); // Type
UBFM(scale_reg, scale_reg, 8, 13); // Scale
// Inline address check
// FIXME: This doesn't correctly account for the BAT configuration.
TST(addr_reg, LogicalImm(0x0c000000, 32));
FixupBranch pass = B(CC_EQ);
FixupBranch fail = B();
SwitchToFarCode();
SetJumpTarget(fail);
// Slow
MOVP2R(ARM64Reg::X30, &paired_store_quantized[16 + w * 8]);
LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true));
ABI_PushRegisters(gprs_in_use);
m_float_emit.ABI_PushRegisters(fprs_in_use, ARM64Reg::X30);
BLR(EncodeRegTo64(type_reg));
m_float_emit.ABI_PopRegisters(fprs_in_use, ARM64Reg::X30);
ABI_PopRegisters(gprs_in_use);
FixupBranch continue1 = B();
SwitchToNearCode();
SetJumpTarget(pass);
// Fast
MOVP2R(ARM64Reg::X30, &paired_store_quantized[w * 8]);
MOVP2R(ARM64Reg::X30, w ? single_store_quantized : paired_store_quantized);
LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(EncodeRegTo64(type_reg));
SetJumpTarget(continue1);
}
if (js.assumeNoPairedQuantize && !have_single)

View File

@ -506,8 +506,9 @@ void JitArm64::GenerateQuantizedLoads()
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT |
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32;
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push & ~BitSet32{1},
fprs_to_push);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push & ~BitSet32{1}, fprs_to_push);
RET(ARM64Reg::X30);
}
const u8* loadPairedU8Two = GetCodePtr();
@ -515,7 +516,8 @@ void JitArm64::GenerateQuantizedLoads()
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT |
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8;
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
@ -532,7 +534,8 @@ void JitArm64::GenerateQuantizedLoads()
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT |
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8;
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
@ -549,7 +552,8 @@ void JitArm64::GenerateQuantizedLoads()
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT |
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16;
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
@ -565,7 +569,8 @@ void JitArm64::GenerateQuantizedLoads()
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT |
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16;
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
@ -582,8 +587,9 @@ void JitArm64::GenerateQuantizedLoads()
constexpr u32 flags =
BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32;
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push & ~BitSet32{1},
fprs_to_push);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push & ~BitSet32{1}, fprs_to_push);
RET(ARM64Reg::X30);
}
const u8* loadPairedU8One = GetCodePtr();
@ -591,7 +597,8 @@ void JitArm64::GenerateQuantizedLoads()
constexpr u32 flags =
BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8;
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
@ -608,7 +615,8 @@ void JitArm64::GenerateQuantizedLoads()
constexpr u32 flags =
BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8;
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
@ -625,7 +633,8 @@ void JitArm64::GenerateQuantizedLoads()
constexpr u32 flags =
BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16;
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
@ -641,7 +650,8 @@ void JitArm64::GenerateQuantizedLoads()
constexpr u32 flags =
BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16;
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
@ -697,256 +707,181 @@ void JitArm64::GenerateQuantizedStores()
const u8* start = GetCodePtr();
const u8* storePairedIllegal = GetCodePtr();
BRK(0x101);
const u8* storePairedFloat;
const u8* storePairedFloatSlow;
const u8* storePairedFloat = GetCodePtr();
{
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT |
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32;
storePairedFloat = GetCodePtr();
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
storePairedFloatSlow = GetCodePtr();
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
}
const u8* storePairedU8;
const u8* storePairedU8Slow;
const u8* storePairedU8 = GetCodePtr();
{
auto emit_quantize = [this, &float_emit, scale_reg]() {
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
};
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT |
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8;
storePairedU8 = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
storePairedU8Slow = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
}
const u8* storePairedS8;
const u8* storePairedS8Slow;
const u8* storePairedS8 = GetCodePtr();
{
auto emit_quantize = [this, &float_emit, scale_reg]() {
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
};
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT |
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8;
storePairedS8 = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
storePairedS8Slow = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
}
const u8* storePairedU16;
const u8* storePairedU16Slow;
const u8* storePairedU16 = GetCodePtr();
{
auto emit_quantize = [this, &float_emit, scale_reg]() {
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
};
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT |
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16;
storePairedU16 = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
storePairedU16Slow = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
}
const u8* storePairedS16; // Used by Viewtiful Joe's intro movie
const u8* storePairedS16Slow;
const u8* storePairedS16 = GetCodePtr(); // Used by Viewtiful Joe's intro movie
{
auto emit_quantize = [this, &float_emit, scale_reg]() {
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
};
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT |
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16;
storePairedS16 = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
storePairedS16Slow = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
}
const u8* storeSingleFloat;
const u8* storeSingleFloatSlow;
const u8* storeSingleFloat = GetCodePtr();
{
constexpr u32 flags =
BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32;
storeSingleFloat = GetCodePtr();
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push);
storeSingleFloatSlow = GetCodePtr();
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
}
const u8* storeSingleU8; // Used by MKWii
const u8* storeSingleU8Slow;
const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii
{
auto emit_quantize = [this, &float_emit, scale_reg]() {
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
};
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
constexpr u32 flags =
BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8;
storeSingleU8 = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
storeSingleU8Slow = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
}
const u8* storeSingleS8;
const u8* storeSingleS8Slow;
const u8* storeSingleS8 = GetCodePtr();
{
auto emit_quantize = [this, &float_emit, scale_reg]() {
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
};
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
constexpr u32 flags =
BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8;
storeSingleS8 = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
storeSingleS8Slow = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
}
const u8* storeSingleU16; // Used by MKWii
const u8* storeSingleU16Slow;
const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii
{
auto emit_quantize = [this, &float_emit, scale_reg]() {
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
};
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
constexpr u32 flags =
BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16;
storeSingleU16 = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
storeSingleU16Slow = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
}
const u8* storeSingleS16;
const u8* storeSingleS16Slow;
const u8* storeSingleS16 = GetCodePtr();
{
auto emit_quantize = [this, &float_emit, scale_reg]() {
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
};
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
constexpr u32 flags =
BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16;
storeSingleS16 = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
gprs_to_push, fprs_to_push, true);
storeSingleS16Slow = GetCodePtr();
emit_quantize();
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
RET(ARM64Reg::X30);
}
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore");
paired_store_quantized = reinterpret_cast<const u8**>(AlignCode16());
ReserveCodeSpace(32 * sizeof(u8*));
ReserveCodeSpace(8 * sizeof(u8*));
// Fast
paired_store_quantized[0] = storePairedFloat;
paired_store_quantized[1] = storePairedIllegal;
paired_store_quantized[2] = storePairedIllegal;
@ -956,31 +891,15 @@ void JitArm64::GenerateQuantizedStores()
paired_store_quantized[6] = storePairedS8;
paired_store_quantized[7] = storePairedS16;
paired_store_quantized[8] = storeSingleFloat;
paired_store_quantized[9] = storePairedIllegal;
paired_store_quantized[10] = storePairedIllegal;
paired_store_quantized[11] = storePairedIllegal;
paired_store_quantized[12] = storeSingleU8;
paired_store_quantized[13] = storeSingleU16;
paired_store_quantized[14] = storeSingleS8;
paired_store_quantized[15] = storeSingleS16;
single_store_quantized = reinterpret_cast<const u8**>(AlignCode16());
ReserveCodeSpace(8 * sizeof(u8*));
// Slow
paired_store_quantized[16] = storePairedFloatSlow;
paired_store_quantized[17] = storePairedIllegal;
paired_store_quantized[18] = storePairedIllegal;
paired_store_quantized[19] = storePairedIllegal;
paired_store_quantized[20] = storePairedU8Slow;
paired_store_quantized[21] = storePairedU16Slow;
paired_store_quantized[22] = storePairedS8Slow;
paired_store_quantized[23] = storePairedS16Slow;
paired_store_quantized[24] = storeSingleFloatSlow;
paired_store_quantized[25] = storePairedIllegal;
paired_store_quantized[26] = storePairedIllegal;
paired_store_quantized[27] = storePairedIllegal;
paired_store_quantized[28] = storeSingleU8Slow;
paired_store_quantized[29] = storeSingleU16Slow;
paired_store_quantized[30] = storeSingleS8Slow;
paired_store_quantized[31] = storeSingleS16Slow;
single_store_quantized[0] = storeSingleFloat;
single_store_quantized[1] = storePairedIllegal;
single_store_quantized[2] = storePairedIllegal;
single_store_quantized[3] = storePairedIllegal;
single_store_quantized[4] = storeSingleU8;
single_store_quantized[5] = storeSingleU16;
single_store_quantized[6] = storeSingleS8;
single_store_quantized[7] = storeSingleS16;
}