mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2024-11-15 05:47:56 -07:00
JitArm64: Move psq_st address check to EmitBackpatchRoutine
This way the address check will take up less icache (since it's only emitted once for each routine rather than once for each psq_st instruction), and we also get address checking for psq_l. Matches Jit64's approach. The disadvantage: In the slowmem case, the routines have to push *every* caller-saved register onto the stack, even though most callers probably don't need it. But at long as the slowmem case isn't hit frequently, this is fine.
This commit is contained in:
parent
cd84339dfd
commit
96760093e9
@ -225,10 +225,9 @@ protected:
|
||||
void DumpCode(const u8* start, const u8* end);
|
||||
|
||||
// Backpatching routines
|
||||
bool DisasmLoadStore(const u8* ptr, u32* flags, Arm64Gen::ARM64Reg* reg);
|
||||
void EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, Arm64Gen::ARM64Reg RS,
|
||||
Arm64Gen::ARM64Reg addr, BitSet32 gprs_to_push = BitSet32(0),
|
||||
BitSet32 fprs_to_push = BitSet32(0));
|
||||
BitSet32 fprs_to_push = BitSet32(0), bool emitting_routine = false);
|
||||
// Loadstore routines
|
||||
void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update);
|
||||
void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset);
|
||||
@ -236,6 +235,7 @@ protected:
|
||||
// jumps to the returned FixupBranch. Clobbers tmp and the 17 lower bits of addr_out.
|
||||
Arm64Gen::FixupBranch BATAddressLookup(Arm64Gen::ARM64Reg addr_out, Arm64Gen::ARM64Reg addr_in,
|
||||
Arm64Gen::ARM64Reg tmp, const void* bat_table);
|
||||
Arm64Gen::FixupBranch CheckIfSafeAddress(Arm64Gen::ARM64Reg addr);
|
||||
|
||||
void DoJit(u32 em_address, JitBlock* b, u32 nextPC);
|
||||
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#include <cinttypes>
|
||||
#include <cstddef>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
#include "Common/BitSet.h"
|
||||
@ -51,13 +52,18 @@ void JitArm64::DoBacktrace(uintptr_t access_address, SContext* ctx)
|
||||
}
|
||||
|
||||
void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, ARM64Reg RS,
|
||||
ARM64Reg addr, BitSet32 gprs_to_push, BitSet32 fprs_to_push)
|
||||
ARM64Reg addr, BitSet32 gprs_to_push, BitSet32 fprs_to_push,
|
||||
bool emitting_routine)
|
||||
{
|
||||
bool in_far_code = false;
|
||||
const u8* fastmem_start = GetCodePtr();
|
||||
std::optional<FixupBranch> slowmem_fixup;
|
||||
|
||||
if (fastmem)
|
||||
{
|
||||
if (do_farcode && emitting_routine)
|
||||
slowmem_fixup = CheckIfSafeAddress(addr);
|
||||
|
||||
if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT))
|
||||
{
|
||||
ARM64Reg temp = ARM64Reg::D0;
|
||||
@ -110,34 +116,45 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR
|
||||
{
|
||||
if (fastmem && do_farcode)
|
||||
{
|
||||
SlowmemHandler handler;
|
||||
handler.dest_reg = RS;
|
||||
handler.addr_reg = addr;
|
||||
handler.gprs = gprs_to_push;
|
||||
handler.fprs = fprs_to_push;
|
||||
handler.flags = flags;
|
||||
|
||||
FastmemArea* fastmem_area = &m_fault_to_handler[fastmem_end];
|
||||
auto handler_loc_iter = m_handler_to_loc.find(handler);
|
||||
|
||||
if (handler_loc_iter == m_handler_to_loc.end())
|
||||
if (emitting_routine)
|
||||
{
|
||||
in_far_code = true;
|
||||
SwitchToFarCode();
|
||||
const u8* handler_loc = GetCodePtr();
|
||||
m_handler_to_loc[handler] = handler_loc;
|
||||
fastmem_area->fastmem_code = fastmem_start;
|
||||
fastmem_area->slowmem_code = handler_loc;
|
||||
}
|
||||
else
|
||||
{
|
||||
const u8* handler_loc = handler_loc_iter->second;
|
||||
fastmem_area->fastmem_code = fastmem_start;
|
||||
fastmem_area->slowmem_code = handler_loc;
|
||||
return;
|
||||
SlowmemHandler handler;
|
||||
handler.dest_reg = RS;
|
||||
handler.addr_reg = addr;
|
||||
handler.gprs = gprs_to_push;
|
||||
handler.fprs = fprs_to_push;
|
||||
handler.flags = flags;
|
||||
|
||||
FastmemArea* fastmem_area = &m_fault_to_handler[fastmem_end];
|
||||
auto handler_loc_iter = m_handler_to_loc.find(handler);
|
||||
|
||||
if (handler_loc_iter == m_handler_to_loc.end())
|
||||
{
|
||||
in_far_code = true;
|
||||
SwitchToFarCode();
|
||||
const u8* handler_loc = GetCodePtr();
|
||||
m_handler_to_loc[handler] = handler_loc;
|
||||
fastmem_area->fastmem_code = fastmem_start;
|
||||
fastmem_area->slowmem_code = handler_loc;
|
||||
}
|
||||
else
|
||||
{
|
||||
const u8* handler_loc = handler_loc_iter->second;
|
||||
fastmem_area->fastmem_code = fastmem_start;
|
||||
fastmem_area->slowmem_code = handler_loc;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (slowmem_fixup)
|
||||
SetJumpTarget(*slowmem_fixup);
|
||||
|
||||
ABI_PushRegisters(gprs_to_push);
|
||||
m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30);
|
||||
|
||||
@ -229,8 +246,17 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR
|
||||
|
||||
if (in_far_code)
|
||||
{
|
||||
RET(ARM64Reg::X30);
|
||||
SwitchToNearCode();
|
||||
if (emitting_routine)
|
||||
{
|
||||
FixupBranch done = B();
|
||||
SwitchToNearCode();
|
||||
SetJumpTarget(done);
|
||||
}
|
||||
else
|
||||
{
|
||||
RET(ARM64Reg::X30);
|
||||
SwitchToNearCode();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -288,6 +288,16 @@ FixupBranch JitArm64::BATAddressLookup(ARM64Reg addr_out, ARM64Reg addr_in, ARM6
|
||||
return fail;
|
||||
}
|
||||
|
||||
FixupBranch JitArm64::CheckIfSafeAddress(Arm64Gen::ARM64Reg addr)
|
||||
{
|
||||
// FIXME: This doesn't correctly account for the BAT configuration.
|
||||
TST(addr, LogicalImm(0x0c000000, 32));
|
||||
FixupBranch pass = B(CC_EQ);
|
||||
FixupBranch fail = B();
|
||||
SetJumpTarget(pass);
|
||||
return fail;
|
||||
}
|
||||
|
||||
void JitArm64::lXX(UGeckoInstruction inst)
|
||||
{
|
||||
INSTRUCTION_START
|
||||
|
@ -19,10 +19,10 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
|
||||
{
|
||||
INSTRUCTION_START
|
||||
JITDISABLE(bJITLoadStorePairedOff);
|
||||
FALLBACK_IF(jo.memcheck || !jo.fastmem);
|
||||
FALLBACK_IF(jo.memcheck);
|
||||
|
||||
// The asm routines assume address translation is on.
|
||||
FALLBACK_IF(!MSR.DR);
|
||||
// If we have a fastmem arena, the asm routines assume address translation is on.
|
||||
FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR);
|
||||
|
||||
// X30 is LR
|
||||
// X0 is the address
|
||||
@ -111,10 +111,10 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
|
||||
{
|
||||
INSTRUCTION_START
|
||||
JITDISABLE(bJITLoadStorePairedOff);
|
||||
FALLBACK_IF(jo.memcheck || !jo.fastmem);
|
||||
FALLBACK_IF(jo.memcheck);
|
||||
|
||||
// The asm routines assume address translation is on.
|
||||
FALLBACK_IF(!MSR.DR);
|
||||
// If we have a fastmem arena, the asm routines assume address translation is on.
|
||||
FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR);
|
||||
|
||||
// X30 is LR
|
||||
// X0 contains the scale
|
||||
@ -213,33 +213,9 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
|
||||
UBFM(type_reg, scale_reg, 0, 2); // Type
|
||||
UBFM(scale_reg, scale_reg, 8, 13); // Scale
|
||||
|
||||
// Inline address check
|
||||
// FIXME: This doesn't correctly account for the BAT configuration.
|
||||
TST(addr_reg, LogicalImm(0x0c000000, 32));
|
||||
FixupBranch pass = B(CC_EQ);
|
||||
FixupBranch fail = B();
|
||||
|
||||
SwitchToFarCode();
|
||||
SetJumpTarget(fail);
|
||||
// Slow
|
||||
MOVP2R(ARM64Reg::X30, &paired_store_quantized[16 + w * 8]);
|
||||
LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true));
|
||||
|
||||
ABI_PushRegisters(gprs_in_use);
|
||||
m_float_emit.ABI_PushRegisters(fprs_in_use, ARM64Reg::X30);
|
||||
BLR(EncodeRegTo64(type_reg));
|
||||
m_float_emit.ABI_PopRegisters(fprs_in_use, ARM64Reg::X30);
|
||||
ABI_PopRegisters(gprs_in_use);
|
||||
FixupBranch continue1 = B();
|
||||
SwitchToNearCode();
|
||||
SetJumpTarget(pass);
|
||||
|
||||
// Fast
|
||||
MOVP2R(ARM64Reg::X30, &paired_store_quantized[w * 8]);
|
||||
MOVP2R(ARM64Reg::X30, w ? single_store_quantized : paired_store_quantized);
|
||||
LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true));
|
||||
BLR(EncodeRegTo64(type_reg));
|
||||
|
||||
SetJumpTarget(continue1);
|
||||
}
|
||||
|
||||
if (js.assumeNoPairedQuantize && !have_single)
|
||||
|
@ -506,8 +506,9 @@ void JitArm64::GenerateQuantizedLoads()
|
||||
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT |
|
||||
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32;
|
||||
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push & ~BitSet32{1},
|
||||
fprs_to_push);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push & ~BitSet32{1}, fprs_to_push);
|
||||
|
||||
RET(ARM64Reg::X30);
|
||||
}
|
||||
const u8* loadPairedU8Two = GetCodePtr();
|
||||
@ -515,7 +516,8 @@ void JitArm64::GenerateQuantizedLoads()
|
||||
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT |
|
||||
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8;
|
||||
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
@ -532,7 +534,8 @@ void JitArm64::GenerateQuantizedLoads()
|
||||
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT |
|
||||
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8;
|
||||
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
@ -549,7 +552,8 @@ void JitArm64::GenerateQuantizedLoads()
|
||||
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT |
|
||||
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16;
|
||||
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
@ -565,7 +569,8 @@ void JitArm64::GenerateQuantizedLoads()
|
||||
constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT |
|
||||
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16;
|
||||
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
@ -582,8 +587,9 @@ void JitArm64::GenerateQuantizedLoads()
|
||||
constexpr u32 flags =
|
||||
BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32;
|
||||
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push & ~BitSet32{1},
|
||||
fprs_to_push);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push & ~BitSet32{1}, fprs_to_push);
|
||||
|
||||
RET(ARM64Reg::X30);
|
||||
}
|
||||
const u8* loadPairedU8One = GetCodePtr();
|
||||
@ -591,7 +597,8 @@ void JitArm64::GenerateQuantizedLoads()
|
||||
constexpr u32 flags =
|
||||
BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8;
|
||||
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
@ -608,7 +615,8 @@ void JitArm64::GenerateQuantizedLoads()
|
||||
constexpr u32 flags =
|
||||
BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8;
|
||||
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
@ -625,7 +633,8 @@ void JitArm64::GenerateQuantizedLoads()
|
||||
constexpr u32 flags =
|
||||
BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16;
|
||||
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
@ -641,7 +650,8 @@ void JitArm64::GenerateQuantizedLoads()
|
||||
constexpr u32 flags =
|
||||
BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16;
|
||||
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
@ -697,256 +707,181 @@ void JitArm64::GenerateQuantizedStores()
|
||||
const u8* start = GetCodePtr();
|
||||
const u8* storePairedIllegal = GetCodePtr();
|
||||
BRK(0x101);
|
||||
const u8* storePairedFloat;
|
||||
const u8* storePairedFloatSlow;
|
||||
const u8* storePairedFloat = GetCodePtr();
|
||||
{
|
||||
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT |
|
||||
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32;
|
||||
|
||||
storePairedFloat = GetCodePtr();
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
storePairedFloatSlow = GetCodePtr();
|
||||
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
}
|
||||
|
||||
const u8* storePairedU8;
|
||||
const u8* storePairedU8Slow;
|
||||
const u8* storePairedU8 = GetCodePtr();
|
||||
{
|
||||
auto emit_quantize = [this, &float_emit, scale_reg]() {
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
|
||||
|
||||
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
|
||||
};
|
||||
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
|
||||
|
||||
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT |
|
||||
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8;
|
||||
|
||||
storePairedU8 = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
storePairedU8Slow = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
}
|
||||
const u8* storePairedS8;
|
||||
const u8* storePairedS8Slow;
|
||||
const u8* storePairedS8 = GetCodePtr();
|
||||
{
|
||||
auto emit_quantize = [this, &float_emit, scale_reg]() {
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
|
||||
|
||||
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
|
||||
};
|
||||
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
|
||||
|
||||
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT |
|
||||
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8;
|
||||
|
||||
storePairedS8 = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
storePairedS8Slow = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
}
|
||||
|
||||
const u8* storePairedU16;
|
||||
const u8* storePairedU16Slow;
|
||||
const u8* storePairedU16 = GetCodePtr();
|
||||
{
|
||||
auto emit_quantize = [this, &float_emit, scale_reg]() {
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
|
||||
|
||||
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
};
|
||||
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
|
||||
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT |
|
||||
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16;
|
||||
|
||||
storePairedU16 = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
storePairedU16Slow = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
}
|
||||
const u8* storePairedS16; // Used by Viewtiful Joe's intro movie
|
||||
const u8* storePairedS16Slow;
|
||||
const u8* storePairedS16 = GetCodePtr(); // Used by Viewtiful Joe's intro movie
|
||||
{
|
||||
auto emit_quantize = [this, &float_emit, scale_reg]() {
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
|
||||
|
||||
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
};
|
||||
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
|
||||
constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT |
|
||||
BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16;
|
||||
|
||||
storePairedS16 = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
storePairedS16Slow = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
}
|
||||
|
||||
const u8* storeSingleFloat;
|
||||
const u8* storeSingleFloatSlow;
|
||||
const u8* storeSingleFloat = GetCodePtr();
|
||||
{
|
||||
constexpr u32 flags =
|
||||
BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32;
|
||||
|
||||
storeSingleFloat = GetCodePtr();
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push);
|
||||
|
||||
storeSingleFloatSlow = GetCodePtr();
|
||||
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
}
|
||||
const u8* storeSingleU8; // Used by MKWii
|
||||
const u8* storeSingleU8Slow;
|
||||
const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii
|
||||
{
|
||||
auto emit_quantize = [this, &float_emit, scale_reg]() {
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
|
||||
|
||||
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
|
||||
};
|
||||
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
|
||||
|
||||
constexpr u32 flags =
|
||||
BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8;
|
||||
|
||||
storeSingleU8 = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
storeSingleU8Slow = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
}
|
||||
const u8* storeSingleS8;
|
||||
const u8* storeSingleS8Slow;
|
||||
const u8* storeSingleS8 = GetCodePtr();
|
||||
{
|
||||
auto emit_quantize = [this, &float_emit, scale_reg]() {
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
|
||||
|
||||
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
|
||||
};
|
||||
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0);
|
||||
|
||||
constexpr u32 flags =
|
||||
BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8;
|
||||
|
||||
storeSingleS8 = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
storeSingleS8Slow = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
}
|
||||
const u8* storeSingleU16; // Used by MKWii
|
||||
const u8* storeSingleU16Slow;
|
||||
const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii
|
||||
{
|
||||
auto emit_quantize = [this, &float_emit, scale_reg]() {
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
|
||||
|
||||
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
};
|
||||
float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
|
||||
constexpr u32 flags =
|
||||
BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16;
|
||||
|
||||
storeSingleU16 = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
storeSingleU16Slow = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
}
|
||||
const u8* storeSingleS16;
|
||||
const u8* storeSingleS16Slow;
|
||||
const u8* storeSingleS16 = GetCodePtr();
|
||||
{
|
||||
auto emit_quantize = [this, &float_emit, scale_reg]() {
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
|
||||
MOVP2R(ARM64Reg::X2, &m_quantizeTableS);
|
||||
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
|
||||
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
|
||||
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1);
|
||||
|
||||
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
};
|
||||
float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0);
|
||||
float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0);
|
||||
|
||||
constexpr u32 flags =
|
||||
BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16;
|
||||
|
||||
storeSingleS16 = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg,
|
||||
gprs_to_push, fprs_to_push, true);
|
||||
|
||||
storeSingleS16Slow = GetCodePtr();
|
||||
emit_quantize();
|
||||
EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push);
|
||||
RET(ARM64Reg::X30);
|
||||
}
|
||||
|
||||
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore");
|
||||
|
||||
paired_store_quantized = reinterpret_cast<const u8**>(AlignCode16());
|
||||
ReserveCodeSpace(32 * sizeof(u8*));
|
||||
ReserveCodeSpace(8 * sizeof(u8*));
|
||||
|
||||
// Fast
|
||||
paired_store_quantized[0] = storePairedFloat;
|
||||
paired_store_quantized[1] = storePairedIllegal;
|
||||
paired_store_quantized[2] = storePairedIllegal;
|
||||
@ -956,31 +891,15 @@ void JitArm64::GenerateQuantizedStores()
|
||||
paired_store_quantized[6] = storePairedS8;
|
||||
paired_store_quantized[7] = storePairedS16;
|
||||
|
||||
paired_store_quantized[8] = storeSingleFloat;
|
||||
paired_store_quantized[9] = storePairedIllegal;
|
||||
paired_store_quantized[10] = storePairedIllegal;
|
||||
paired_store_quantized[11] = storePairedIllegal;
|
||||
paired_store_quantized[12] = storeSingleU8;
|
||||
paired_store_quantized[13] = storeSingleU16;
|
||||
paired_store_quantized[14] = storeSingleS8;
|
||||
paired_store_quantized[15] = storeSingleS16;
|
||||
single_store_quantized = reinterpret_cast<const u8**>(AlignCode16());
|
||||
ReserveCodeSpace(8 * sizeof(u8*));
|
||||
|
||||
// Slow
|
||||
paired_store_quantized[16] = storePairedFloatSlow;
|
||||
paired_store_quantized[17] = storePairedIllegal;
|
||||
paired_store_quantized[18] = storePairedIllegal;
|
||||
paired_store_quantized[19] = storePairedIllegal;
|
||||
paired_store_quantized[20] = storePairedU8Slow;
|
||||
paired_store_quantized[21] = storePairedU16Slow;
|
||||
paired_store_quantized[22] = storePairedS8Slow;
|
||||
paired_store_quantized[23] = storePairedS16Slow;
|
||||
|
||||
paired_store_quantized[24] = storeSingleFloatSlow;
|
||||
paired_store_quantized[25] = storePairedIllegal;
|
||||
paired_store_quantized[26] = storePairedIllegal;
|
||||
paired_store_quantized[27] = storePairedIllegal;
|
||||
paired_store_quantized[28] = storeSingleU8Slow;
|
||||
paired_store_quantized[29] = storeSingleU16Slow;
|
||||
paired_store_quantized[30] = storeSingleS8Slow;
|
||||
paired_store_quantized[31] = storeSingleS16Slow;
|
||||
single_store_quantized[0] = storeSingleFloat;
|
||||
single_store_quantized[1] = storePairedIllegal;
|
||||
single_store_quantized[2] = storePairedIllegal;
|
||||
single_store_quantized[3] = storePairedIllegal;
|
||||
single_store_quantized[4] = storeSingleU8;
|
||||
single_store_quantized[5] = storeSingleU16;
|
||||
single_store_quantized[6] = storeSingleS8;
|
||||
single_store_quantized[7] = storeSingleS16;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user