From 36b3d515231725421cce03ac6e5be890b053075e Mon Sep 17 00:00:00 2001 From: JosJuice Date: Thu, 8 Jul 2021 17:53:01 +0200 Subject: [PATCH 1/7] JitArm64: Restructure the BackPatchInfo flags enum This makes it possible to construct flag combinations like "load 16 bits into an FPR". --- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 20 ++++---- .../JitArm64/JitArm64_LoadStoreFloating.cpp | 46 +++++++++---------- .../JitArm64/JitArm64_LoadStorePaired.cpp | 6 +-- .../Core/PowerPC/JitArmCommon/BackPatch.h | 31 +++++++------ 4 files changed, 52 insertions(+), 51 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index ef2ec7164d..cf5712eaab 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -58,14 +58,14 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR if (fastmem) { - if (flags & BackPatchInfo::FLAG_STORE && flags & BackPatchInfo::FLAG_MASK_FLOAT) + if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) { - if (flags & BackPatchInfo::FLAG_SIZE_F32) + if ((flags & BackPatchInfo::FLAG_SIZE_32) && !(flags & BackPatchInfo::FLAG_PAIR)) { m_float_emit.REV32(8, ARM64Reg::D0, RS); m_float_emit.STR(32, ARM64Reg::D0, MEM_REG, addr); } - else if (flags & BackPatchInfo::FLAG_SIZE_F32X2) + else if ((flags & BackPatchInfo::FLAG_SIZE_32) && (flags & BackPatchInfo::FLAG_PAIR)) { m_float_emit.REV32(8, ARM64Reg::D0, RS); m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr); @@ -76,9 +76,9 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr); } } - else if (flags & BackPatchInfo::FLAG_LOAD && flags & BackPatchInfo::FLAG_MASK_FLOAT) + else if ((flags & BackPatchInfo::FLAG_LOAD) && (flags & BackPatchInfo::FLAG_FLOAT)) { - if (flags & BackPatchInfo::FLAG_SIZE_F32) + if (flags & BackPatchInfo::FLAG_SIZE_32) { m_float_emit.LDR(32, EncodeRegToDouble(RS), MEM_REG, addr); m_float_emit.REV32(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); @@ -158,15 +158,15 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR ABI_PushRegisters(gprs_to_push); m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); - if (flags & BackPatchInfo::FLAG_STORE && flags & BackPatchInfo::FLAG_MASK_FLOAT) + if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) { - if (flags & BackPatchInfo::FLAG_SIZE_F32) + if ((flags & BackPatchInfo::FLAG_SIZE_32) && !(flags & BackPatchInfo::FLAG_PAIR)) { m_float_emit.UMOV(32, ARM64Reg::W0, RS, 0); MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32); BLR(ARM64Reg::X8); } - else if (flags & BackPatchInfo::FLAG_SIZE_F32X2) + else if ((flags & BackPatchInfo::FLAG_SIZE_32) && (flags & BackPatchInfo::FLAG_PAIR)) { m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0); MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64); @@ -180,9 +180,9 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR BLR(ARM64Reg::X8); } } - else if (flags & BackPatchInfo::FLAG_LOAD && flags & BackPatchInfo::FLAG_MASK_FLOAT) + else if ((flags & BackPatchInfo::FLAG_LOAD) && (flags & BackPatchInfo::FLAG_FLOAT)) { - if (flags & BackPatchInfo::FLAG_SIZE_F32) + if (flags & BackPatchInfo::FLAG_SIZE_32) { MOVP2R(ARM64Reg::X8, &PowerPC::Read_U32); BLR(ARM64Reg::X8); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index 727c0268e2..79add2f227 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -26,7 +26,7 @@ void JitArm64::lfXX(UGeckoInstruction inst) u32 a = inst.RA, b = inst.RB; s32 offset = inst.SIMM_16; - u32 flags = BackPatchInfo::FLAG_LOAD; + u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT; bool update = false; s32 offset_reg = -1; @@ -36,38 +36,38 @@ void JitArm64::lfXX(UGeckoInstruction inst) switch (inst.SUBOP10) { case 567: // lfsux - flags |= BackPatchInfo::FLAG_SIZE_F32; + flags |= BackPatchInfo::FLAG_SIZE_32; update = true; offset_reg = b; break; case 535: // lfsx - flags |= BackPatchInfo::FLAG_SIZE_F32; + flags |= BackPatchInfo::FLAG_SIZE_32; offset_reg = b; break; case 631: // lfdux - flags |= BackPatchInfo::FLAG_SIZE_F64; + flags |= BackPatchInfo::FLAG_SIZE_64; update = true; offset_reg = b; break; case 599: // lfdx - flags |= BackPatchInfo::FLAG_SIZE_F64; + flags |= BackPatchInfo::FLAG_SIZE_64; offset_reg = b; break; } break; case 49: // lfsu - flags |= BackPatchInfo::FLAG_SIZE_F32; + flags |= BackPatchInfo::FLAG_SIZE_32; update = true; break; case 48: // lfs - flags |= BackPatchInfo::FLAG_SIZE_F32; + flags |= BackPatchInfo::FLAG_SIZE_32; break; case 51: // lfdu - flags |= BackPatchInfo::FLAG_SIZE_F64; + flags |= BackPatchInfo::FLAG_SIZE_64; update = true; break; case 50: // lfd - flags |= BackPatchInfo::FLAG_SIZE_F64; + flags |= BackPatchInfo::FLAG_SIZE_64; break; } @@ -75,7 +75,7 @@ void JitArm64::lfXX(UGeckoInstruction inst) bool is_immediate = false; const RegType type = - (flags & BackPatchInfo::FLAG_SIZE_F64) != 0 ? RegType::LowerPair : RegType::DuplicatedSingle; + (flags & BackPatchInfo::FLAG_SIZE_64) != 0 ? RegType::LowerPair : RegType::DuplicatedSingle; gpr.Lock(ARM64Reg::W0, ARM64Reg::W30); fpr.Lock(ARM64Reg::Q0); @@ -190,7 +190,7 @@ void JitArm64::stfXX(UGeckoInstruction inst) bool want_single = false; s32 offset = inst.SIMM_16; - u32 flags = BackPatchInfo::FLAG_STORE; + u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT; bool update = false; s32 offset_reg = -1; @@ -201,46 +201,46 @@ void JitArm64::stfXX(UGeckoInstruction inst) { case 663: // stfsx want_single = true; - flags |= BackPatchInfo::FLAG_SIZE_F32; + flags |= BackPatchInfo::FLAG_SIZE_32; offset_reg = b; break; case 695: // stfsux want_single = true; - flags |= BackPatchInfo::FLAG_SIZE_F32; + flags |= BackPatchInfo::FLAG_SIZE_32; update = true; offset_reg = b; break; case 727: // stfdx - flags |= BackPatchInfo::FLAG_SIZE_F64; + flags |= BackPatchInfo::FLAG_SIZE_64; offset_reg = b; break; case 759: // stfdux - flags |= BackPatchInfo::FLAG_SIZE_F64; + flags |= BackPatchInfo::FLAG_SIZE_64; update = true; offset_reg = b; break; case 983: // stfiwx // This instruction writes the lower 32 bits of a double. want_single must be false - flags |= BackPatchInfo::FLAG_SIZE_F32; + flags |= BackPatchInfo::FLAG_SIZE_32; offset_reg = b; break; } break; case 53: // stfsu want_single = true; - flags |= BackPatchInfo::FLAG_SIZE_F32; + flags |= BackPatchInfo::FLAG_SIZE_32; update = true; break; case 52: // stfs want_single = true; - flags |= BackPatchInfo::FLAG_SIZE_F32; + flags |= BackPatchInfo::FLAG_SIZE_32; break; case 55: // stfdu - flags |= BackPatchInfo::FLAG_SIZE_F64; + flags |= BackPatchInfo::FLAG_SIZE_64; update = true; break; case 54: // stfd - flags |= BackPatchInfo::FLAG_SIZE_F64; + flags |= BackPatchInfo::FLAG_SIZE_64; break; } @@ -361,16 +361,16 @@ void JitArm64::stfXX(UGeckoInstruction inst) if (jo.optimizeGatherPipe && PowerPC::IsOptimizableGatherPipeWrite(imm_addr)) { int accessSize; - if (flags & BackPatchInfo::FLAG_SIZE_F64) + if (flags & BackPatchInfo::FLAG_SIZE_64) accessSize = 64; else accessSize = 32; LDR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr)); - if (flags & BackPatchInfo::FLAG_SIZE_F64) + if (flags & BackPatchInfo::FLAG_SIZE_64) m_float_emit.REV64(8, ARM64Reg::Q0, V0); - else if (flags & BackPatchInfo::FLAG_SIZE_F32) + else if (flags & BackPatchInfo::FLAG_SIZE_32) m_float_emit.REV32(8, ARM64Reg::D0, V0); m_float_emit.STR(accessSize, IndexType::Post, accessSize == 64 ? ARM64Reg::Q0 : ARM64Reg::D0, diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index aafda1845b..76d73edd50 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -198,9 +198,9 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) if (js.assumeNoPairedQuantize) { - u32 flags = BackPatchInfo::FLAG_STORE; - - flags |= (w ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2); + u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; + if (!w) + flags |= BackPatchInfo::FLAG_PAIR; EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, VS, EncodeRegTo64(addr_reg), gprs_in_use, fprs_in_use); diff --git a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h index c1a4596500..0e42d2a9f9 100644 --- a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h +++ b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h @@ -13,31 +13,32 @@ struct BackPatchInfo FLAG_SIZE_8 = (1 << 2), FLAG_SIZE_16 = (1 << 3), FLAG_SIZE_32 = (1 << 4), - FLAG_SIZE_F32 = (1 << 5), - FLAG_SIZE_F32X2 = (1 << 6), - FLAG_SIZE_F64 = (1 << 7), + FLAG_SIZE_64 = (1 << 5), + FLAG_FLOAT = (1 << 6), + FLAG_PAIR = (1 << 7), FLAG_REVERSE = (1 << 8), FLAG_EXTEND = (1 << 9), FLAG_ZERO_256 = (1 << 10), - FLAG_MASK_FLOAT = FLAG_SIZE_F32 | FLAG_SIZE_F32X2 | FLAG_SIZE_F64, }; static u32 GetFlagSize(u32 flags) { + u32 size = 0; + if (flags & FLAG_SIZE_8) - return 8; + size = 8; if (flags & FLAG_SIZE_16) - return 16; + size = 16; if (flags & FLAG_SIZE_32) - return 32; - if (flags & FLAG_SIZE_F32) - return 32; - if (flags & FLAG_SIZE_F32X2) - return 64; - if (flags & FLAG_SIZE_F64) - return 64; + size = 32; + if (flags & FLAG_SIZE_64) + size = 64; if (flags & FLAG_ZERO_256) - return 256; - return 0; + size = 256; + + if (flags & FLAG_PAIR) + size *= 2; + + return size; } }; From 6c3141e0cc813383675fc0536a1d78ed2f483d08 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Thu, 8 Jul 2021 18:20:38 +0200 Subject: [PATCH 2/7] JitArm64: Change parameter order for quantized load routines EmitBackpatchRoutine (see the upcoming commits) expects the address to be in W0 for loads and W1 for stores. --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 3 +- .../JitArm64/JitArm64_LoadStorePaired.cpp | 8 ++--- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 33 ++++++++++++------- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 35375f8302..bf266acfd3 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -253,7 +253,8 @@ protected: void GenerateConvertDoubleToSingle(); void GenerateConvertSingleToDouble(); void GenerateFPRF(bool single); - void GenerateQuantizedLoadStores(); + void GenerateQuantizedLoads(); + void GenerateQuantizedStores(); // Profiling void BeginTimeProfile(JitBlock* b); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 76d73edd50..da6af010d9 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -25,8 +25,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) FALLBACK_IF(!MSR.DR); // X30 is LR - // X0 contains the scale - // X1 is the address + // X0 is the address + // X1 contains the scale // X2 is a temporary // Q0 is the return register // Q1 is a temporary @@ -39,8 +39,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); - constexpr ARM64Reg scale_reg = ARM64Reg::W0; - constexpr ARM64Reg addr_reg = ARM64Reg::W1; + constexpr ARM64Reg addr_reg = ARM64Reg::W0; + constexpr ARM64Reg scale_reg = ARM64Reg::W1; constexpr ARM64Reg type_reg = ARM64Reg::W2; ARM64Reg VS; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index fd415e8da2..40129ce202 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -223,7 +223,8 @@ void JitArm64::GenerateCommonAsm() GenerateFPRF(false); JitRegister::Register(GetAsmRoutines()->fprf_single, GetCodePtr(), "JIT_FPRF"); - GenerateQuantizedLoadStores(); + GenerateQuantizedLoads(); + GenerateQuantizedStores(); } // Input: X1 contains input, and D0 contains result of running the input through AArch64 FRECPE. @@ -483,17 +484,15 @@ void JitArm64::GenerateFPRF(bool single) B(write_fprf_and_ret); } -void JitArm64::GenerateQuantizedLoadStores() +void JitArm64::GenerateQuantizedLoads() { - // X0 is the scale - // X1 is address - // X2 is a temporary on stores + // X0 is the address + // X1 is the scale // X30 is LR - // Q0 is the return for loads - // is the register for stores + // Q0 is the return // Q1 is a temporary - ARM64Reg addr_reg = ARM64Reg::X1; - ARM64Reg scale_reg = ARM64Reg::X0; + ARM64Reg addr_reg = ARM64Reg::X0; + ARM64Reg scale_reg = ARM64Reg::X1; ARM64FloatEmitter float_emit(this); const u8* start = GetCodePtr(); @@ -652,9 +651,21 @@ void JitArm64::GenerateQuantizedLoadStores() single_load_quantized[5] = loadPairedU16One; single_load_quantized[6] = loadPairedS8One; single_load_quantized[7] = loadPairedS16One; +} - // Stores - start = GetCodePtr(); +void JitArm64::GenerateQuantizedStores() +{ + // X0 is the scale + // X1 is the address + // X2 is a temporary + // X30 is LR + // Q0 is the register + // Q1 is a temporary + ARM64Reg scale_reg = ARM64Reg::X0; + ARM64Reg addr_reg = ARM64Reg::X1; + ARM64FloatEmitter float_emit(this); + + const u8* start = GetCodePtr(); const u8* storePairedIllegal = GetCodePtr(); BRK(0x101); const u8* storePairedFloat; From de21dc5fd932148184a4a9028905c54767a7c942 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Thu, 8 Jul 2021 18:38:22 +0200 Subject: [PATCH 3/7] JitArm64: Add bitset constants for caller saved registers --- Source/Core/Common/Arm64Emitter.h | 2 ++ .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 23 +++++-------------- .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 4 ++-- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index a5a4c03e4e..b2d6fda34d 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -1146,6 +1146,8 @@ public: bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm); // ABI related + static constexpr BitSet32 CALLER_SAVED_GPRS = BitSet32(0x4007FFFF); + static constexpr BitSet32 CALLER_SAVED_FPRS = BitSet32(0xFFFF00FF); void ABI_PushRegisters(BitSet32 registers); void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0)); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 98242769b4..45249d92ed 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -149,15 +149,9 @@ void Arm64GPRCache::Start(PPCAnalyst::BlockRegStats& stats) { } -bool Arm64GPRCache::IsCalleeSaved(ARM64Reg reg) const +bool Arm64GPRCache::IsCallerSaved(ARM64Reg reg) const { - static constexpr auto callee_regs = { - ARM64Reg::X28, ARM64Reg::X27, ARM64Reg::X26, ARM64Reg::X25, - ARM64Reg::X24, ARM64Reg::X23, ARM64Reg::X22, ARM64Reg::X21, - ARM64Reg::X20, ARM64Reg::X19, ARM64Reg::INVALID_REG, - }; - - return std::find(callee_regs.begin(), callee_regs.end(), EncodeRegTo64(reg)) != callee_regs.end(); + return ARM64XEmitter::CALLER_SAVED_GPRS[DecodeReg(reg)]; } const OpArg& Arm64GPRCache::GetGuestGPROpArg(size_t preg) const @@ -416,7 +410,7 @@ BitSet32 Arm64GPRCache::GetCallerSavedUsed() const BitSet32 registers(0); for (const auto& it : m_host_registers) { - if (it.IsLocked() && !IsCalleeSaved(it.GetReg())) + if (it.IsLocked() && IsCallerSaved(it.GetReg())) registers[DecodeReg(it.GetReg())] = true; } return registers; @@ -716,14 +710,9 @@ void Arm64FPRCache::FlushByHost(ARM64Reg host_reg, ARM64Reg tmp_reg) } } -bool Arm64FPRCache::IsCalleeSaved(ARM64Reg reg) const +bool Arm64FPRCache::IsCallerSaved(ARM64Reg reg) const { - static constexpr auto callee_regs = { - ARM64Reg::Q8, ARM64Reg::Q9, ARM64Reg::Q10, ARM64Reg::Q11, ARM64Reg::Q12, - ARM64Reg::Q13, ARM64Reg::Q14, ARM64Reg::Q15, ARM64Reg::INVALID_REG, - }; - - return std::find(callee_regs.begin(), callee_regs.end(), reg) != callee_regs.end(); + return ARM64XEmitter::CALLER_SAVED_FPRS[DecodeReg(reg)]; } bool Arm64FPRCache::IsTopHalfUsed(ARM64Reg reg) const @@ -841,7 +830,7 @@ BitSet32 Arm64FPRCache::GetCallerSavedUsed() const BitSet32 registers(0); for (const auto& it : m_host_registers) { - if (it.IsLocked() && (!IsCalleeSaved(it.GetReg()) || IsTopHalfUsed(it.GetReg()))) + if (it.IsLocked() && (IsCallerSaved(it.GetReg()) || IsTopHalfUsed(it.GetReg()))) registers[DecodeReg(it.GetReg())] = true; } return registers; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index 9be43a3a6e..2ecbcbbffa 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -291,7 +291,7 @@ protected: void FlushRegister(size_t index, bool maintain_state, Arm64Gen::ARM64Reg tmp_reg) override; private: - bool IsCalleeSaved(Arm64Gen::ARM64Reg reg) const; + bool IsCallerSaved(Arm64Gen::ARM64Reg reg) const; struct GuestRegInfo { @@ -350,7 +350,7 @@ protected: void FlushRegister(size_t preg, bool maintain_state, Arm64Gen::ARM64Reg tmp_reg) override; private: - bool IsCalleeSaved(Arm64Gen::ARM64Reg reg) const; + bool IsCallerSaved(Arm64Gen::ARM64Reg reg) const; bool IsTopHalfUsed(Arm64Gen::ARM64Reg reg) const; void FlushRegisters(BitSet32 regs, bool maintain_state, Arm64Gen::ARM64Reg tmp_reg); From cd84339dfd3044c663a1eceeda82aaaf4a292bf5 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 9 Jul 2021 10:59:53 +0200 Subject: [PATCH 4/7] JitArm64: Use EmitBackpatchRoutine more for psq_l/psq_st In the case of the JitAsm routines, we can't actually use backpatching. Still, I would like to gather all the load and store instructions in one place to make future changes easier. --- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 130 ++++++----- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 2 +- .../JitArm64/JitArm64_LoadStorePaired.cpp | 26 +-- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 202 ++++++++++-------- .../Core/Core/PowerPC/JitArm64/Jit_Util.cpp | 80 +++++-- Source/Core/Core/PowerPC/JitArm64/Jit_Util.h | 14 +- 6 files changed, 269 insertions(+), 185 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index cf5712eaab..fa5ed3e67d 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -60,39 +60,22 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR { if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) { - if ((flags & BackPatchInfo::FLAG_SIZE_32) && !(flags & BackPatchInfo::FLAG_PAIR)) - { - m_float_emit.REV32(8, ARM64Reg::D0, RS); - m_float_emit.STR(32, ARM64Reg::D0, MEM_REG, addr); - } - else if ((flags & BackPatchInfo::FLAG_SIZE_32) && (flags & BackPatchInfo::FLAG_PAIR)) - { - m_float_emit.REV32(8, ARM64Reg::D0, RS); - m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr); - } - else - { - m_float_emit.REV64(8, ARM64Reg::Q0, RS); - m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr); - } + ARM64Reg temp = ARM64Reg::D0; + temp = ByteswapBeforeStore(this, &m_float_emit, temp, EncodeRegToDouble(RS), flags, true); + + m_float_emit.STR(BackPatchInfo::GetFlagSize(flags), temp, MEM_REG, addr); } else if ((flags & BackPatchInfo::FLAG_LOAD) && (flags & BackPatchInfo::FLAG_FLOAT)) { - if (flags & BackPatchInfo::FLAG_SIZE_32) - { - m_float_emit.LDR(32, EncodeRegToDouble(RS), MEM_REG, addr); - m_float_emit.REV32(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); - } - else - { - m_float_emit.LDR(64, EncodeRegToDouble(RS), MEM_REG, addr); - m_float_emit.REV64(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); - } + m_float_emit.LDR(BackPatchInfo::GetFlagSize(flags), EncodeRegToDouble(RS), MEM_REG, addr); + + ByteswapAfterLoad(this, &m_float_emit, EncodeRegToDouble(RS), EncodeRegToDouble(RS), flags, + true, false); } else if (flags & BackPatchInfo::FLAG_STORE) { ARM64Reg temp = ARM64Reg::W0; - temp = ByteswapBeforeStore(this, temp, RS, flags, true); + temp = ByteswapBeforeStore(this, &m_float_emit, temp, RS, flags, true); if (flags & BackPatchInfo::FLAG_SIZE_32) STR(temp, MEM_REG, addr); @@ -118,7 +101,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR else if (flags & BackPatchInfo::FLAG_SIZE_8) LDRB(RS, MEM_REG, addr); - ByteswapAfterLoad(this, RS, RS, flags, true, false); + ByteswapAfterLoad(this, &m_float_emit, RS, RS, flags, true, false); } } const u8* fastmem_end = GetCodePtr(); @@ -158,52 +141,39 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR ABI_PushRegisters(gprs_to_push); m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); - if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) + if (flags & BackPatchInfo::FLAG_STORE) { - if ((flags & BackPatchInfo::FLAG_SIZE_32) && !(flags & BackPatchInfo::FLAG_PAIR)) + const u32 access_size = BackPatchInfo::GetFlagSize(flags); + ARM64Reg src_reg = RS; + const ARM64Reg dst_reg = access_size == 64 ? ARM64Reg::X0 : ARM64Reg::W0; + + if (flags & BackPatchInfo::FLAG_FLOAT) { - m_float_emit.UMOV(32, ARM64Reg::W0, RS, 0); - MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32); - BLR(ARM64Reg::X8); + if (access_size == 64) + m_float_emit.FMOV(dst_reg, EncodeRegToDouble(RS)); + else + m_float_emit.FMOV(dst_reg, EncodeRegToSingle(RS)); + + src_reg = dst_reg; } - else if ((flags & BackPatchInfo::FLAG_SIZE_32) && (flags & BackPatchInfo::FLAG_PAIR)) + + if (flags & BackPatchInfo::FLAG_PAIR) { - m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0); - MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64); - ROR(ARM64Reg::X0, ARM64Reg::X0, 32); - BLR(ARM64Reg::X8); + // Compensate for the Write_ functions swapping the whole write instead of each pair + SwapPairs(this, dst_reg, src_reg, flags); + src_reg = dst_reg; } - else - { - m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0); - MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64); - BLR(ARM64Reg::X8); - } - } - else if ((flags & BackPatchInfo::FLAG_LOAD) && (flags & BackPatchInfo::FLAG_FLOAT)) - { - if (flags & BackPatchInfo::FLAG_SIZE_32) - { - MOVP2R(ARM64Reg::X8, &PowerPC::Read_U32); - BLR(ARM64Reg::X8); - m_float_emit.INS(32, RS, 0, ARM64Reg::X0); - } - else - { - MOVP2R(ARM64Reg::X8, &PowerPC::Read_F64); - BLR(ARM64Reg::X8); - m_float_emit.INS(64, RS, 0, ARM64Reg::X0); - } - } - else if (flags & BackPatchInfo::FLAG_STORE) - { - MOV(ARM64Reg::W0, RS); + + if (dst_reg != src_reg) + MOV(dst_reg, src_reg); const bool reverse = (flags & BackPatchInfo::FLAG_REVERSE) != 0; - if (flags & BackPatchInfo::FLAG_SIZE_32) + if (access_size == 64) + MOVP2R(ARM64Reg::X8, reverse ? &PowerPC::Write_U64_Swap : &PowerPC::Write_U64); + else if (access_size == 32) MOVP2R(ARM64Reg::X8, reverse ? &PowerPC::Write_U32_Swap : &PowerPC::Write_U32); - else if (flags & BackPatchInfo::FLAG_SIZE_16) + else if (access_size == 16) MOVP2R(ARM64Reg::X8, reverse ? &PowerPC::Write_U16_Swap : &PowerPC::Write_U16); else MOVP2R(ARM64Reg::X8, &PowerPC::Write_U8); @@ -217,16 +187,40 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR } else { - if (flags & BackPatchInfo::FLAG_SIZE_32) + const u32 access_size = BackPatchInfo::GetFlagSize(flags); + + if (access_size == 64) + MOVP2R(ARM64Reg::X8, &PowerPC::Read_U64); + else if (access_size == 32) MOVP2R(ARM64Reg::X8, &PowerPC::Read_U32); - else if (flags & BackPatchInfo::FLAG_SIZE_16) + else if (access_size == 16) MOVP2R(ARM64Reg::X8, &PowerPC::Read_U16); - else if (flags & BackPatchInfo::FLAG_SIZE_8) + else MOVP2R(ARM64Reg::X8, &PowerPC::Read_U8); BLR(ARM64Reg::X8); - ByteswapAfterLoad(this, RS, ARM64Reg::W0, flags, false, false); + ARM64Reg src_reg = access_size == 64 ? ARM64Reg::X0 : ARM64Reg::W0; + + if (flags & BackPatchInfo::FLAG_PAIR) + { + // Compensate for the Read_ functions swapping the whole read instead of each pair + const ARM64Reg dst_reg = flags & BackPatchInfo::FLAG_FLOAT ? src_reg : RS; + SwapPairs(this, dst_reg, src_reg, flags); + src_reg = dst_reg; + } + + if (flags & BackPatchInfo::FLAG_FLOAT) + { + if (access_size == 64) + m_float_emit.FMOV(EncodeRegToDouble(RS), src_reg); + else + m_float_emit.FMOV(EncodeRegToSingle(RS), src_reg); + + src_reg = RS; + } + + ByteswapAfterLoad(this, &m_float_emit, RS, src_reg, flags, false, false); } m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index fc4603811f..0cedefac45 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -240,7 +240,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s LDR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr)); ARM64Reg temp = ARM64Reg::W1; - temp = ByteswapBeforeStore(this, temp, RS, flags, true); + temp = ByteswapBeforeStore(this, &m_float_emit, temp, RS, flags, true); if (accessSize == 32) STR(IndexType::Post, temp, ARM64Reg::X0, 4); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index da6af010d9..305eb8a9f2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -42,7 +42,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) constexpr ARM64Reg addr_reg = ARM64Reg::W0; constexpr ARM64Reg scale_reg = ARM64Reg::W1; constexpr ARM64Reg type_reg = ARM64Reg::W2; - ARM64Reg VS; + ARM64Reg VS = fpr.RW(inst.RS, RegType::Single); if (inst.RA || update) // Always uses the register on update { @@ -69,17 +69,20 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) if (js.assumeNoPairedQuantize) { - VS = fpr.RW(inst.RS, RegType::Single); + BitSet32 gprs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + + // Wipe the registers we are using as temporaries + gprs_in_use &= BitSet32(~7); + fprs_in_use &= BitSet32(~3); + fprs_in_use[DecodeReg(VS)] = 0; + + u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) - { - ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), MEM_REG); - m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg)); - } - else - { - m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), MEM_REG); - } - m_float_emit.REV32(8, EncodeRegToDouble(VS), EncodeRegToDouble(VS)); + flags |= BackPatchInfo::FLAG_PAIR; + + EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, VS, EncodeRegTo64(addr_reg), gprs_in_use, + fprs_in_use); } else { @@ -91,7 +94,6 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true)); BLR(EncodeRegTo64(type_reg)); - VS = fpr.RW(inst.RS, RegType::Single); m_float_emit.ORR(EncodeRegToDouble(VS), ARM64Reg::D0, ARM64Reg::D0); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 40129ce202..f083ba3dee 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -488,11 +488,14 @@ void JitArm64::GenerateQuantizedLoads() { // X0 is the address // X1 is the scale + // X2 is a temporary // X30 is LR // Q0 is the return // Q1 is a temporary ARM64Reg addr_reg = ARM64Reg::X0; ARM64Reg scale_reg = ARM64Reg::X1; + BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 2}; + BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1}; ARM64FloatEmitter float_emit(this); const u8* start = GetCodePtr(); @@ -500,15 +503,20 @@ void JitArm64::GenerateQuantizedLoads() BRK(100); const u8* loadPairedFloatTwo = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LD1(32, 1, ARM64Reg::D0, addr_reg); - float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); + constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push & ~BitSet32{1}, + fprs_to_push); RET(ARM64Reg::X30); } const u8* loadPairedU8Two = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(16, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); + constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -521,8 +529,11 @@ void JitArm64::GenerateQuantizedLoads() } const u8* loadPairedS8Two = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(16, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); + constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -535,9 +546,11 @@ void JitArm64::GenerateQuantizedLoads() } const u8* loadPairedU16Two = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LD1(16, 1, ARM64Reg::D0, addr_reg); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); + constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -549,9 +562,11 @@ void JitArm64::GenerateQuantizedLoads() } const u8* loadPairedS16Two = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LD1(16, 1, ARM64Reg::D0, addr_reg); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); + constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -564,15 +579,20 @@ void JitArm64::GenerateQuantizedLoads() const u8* loadPairedFloatOne = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); - float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); + constexpr u32 flags = + BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push & ~BitSet32{1}, + fprs_to_push); RET(ARM64Reg::X30); } const u8* loadPairedU8One = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(8, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); + constexpr u32 flags = + BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -585,8 +605,11 @@ void JitArm64::GenerateQuantizedLoads() } const u8* loadPairedS8One = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(8, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); + constexpr u32 flags = + BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -599,9 +622,11 @@ void JitArm64::GenerateQuantizedLoads() } const u8* loadPairedU16One = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(16, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); + constexpr u32 flags = + BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -613,9 +638,11 @@ void JitArm64::GenerateQuantizedLoads() } const u8* loadPairedS16One = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(16, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); + constexpr u32 flags = + BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -663,6 +690,8 @@ void JitArm64::GenerateQuantizedStores() // Q1 is a temporary ARM64Reg scale_reg = ARM64Reg::X0; ARM64Reg addr_reg = ARM64Reg::X1; + BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 1, 2}; + BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1}; ARM64FloatEmitter float_emit(this); const u8* start = GetCodePtr(); @@ -671,17 +700,16 @@ void JitArm64::GenerateQuantizedStores() const u8* storePairedFloat; const u8* storePairedFloatSlow; { + constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32; + storePairedFloat = GetCodePtr(); - float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(64, ARM64Reg::Q0, 0, addr_reg, ARM64Reg::SP); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storePairedFloatSlow = GetCodePtr(); - float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::Q0, 0); - ROR(ARM64Reg::X0, ARM64Reg::X0, 32); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U64); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storePairedU8; @@ -698,18 +726,18 @@ void JitArm64::GenerateQuantizedStores() float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; + storePairedU8 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(16, ARM64Reg::Q0, 0, addr_reg, ARM64Reg::SP); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storePairedU8Slow = GetCodePtr(); emit_quantize(); - float_emit.UMOV(16, ARM64Reg::W0, ARM64Reg::Q0, 0); - REV16(ARM64Reg::W0, ARM64Reg::W0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U16); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storePairedS8; const u8* storePairedS8Slow; @@ -725,18 +753,18 @@ void JitArm64::GenerateQuantizedStores() float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; + storePairedS8 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(16, ARM64Reg::Q0, 0, addr_reg, ARM64Reg::SP); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storePairedS8Slow = GetCodePtr(); emit_quantize(); - float_emit.UMOV(16, ARM64Reg::W0, ARM64Reg::Q0, 0); - REV16(ARM64Reg::W0, ARM64Reg::W0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U16); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storePairedU16; @@ -750,21 +778,20 @@ void JitArm64::GenerateQuantizedStores() float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; + storePairedU16 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(32, ARM64Reg::Q0, 0, addr_reg, ARM64Reg::SP); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storePairedU16Slow = GetCodePtr(); emit_quantize(); - float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U32); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storePairedS16; // Used by Viewtiful Joe's intro movie const u8* storePairedS16Slow; @@ -777,36 +804,35 @@ void JitArm64::GenerateQuantizedStores() float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; + storePairedS16 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(32, ARM64Reg::Q0, 0, addr_reg, ARM64Reg::SP); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storePairedS16Slow = GetCodePtr(); emit_quantize(); - float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U32); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storeSingleFloat; const u8* storeSingleFloatSlow; { + constexpr u32 flags = + BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; + storeSingleFloat = GetCodePtr(); - float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.STR(32, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storeSingleFloatSlow = GetCodePtr(); - float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U32); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storeSingleU8; // Used by MKWii const u8* storeSingleU8Slow; @@ -822,17 +848,18 @@ void JitArm64::GenerateQuantizedStores() float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = + BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; + storeSingleU8 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(8, ARM64Reg::Q0, 0, addr_reg); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storeSingleU8Slow = GetCodePtr(); emit_quantize(); - float_emit.UMOV(8, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U8); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storeSingleS8; const u8* storeSingleS8Slow; @@ -848,17 +875,18 @@ void JitArm64::GenerateQuantizedStores() float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = + BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; + storeSingleS8 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(8, ARM64Reg::Q0, 0, addr_reg); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storeSingleS8Slow = GetCodePtr(); emit_quantize(); - float_emit.SMOV(8, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U8); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storeSingleU16; // Used by MKWii const u8* storeSingleU16Slow; @@ -873,18 +901,18 @@ void JitArm64::GenerateQuantizedStores() float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = + BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; + storeSingleU16 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); - float_emit.ST1(16, ARM64Reg::Q0, 0, addr_reg); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storeSingleU16Slow = GetCodePtr(); emit_quantize(); - float_emit.UMOV(16, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U16); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storeSingleS16; const u8* storeSingleS16Slow; @@ -899,18 +927,18 @@ void JitArm64::GenerateQuantizedStores() float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = + BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; + storeSingleS16 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); - float_emit.ST1(16, ARM64Reg::Q0, 0, addr_reg); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storeSingleS16Slow = GetCodePtr(); emit_quantize(); - float_emit.SMOV(16, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U16); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore"); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp index 8c1237c061..f8aceb5615 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp @@ -191,19 +191,47 @@ private: bool m_sign_extend; }; -void ByteswapAfterLoad(ARM64XEmitter* emit, ARM64Reg dst_reg, ARM64Reg src_reg, u32 flags, - bool is_reversed, bool is_extended) +void SwapPairs(ARM64XEmitter* emit, ARM64Reg dst_reg, ARM64Reg src_reg, u32 flags) +{ + if (flags & BackPatchInfo::FLAG_SIZE_32) + emit->ROR(dst_reg, src_reg, 32); + else if (flags & BackPatchInfo::FLAG_SIZE_16) + emit->ROR(dst_reg, src_reg, 16); + else + emit->REV16(dst_reg, src_reg); +} + +void ByteswapAfterLoad(ARM64XEmitter* emit, Arm64Gen::ARM64FloatEmitter* float_emit, + ARM64Reg dst_reg, ARM64Reg src_reg, u32 flags, bool is_reversed, + bool is_extended) { if (is_reversed == !(flags & BackPatchInfo::FLAG_REVERSE)) { - if (flags & BackPatchInfo::FLAG_SIZE_32) + if (flags & BackPatchInfo::FLAG_SIZE_64) { - emit->REV32(dst_reg, src_reg); + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->REV64(8, dst_reg, src_reg); + else + emit->REV64(dst_reg, src_reg); + + src_reg = dst_reg; + } + else if (flags & BackPatchInfo::FLAG_SIZE_32) + { + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->REV32(8, dst_reg, src_reg); + else + emit->REV32(dst_reg, src_reg); + src_reg = dst_reg; } else if (flags & BackPatchInfo::FLAG_SIZE_16) { - emit->REV16(dst_reg, src_reg); + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->REV16(8, dst_reg, src_reg); + else + emit->REV16(dst_reg, src_reg); + src_reg = dst_reg; } } @@ -215,25 +243,47 @@ void ByteswapAfterLoad(ARM64XEmitter* emit, ARM64Reg dst_reg, ARM64Reg src_reg, } if (dst_reg != src_reg) - emit->MOV(dst_reg, src_reg); + { + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->ORR(dst_reg, src_reg, src_reg); + else + emit->MOV(dst_reg, src_reg); + } } -ARM64Reg ByteswapBeforeStore(ARM64XEmitter* emit, ARM64Reg tmp_reg, ARM64Reg src_reg, u32 flags, - bool want_reversed) +ARM64Reg ByteswapBeforeStore(ARM64XEmitter* emit, Arm64Gen::ARM64FloatEmitter* float_emit, + ARM64Reg tmp_reg, ARM64Reg src_reg, u32 flags, bool want_reversed) { ARM64Reg dst_reg = src_reg; if (want_reversed == !(flags & BackPatchInfo::FLAG_REVERSE)) { - if (flags & BackPatchInfo::FLAG_SIZE_32) + if (flags & BackPatchInfo::FLAG_SIZE_64) { dst_reg = tmp_reg; - emit->REV32(dst_reg, src_reg); + + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->REV64(8, dst_reg, src_reg); + else + emit->REV64(dst_reg, src_reg); + } + else if (flags & BackPatchInfo::FLAG_SIZE_32) + { + dst_reg = tmp_reg; + + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->REV32(8, dst_reg, src_reg); + else + emit->REV32(dst_reg, src_reg); } else if (flags & BackPatchInfo::FLAG_SIZE_16) { dst_reg = tmp_reg; - emit->REV16(dst_reg, src_reg); + + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->REV16(8, dst_reg, src_reg); + else + emit->REV16(dst_reg, src_reg); } } @@ -243,6 +293,8 @@ ARM64Reg ByteswapBeforeStore(ARM64XEmitter* emit, ARM64Reg tmp_reg, ARM64Reg src void MMIOLoadToReg(MMIO::Mapping* mmio, Arm64Gen::ARM64XEmitter* emit, BitSet32 gprs_in_use, BitSet32 fprs_in_use, ARM64Reg dst_reg, u32 address, u32 flags) { + ASSERT(!(flags & BackPatchInfo::FLAG_FLOAT)); + if (flags & BackPatchInfo::FLAG_SIZE_8) { MMIOReadCodeGenerator gen(emit, gprs_in_use, fprs_in_use, dst_reg, address, @@ -262,13 +314,15 @@ void MMIOLoadToReg(MMIO::Mapping* mmio, Arm64Gen::ARM64XEmitter* emit, BitSet32 mmio->GetHandlerForRead(address).Visit(gen); } - ByteswapAfterLoad(emit, dst_reg, dst_reg, flags, false, true); + ByteswapAfterLoad(emit, nullptr, dst_reg, dst_reg, flags, false, true); } void MMIOWriteRegToAddr(MMIO::Mapping* mmio, Arm64Gen::ARM64XEmitter* emit, BitSet32 gprs_in_use, BitSet32 fprs_in_use, ARM64Reg src_reg, u32 address, u32 flags) { - src_reg = ByteswapBeforeStore(emit, ARM64Reg::W1, src_reg, flags, false); + ASSERT(!(flags & BackPatchInfo::FLAG_FLOAT)); + + src_reg = ByteswapBeforeStore(emit, nullptr, ARM64Reg::W1, src_reg, flags, false); if (flags & BackPatchInfo::FLAG_SIZE_8) { diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit_Util.h b/Source/Core/Core/PowerPC/JitArm64/Jit_Util.h index 47d4b5ce95..9f1b8f8436 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit_Util.h @@ -8,11 +8,17 @@ #include "Core/HW/MMIO.h" -void ByteswapAfterLoad(Arm64Gen::ARM64XEmitter* emit, Arm64Gen::ARM64Reg dst_reg, - Arm64Gen::ARM64Reg src_reg, u32 flags, bool is_reversed, bool is_extended); +void SwapPairs(Arm64Gen::ARM64XEmitter* emit, Arm64Gen::ARM64Reg dst_reg, + Arm64Gen::ARM64Reg src_reg, u32 flags); -Arm64Gen::ARM64Reg ByteswapBeforeStore(Arm64Gen::ARM64XEmitter* emit, Arm64Gen::ARM64Reg tmp_reg, - Arm64Gen::ARM64Reg src_reg, u32 flags, bool want_reversed); +void ByteswapAfterLoad(Arm64Gen::ARM64XEmitter* emit, Arm64Gen::ARM64FloatEmitter* float_emit, + Arm64Gen::ARM64Reg dst_reg, Arm64Gen::ARM64Reg src_reg, u32 flags, + bool is_reversed, bool is_extended); + +Arm64Gen::ARM64Reg ByteswapBeforeStore(Arm64Gen::ARM64XEmitter* emit, + Arm64Gen::ARM64FloatEmitter* float_emit, + Arm64Gen::ARM64Reg tmp_reg, Arm64Gen::ARM64Reg src_reg, + u32 flags, bool want_reversed); void MMIOLoadToReg(MMIO::Mapping* mmio, Arm64Gen::ARM64XEmitter* emit, BitSet32 gprs_in_use, BitSet32 fprs_in_use, Arm64Gen::ARM64Reg dst_reg, u32 address, u32 flags); From 96760093e9cdce17fad57c9efa0370a14b908b1e Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 9 Jul 2021 12:13:58 +0200 Subject: [PATCH 5/7] JitArm64: Move psq_st address check to EmitBackpatchRoutine This way the address check will take up less icache (since it's only emitted once for each routine rather than once for each psq_st instruction), and we also get address checking for psq_l. Matches Jit64's approach. The disadvantage: In the slowmem case, the routines have to push *every* caller-saved register onto the stack, even though most callers probably don't need it. But at long as the slowmem case isn't hit frequently, this is fine. --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 4 +- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 70 ++-- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 10 + .../JitArm64/JitArm64_LoadStorePaired.cpp | 38 +-- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 311 +++++++----------- 5 files changed, 182 insertions(+), 251 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index bf266acfd3..34f25c04c3 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -225,10 +225,9 @@ protected: void DumpCode(const u8* start, const u8* end); // Backpatching routines - bool DisasmLoadStore(const u8* ptr, u32* flags, Arm64Gen::ARM64Reg* reg); void EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, Arm64Gen::ARM64Reg RS, Arm64Gen::ARM64Reg addr, BitSet32 gprs_to_push = BitSet32(0), - BitSet32 fprs_to_push = BitSet32(0)); + BitSet32 fprs_to_push = BitSet32(0), bool emitting_routine = false); // Loadstore routines void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update); void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset); @@ -236,6 +235,7 @@ protected: // jumps to the returned FixupBranch. Clobbers tmp and the 17 lower bits of addr_out. Arm64Gen::FixupBranch BATAddressLookup(Arm64Gen::ARM64Reg addr_out, Arm64Gen::ARM64Reg addr_in, Arm64Gen::ARM64Reg tmp, const void* bat_table); + Arm64Gen::FixupBranch CheckIfSafeAddress(Arm64Gen::ARM64Reg addr); void DoJit(u32 em_address, JitBlock* b, u32 nextPC); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index fa5ed3e67d..de42f5811f 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include "Common/BitSet.h" @@ -51,13 +52,18 @@ void JitArm64::DoBacktrace(uintptr_t access_address, SContext* ctx) } void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, ARM64Reg RS, - ARM64Reg addr, BitSet32 gprs_to_push, BitSet32 fprs_to_push) + ARM64Reg addr, BitSet32 gprs_to_push, BitSet32 fprs_to_push, + bool emitting_routine) { bool in_far_code = false; const u8* fastmem_start = GetCodePtr(); + std::optional slowmem_fixup; if (fastmem) { + if (do_farcode && emitting_routine) + slowmem_fixup = CheckIfSafeAddress(addr); + if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) { ARM64Reg temp = ARM64Reg::D0; @@ -110,34 +116,45 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR { if (fastmem && do_farcode) { - SlowmemHandler handler; - handler.dest_reg = RS; - handler.addr_reg = addr; - handler.gprs = gprs_to_push; - handler.fprs = fprs_to_push; - handler.flags = flags; - - FastmemArea* fastmem_area = &m_fault_to_handler[fastmem_end]; - auto handler_loc_iter = m_handler_to_loc.find(handler); - - if (handler_loc_iter == m_handler_to_loc.end()) + if (emitting_routine) { in_far_code = true; SwitchToFarCode(); - const u8* handler_loc = GetCodePtr(); - m_handler_to_loc[handler] = handler_loc; - fastmem_area->fastmem_code = fastmem_start; - fastmem_area->slowmem_code = handler_loc; } else { - const u8* handler_loc = handler_loc_iter->second; - fastmem_area->fastmem_code = fastmem_start; - fastmem_area->slowmem_code = handler_loc; - return; + SlowmemHandler handler; + handler.dest_reg = RS; + handler.addr_reg = addr; + handler.gprs = gprs_to_push; + handler.fprs = fprs_to_push; + handler.flags = flags; + + FastmemArea* fastmem_area = &m_fault_to_handler[fastmem_end]; + auto handler_loc_iter = m_handler_to_loc.find(handler); + + if (handler_loc_iter == m_handler_to_loc.end()) + { + in_far_code = true; + SwitchToFarCode(); + const u8* handler_loc = GetCodePtr(); + m_handler_to_loc[handler] = handler_loc; + fastmem_area->fastmem_code = fastmem_start; + fastmem_area->slowmem_code = handler_loc; + } + else + { + const u8* handler_loc = handler_loc_iter->second; + fastmem_area->fastmem_code = fastmem_start; + fastmem_area->slowmem_code = handler_loc; + return; + } } } + if (slowmem_fixup) + SetJumpTarget(*slowmem_fixup); + ABI_PushRegisters(gprs_to_push); m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); @@ -229,8 +246,17 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR if (in_far_code) { - RET(ARM64Reg::X30); - SwitchToNearCode(); + if (emitting_routine) + { + FixupBranch done = B(); + SwitchToNearCode(); + SetJumpTarget(done); + } + else + { + RET(ARM64Reg::X30); + SwitchToNearCode(); + } } } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 0cedefac45..3d18b977bb 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -288,6 +288,16 @@ FixupBranch JitArm64::BATAddressLookup(ARM64Reg addr_out, ARM64Reg addr_in, ARM6 return fail; } +FixupBranch JitArm64::CheckIfSafeAddress(Arm64Gen::ARM64Reg addr) +{ + // FIXME: This doesn't correctly account for the BAT configuration. + TST(addr, LogicalImm(0x0c000000, 32)); + FixupBranch pass = B(CC_EQ); + FixupBranch fail = B(); + SetJumpTarget(pass); + return fail; +} + void JitArm64::lXX(UGeckoInstruction inst) { INSTRUCTION_START diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 305eb8a9f2..60e86225a4 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -19,10 +19,10 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(jo.memcheck || !jo.fastmem); + FALLBACK_IF(jo.memcheck); - // The asm routines assume address translation is on. - FALLBACK_IF(!MSR.DR); + // If we have a fastmem arena, the asm routines assume address translation is on. + FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR); // X30 is LR // X0 is the address @@ -111,10 +111,10 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(jo.memcheck || !jo.fastmem); + FALLBACK_IF(jo.memcheck); - // The asm routines assume address translation is on. - FALLBACK_IF(!MSR.DR); + // If we have a fastmem arena, the asm routines assume address translation is on. + FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR); // X30 is LR // X0 contains the scale @@ -213,33 +213,9 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) UBFM(type_reg, scale_reg, 0, 2); // Type UBFM(scale_reg, scale_reg, 8, 13); // Scale - // Inline address check - // FIXME: This doesn't correctly account for the BAT configuration. - TST(addr_reg, LogicalImm(0x0c000000, 32)); - FixupBranch pass = B(CC_EQ); - FixupBranch fail = B(); - - SwitchToFarCode(); - SetJumpTarget(fail); - // Slow - MOVP2R(ARM64Reg::X30, &paired_store_quantized[16 + w * 8]); - LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true)); - - ABI_PushRegisters(gprs_in_use); - m_float_emit.ABI_PushRegisters(fprs_in_use, ARM64Reg::X30); - BLR(EncodeRegTo64(type_reg)); - m_float_emit.ABI_PopRegisters(fprs_in_use, ARM64Reg::X30); - ABI_PopRegisters(gprs_in_use); - FixupBranch continue1 = B(); - SwitchToNearCode(); - SetJumpTarget(pass); - - // Fast - MOVP2R(ARM64Reg::X30, &paired_store_quantized[w * 8]); + MOVP2R(ARM64Reg::X30, w ? single_store_quantized : paired_store_quantized); LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true)); BLR(EncodeRegTo64(type_reg)); - - SetJumpTarget(continue1); } if (js.assumeNoPairedQuantize && !have_single) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index f083ba3dee..095fe14aff 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -506,8 +506,9 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push & ~BitSet32{1}, - fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push & ~BitSet32{1}, fprs_to_push); + RET(ARM64Reg::X30); } const u8* loadPairedU8Two = GetCodePtr(); @@ -515,7 +516,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); @@ -532,7 +534,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); @@ -549,7 +552,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -565,7 +569,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -582,8 +587,9 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push & ~BitSet32{1}, - fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push & ~BitSet32{1}, fprs_to_push); + RET(ARM64Reg::X30); } const u8* loadPairedU8One = GetCodePtr(); @@ -591,7 +597,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); @@ -608,7 +615,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); @@ -625,7 +633,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -641,7 +650,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -697,256 +707,181 @@ void JitArm64::GenerateQuantizedStores() const u8* start = GetCodePtr(); const u8* storePairedIllegal = GetCodePtr(); BRK(0x101); - const u8* storePairedFloat; - const u8* storePairedFloatSlow; + const u8* storePairedFloat = GetCodePtr(); { constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32; - storePairedFloat = GetCodePtr(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storePairedFloatSlow = GetCodePtr(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - - const u8* storePairedU8; - const u8* storePairedU8Slow; + const u8* storePairedU8 = GetCodePtr(); { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); - float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); + float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; - storePairedU8 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storePairedU8Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storePairedS8; - const u8* storePairedS8Slow; + const u8* storePairedS8 = GetCodePtr(); { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); - float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); + float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; - storePairedS8 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storePairedS8Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - - const u8* storePairedU16; - const u8* storePairedU16Slow; + const u8* storePairedU16 = GetCodePtr(); { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); - float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; - storePairedU16 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storePairedU16Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storePairedS16; // Used by Viewtiful Joe's intro movie - const u8* storePairedS16Slow; + const u8* storePairedS16 = GetCodePtr(); // Used by Viewtiful Joe's intro movie { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); - float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; - storePairedS16 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storePairedS16Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storeSingleFloat; - const u8* storeSingleFloatSlow; + const u8* storeSingleFloat = GetCodePtr(); { constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; - storeSingleFloat = GetCodePtr(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push); - storeSingleFloatSlow = GetCodePtr(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storeSingleU8; // Used by MKWii - const u8* storeSingleU8Slow; + const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); - float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); + float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; - storeSingleU8 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storeSingleU8Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storeSingleS8; - const u8* storeSingleS8Slow; + const u8* storeSingleS8 = GetCodePtr(); { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); - float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); + float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; - storeSingleS8 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storeSingleS8Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storeSingleU16; // Used by MKWii - const u8* storeSingleU16Slow; + const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); - float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; - storeSingleU16 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storeSingleU16Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storeSingleS16; - const u8* storeSingleS16Slow; + const u8* storeSingleS16 = GetCodePtr(); { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); - float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; - storeSingleS16 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storeSingleS16Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore"); paired_store_quantized = reinterpret_cast(AlignCode16()); - ReserveCodeSpace(32 * sizeof(u8*)); + ReserveCodeSpace(8 * sizeof(u8*)); - // Fast paired_store_quantized[0] = storePairedFloat; paired_store_quantized[1] = storePairedIllegal; paired_store_quantized[2] = storePairedIllegal; @@ -956,31 +891,15 @@ void JitArm64::GenerateQuantizedStores() paired_store_quantized[6] = storePairedS8; paired_store_quantized[7] = storePairedS16; - paired_store_quantized[8] = storeSingleFloat; - paired_store_quantized[9] = storePairedIllegal; - paired_store_quantized[10] = storePairedIllegal; - paired_store_quantized[11] = storePairedIllegal; - paired_store_quantized[12] = storeSingleU8; - paired_store_quantized[13] = storeSingleU16; - paired_store_quantized[14] = storeSingleS8; - paired_store_quantized[15] = storeSingleS16; + single_store_quantized = reinterpret_cast(AlignCode16()); + ReserveCodeSpace(8 * sizeof(u8*)); - // Slow - paired_store_quantized[16] = storePairedFloatSlow; - paired_store_quantized[17] = storePairedIllegal; - paired_store_quantized[18] = storePairedIllegal; - paired_store_quantized[19] = storePairedIllegal; - paired_store_quantized[20] = storePairedU8Slow; - paired_store_quantized[21] = storePairedU16Slow; - paired_store_quantized[22] = storePairedS8Slow; - paired_store_quantized[23] = storePairedS16Slow; - - paired_store_quantized[24] = storeSingleFloatSlow; - paired_store_quantized[25] = storePairedIllegal; - paired_store_quantized[26] = storePairedIllegal; - paired_store_quantized[27] = storePairedIllegal; - paired_store_quantized[28] = storeSingleU8Slow; - paired_store_quantized[29] = storeSingleU16Slow; - paired_store_quantized[30] = storeSingleS8Slow; - paired_store_quantized[31] = storeSingleS16Slow; + single_store_quantized[0] = storeSingleFloat; + single_store_quantized[1] = storePairedIllegal; + single_store_quantized[2] = storePairedIllegal; + single_store_quantized[3] = storePairedIllegal; + single_store_quantized[4] = storeSingleU8; + single_store_quantized[5] = storeSingleU16; + single_store_quantized[6] = storeSingleS8; + single_store_quantized[7] = storeSingleS16; } From 93e968208e920605369010cc401563eb4c6e1b8f Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 10 Jul 2021 13:21:27 +0200 Subject: [PATCH 6/7] JitArm64: Lock fewer registers when assumeNoPairedQuantize --- .../JitArm64/JitArm64_LoadStorePaired.cpp | 51 ++++++++++++------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 60e86225a4..dcbbd1b686 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -36,8 +36,13 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) const int i = indexed ? inst.Ix : inst.I; const int w = indexed ? inst.Wx : inst.W; - gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); - fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W30); + fpr.Lock(ARM64Reg::Q0); + if (!js.assumeNoPairedQuantize) + { + gpr.Lock(ARM64Reg::W1, ARM64Reg::W2); + fpr.Lock(ARM64Reg::Q1); + } constexpr ARM64Reg addr_reg = ARM64Reg::W0; constexpr ARM64Reg scale_reg = ARM64Reg::W1; @@ -73,8 +78,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); // Wipe the registers we are using as temporaries - gprs_in_use &= BitSet32(~7); - fprs_in_use &= BitSet32(~3); + gprs_in_use[DecodeReg(ARM64Reg::W0)] = false; + fprs_in_use[DecodeReg(ARM64Reg::Q0)] = false; fprs_in_use[DecodeReg(VS)] = 0; u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; @@ -103,8 +108,13 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) m_float_emit.INS(32, VS, 1, ARM64Reg::Q0, 0); } - gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); - fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30); + fpr.Unlock(ARM64Reg::Q0); + if (!js.assumeNoPairedQuantize) + { + gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2); + fpr.Unlock(ARM64Reg::Q1); + } } void JitArm64::psq_stXX(UGeckoInstruction inst) @@ -127,7 +137,8 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) const int i = indexed ? inst.Ix : inst.I; const int w = indexed ? inst.Wx : inst.W; - fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); + if (!js.assumeNoPairedQuantize) + fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); const bool have_single = fpr.IsSingle(inst.RS); @@ -162,7 +173,9 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) } } - gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); + if (!js.assumeNoPairedQuantize) + gpr.Lock(ARM64Reg::W2); constexpr ARM64Reg scale_reg = ARM64Reg::W0; constexpr ARM64Reg addr_reg = ARM64Reg::W1; @@ -191,15 +204,15 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) MOV(gpr.R(inst.RA), addr_reg); } - BitSet32 gprs_in_use = gpr.GetCallerSavedUsed(); - BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); - - // Wipe the registers we are using as temporaries - gprs_in_use &= BitSet32(~7); - fprs_in_use &= BitSet32(~3); - if (js.assumeNoPairedQuantize) { + BitSet32 gprs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + + // Wipe the registers we are using as temporaries + gprs_in_use[DecodeReg(ARM64Reg::W0)] = false; + gprs_in_use[DecodeReg(ARM64Reg::W1)] = false; + u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) flags |= BackPatchInfo::FLAG_PAIR; @@ -221,6 +234,10 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) if (js.assumeNoPairedQuantize && !have_single) fpr.Unlock(VS); - gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); - fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); + if (!js.assumeNoPairedQuantize) + { + gpr.Unlock(ARM64Reg::W2); + fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); + } } From 15eb56142a646ef943652964c26c3c2810b296b5 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 9 Jul 2021 13:21:16 +0200 Subject: [PATCH 7/7] JitArm64: Read BATs in psq_l/psq_st address check Same approach as Jit64 here as well. --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 3 ++- .../Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp | 8 +++++++- .../Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp | 12 ++++++++---- .../PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp | 4 ++-- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 3 ++- 5 files changed, 21 insertions(+), 9 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 34f25c04c3..b029f545cc 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -235,7 +235,8 @@ protected: // jumps to the returned FixupBranch. Clobbers tmp and the 17 lower bits of addr_out. Arm64Gen::FixupBranch BATAddressLookup(Arm64Gen::ARM64Reg addr_out, Arm64Gen::ARM64Reg addr_in, Arm64Gen::ARM64Reg tmp, const void* bat_table); - Arm64Gen::FixupBranch CheckIfSafeAddress(Arm64Gen::ARM64Reg addr); + Arm64Gen::FixupBranch CheckIfSafeAddress(Arm64Gen::ARM64Reg addr, Arm64Gen::ARM64Reg tmp1, + Arm64Gen::ARM64Reg tmp2); void DoJit(u32 em_address, JitBlock* b, u32 nextPC); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index de42f5811f..11da7bb923 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -10,6 +10,7 @@ #include "Common/CommonFuncs.h" #include "Common/CommonTypes.h" #include "Common/Logging/Log.h" +#include "Common/MathUtil.h" #include "Common/StringUtil.h" #include "Common/Swap.h" @@ -62,7 +63,12 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR if (fastmem) { if (do_farcode && emitting_routine) - slowmem_fixup = CheckIfSafeAddress(addr); + { + const ARM64Reg temp1 = flags & BackPatchInfo::FLAG_STORE ? ARM64Reg::W0 : ARM64Reg::W3; + const ARM64Reg temp2 = ARM64Reg::W2; + + slowmem_fixup = CheckIfSafeAddress(addr, temp1, temp2); + } if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 3d18b977bb..50a727d926 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -288,11 +288,15 @@ FixupBranch JitArm64::BATAddressLookup(ARM64Reg addr_out, ARM64Reg addr_in, ARM6 return fail; } -FixupBranch JitArm64::CheckIfSafeAddress(Arm64Gen::ARM64Reg addr) +FixupBranch JitArm64::CheckIfSafeAddress(Arm64Gen::ARM64Reg addr, Arm64Gen::ARM64Reg tmp1, + Arm64Gen::ARM64Reg tmp2) { - // FIXME: This doesn't correctly account for the BAT configuration. - TST(addr, LogicalImm(0x0c000000, 32)); - FixupBranch pass = B(CC_EQ); + tmp2 = EncodeRegTo64(tmp2); + + MOVP2R(tmp2, PowerPC::dbat_table.data()); + LSR(tmp1, addr, PowerPC::BAT_INDEX_SHIFT); + LDR(tmp1, tmp2, ArithOption(tmp1, true)); + FixupBranch pass = TBNZ(tmp1, IntLog2(PowerPC::BAT_PHYSICAL_BIT)); FixupBranch fail = B(); SetJumpTarget(pass); return fail; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index dcbbd1b686..141db6f487 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -40,7 +40,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) fpr.Lock(ARM64Reg::Q0); if (!js.assumeNoPairedQuantize) { - gpr.Lock(ARM64Reg::W1, ARM64Reg::W2); + gpr.Lock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3); fpr.Lock(ARM64Reg::Q1); } @@ -112,7 +112,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) fpr.Unlock(ARM64Reg::Q0); if (!js.assumeNoPairedQuantize) { - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2); + gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3); fpr.Unlock(ARM64Reg::Q1); } } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 095fe14aff..6a427a1ab8 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -489,12 +489,13 @@ void JitArm64::GenerateQuantizedLoads() // X0 is the address // X1 is the scale // X2 is a temporary + // X3 is a temporary (used in EmitBackpatchRoutine) // X30 is LR // Q0 is the return // Q1 is a temporary ARM64Reg addr_reg = ARM64Reg::X0; ARM64Reg scale_reg = ARM64Reg::X1; - BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 2}; + BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 2, 3}; BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1}; ARM64FloatEmitter float_emit(this);