Merge pull request #9458 from JosJuice/arm-fpu-round

JitArm64: Set flush-to-zero/rounding mode and improve float/double conversion accuracy
2024-11-14 21:37:52 -07:00 · 2021-04-25 10:23:19 -04:00 · 2021-04-25 10:23:19 -04:00 · 5da85f3a25
commit 5da85f3a25
parent aa3a96f048 69c14d6ec3
25 changed files with 819 additions and 120 deletions
--- a/Source/Core/Common/Arm64Emitter.cpp
+++ b/Source/Core/Common/Arm64Emitter.cpp
@ -3601,6 +3601,14 @@ void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
 {
  Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
 }
+void ARM64FloatEmitter::FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, size >> 6, 0x1D, Rd, Rn, Rm);
+}
+void ARM64FloatEmitter::FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
+{
+  EmitThreeSame(1, 2 | (size >> 6), 0x1D, Rd, Rn, Rm);
+}

 void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
 {
--- a/Source/Core/Common/Arm64Emitter.h
+++ b/Source/Core/Common/Arm64Emitter.h
@ -1094,6 +1094,8 @@ public:
  void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
  void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
+  void FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
+  void FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);

  // Conditional select
  void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);
--- a/Source/Core/Common/ArmCPUDetect.cpp
+++ b/Source/Core/Common/ArmCPUDetect.cpp
@ -69,6 +69,7 @@ void CPUInfo::Detect()
  CPU64bit = true;
  Mode64bit = true;
  vendor = CPUVendor::ARM;
+  bFlushToZero = true;

 #ifdef _WIN32
  num_cores = std::thread::hardware_concurrency();
--- a/Source/Core/Common/ArmFPURoundMode.cpp
+++ b/Source/Core/Common/ArmFPURoundMode.cpp
@ -0,0 +1,78 @@
+// Copyright 2021 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include "Common/CommonTypes.h"
+#include "Common/FPURoundMode.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+static u64 GetFPCR()
+{
+#ifdef _MSC_VER
+  return _ReadStatusReg(ARM64_FPCR);
+#else
+  u64 fpcr;
+  __asm__ __volatile__("mrs %0, fpcr" : "=r"(fpcr));
+  return fpcr;
+#endif
+}
+
+static void SetFPCR(u64 fpcr)
+{
+#ifdef _MSC_VER
+  _WriteStatusReg(ARM64_FPCR, fpcr);
+#else
+  __asm__ __volatile__("msr fpcr, %0" : : "ri"(fpcr));
+#endif
+}
+
+namespace FPURoundMode
+{
+static const u64 default_fpcr = GetFPCR();
+static u64 saved_fpcr = default_fpcr;
+
+void SetRoundMode(int mode)
+{
+  // We don't need to do anything here since SetSIMDMode is always called after calling this
+}
+
+void SetPrecisionMode(PrecisionMode mode)
+{
+}
+
+void SetSIMDMode(int rounding_mode, bool non_ieee_mode)
+{
+  // Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0)
+  constexpr u32 FZ = 1 << 24;
+
+  // lookup table for FPSCR.RN-to-FPCR.RMode translation
+  constexpr u32 rounding_mode_table[] = {
+      (0 << 22),  // nearest
+      (3 << 22),  // zero
+      (1 << 22),  // +inf
+      (2 << 22),  // -inf
+  };
+
+  const u64 base = default_fpcr & ~(0b111 << 22);
+  SetFPCR(base | rounding_mode_table[rounding_mode] | (non_ieee_mode ? FZ : 0));
+}
+
+void SaveSIMDState()
+{
+  saved_fpcr = GetFPCR();
+}
+
+void LoadSIMDState()
+{
+  SetFPCR(saved_fpcr);
+}
+
+void LoadDefaultSIMDState()
+{
+  SetFPCR(default_fpcr);
+}
+
+}  // namespace FPURoundMode
--- a/Source/Core/Common/CMakeLists.txt
+++ b/Source/Core/Common/CMakeLists.txt
@ -199,7 +199,7 @@ if(_M_ARM_64)
    Arm64Emitter.h
    ArmCommon.h
    ArmCPUDetect.cpp
-    GenericFPURoundMode.cpp
+    ArmFPURoundMode.cpp
  )
 else()
  if(_M_X86) #X86
--- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp
@ -982,6 +982,7 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)

    js.compilerPC = op.address;
    js.op = &op;
+    js.fpr_is_store_safe = op.fprIsStoreSafeBeforeInst;
    js.instructionNumber = i;
    js.instructionsLeft = (code_block.m_num_instructions - 1) - i;
    const GekkoOPInfo* opinfo = op.opinfo;
@ -1118,6 +1119,8 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)

      CompileInstruction(op);

+      js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst;
+
      if (jo.memcheck && (opinfo->flags & FL_LOADSTORE))
      {
        // If we have a fastmem loadstore, we can omit the exception check and let fastmem handle
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@ -105,7 +105,7 @@ void Jit64::stfXXX(UGeckoInstruction inst)

  if (single)
  {
-    if (js.op->fprIsStoreSafe[s])
+    if (js.fpr_is_store_safe[s])
    {
      RCOpArg Rs = fpr.Use(s, RCMode::Read);
      RegCache::Realize(Rs);
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
@ -695,6 +695,7 @@ void JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)

    js.compilerPC = op.address;
    js.op = &op;
+    js.fpr_is_store_safe = op.fprIsStoreSafeBeforeInst;
    js.instructionNumber = i;
    js.instructionsLeft = (code_block.m_num_instructions - 1) - i;
    const GekkoOPInfo* opinfo = op.opinfo;
@ -830,6 +831,9 @@ void JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
      }

      CompileInstruction(op);
+
+      js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst;
+
      if (!CanMergeNextInstructions(1) || js.op[1].opinfo->type != ::OpType::Integer)
        FlushCarry();

--- a/Source/Core/Core/PowerPC/JitArm64/Jit.h
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h
@ -152,7 +152,20 @@ public:
  void psq_l(UGeckoInstruction inst);
  void psq_st(UGeckoInstruction inst);

-private:
+  void ConvertDoubleToSingleLower(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg,
+                                  Arm64Gen::ARM64Reg src_reg);
+  void ConvertDoubleToSinglePair(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg,
+                                 Arm64Gen::ARM64Reg src_reg);
+  void ConvertSingleToDoubleLower(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg,
+                                  Arm64Gen::ARM64Reg src_reg,
+                                  Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
+  void ConvertSingleToDoublePair(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg,
+                                 Arm64Gen::ARM64Reg src_reg,
+                                 Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
+
+  bool IsFPRStoreSafe(size_t guest_reg) const;
+
+protected:
  struct SlowmemHandler
  {
    Arm64Gen::ARM64Reg dest_reg;
@ -184,14 +197,18 @@ private:
    nearcode = GetWritableCodePtr();
    SetCodePtrUnsafe(farcode.GetWritableCodePtr());
    AlignCode16();
+    m_in_farcode = true;
  }

  void SwitchToNearCode()
  {
    farcode.SetCodePtrUnsafe(GetWritableCodePtr());
    SetCodePtrUnsafe(nearcode);
+    m_in_farcode = false;
  }

+  bool IsInFarCode() const { return m_in_farcode; }
+
  // Dump a memory range of code
  void DumpCode(const u8* start, const u8* end);

@ -215,6 +232,9 @@ private:
  // AsmRoutines
  void GenerateAsm();
  void GenerateCommonAsm();
+  void GenerateConvertDoubleToSingle();
+  void GenerateConvertSingleToDouble();
+  void GenerateQuantizedLoadStores();

  // Profiling
  void BeginTimeProfile(JitBlock* b);
@ -254,6 +274,7 @@ private:

  Arm64Gen::ARM64CodeBlock farcode;
  u8* nearcode;  // Backed up when we switch to far code.
+  bool m_in_farcode = false;

  bool m_enable_blr_optimization;
  bool m_cleanup_after_stackfault = false;
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp
@ -61,23 +61,11 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR
    if (flags & BackPatchInfo::FLAG_STORE && flags & BackPatchInfo::FLAG_MASK_FLOAT)
    {
      if (flags & BackPatchInfo::FLAG_SIZE_F32)
-      {
-        m_float_emit.FCVT(32, 64, ARM64Reg::D0, RS);
-        m_float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0);
-        m_float_emit.STR(32, ARM64Reg::D0, MEM_REG, addr);
-      }
-      else if (flags & BackPatchInfo::FLAG_SIZE_F32I)
      {
        m_float_emit.REV32(8, ARM64Reg::D0, RS);
        m_float_emit.STR(32, ARM64Reg::D0, MEM_REG, addr);
      }
      else if (flags & BackPatchInfo::FLAG_SIZE_F32X2)
-      {
-        m_float_emit.FCVTN(32, ARM64Reg::D0, RS);
-        m_float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0);
-        m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr);
-      }
-      else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I)
      {
        m_float_emit.REV32(8, ARM64Reg::D0, RS);
        m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr);
@ -184,37 +172,22 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR
    if (flags & BackPatchInfo::FLAG_STORE && flags & BackPatchInfo::FLAG_MASK_FLOAT)
    {
      if (flags & BackPatchInfo::FLAG_SIZE_F32)
-      {
-        m_float_emit.FCVT(32, 64, ARM64Reg::D0, RS);
-        m_float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::Q0, 0);
-        MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32);
-        BLR(ARM64Reg::X8);
-      }
-      else if (flags & BackPatchInfo::FLAG_SIZE_F32I)
      {
        m_float_emit.UMOV(32, ARM64Reg::W0, RS, 0);
        MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32);
        BLR(ARM64Reg::X8);
      }
      else if (flags & BackPatchInfo::FLAG_SIZE_F32X2)
-      {
-        m_float_emit.FCVTN(32, ARM64Reg::D0, RS);
-        m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::D0, 0);
-        ROR(ARM64Reg::X0, ARM64Reg::X0, 32);
-        MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64);
-        BLR(ARM64Reg::X8);
-      }
-      else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I)
      {
        m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0);
-        ROR(ARM64Reg::X0, ARM64Reg::X0, 32);
        MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64);
+        ROR(ARM64Reg::X0, ARM64Reg::X0, 32);
        BLR(ARM64Reg::X8);
      }
      else
      {
-        MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64);
        m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0);
+        MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64);
        BLR(ARM64Reg::X8);
      }
    }
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@ -220,30 +220,28 @@ void JitArm64::fselx(UGeckoInstruction inst)
  const u32 c = inst.FC;
  const u32 d = inst.FD;

-  const bool a_single = fpr.IsSingle(a, true);
-  if (a_single)
-  {
-    const ARM64Reg VA = fpr.R(a, RegType::LowerPairSingle);
-    m_float_emit.FCMPE(EncodeRegToSingle(VA));
-  }
-  else
-  {
-    const ARM64Reg VA = fpr.R(a, RegType::LowerPair);
-    m_float_emit.FCMPE(EncodeRegToDouble(VA));
-  }
+  const bool b_and_c_singles = fpr.IsSingle(b, true) && fpr.IsSingle(c, true);
+  const RegType b_and_c_type = b_and_c_singles ? RegType::LowerPairSingle : RegType::LowerPair;
+  const auto b_and_c_reg_encoder = b_and_c_singles ? EncodeRegToSingle : EncodeRegToDouble;

+  const bool a_single = fpr.IsSingle(a, true) && (b_and_c_singles || (a != b && a != c));
+  const RegType a_type = a_single ? RegType::LowerPairSingle : RegType::LowerPair;
+  const auto a_reg_encoder = a_single ? EncodeRegToSingle : EncodeRegToDouble;
+
+  const ARM64Reg VA = fpr.R(a, a_type);
+  const ARM64Reg VB = fpr.R(b, b_and_c_type);
+  const ARM64Reg VC = fpr.R(c, b_and_c_type);
+
+  // If a == d, the RW call below may change the type of a to double. This is okay, because the
+  // actual value in the register is not altered by RW. So let's just assert before calling RW.
  ASSERT_MSG(DYNA_REC, a_single == fpr.IsSingle(a, true),
             "Register allocation turned singles into doubles in the middle of fselx");

-  const bool b_and_c_singles = fpr.IsSingle(b, true) && fpr.IsSingle(c, true);
-  const RegType type = b_and_c_singles ? RegType::LowerPairSingle : RegType::LowerPair;
-  const auto reg_encoder = b_and_c_singles ? EncodeRegToSingle : EncodeRegToDouble;
+  const ARM64Reg VD = fpr.RW(d, b_and_c_type);

-  const ARM64Reg VB = fpr.R(b, type);
-  const ARM64Reg VC = fpr.R(c, type);
-  const ARM64Reg VD = fpr.RW(d, type);
-
-  m_float_emit.FCSEL(reg_encoder(VD), reg_encoder(VC), reg_encoder(VB), CC_GE);
+  m_float_emit.FCMPE(a_reg_encoder(VA));
+  m_float_emit.FCSEL(b_and_c_reg_encoder(VD), b_and_c_reg_encoder(VC), b_and_c_reg_encoder(VB),
+                     CC_GE);

  ASSERT_MSG(DYNA_REC, b_and_c_singles == (fpr.IsSingle(b, true) && fpr.IsSingle(c, true)),
             "Register allocation turned singles into doubles in the middle of fselx");
@ -260,7 +258,7 @@ void JitArm64::frspx(UGeckoInstruction inst)
  const u32 d = inst.FD;

  const bool single = fpr.IsSingle(b, true);
-  if (single)
+  if (single && js.fpr_is_store_safe[b])
  {
    // Source is already in single precision, so no need to do anything but to copy to PSR1.
    const ARM64Reg VB = fpr.R(b, RegType::LowerPairSingle);
@ -268,6 +266,9 @@ void JitArm64::frspx(UGeckoInstruction inst)

    if (b != d)
      m_float_emit.FMOV(EncodeRegToSingle(VD), EncodeRegToSingle(VB));
+
+    ASSERT_MSG(DYNA_REC, fpr.IsSingle(b, true),
+               "Register allocation turned singles into doubles in the middle of frspx");
  }
  else
  {
@ -276,9 +277,6 @@ void JitArm64::frspx(UGeckoInstruction inst)

    m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
  }
-
-  ASSERT_MSG(DYNA_REC, b == d || single == fpr.IsSingle(b, true),
-             "Register allocation turned singles into doubles in the middle of frspx");
 }

 void JitArm64::fcmpX(UGeckoInstruction inst)
@ -386,3 +384,196 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
  ASSERT_MSG(DYNA_REC, b == d || single == fpr.IsSingle(b, true),
             "Register allocation turned singles into doubles in the middle of fctiwzx");
 }
+
+// Since the following float conversion functions are used in non-arithmetic PPC float
+// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
+// into QNaNs. This means we can't just use FCVT/FCVTL/FCVTN.
+
+void JitArm64::ConvertDoubleToSingleLower(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg)
+{
+  if (js.fpr_is_store_safe[guest_reg])
+  {
+    m_float_emit.FCVT(32, 64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
+    return;
+  }
+
+  FlushCarry();
+
+  const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 30};
+  ABI_PushRegisters(gpr_saved);
+
+  m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 0);
+  BL(cdts);
+  m_float_emit.INS(32, dest_reg, 0, ARM64Reg::W1);
+
+  ABI_PopRegisters(gpr_saved);
+}
+
+void JitArm64::ConvertDoubleToSinglePair(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg)
+{
+  if (js.fpr_is_store_safe[guest_reg])
+  {
+    m_float_emit.FCVTN(32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
+    return;
+  }
+
+  FlushCarry();
+
+  const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 30};
+  ABI_PushRegisters(gpr_saved);
+
+  m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 0);
+  BL(cdts);
+  m_float_emit.INS(32, dest_reg, 0, ARM64Reg::W1);
+
+  m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 1);
+  BL(cdts);
+  m_float_emit.INS(32, dest_reg, 1, ARM64Reg::W1);
+
+  ABI_PopRegisters(gpr_saved);
+}
+
+void JitArm64::ConvertSingleToDoubleLower(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg,
+                                          ARM64Reg scratch_reg)
+{
+  ASSERT(scratch_reg != src_reg);
+
+  if (js.fpr_is_store_safe[guest_reg])
+  {
+    m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
+    return;
+  }
+
+  const bool switch_to_farcode = !IsInFarCode();
+
+  FlushCarry();
+
+  // Do we know that the input isn't NaN, and that the input isn't denormal or FPCR.FZ is not set?
+  // (This check unfortunately also catches zeroes)
+
+  FixupBranch fast;
+  if (scratch_reg != ARM64Reg::INVALID_REG)
+  {
+    m_float_emit.FABS(EncodeRegToSingle(scratch_reg), EncodeRegToSingle(src_reg));
+    m_float_emit.FCMP(EncodeRegToSingle(scratch_reg));
+    fast = B(CCFlags::CC_GT);
+
+    if (switch_to_farcode)
+    {
+      FixupBranch slow = B();
+
+      SwitchToFarCode();
+      SetJumpTarget(slow);
+    }
+  }
+
+  // If no (or if we don't have a scratch register), call the bit-exact routine
+
+  const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
+  ABI_PushRegisters(gpr_saved);
+
+  m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 0);
+  BL(cstd);
+  m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
+
+  ABI_PopRegisters(gpr_saved);
+
+  // If yes, do a fast conversion with FCVT
+
+  if (scratch_reg != ARM64Reg::INVALID_REG)
+  {
+    FixupBranch continue1 = B();
+
+    if (switch_to_farcode)
+      SwitchToNearCode();
+
+    SetJumpTarget(fast);
+
+    m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
+
+    SetJumpTarget(continue1);
+  }
+}
+
+void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg,
+                                         ARM64Reg scratch_reg)
+{
+  ASSERT(scratch_reg != src_reg);
+
+  if (js.fpr_is_store_safe[guest_reg])
+  {
+    m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
+    return;
+  }
+
+  const bool switch_to_farcode = !IsInFarCode();
+
+  FlushCarry();
+
+  // Do we know that neither input is NaN, and that neither input is denormal or FPCR.FZ is not set?
+  // (This check unfortunately also catches zeroes)
+
+  FixupBranch fast;
+  if (scratch_reg != ARM64Reg::INVALID_REG)
+  {
+    // Set each 32-bit element of scratch_reg to 0x0000'0000 or 0xFFFF'FFFF depending on whether
+    // the absolute value of the corresponding element in src_reg compares greater than 0
+    m_float_emit.MOVI(8, EncodeRegToDouble(scratch_reg), 0);
+    m_float_emit.FACGT(32, EncodeRegToDouble(scratch_reg), EncodeRegToDouble(src_reg),
+                       EncodeRegToDouble(scratch_reg));
+
+    // 0x0000'0000'0000'0000 (zero)     -> 0x0000'0000'0000'0000 (zero)
+    // 0x0000'0000'FFFF'FFFF (denormal) -> 0xFF00'0000'FFFF'FFFF (normal)
+    // 0xFFFF'FFFF'0000'0000 (NaN)      -> 0x00FF'FFFF'0000'0000 (normal)
+    // 0xFFFF'FFFF'FFFF'FFFF (NaN)      -> 0xFFFF'FFFF'FFFF'FFFF (NaN)
+    m_float_emit.INS(8, EncodeRegToDouble(scratch_reg), 7, EncodeRegToDouble(scratch_reg), 0);
+
+    // Is scratch_reg a NaN (0xFFFF'FFFF'FFFF'FFFF)?
+    m_float_emit.FCMP(EncodeRegToDouble(scratch_reg));
+    fast = B(CCFlags::CC_VS);
+
+    if (switch_to_farcode)
+    {
+      FixupBranch slow = B();
+
+      SwitchToFarCode();
+      SetJumpTarget(slow);
+    }
+  }
+
+  // If no (or if we don't have a scratch register), call the bit-exact routine
+
+  // Save X0-X4 and X30 if they're in use
+  const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
+  ABI_PushRegisters(gpr_saved);
+
+  m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 1);
+  BL(cstd);
+  m_float_emit.INS(64, dest_reg, 1, ARM64Reg::X0);
+
+  m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 0);
+  BL(cstd);
+  m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
+
+  ABI_PopRegisters(gpr_saved);
+
+  // If yes, do a fast conversion with FCVTL
+
+  if (scratch_reg != ARM64Reg::INVALID_REG)
+  {
+    FixupBranch continue1 = B();
+
+    if (switch_to_farcode)
+      SwitchToNearCode();
+
+    SetJumpTarget(fast);
+    m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
+
+    SetJumpTarget(continue1);
+  }
+}
+
+bool JitArm64::IsFPRStoreSafe(size_t guest_reg) const
+{
+  return js.fpr_is_store_safe[guest_reg];
+}
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp
@ -189,6 +189,7 @@ void JitArm64::stfXX(UGeckoInstruction inst)

  u32 a = inst.RA, b = inst.RB;

+  bool want_single = false;
  s32 offset = inst.SIMM_16;
  u32 flags = BackPatchInfo::FLAG_STORE;
  bool update = false;
@ -200,10 +201,12 @@ void JitArm64::stfXX(UGeckoInstruction inst)
    switch (inst.SUBOP10)
    {
    case 663:  // stfsx
+      want_single = true;
      flags |= BackPatchInfo::FLAG_SIZE_F32;
      offset_reg = b;
      break;
    case 695:  // stfsux
+      want_single = true;
      flags |= BackPatchInfo::FLAG_SIZE_F32;
      update = true;
      offset_reg = b;
@ -218,16 +221,19 @@ void JitArm64::stfXX(UGeckoInstruction inst)
      offset_reg = b;
      break;
    case 983:  // stfiwx
-      flags |= BackPatchInfo::FLAG_SIZE_F32I;
+      // This instruction writes the lower 32 bits of a double. want_single must be false
+      flags |= BackPatchInfo::FLAG_SIZE_F32;
      offset_reg = b;
      break;
    }
    break;
  case 53:  // stfsu
+    want_single = true;
    flags |= BackPatchInfo::FLAG_SIZE_F32;
    update = true;
    break;
  case 52:  // stfs
+    want_single = true;
    flags |= BackPatchInfo::FLAG_SIZE_F32;
    break;
  case 55:  // stfdu
@ -242,19 +248,22 @@ void JitArm64::stfXX(UGeckoInstruction inst)
  u32 imm_addr = 0;
  bool is_immediate = false;

-  gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
  fpr.Lock(ARM64Reg::Q0);

-  const bool single = (flags & BackPatchInfo::FLAG_SIZE_F32) && fpr.IsSingle(inst.FS, true);
+  const bool have_single = fpr.IsSingle(inst.FS, true);

-  const ARM64Reg V0 = fpr.R(inst.FS, single ? RegType::LowerPairSingle : RegType::LowerPair);
+  ARM64Reg V0 =
+      fpr.R(inst.FS, want_single && have_single ? RegType::LowerPairSingle : RegType::LowerPair);

-  if (single)
+  if (want_single && !have_single)
  {
-    flags &= ~BackPatchInfo::FLAG_SIZE_F32;
-    flags |= BackPatchInfo::FLAG_SIZE_F32I;
+    const ARM64Reg single_reg = fpr.GetReg();
+    ConvertDoubleToSingleLower(inst.FS, single_reg, V0);
+    V0 = single_reg;
  }

+  gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
+
  ARM64Reg addr_reg = ARM64Reg::W1;

  if (update)
@ -359,19 +368,11 @@ void JitArm64::stfXX(UGeckoInstruction inst)
        accessSize = 32;

      LDR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
+
      if (flags & BackPatchInfo::FLAG_SIZE_F64)
-      {
        m_float_emit.REV64(8, ARM64Reg::Q0, V0);
-      }
      else if (flags & BackPatchInfo::FLAG_SIZE_F32)
-      {
-        m_float_emit.FCVT(32, 64, ARM64Reg::D0, EncodeRegToDouble(V0));
-        m_float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0);
-      }
-      else if (flags & BackPatchInfo::FLAG_SIZE_F32I)
-      {
        m_float_emit.REV32(8, ARM64Reg::D0, V0);
-      }

      m_float_emit.STR(accessSize, IndexType::Post, accessSize == 64 ? ARM64Reg::Q0 : ARM64Reg::D0,
                       ARM64Reg::X0, accessSize >> 3);
@ -399,6 +400,10 @@ void JitArm64::stfXX(UGeckoInstruction inst)
  {
    EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, V0, XA, regs_in_use, fprs_in_use);
  }
+
+  if (want_single && !have_single)
+    fpr.Unlock(V0);
+
  gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
  fpr.Unlock(ARM64Reg::Q0);
 }
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp
@ -116,13 +116,44 @@ void JitArm64::psq_st(UGeckoInstruction inst)
  const bool update = inst.OPCD == 61;
  const s32 offset = inst.SIMM_12;

-  gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);
  fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1);

-  const bool single = fpr.IsSingle(inst.RS);
+  const bool have_single = fpr.IsSingle(inst.RS);
+
+  ARM64Reg VS = fpr.R(inst.RS, have_single ? RegType::Single : RegType::Register);
+
+  if (js.assumeNoPairedQuantize)
+  {
+    if (!have_single)
+    {
+      const ARM64Reg single_reg = fpr.GetReg();
+
+      if (inst.W)
+        m_float_emit.FCVT(32, 64, EncodeRegToDouble(single_reg), EncodeRegToDouble(VS));
+      else
+        m_float_emit.FCVTN(32, EncodeRegToDouble(single_reg), EncodeRegToDouble(VS));
+
+      VS = single_reg;
+    }
+  }
+  else
+  {
+    if (have_single)
+    {
+      m_float_emit.ORR(ARM64Reg::D0, VS, VS);
+    }
+    else
+    {
+      if (inst.W)
+        m_float_emit.FCVT(32, 64, ARM64Reg::D0, VS);
+      else
+        m_float_emit.FCVTN(32, ARM64Reg::D0, VS);
+    }
+  }
+
+  gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);

  const ARM64Reg arm_addr = gpr.R(inst.RA);
-  const ARM64Reg VS = fpr.R(inst.RS, single ? RegType::Single : RegType::Register);

  constexpr ARM64Reg scale_reg = ARM64Reg::W0;
  constexpr ARM64Reg addr_reg = ARM64Reg::W1;
@ -157,28 +188,13 @@ void JitArm64::psq_st(UGeckoInstruction inst)
  {
    u32 flags = BackPatchInfo::FLAG_STORE;

-    if (single)
-      flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32I : BackPatchInfo::FLAG_SIZE_F32X2I);
-    else
-      flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2);
+    flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2);

    EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, VS, EncodeRegTo64(addr_reg), gprs_in_use,
                         fprs_in_use);
  }
  else
  {
-    if (single)
-    {
-      m_float_emit.ORR(ARM64Reg::D0, VS, VS);
-    }
-    else
-    {
-      if (inst.W)
-        m_float_emit.FCVT(32, 64, ARM64Reg::D0, VS);
-      else
-        m_float_emit.FCVTN(32, ARM64Reg::D0, VS);
-    }
-
    LDR(IndexType::Unsigned, scale_reg, PPC_REG, PPCSTATE_OFF_SPR(SPR_GQR0 + inst.I));
    UBFM(type_reg, scale_reg, 0, 2);    // Type
    UBFM(scale_reg, scale_reg, 8, 13);  // Scale
@ -212,6 +228,9 @@ void JitArm64::psq_st(UGeckoInstruction inst)
    SetJumpTarget(continue1);
  }

+  if (js.assumeNoPairedQuantize && !have_single)
+    fpr.Unlock(VS);
+
  gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);
  fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1);
 }
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
@ -17,9 +17,10 @@

 using namespace Arm64Gen;

-void Arm64RegCache::Init(ARM64XEmitter* emitter)
+void Arm64RegCache::Init(JitArm64* jit)
 {
-  m_emit = emitter;
+  m_jit = jit;
+  m_emit = jit;
  m_float_emit.reset(new ARM64FloatEmitter(m_emit));
  GetAllocationOrder();
 }
@ -467,7 +468,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
      return host_reg;

    // Else convert this register back to doubles.
-    m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
+    const ARM64Reg tmp_reg = GetReg();
+    m_jit->ConvertSingleToDoublePair(preg, host_reg, host_reg, tmp_reg);
+    UnlockRegister(tmp_reg);
+
    reg.Load(host_reg, RegType::Register);
    [[fallthrough]];
  }
@ -482,7 +486,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
      return host_reg;

    // Else convert this register back to a double.
-    m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
+    const ARM64Reg tmp_reg = GetReg();
+    m_jit->ConvertSingleToDoubleLower(preg, host_reg, host_reg, tmp_reg);
+    UnlockRegister(tmp_reg);
+
    reg.Load(host_reg, RegType::LowerPair);
    [[fallthrough]];
  }
@ -516,7 +523,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
      return host_reg;
    }

-    m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
+    const ARM64Reg tmp_reg = GetReg();
+    m_jit->ConvertSingleToDoubleLower(preg, host_reg, host_reg, tmp_reg);
+    UnlockRegister(tmp_reg);
+
    reg.Load(host_reg, RegType::Duplicated);
    [[fallthrough]];
  }
@ -584,7 +594,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
  if ((type == RegType::LowerPair || type == RegType::LowerPairSingle) && was_dirty)
  {
    // We must *not* change host_reg as this register might still be in use. So it's fine to
-    // store this register, but it's *not* fine to convert it to double. So for double convertion,
+    // store this register, but it's *not* fine to convert it to double. So for double conversion,
    // a temporary register needs to be used.
    ARM64Reg host_reg = reg.GetReg();
    ARM64Reg flush_reg = host_reg;
@ -592,9 +602,27 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
    switch (reg.GetType())
    {
    case RegType::Single:
+      // For a store-safe register, conversion is just one instruction regardless of whether
+      // we're whether we're converting a pair, so ConvertSingleToDoublePair followed by a
+      // 128-bit store is faster than INS followed by ConvertSingleToDoubleLower and a
+      // 64-bit store. But for registers which are not store-safe, the latter is better.
      flush_reg = GetReg();
-      m_float_emit->FCVTL(64, EncodeRegToDouble(flush_reg), EncodeRegToDouble(host_reg));
-      [[fallthrough]];
+      if (!m_jit->IsFPRStoreSafe(preg))
+      {
+        ARM64Reg scratch_reg = GetReg();
+        m_float_emit->INS(32, flush_reg, 0, host_reg, 1);
+        m_jit->ConvertSingleToDoubleLower(preg, flush_reg, flush_reg, scratch_reg);
+        m_float_emit->STR(64, IndexType::Unsigned, flush_reg, PPC_REG, u32(PPCSTATE_OFF_PS1(preg)));
+        Unlock(scratch_reg);
+        break;
+      }
+      else
+      {
+        m_jit->ConvertSingleToDoublePair(preg, flush_reg, host_reg, flush_reg);
+        m_float_emit->STR(128, IndexType::Unsigned, flush_reg, PPC_REG,
+                          u32(PPCSTATE_OFF_PS0(preg)));
+      }
+      break;
    case RegType::Register:
      // We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit
      // store.
@ -604,7 +632,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
      break;
    case RegType::DuplicatedSingle:
      flush_reg = GetReg();
-      m_float_emit->FCVT(64, 32, EncodeRegToDouble(flush_reg), EncodeRegToDouble(host_reg));
+      m_jit->ConvertSingleToDoubleLower(preg, flush_reg, host_reg, flush_reg);
      [[fallthrough]];
    case RegType::Duplicated:
      // Store PSR1 (which is equal to PSR0) in memory.
@ -708,17 +736,20 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
  const bool dirty = reg.IsDirty();
  RegType type = reg.GetType();

+  // If FlushRegister calls GetReg with all registers locked, we can get infinite recursion
+  const ARM64Reg tmp_reg = GetUnlockedRegisterCount() > 0 ? GetReg() : ARM64Reg::INVALID_REG;
+
  // If we're in single mode, just convert it back to a double.
  if (type == RegType::Single)
  {
    if (dirty)
-      m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
+      m_jit->ConvertSingleToDoublePair(preg, host_reg, host_reg, tmp_reg);
    type = RegType::Register;
  }
  if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle)
  {
    if (dirty)
-      m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
+      m_jit->ConvertSingleToDoubleLower(preg, host_reg, host_reg, tmp_reg);

    if (type == RegType::DuplicatedSingle)
      type = RegType::Duplicated;
@ -770,6 +801,9 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
      reg.Flush();
    }
  }
+
+  if (tmp_reg != ARM64Reg::INVALID_REG)
+    UnlockRegister(tmp_reg);
 }

 void Arm64FPRCache::FlushRegisters(BitSet32 regs, bool maintain_state)
@ -806,7 +840,7 @@ void Arm64FPRCache::FixSinglePrecision(size_t preg)
    m_float_emit->FCVT(32, 64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
    reg.Load(host_reg, RegType::DuplicatedSingle);
    break;
-  case RegType::Register:  // PS0 and PS1 needs to be converted
+  case RegType::Register:  // PS0 and PS1 need to be converted
    m_float_emit->FCVTN(32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
    reg.Load(host_reg, RegType::Single);
    break;
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h
@ -15,6 +15,8 @@
 #include "Core/PowerPC/PPCAnalyst.h"
 #include "Core/PowerPC/PowerPC.h"

+class JitArm64;
+
 // Dedicated host registers

 // memory base register
@ -150,7 +152,7 @@ public:
  explicit Arm64RegCache(size_t guest_reg_count) : m_guest_registers(guest_reg_count) {}
  virtual ~Arm64RegCache() = default;

-  void Init(Arm64Gen::ARM64XEmitter* emitter);
+  void Init(JitArm64* jit);

  virtual void Start(PPCAnalyst::BlockRegStats& stats) {}
  void DiscardRegisters(BitSet32 regs);
@ -166,6 +168,9 @@ public:

  void UpdateLastUsed(BitSet32 regs_used);

+  // Get available host registers
+  u32 GetUnlockedRegisterCount() const;
+
  // Locks a register so a cache cannot use it
  // Useful for function calls
  template <typename T = Arm64Gen::ARM64Reg, typename... Args>
@ -209,15 +214,14 @@ protected:
  void DiscardRegister(size_t preg);
  virtual void FlushRegister(size_t preg, bool maintain_state) = 0;

-  // Get available host registers
-  u32 GetUnlockedRegisterCount() const;
-
  void IncrementAllUsed()
  {
    for (auto& reg : m_guest_registers)
      reg.IncrementLastUsed();
  }

+  JitArm64* m_jit = nullptr;
+
  // Code emitter
  Arm64Gen::ARM64XEmitter* m_emit = nullptr;

--- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
@ -194,6 +194,85 @@ void JitArm64::GenerateAsm()
 }

 void JitArm64::GenerateCommonAsm()
+{
+  GetAsmRoutines()->cdts = GetCodePtr();
+  GenerateConvertDoubleToSingle();
+  JitRegister::Register(GetAsmRoutines()->cdts, GetCodePtr(), "JIT_cdts");
+
+  GetAsmRoutines()->cstd = GetCodePtr();
+  GenerateConvertSingleToDouble();
+  JitRegister::Register(GetAsmRoutines()->cdts, GetCodePtr(), "JIT_cstd");
+
+  GenerateQuantizedLoadStores();
+}
+
+// Input in X0, output in W1, clobbers X0-X3 and flags.
+void JitArm64::GenerateConvertDoubleToSingle()
+{
+  UBFX(ARM64Reg::X2, ARM64Reg::X0, 52, 11);
+  SUB(ARM64Reg::W3, ARM64Reg::W2, 874);
+  CMP(ARM64Reg::W3, 896 - 874);
+  LSR(ARM64Reg::X1, ARM64Reg::X0, 32);
+  FixupBranch denormal = B(CCFlags::CC_LS);
+
+  ANDI2R(ARM64Reg::X1, ARM64Reg::X1, 0xc0000000);
+  BFXIL(ARM64Reg::X1, ARM64Reg::X0, 29, 30);
+  RET();
+
+  SetJumpTarget(denormal);
+  LSR(ARM64Reg::X3, ARM64Reg::X0, 21);
+  MOVZ(ARM64Reg::X0, 905);
+  ORRI2R(ARM64Reg::W3, ARM64Reg::W3, 0x80000000);
+  SUB(ARM64Reg::W2, ARM64Reg::W0, ARM64Reg::W2);
+  LSRV(ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W2);
+  ANDI2R(ARM64Reg::X3, ARM64Reg::X1, 0x80000000);
+  ORR(ARM64Reg::X1, ARM64Reg::X3, ARM64Reg::X2);
+  RET();
+}
+
+// Input in W0, output in X0, clobbers X0-X4 and flags.
+void JitArm64::GenerateConvertSingleToDouble()
+{
+  UBFX(ARM64Reg::W1, ARM64Reg::W0, 23, 8);
+  FixupBranch normal_or_nan = CBNZ(ARM64Reg::W1);
+
+  ANDI2R(ARM64Reg::W1, ARM64Reg::W0, 0x007fffff);
+  FixupBranch denormal = CBNZ(ARM64Reg::W1);
+
+  // Zero
+  LSL(ARM64Reg::X0, ARM64Reg::X0, 32);
+  RET();
+
+  SetJumpTarget(denormal);
+  ANDI2R(ARM64Reg::W2, ARM64Reg::W0, 0x80000000);
+  CLZ(ARM64Reg::X3, ARM64Reg::X1);
+  LSL(ARM64Reg::X2, ARM64Reg::X2, 32);
+  ORRI2R(ARM64Reg::X4, ARM64Reg::X3, 0xffffffffffffffc0);
+  SUB(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 52));
+  ADD(ARM64Reg::X3, ARM64Reg::X4, 23);
+  LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X3);
+  BFI(ARM64Reg::X2, ARM64Reg::X1, 30, 22);
+  MOVI2R(ARM64Reg::X1, 0x3a90000000000000);
+  ADD(ARM64Reg::X0, ARM64Reg::X2, ARM64Reg::X1);
+  RET();
+
+  SetJumpTarget(normal_or_nan);
+  CMP(ARM64Reg::W1, 0xff);
+  ANDI2R(ARM64Reg::W2, ARM64Reg::W0, 0x40000000);
+  CSET(ARM64Reg::W4, CCFlags::CC_NEQ);
+  ANDI2R(ARM64Reg::W3, ARM64Reg::W0, 0xc0000000);
+  EOR(ARM64Reg::W2, ARM64Reg::W4, ARM64Reg::W2, ArithOption(ARM64Reg::W2, ShiftType::LSR, 30));
+  MOVI2R(ARM64Reg::X1, 0x3800000000000000);
+  ANDI2R(ARM64Reg::W4, ARM64Reg::W0, 0x3fffffff);
+  LSL(ARM64Reg::X3, ARM64Reg::X3, 32);
+  CMP(ARM64Reg::W2, 0);
+  CSEL(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::ZR, CCFlags::CC_NEQ);
+  BFI(ARM64Reg::X3, ARM64Reg::X4, 29, 30);
+  ORR(ARM64Reg::X0, ARM64Reg::X3, ARM64Reg::X1);
+  RET();
+}
+
+void JitArm64::GenerateQuantizedLoadStores()
 {
  // X0 is the scale
  // X1 is address
@ -654,6 +733,4 @@ void JitArm64::GenerateCommonAsm()
  paired_store_quantized[29] = storeSingleU16Slow;
  paired_store_quantized[30] = storeSingleS8Slow;
  paired_store_quantized[31] = storeSingleS16Slow;
-
-  GetAsmRoutines()->mfcr = nullptr;
 }
--- a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h
+++ b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h
@ -16,14 +16,11 @@ struct BackPatchInfo
    FLAG_SIZE_32 = (1 << 4),
    FLAG_SIZE_F32 = (1 << 5),
    FLAG_SIZE_F32X2 = (1 << 6),
-    FLAG_SIZE_F32X2I = (1 << 7),
-    FLAG_SIZE_F64 = (1 << 8),
-    FLAG_REVERSE = (1 << 9),
-    FLAG_EXTEND = (1 << 10),
-    FLAG_SIZE_F32I = (1 << 11),
-    FLAG_ZERO_256 = (1 << 12),
-    FLAG_MASK_FLOAT =
-        FLAG_SIZE_F32 | FLAG_SIZE_F32X2 | FLAG_SIZE_F32X2I | FLAG_SIZE_F64 | FLAG_SIZE_F32I,
+    FLAG_SIZE_F64 = (1 << 7),
+    FLAG_REVERSE = (1 << 8),
+    FLAG_EXTEND = (1 << 9),
+    FLAG_ZERO_256 = (1 << 10),
+    FLAG_MASK_FLOAT = FLAG_SIZE_F32 | FLAG_SIZE_F32X2 | FLAG_SIZE_F64,
  };

  static u32 GetFlagSize(u32 flags)
@ -34,8 +31,10 @@ struct BackPatchInfo
      return 16;
    if (flags & FLAG_SIZE_32)
      return 32;
-    if (flags & FLAG_SIZE_F32 || flags & FLAG_SIZE_F32I)
+    if (flags & FLAG_SIZE_F32)
      return 32;
+    if (flags & FLAG_SIZE_F32X2)
+      return 64;
    if (flags & FLAG_SIZE_F64)
      return 64;
    if (flags & FLAG_ZERO_256)
--- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h
@ -26,6 +26,7 @@ struct CommonAsmRoutinesBase
  const u8* fres;
  const u8* mfcr;
  const u8* cdts;
+  const u8* cstd;

  // In: array index: GQR to use.
  // In: ECX: Address to read from.
--- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h
+++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h
@ -8,6 +8,7 @@
 #include <map>
 #include <unordered_set>

+#include "Common/BitSet.h"
 #include "Common/CommonTypes.h"
 #include "Common/x64Emitter.h"
 #include "Core/ConfigManager.h"
@ -98,6 +99,7 @@ protected:
    PPCAnalyst::BlockRegStats gpa;
    PPCAnalyst::BlockRegStats fpa;
    PPCAnalyst::CodeOp* op;
+    BitSet32 fpr_is_store_safe;

    JitBlock* curBlock;

--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@ -976,7 +976,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:

    op.fprIsSingle = fprIsSingle;
    op.fprIsDuplicated = fprIsDuplicated;
-    op.fprIsStoreSafe = fprIsStoreSafe;
+    op.fprIsStoreSafeBeforeInst = fprIsStoreSafe;
    if (op.fregOut >= 0)
    {
      if (op.opinfo->type == OpType::SingleFP)
@ -1036,6 +1036,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
            (op.opinfo->type == OpType::SingleFP || op.opinfo->type == OpType::PS);
      }
    }
+    op.fprIsStoreSafeAfterInst = fprIsStoreSafe;

    if (op.opinfo->type == OpType::StorePS || op.opinfo->type == OpType::LoadPS)
    {
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@ -66,7 +66,8 @@ struct CodeOp  // 16B
  // convert between single and double formats by just using the host machine's instruction for it.
  // (The reason why we can't always do this is because some games rely on the exact bits of
  // denormals and SNaNs being preserved as long as no arithmetic operation is performed on them.)
-  BitSet32 fprIsStoreSafe;
+  BitSet32 fprIsStoreSafeBeforeInst;
+  BitSet32 fprIsStoreSafeAfterInst;

  BitSet32 GetFregsOut() const
  {
--- a/Source/Core/DolphinLib.ARM64.props
+++ b/Source/Core/DolphinLib.ARM64.props
@ -13,7 +13,7 @@
  <ItemGroup>
    <ClCompile Include="Common\Arm64Emitter.cpp" />
    <ClCompile Include="Common\ArmCPUDetect.cpp" />
-    <ClCompile Include="Common\GenericFPURoundMode.cpp" />
+    <ClCompile Include="Common\ArmFPURoundMode.cpp" />
    <ClCompile Include="Core\PowerPC\JitArm64\Jit_Util.cpp" />
    <ClCompile Include="Core\PowerPC\JitArm64\Jit.cpp" />
    <ClCompile Include="Core\PowerPC\JitArm64\JitArm64_BackPatch.cpp" />
--- a/Source/UnitTests/Core/CMakeLists.txt
+++ b/Source/UnitTests/Core/CMakeLists.txt
@ -21,6 +21,7 @@ if(_M_X86)
  )
 elseif(_M_ARM_64)
  add_dolphin_test(PowerPCTest
+    PowerPC/JitArm64/ConvertSingleDouble.cpp
    PowerPC/JitArm64/MovI2R.cpp
  )
 endif()
--- a/Source/UnitTests/Core/PowerPC/JitArm64/ConvertSingleDouble.cpp
+++ b/Source/UnitTests/Core/PowerPC/JitArm64/ConvertSingleDouble.cpp
@ -0,0 +1,273 @@
+// Copyright 2021 Dolphin Emulator Project
+// Licensed under GPLv2+
+// Refer to the license.txt file included.
+
+#include <functional>
+#include <vector>
+
+#include "Common/Arm64Emitter.h"
+#include "Common/BitUtils.h"
+#include "Common/CommonTypes.h"
+#include "Common/FPURoundMode.h"
+#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h"
+#include "Core/PowerPC/JitArm64/Jit.h"
+
+#include <fmt/format.h>
+#include <gtest/gtest.h>
+
+namespace
+{
+using namespace Arm64Gen;
+
+// The ABI situation for returning an std::tuple seems annoying. Let's use this struct instead
+template <typename T>
+struct Pair
+{
+  T value1;
+  T value2;
+};
+
+class TestConversion : private JitArm64
+{
+public:
+  TestConversion()
+  {
+    AllocCodeSpace(4096);
+    AddChildCodeSpace(&farcode, 2048);
+
+    gpr.Init(this);
+    fpr.Init(this);
+
+    js.fpr_is_store_safe = BitSet32(0);
+
+    GetAsmRoutines()->cdts = GetCodePtr();
+    GenerateConvertDoubleToSingle();
+    GetAsmRoutines()->cstd = GetCodePtr();
+    GenerateConvertSingleToDouble();
+
+    gpr.Lock(ARM64Reg::W30);
+    fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1);
+
+    convert_single_to_double_lower = Common::BitCast<u64 (*)(u32)>(GetCodePtr());
+    m_float_emit.INS(32, ARM64Reg::S0, 0, ARM64Reg::W0);
+    ConvertSingleToDoubleLower(0, ARM64Reg::D0, ARM64Reg::S0, ARM64Reg::Q1);
+    m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::D0, 0);
+    RET();
+
+    convert_single_to_double_pair = Common::BitCast<Pair<u64> (*)(u32, u32)>(GetCodePtr());
+    m_float_emit.INS(32, ARM64Reg::D0, 0, ARM64Reg::W0);
+    m_float_emit.INS(32, ARM64Reg::D0, 1, ARM64Reg::W1);
+    ConvertSingleToDoublePair(0, ARM64Reg::Q0, ARM64Reg::D0, ARM64Reg::Q1);
+    m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::Q0, 0);
+    m_float_emit.UMOV(64, ARM64Reg::X1, ARM64Reg::Q0, 1);
+    RET();
+
+    convert_double_to_single_lower = Common::BitCast<u32 (*)(u64)>(GetCodePtr());
+    m_float_emit.INS(64, ARM64Reg::D0, 0, ARM64Reg::X0);
+    ConvertDoubleToSingleLower(0, ARM64Reg::S0, ARM64Reg::D0);
+    m_float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::S0, 0);
+    RET();
+
+    convert_double_to_single_pair = Common::BitCast<Pair<u32> (*)(u64, u64)>(GetCodePtr());
+    m_float_emit.INS(64, ARM64Reg::Q0, 0, ARM64Reg::X0);
+    m_float_emit.INS(64, ARM64Reg::Q0, 1, ARM64Reg::X1);
+    ConvertDoubleToSinglePair(0, ARM64Reg::D0, ARM64Reg::Q0);
+    m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::D0, 0);
+    RET();
+
+    gpr.Unlock(ARM64Reg::W30);
+    fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1);
+
+    FlushIcache();
+
+    // Set the rounding mode to something that's as annoying as possible to handle
+    // (flush-to-zero enabled, and rounding not symmetric about the origin)
+    FPURoundMode::SetSIMDMode(FPURoundMode::RoundMode::ROUND_UP, true);
+  }
+
+  ~TestConversion() override
+  {
+    FPURoundMode::LoadDefaultSIMDState();
+
+    FreeCodeSpace();
+  }
+
+  u64 ConvertSingleToDouble(u32 value) { return convert_single_to_double_lower(value); }
+
+  Pair<u64> ConvertSingleToDouble(u32 value1, u32 value2)
+  {
+    return convert_single_to_double_pair(value1, value2);
+  }
+
+  u32 ConvertDoubleToSingle(u64 value) { return convert_double_to_single_lower(value); }
+
+  Pair<u32> ConvertDoubleToSingle(u64 value1, u64 value2)
+  {
+    return convert_double_to_single_pair(value1, value2);
+  }
+
+private:
+  std::function<u64(u32)> convert_single_to_double_lower;
+  std::function<Pair<u64>(u32, u32)> convert_single_to_double_pair;
+  std::function<u32(u64)> convert_double_to_single_lower;
+  std::function<Pair<u32>(u64, u64)> convert_double_to_single_pair;
+};
+
+}  // namespace
+
+TEST(JitArm64, ConvertDoubleToSingle)
+{
+  TestConversion test;
+
+  const std::vector<u64> input_values{
+      // Special values
+      0x0000'0000'0000'0000,  // positive zero
+      0x0000'0000'0000'0001,  // smallest positive denormal
+      0x0000'0000'0100'0000,
+      0x000F'FFFF'FFFF'FFFF,  // largest positive denormal
+      0x0010'0000'0000'0000,  // smallest positive normal
+      0x0010'0000'0000'0002,
+      0x3FF0'0000'0000'0000,  // 1.0
+      0x7FEF'FFFF'FFFF'FFFF,  // largest positive normal
+      0x7FF0'0000'0000'0000,  // positive infinity
+      0x7FF0'0000'0000'0001,  // first positive SNaN
+      0x7FF7'FFFF'FFFF'FFFF,  // last positive SNaN
+      0x7FF8'0000'0000'0000,  // first positive QNaN
+      0x7FFF'FFFF'FFFF'FFFF,  // last positive QNaN
+      0x8000'0000'0000'0000,  // negative zero
+      0x8000'0000'0000'0001,  // smallest negative denormal
+      0x8000'0000'0100'0000,
+      0x800F'FFFF'FFFF'FFFF,  // largest negative denormal
+      0x8010'0000'0000'0000,  // smallest negative normal
+      0x8010'0000'0000'0002,
+      0xBFF0'0000'0000'0000,  // -1.0
+      0xFFEF'FFFF'FFFF'FFFF,  // largest negative normal
+      0xFFF0'0000'0000'0000,  // negative infinity
+      0xFFF0'0000'0000'0001,  // first negative SNaN
+      0xFFF7'FFFF'FFFF'FFFF,  // last negative SNaN
+      0xFFF8'0000'0000'0000,  // first negative QNaN
+      0xFFFF'FFFF'FFFF'FFFF,  // last negative QNaN
+
+      // (exp > 896) Boundary Case
+      0x3800'0000'0000'0000,  // 2^(-127) = Denormal in single-prec
+      0x3810'0000'0000'0000,  // 2^(-126) = Smallest single-prec normal
+      0xB800'0000'0000'0000,  // -2^(-127) = Denormal in single-prec
+      0xB810'0000'0000'0000,  // -2^(-126) = Smallest single-prec normal
+      0x3800'1234'5678'9ABC, 0x3810'1234'5678'9ABC, 0xB800'1234'5678'9ABC, 0xB810'1234'5678'9ABC,
+
+      // (exp >= 874) Boundary Case
+      0x3680'0000'0000'0000,  // 2^(-150) = Unrepresentable in single-prec
+      0x36A0'0000'0000'0000,  // 2^(-149) = Smallest single-prec denormal
+      0x36B0'0000'0000'0000,  // 2^(-148) = Single-prec denormal
+      0xB680'0000'0000'0000,  // -2^(-150) = Unrepresentable in single-prec
+      0xB6A0'0000'0000'0000,  // -2^(-149) = Smallest single-prec denormal
+      0xB6B0'0000'0000'0000,  // -2^(-148) = Single-prec denormal
+      0x3680'1234'5678'9ABC, 0x36A0'1234'5678'9ABC, 0x36B0'1234'5678'9ABC, 0xB680'1234'5678'9ABC,
+      0xB6A0'1234'5678'9ABC, 0xB6B0'1234'5678'9ABC,
+
+      // Some typical numbers
+      0x3FF8'0000'0000'0000,  // 1.5
+      0x408F'4000'0000'0000,  // 1000
+      0xC008'0000'0000'0000,  // -3
+  };
+
+  for (const u64 input : input_values)
+  {
+    const u32 expected = ConvertToSingle(input);
+    const u32 actual = test.ConvertDoubleToSingle(input);
+
+    if (expected != actual)
+      fmt::print("{:016x} -> {:08x} == {:08x}\n", input, actual, expected);
+
+    EXPECT_EQ(expected, actual);
+  }
+
+  for (const u64 input1 : input_values)
+  {
+    for (const u64 input2 : input_values)
+    {
+      const u32 expected1 = ConvertToSingle(input1);
+      const u32 expected2 = ConvertToSingle(input2);
+      const auto [actual1, actual2] = test.ConvertDoubleToSingle(input1, input2);
+
+      if (expected1 != actual1 || expected2 != actual2)
+      {
+        fmt::print("{:016x} -> {:08x} == {:08x},\n", input1, actual1, expected1);
+        fmt::print("{:016x} -> {:08x} == {:08x}\n", input2, actual2, expected2);
+      }
+
+      EXPECT_EQ(expected1, actual1);
+      EXPECT_EQ(expected2, actual2);
+    }
+  }
+}
+
+TEST(JitArm64, ConvertSingleToDouble)
+{
+  TestConversion test;
+
+  const std::vector<u32> input_values{
+      // Special values
+      0x0000'0000,  // positive zero
+      0x0000'0001,  // smallest positive denormal
+      0x0000'1000,
+      0x007F'FFFF,  // largest positive denormal
+      0x0080'0000,  // smallest positive normal
+      0x0080'0002,
+      0x3F80'0000,  // 1.0
+      0x7F7F'FFFF,  // largest positive normal
+      0x7F80'0000,  // positive infinity
+      0x7F80'0001,  // first positive SNaN
+      0x7FBF'FFFF,  // last positive SNaN
+      0x7FC0'0000,  // first positive QNaN
+      0x7FFF'FFFF,  // last positive QNaN
+      0x8000'0000,  // negative zero
+      0x8000'0001,  // smallest negative denormal
+      0x8000'1000,
+      0x807F'FFFF,  // largest negative denormal
+      0x8080'0000,  // smallest negative normal
+      0x8080'0002,
+      0xBFF0'0000,  // -1.0
+      0xFF7F'FFFF,  // largest negative normal
+      0xFF80'0000,  // negative infinity
+      0xFF80'0001,  // first negative SNaN
+      0xFFBF'FFFF,  // last negative SNaN
+      0xFFC0'0000,  // first negative QNaN
+      0xFFFF'FFFF,  // last negative QNaN
+
+      // Some typical numbers
+      0x3FC0'0000,  // 1.5
+      0x447A'0000,  // 1000
+      0xC040'0000,  // -3
+  };
+
+  for (const u32 input : input_values)
+  {
+    const u64 expected = ConvertToDouble(input);
+    const u64 actual = test.ConvertSingleToDouble(input);
+
+    if (expected != actual)
+      fmt::print("{:08x} -> {:016x} == {:016x}\n", input, actual, expected);
+
+    EXPECT_EQ(expected, actual);
+  }
+
+  for (const u32 input1 : input_values)
+  {
+    for (const u32 input2 : input_values)
+    {
+      const u64 expected1 = ConvertToDouble(input1);
+      const u64 expected2 = ConvertToDouble(input2);
+      const auto [actual1, actual2] = test.ConvertSingleToDouble(input1, input2);
+
+      if (expected1 != actual1 || expected2 != actual2)
+      {
+        fmt::print("{:08x} -> {:016x} == {:016x},\n", input1, actual1, expected1);
+        fmt::print("{:08x} -> {:016x} == {:016x}\n", input2, actual2, expected2);
+      }
+
+      EXPECT_EQ(expected1, actual1);
+      EXPECT_EQ(expected2, actual2);
+    }
+  }
+}
--- a/Source/UnitTests/UnitTests.vcxproj
+++ b/Source/UnitTests/UnitTests.vcxproj
@ -81,6 +81,7 @@
    <ClCompile Include="Core\PowerPC\Jit64Common\Frsqrte.cpp" />
  </ItemGroup>
  <ItemGroup Condition="'$(Platform)'=='ARM64'">
+    <ClCompile Include="Core\PowerPC\JitArm64\ConvertSingleDouble.cpp" />
    <ClCompile Include="Core\PowerPC\JitArm64\MovI2R.cpp" />
  </ItemGroup>
  <ItemGroup>