[AArch64] Implement Fiora's preemptive paired loadstore optimization.

This provides a decent speed up in pretty much everything that touches pair loadstores because in most cases they are just regular non-quantizing float loadstores that happen.
2025-07-23 22:29:39 -06:00 · 2015-09-01 16:22:44 -05:00
parent e01428935f
commit 2c68f6bfc5
4 changed files with 131 additions and 48 deletions
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
@ -7,6 +7,7 @@
 #include "Common/PerformanceCounter.h"

 #include "Core/PatchEngine.h"
+#include "Core/PowerPC/JitInterface.h"
 #include "Core/PowerPC/Profiler.h"
 #include "Core/PowerPC/JitArm64/Jit.h"
 #include "Core/PowerPC/JitArm64/JitArm64_RegCache.h"
@ -351,6 +352,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB

 	js.isLastInstruction = false;
 	js.firstFPInstructionFound = false;
+	js.assumeNoPairedQuantize = false;
 	js.blockStart = em_address;
 	js.fifoBytesThisBlock = 0;
 	js.downcountAmount = 0;
@ -396,6 +398,30 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB
 		// get start tic
 		BeginTimeProfile(b);
 	}
+
+	if (code_block.m_gqr_used.Count() == 1 && js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end())
+	{
+		int gqr = *code_block.m_gqr_used.begin();
+		if (!code_block.m_gqr_modified[gqr] && !GQR(gqr))
+		{
+			LDR(INDEX_UNSIGNED, W0, X29, PPCSTATE_OFF(spr[SPR_GQR0]) + gqr * 4);
+			FixupBranch no_fail = B(CC_EQ);
+			FixupBranch fail = B();
+			SwitchToFarCode();
+				SetJumpTarget(fail);
+				MOVI2R(W0, js.blockStart);
+				STR(INDEX_UNSIGNED, W0, X29, PPCSTATE_OFF(pc));
+				MOVI2R(W0, (u32)JitInterface::ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE);
+				MOVI2R(X1, (u64)&JitInterface::CompileExceptionCheck);
+				BLR(X1);
+				MOVI2R(X1, (u64)asm_routines.dispatcher);
+				BR(X1);
+			SwitchToNearCode();
+			SetJumpTarget(no_fail);
+			js.assumeNoPairedQuantize = true;
+		}
+	}
+
 	const u8 *normalEntry = GetCodePtr();
 	b->normalEntry = normalEntry;

--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp
@ -51,7 +51,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
 	{

 		if (flags & BackPatchInfo::FLAG_STORE &&
-		    flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64 | BackPatchInfo::FLAG_SIZE_F32I))
+		    flags & BackPatchInfo::FLAG_MASK_FLOAT)
 		{
 			if (flags & BackPatchInfo::FLAG_SIZE_F32)
 			{
@ -64,6 +64,12 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
 				m_float_emit.REV32(8, D0, RS);
 				m_float_emit.STR(32, D0, X28, addr);
 			}
+			else if (flags & BackPatchInfo::FLAG_SIZE_F32X2)
+			{
+				m_float_emit.FCVTN(32, D0, RS);
+				m_float_emit.REV32(8, D0, D0);
+				m_float_emit.STR(64, Q0, X28, addr);
+			}
 			else
 			{
 				m_float_emit.REV64(8, Q0, RS);
@ -71,7 +77,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
 			}
 		}
 		else if (flags & BackPatchInfo::FLAG_LOAD &&
-		         flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64))
+		         flags & BackPatchInfo::FLAG_MASK_FLOAT)
 		{
 			if (flags & BackPatchInfo::FLAG_SIZE_F32)
 			{
@ -166,7 +172,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
 		m_float_emit.ABI_PushRegisters(fprs_to_push, X30);

 		if (flags & BackPatchInfo::FLAG_STORE &&
-		    flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64 | BackPatchInfo::FLAG_SIZE_F32I))
+		    flags & BackPatchInfo::FLAG_MASK_FLOAT)
 		{
 			if (flags & BackPatchInfo::FLAG_SIZE_F32)
 			{
@ -181,6 +187,14 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
 				MOVI2R(X30, (u64)&PowerPC::Write_U32);
 				BLR(X30);
 			}
+			else if (flags & BackPatchInfo::FLAG_SIZE_F32X2)
+			{
+				m_float_emit.FCVTN(32, D0, RS);
+				m_float_emit.UMOV(64, X0, D0, 0);
+				ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32));
+				MOVI2R(X30, (u64)PowerPC::Write_U64);
+				BLR(X30);
+			}
 			else
 			{
 				MOVI2R(X30, (u64)&PowerPC::Write_U64);
@ -190,7 +204,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,

 		}
 		else if (flags & BackPatchInfo::FLAG_LOAD &&
-			   flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64))
+			   flags & BackPatchInfo::FLAG_MASK_FLOAT)
 		{
 			if (flags & BackPatchInfo::FLAG_SIZE_F32)
 			{
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp
@ -38,8 +38,7 @@ void JitArm64::psq_l(UGeckoInstruction inst)
 	ARM64Reg scale_reg = W0;
 	ARM64Reg addr_reg = W1;
 	ARM64Reg type_reg = W2;
-
-	LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
+	ARM64Reg VS;

 	if (inst.RA || update) // Always uses the register on update
 	{
@ -53,21 +52,43 @@ void JitArm64::psq_l(UGeckoInstruction inst)
 		MOVI2R(addr_reg, (u32)offset);
 	}

-	UBFM(type_reg, scale_reg, 16, 18); // Type
-	UBFM(scale_reg, scale_reg, 24, 29); // Scale
-
 	if (update)
 	{
 		gpr.BindToRegister(inst.RA, REG_REG);
 		MOV(arm_addr, addr_reg);
 	}

-	MOVI2R(X30, (u64)&asm_routines.pairedLoadQuantized[inst.W * 8]);
-	LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true));
-	BLR(X30);
+	if (js.assumeNoPairedQuantize)
+	{
+		VS = fpr.RW(inst.RS, REG_REG);
+		if (!inst.W)
+		{
+			ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), X28);
+			m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg));
+			m_float_emit.REV32(8, VS, VS);
+		}
+		else
+		{
+			m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), X28);
+			m_float_emit.REV32(8, VS, VS);
+
+		}
+		m_float_emit.FCVTL(64, VS, VS);
+	}
+	else
+	{
+		LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
+		UBFM(type_reg, scale_reg, 16, 18); // Type
+		UBFM(scale_reg, scale_reg, 24, 29); // Scale
+
+		MOVI2R(X30, (u64)&asm_routines.pairedLoadQuantized[inst.W * 8]);
+		LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true));
+		BLR(X30);
+
+		VS = fpr.RW(inst.RS, REG_REG);
+		m_float_emit.FCVTL(64, VS, D0);
+	}

-	ARM64Reg VS = fpr.RW(inst.RS, REG_REG);
-	m_float_emit.FCVTL(64, VS, D0);
 	if (inst.W)
 	{
 		m_float_emit.FMOV(D0, 0x70); // 1.0 as a Double
@ -106,11 +127,9 @@ void JitArm64::psq_st(UGeckoInstruction inst)
 	BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();

 	// Wipe the registers we are using as temporaries
-	gprs_in_use &= BitSet32(~0x40000007);
+	gprs_in_use &= BitSet32(~7);
 	fprs_in_use &= BitSet32(~3);

-	LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
-
 	if (inst.RA || update) // Always uses the register on update
 	{
 		if (offset >= 0)
@ -123,40 +142,59 @@ void JitArm64::psq_st(UGeckoInstruction inst)
 		MOVI2R(addr_reg, (u32)offset);
 	}

-	UBFM(type_reg, scale_reg, 0, 2); // Type
-	UBFM(scale_reg, scale_reg, 8, 13); // Scale
-
 	if (update)
 	{
 		gpr.BindToRegister(inst.RA, REG_REG);
 		MOV(arm_addr, addr_reg);
 	}

-	m_float_emit.FCVTN(32, D0, VS);
-
-	// Inline address check
+	if (js.assumeNoPairedQuantize)
 	{
+		u32 flags = BackPatchInfo::FLAG_STORE;
+		flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2);
+		EmitBackpatchRoutine(flags,
+			jo.fastmem,
+			jo.fastmem,
+			VS, EncodeRegTo64(addr_reg),
+			gprs_in_use,
+			fprs_in_use);
+	}
+	else
+	{
+		if (inst.W)
+			m_float_emit.FCVT(32, 64, D0, VS);
+		else
+			m_float_emit.FCVTN(32, D0, VS);
+
+		LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
+		UBFM(type_reg, scale_reg, 0, 2); // Type
+		UBFM(scale_reg, scale_reg, 8, 13); // Scale
+
+		// Inline address check
 		TST(addr_reg, 6, 1);
-		FixupBranch argh = B(CC_NEQ);
+		FixupBranch pass = B(CC_EQ);
+		FixupBranch fail = B();
+
+		SwitchToFarCode();
+			SetJumpTarget(fail);
+			// Slow
+			MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[16 + inst.W * 8]);
+			LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
+
+			ABI_PushRegisters(gprs_in_use);
+			m_float_emit.ABI_PushRegisters(fprs_in_use, X30);
+			BLR(EncodeRegTo64(type_reg));
+			m_float_emit.ABI_PopRegisters(fprs_in_use, X30);
+			ABI_PopRegisters(gprs_in_use);
+			FixupBranch continue1 = B();
+		SwitchToNearCode();
+		SetJumpTarget(pass);

 		// Fast
 		MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]);
 		LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
 		BLR(EncodeRegTo64(type_reg));

-		FixupBranch continue1 = B();
-		SetJumpTarget(argh);
-
-		// Slow
-		MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[16 + inst.W * 8]);
-		LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
-
-		ABI_PushRegisters(gprs_in_use);
-		m_float_emit.ABI_PushRegisters(fprs_in_use, X30);
-		BLR(EncodeRegTo64(type_reg));
-		m_float_emit.ABI_PopRegisters(fprs_in_use, X30);
-		ABI_PopRegisters(gprs_in_use);
-
 		SetJumpTarget(continue1);
 	}

--- a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h
+++ b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h
@ -9,17 +9,22 @@ struct BackPatchInfo
 {
 	enum
 	{
-		FLAG_STORE    = (1 << 0),
-		FLAG_LOAD     = (1 << 1),
-		FLAG_SIZE_8   = (1 << 2),
-		FLAG_SIZE_16  = (1 << 3),
-		FLAG_SIZE_32  = (1 << 4),
-		FLAG_SIZE_F32 = (1 << 5),
-		FLAG_SIZE_F64 = (1 << 6),
-		FLAG_REVERSE  = (1 << 7),
-		FLAG_EXTEND   = (1 << 8),
-		FLAG_SIZE_F32I = (1 << 9),
-		FLAG_ZERO_256  = (1 << 10),
+		FLAG_STORE      = (1 << 0),
+		FLAG_LOAD       = (1 << 1),
+		FLAG_SIZE_8     = (1 << 2),
+		FLAG_SIZE_16    = (1 << 3),
+		FLAG_SIZE_32    = (1 << 4),
+		FLAG_SIZE_F32   = (1 << 5),
+		FLAG_SIZE_F32X2 = (1 << 6),
+		FLAG_SIZE_F64   = (1 << 7),
+		FLAG_REVERSE    = (1 << 8),
+		FLAG_EXTEND     = (1 << 9),
+		FLAG_SIZE_F32I  = (1 << 10),
+		FLAG_ZERO_256   = (1 << 11),
+		FLAG_MASK_FLOAT = FLAG_SIZE_F32 |
+		                  FLAG_SIZE_F32X2 |
+		                  FLAG_SIZE_F64 |
+		                  FLAG_SIZE_F32I,
 	};

 	static u32 GetFlagSize(u32 flags)