From 2c3fa8da288878f0664499997cd503de06b71e80 Mon Sep 17 00:00:00 2001
From: Ryan Houdek <Sonicadvance1@gmail.com>
Date: Fri, 28 Aug 2015 14:36:14 -0500
Subject: [PATCH 1/2] [AArch64] Fix a bug in the register caches.

This is a bug that crops if BindToRegister() is called multiple times in a row without a R() function call between them.
How to reproduce the bug:
1) Have a completely filled cache with no host register remaining
2) Call BindToRegister() with different guest registers
3) Don't call R() between the BindToRegister() calls.

This issue typically wouldn't be seen for a couple of reasons. Typically we have /plenty/ of registers in the cache, and in most cases we only call
BindToRegister() once per instruction. In the off chance that it is called multiple times, it wouldn't update the last used counts and would flush the
same register as the previous call to it.
---
 Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
index 27b021cf8a..c58a3cee46 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
@@ -206,6 +206,8 @@ void Arm64GPRCache::BindToRegister(u32 preg, bool do_load)
 {
 	OpArg& reg = m_guest_registers[preg];
 
+	reg.ResetLastUsed();
+
 	reg.SetDirty(true);
 	if (reg.GetType() == REG_NOTLOADED)
 	{
@@ -331,6 +333,9 @@ void Arm64FPRCache::BindToRegister(u32 preg, bool do_load, bool only_lower)
 	OpArg& reg = m_guest_registers[preg];
 
 	bool was_dirty = reg.IsDirty();
+
+	reg.ResetLastUsed();
+
 	reg.SetDirty(true);
 	switch (reg.GetType())
 	{

From 8d617064403cd4df2aeaa639eca6595aac289bb8 Mon Sep 17 00:00:00 2001
From: Ryan Houdek <Sonicadvance1@gmail.com>
Date: Fri, 28 Aug 2015 14:40:30 -0500
Subject: [PATCH 2/2] [AArch64] Optimize lmw.

This instruction is fairly heavily used by Ikaruga to load a bunch of registers from the stack.
In particular at the start of the second stage is a block that takes up ~20% CPU time that includes a usage of lmw to load half of the guest
registers.

Basic thing optimized here is changing from a single 32bit LDR to potentially a single 128bit LDR.
a single 32bit LDR is fairly slow, so we can optimize a few ways.
If we have four or more registers to load, do a 64bit LDP in to two host registers, byteswap, and then move the high 32bits of the host registers in
to the correct mapped guest register locations.
If we have two registers to load then do a 32bit LDP which will load two guest registers in a single instruction.
and then if we have only one register left to load, load it as before.

This saves quite a bit of cycles since the Cortex-A57 and A72's LDR instruction takes a few cycles.

Each 32bit LDR takes 4 cycles latency, plus 1 cycle for post-index(which typically happens in parallel.
Both the 32bit and 64bit LDP take the same amount of latency.

So we are improving latencies and reducing code bloat here.
---
 .../PowerPC/JitArm64/JitArm64_LoadStore.cpp   | 43 ++++++++++++++++---
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp
index 0312a55d93..6894498f92 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp
@@ -543,15 +543,46 @@ void JitArm64::lmw(UGeckoInstruction inst)
 		MOVI2R(WA, (u32)(s32)(s16)inst.SIMM_16);
 	}
 
-	u8* base = UReg_MSR(MSR).DR ? Memory::logical_base : Memory::physical_base;
-	MOVK(XA, ((u64)base >> 32) & 0xFFFF, SHIFT_32);
+	ADD(XA, XA, X28);
 
 	for (int i = inst.RD; i < 32; i++)
 	{
-		gpr.BindToRegister(i, false);
-		ARM64Reg RX = gpr.R(i);
-		LDR(INDEX_UNSIGNED, RX, XA, (i - inst.RD) * 4);
-		REV32(RX, RX);
+		int remaining = 32 - i;
+		if (remaining >= 4)
+		{
+			gpr.BindToRegister(i + 3, false);
+			gpr.BindToRegister(i + 2, false);
+			gpr.BindToRegister(i + 1, false);
+			gpr.BindToRegister(i, false);
+			ARM64Reg RX4 = gpr.R(i + 3);
+			ARM64Reg RX3 = gpr.R(i + 2);
+			ARM64Reg RX2 = gpr.R(i + 1);
+			ARM64Reg RX1 = gpr.R(i);
+			LDP(INDEX_POST, EncodeRegTo64(RX1), EncodeRegTo64(RX3), XA, 16);
+			REV32(EncodeRegTo64(RX1), EncodeRegTo64(RX1));
+			REV32(EncodeRegTo64(RX3), EncodeRegTo64(RX3));
+			ORR(EncodeRegTo64(RX2), ZR, EncodeRegTo64(RX1), ArithOption(EncodeRegTo64(RX1), ST_LSR, 32));
+			ORR(EncodeRegTo64(RX4), ZR, EncodeRegTo64(RX3), ArithOption(EncodeRegTo64(RX3), ST_LSR, 32));
+			i+=3;
+		}
+		else if (remaining >= 2)
+		{
+			gpr.BindToRegister(i + 1, false);
+			gpr.BindToRegister(i, false);
+			ARM64Reg RX2 = gpr.R(i + 1);
+			ARM64Reg RX1 = gpr.R(i);
+			LDP(INDEX_POST, RX1, RX2, XA, 8);
+			REV32(RX1, RX1);
+			REV32(RX2, RX2);
+			++i;
+		}
+		else
+		{
+			gpr.BindToRegister(i, false);
+			ARM64Reg RX = gpr.R(i);
+			LDR(INDEX_POST, RX, XA, 4);
+			REV32(RX, RX);
+		}
 	}
 
 	gpr.Unlock(WA);