From e358696d46bb78c38a93ce3885aa14a190f04e29 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 29 Nov 2014 02:11:25 +0000 Subject: [PATCH 01/12] [ARM32] Removes conditional execution from ARMv7's Jit function. --- Source/Core/Core/PowerPC/JitArm32/Jit.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp index 59ddf184e9..9143885fc3 100644 --- a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp @@ -385,13 +385,13 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo // Downcount flag check, Only valid for linked blocks { - SetCC(CC_MI); + FixupBranch no_downcount = B_CC(CC_PL); ARMReg rA = gpr.GetReg(false); MOVI2R(rA, js.blockStart); STR(rA, R9, PPCSTATE_OFF(pc)); MOVI2R(rA, (u32)asm_routines.doTiming); B(rA); - SetCC(); + SetJumpTarget(no_downcount); } const u8 *normalEntry = GetCodePtr(); @@ -409,7 +409,7 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo MOVI2R(C, js.blockStart); // R3 LDR(A, R9, PPCSTATE_OFF(msr)); TST(A, Shift); - SetCC(CC_EQ); + FixupBranch no_fpe = B_CC(CC_NEQ); STR(C, R9, PPCSTATE_OFF(pc)); LDR(A, R9, PPCSTATE_OFF(Exceptions)); @@ -422,7 +422,7 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo MOVI2R(A, (u32)asm_routines.dispatcher); B(A); - SetCC(); + SetJumpTarget(no_fpe); gpr.Unlock(A, C); } From 6c399ce9aef5b2a379bc35289ef6e01b272d5bdd Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 29 Nov 2014 02:20:31 +0000 Subject: [PATCH 02/12] [ARM32] Removes a block of conditional execution in the dispatcher. --- Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp index bb19c300c4..9e630810d0 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp @@ -134,7 +134,7 @@ void JitArmAsmRoutineManager::Generate() // R12 Confirmed this is the correct iCache Location loaded. TST(R12, 0x80); // Test to see if it is a JIT block. - SetCC(CC_EQ); + FixupBranch no_block = B_CC(CC_NEQ); // Success, it is our Jitblock. MOVI2R(R14, (u32)jit->GetBlockCache()->GetCodePointers()); // LDR R14 right here to get CodePointers()[0] pointer. @@ -143,7 +143,7 @@ void JitArmAsmRoutineManager::Generate() B(R14); // No need to jump anywhere after here, the block will go back to dispatcher start - SetCC(); + SetJumpTarget(no_block); // If we get to this point, that means that we don't have the block cached to execute // So call ArmJit to compile the block and then execute it. From ec3d6da7b5ffb462d4ad646a9161852d86d58312 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 29 Nov 2014 02:44:17 +0000 Subject: [PATCH 03/12] [ARM32] Remove conditional execution from store instructions. --- Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp index 1f50c7fb49..95623d7639 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp @@ -227,7 +227,7 @@ void JitArm::stX(UGeckoInstruction inst) // Check for DSI exception prior to writing back address LDR(rA, R9, PPCSTATE_OFF(Exceptions)); TST(rA, EXCEPTION_DSI); - SetCC(CC_EQ); + FixupBranch has_exception = B_CC(CC_NEQ); if (regOffset == -1) { MOVI2R(rA, offset); @@ -237,7 +237,7 @@ void JitArm::stX(UGeckoInstruction inst) { ADD(RA, RA, RB); } - SetCC(); + SetJumpTarget(has_exception); gpr.Unlock(rA); } } From fca0fd9dd5ee3535a435e8a4fe92033849041466 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 29 Nov 2014 03:32:53 +0000 Subject: [PATCH 04/12] [ARM32] rlwimix and rlwnmx optimizations. --- .../Core/PowerPC/JitArm32/JitArm_Integer.cpp | 98 +++++++++++++++++-- 1 file changed, 92 insertions(+), 6 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp index 5c897227b3..5e939edb22 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp @@ -870,13 +870,50 @@ void JitArm::rlwimix(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); u32 mask = Helper_Mask(inst.MB,inst.ME); - ARMReg RA = gpr.R(inst.RA); - ARMReg RS = gpr.R(inst.RS); + int a = inst.RA, s = inst.RS; + if (gpr.IsImm(s) && inst.MB <= inst.ME) + { + u32 imm = _rotl(gpr.GetImm(s), inst.SH) & mask; + imm >>= 31 - inst.ME; + ARMReg rA = gpr.GetReg(); + + MOVI2R(rA, imm); + BFI(gpr.R(a), rA, 31 - inst.ME, inst.ME - inst.MB + 1); + if (inst.Rc) + ComputeRC(gpr.R(a)); + + gpr.Unlock(rA); + return; + } + + ARMReg RA = gpr.R(a); + ARMReg RS = gpr.R(s); + + if (inst.SH == 0 && inst.MB <= inst.ME) + { + if (inst.ME != 31) + { + ARMReg rA = gpr.GetReg(); + LSR(rA, RS, 31 - inst.ME); + BFI(RA, rA, 31 - inst.ME, inst.ME - inst.MB + 1); + gpr.Unlock(rA); + } + else + { + BFI(RA, RS, 0, inst.ME - inst.MB + 1); + } + if (inst.Rc) + ComputeRC(RA); + + return; + } + ARMReg rA = gpr.GetReg(); ARMReg rB = gpr.GetReg(); + Operand2 Shift(RS, ST_ROR, 32 - inst.SH); // This rotates left, while ARM has only rotate right, so swap it. + MOVI2R(rA, mask); - Operand2 Shift(RS, ST_ROR, 32 - inst.SH); // This rotates left, while ARM has only rotate right, so swap it. BIC (rB, RA, rA); // RA & ~mask AND (rA, rA, Shift); ORR(RA, rB, rA); @@ -892,13 +929,62 @@ void JitArm::rlwinmx(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); u32 mask = Helper_Mask(inst.MB,inst.ME); + if (gpr.IsImm(inst.RS)) + { + gpr.SetImmediate(inst.RA, _rotl(gpr.GetImm(inst.RS), inst.SH) & mask); + if (inst.Rc) + ComputeRC(gpr.GetImm(inst.RA), 0); + return; + } + + gpr.BindToRegister(inst.RA, inst.RA == inst.RS); ARMReg RA = gpr.R(inst.RA); ARMReg RS = gpr.R(inst.RS); ARMReg rA = gpr.GetReg(); - MOVI2R(rA, mask); + bool inverse = false; + bool fit_op = false; + Operand2 op2; + fit_op = TryMakeOperand2_AllowInverse(mask, op2, &inverse); - Operand2 Shift(RS, ST_ROR, 32 - inst.SH); // This rotates left, while ARM has only rotate right, so swap it. - AND(RA, rA, Shift); + if (!inst.SH && fit_op) + { + if (inverse) + BIC(RA, RS, op2); + else + AND(RA, RS, op2); + } + else if (!inst.SH && inst.ME == 31) + { + UBFX(RA, RS, 0, inst.ME - inst.MB + 1); + } + else if (!inst.SH && inst.MB == 0) + { + LSR(RA, RS, 31 - inst.ME); + LSL(RA, RA, 31 - inst.ME); + } + else if (inst.SH == 16 && inst.MB >= 16 && inst.ME == 31) + { + UBFX(RA, RS, 16, 32 - inst.MB); + } + else if (inst.SH == 16 && inst.MB == 0 && inst.ME == 15) + { + LSL(RA, RS, 16); + } + else if (fit_op) + { + Operand2 Shift(RS, ST_ROR, 32 - inst.SH); // This rotates left, while ARM has only rotate right, so swap it. + MOV(RA, Shift); + if (inverse) + BIC(RA, RA, op2); + else + AND(RA, RA, op2); + } + else + { + MOVI2R(rA, mask); + Operand2 Shift(RS, ST_ROR, 32 - inst.SH); // This rotates left, while ARM has only rotate right, so swap it. + AND(RA, rA, Shift); + } if (inst.Rc) ComputeRC(RA); From 1dbb39f791e089d67c2e4464f1b563c0eb889fd6 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 29 Nov 2014 03:55:53 +0000 Subject: [PATCH 05/12] [ARM32] srawix optimization when source is an immediate. --- .../Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp index 5e939edb22..a7379ca20b 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp @@ -1025,9 +1025,19 @@ void JitArm::srawix(UGeckoInstruction inst) int s = inst.RS; int amount = inst.SH; - gpr.BindToRegister(a, a == s); - if (amount != 0) + if (gpr.IsImm(s)) { + s32 imm = (s32)gpr.GetImm(s); + gpr.SetImmediate(a, imm >> amount); + + if (amount != 0 && (imm < 0) && (imm << (32 - amount))) + ComputeCarry(true); + else + ComputeCarry(false); + } + else if (amount != 0) + { + gpr.BindToRegister(a, a == s); ARMReg RA = gpr.R(a); ARMReg RS = gpr.R(s); ARMReg tmp = gpr.GetReg(); @@ -1049,6 +1059,7 @@ void JitArm::srawix(UGeckoInstruction inst) } else { + gpr.BindToRegister(a, a == s); ARMReg RA = gpr.R(a); ARMReg RS = gpr.R(s); MOV(RA, RS); From 9c82adb14f7915f4c020a64bd6d69e2e37b1d509 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 29 Nov 2014 04:02:57 +0000 Subject: [PATCH 06/12] [ARM32] negx optimization if source is immediate --- Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp index a7379ca20b..f4157941fa 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp @@ -850,6 +850,14 @@ void JitArm::negx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITIntegerOff); + if (gpr.IsImm(inst.RA)) + { + gpr.SetImmediate(inst.RD, ~gpr.GetImm(inst.RA) + 1); + if (inst.Rc) + ComputeRC(gpr.GetImm(inst.RD), 0); + return; + } + gpr.BindToRegister(inst.RD, inst.RD == inst.RA); ARMReg RD = gpr.R(inst.RD); ARMReg RA = gpr.R(inst.RA); From 581ab9edecf6bcb85446ba860aac55470772d68d Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 29 Nov 2014 04:21:13 +0000 Subject: [PATCH 07/12] [ARM32] cntlzwx optimization if source is immediate --- Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp index f4157941fa..262c145006 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp @@ -723,6 +723,14 @@ void JitArm::cntlzwx(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); u32 a = inst.RA, s = inst.RS; + if (gpr.IsImm(s)) + { + gpr.SetImmediate(a, __builtin_clz(gpr.GetImm(s))); + if (inst.Rc) + ComputeRC(gpr.GetImm(a), 0); + return; + } + gpr.BindToRegister(a, a == s); ARMReg RA = gpr.R(a); ARMReg RS = gpr.R(s); From e2f8286415cc5c0b7e6da92321963fab5abf99aa Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 29 Nov 2014 04:47:04 +0000 Subject: [PATCH 08/12] [ARM32] Adds unsigned compares. Superscedes PR #1131. Optimizes cmpi a bit. --- Source/Core/Core/PowerPC/JitArm32/Jit.h | 2 + .../Core/PowerPC/JitArm32/JitArm_Integer.cpp | 55 ++++++++++++++++++- .../Core/PowerPC/JitArm32/JitArm_Tables.cpp | 4 +- 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/Jit.h b/Source/Core/Core/PowerPC/JitArm32/Jit.h index 1f8684bcc9..111ad8b787 100644 --- a/Source/Core/Core/PowerPC/JitArm32/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm32/Jit.h @@ -178,7 +178,9 @@ public: void subfic(UGeckoInstruction _inst); void cntlzwx(UGeckoInstruction _inst); void cmp (UGeckoInstruction _inst); + void cmpl(UGeckoInstruction _inst); void cmpi(UGeckoInstruction _inst); + void cmpli(UGeckoInstruction _inst); void negx(UGeckoInstruction _inst); void mulhwux(UGeckoInstruction _inst); void rlwimix(UGeckoInstruction _inst); diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp index 262c145006..0c420d5ce6 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_Integer.cpp @@ -825,10 +825,34 @@ void JitArm::cmp (UGeckoInstruction inst) gpr.Unlock(rA); } + +void JitArm::cmpl(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITIntegerOff); + + int crf = inst.CRFD; + u32 a = inst.RA, b = inst.RB; + + if (gpr.IsImm(a) && gpr.IsImm(b)) + { + ComputeRC(gpr.GetImm(a) - gpr.GetImm(b), crf); + return; + } + else if (gpr.IsImm(b) && !gpr.GetImm(b)) + { + ComputeRC(gpr.R(a), crf); + return; + } + + FALLBACK_IF(true); +} + void JitArm::cmpi(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITIntegerOff); + u32 a = inst.RA; int crf = inst.CRFD; if (gpr.IsImm(a)) @@ -838,10 +862,15 @@ void JitArm::cmpi(UGeckoInstruction inst) } ARMReg rA = gpr.GetReg(); ARMReg RA = gpr.R(a); + bool negated = false; + Operand2 off; - if (inst.SIMM_16 >= 0 && inst.SIMM_16 < 256) + if (TryMakeOperand2_AllowNegation(inst.SIMM_16, off, &negated)) { - SUB(rA, RA, inst.SIMM_16); + if (negated) + ADD(rA, RA, off); + else + SUB(rA, RA, off); } else { @@ -853,6 +882,28 @@ void JitArm::cmpi(UGeckoInstruction inst) gpr.Unlock(rA); } +void JitArm::cmpli(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITIntegerOff); + u32 a = inst.RA; + int crf = inst.CRFD; + + if (gpr.IsImm(a)) + { + ComputeRC(gpr.GetImm(a) - inst.UIMM, crf); + return; + } + + if (!inst.UIMM) + { + ComputeRC(gpr.R(a), crf); + return; + } + + FALLBACK_IF(true); +} + void JitArm::negx(UGeckoInstruction inst) { INSTRUCTION_START diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_Tables.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_Tables.cpp index 198a73fba3..79a7399d62 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_Tables.cpp @@ -47,7 +47,7 @@ static GekkoOPTemplate primarytable[] = {7, &JitArm::arith}, //"mulli", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_RC_BIT, 2}}, {8, &JitArm::subfic}, //"subfic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}}, - {10, &JitArm::FallBackToInterpreter}, //"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}}, + {10, &JitArm::cmpli}, //"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}}, {11, &JitArm::cmpi}, //"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn}}, {12, &JitArm::arith}, //"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA}}, {13, &JitArm::arith}, //"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0}}, @@ -190,7 +190,7 @@ static GekkoOPTemplate table31[] = {476, &JitArm::arith}, //"nandx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_SB | FL_RC_BIT}}, {284, &JitArm::arith}, //"eqvx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_SB | FL_RC_BIT}}, {0, &JitArm::cmp}, //"cmp", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}}, - {32, &JitArm::FallBackToInterpreter}, //"cmpl", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}}, + {32, &JitArm::cmpl}, //"cmpl", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}}, {26, &JitArm::cntlzwx}, //"cntlzwx",OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {922, &JitArm::extshx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {954, &JitArm::extsbx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, From 32dc105aa3647e3424164d252212ad67b356d699 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 29 Nov 2014 05:27:39 +0000 Subject: [PATCH 09/12] [ARM32] Eat a register to store our memory base. This saves at least two instructions per fastmem operation. --- .../PowerPC/JitArm32/JitArm_BackPatch.cpp | 5 +-- .../PowerPC/JitArm32/JitArm_LoadStore.cpp | 22 ++++------ Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp | 43 +++++++------------ .../Core/PowerPC/JitArm32/JitRegCache.cpp | 2 +- 4 files changed, 27 insertions(+), 45 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp index c703e5a88d..f095d1c145 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp @@ -225,9 +225,8 @@ u32 JitArm::EmitBackpatchRoutine(ARMXEmitter* emit, u32 flags, bool fastmem, boo { ARMReg temp2 = R10; Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) - emit->BIC(temp, addr, mask); // 1 - emit->MOVI2R(temp2, (u32)Memory::base); // 2-3 - emit->ADD(temp, temp, temp2); // 4 + emit->BIC(temp, addr, mask); + emit->ADD(temp, temp, R8); if (flags & BackPatchInfo::FLAG_STORE && flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64)) diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp index 95623d7639..1682fe5b57 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp @@ -148,7 +148,7 @@ void JitArm::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, int accessSize else if (Memory::IsRAMAddress(imm_addr)) { MOVI2R(rA, imm_addr); - EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, false, RS); + EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, true, RS); } else { @@ -487,14 +487,12 @@ void JitArm::lmw(UGeckoInstruction inst) u32 a = inst.RA; ARMReg rA = gpr.GetReg(); - ARMReg rB = gpr.GetReg(); MOVI2R(rA, inst.SIMM_16); if (a) ADD(rA, rA, gpr.R(a)); Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) - BIC(rA, rA, mask); // 3 - MOVI2R(rB, (u32)Memory::base, false); // 4-5 - ADD(rA, rA, rB); // 6 + BIC(rA, rA, mask); + ADD(rA, rA, R8); for (int i = inst.RD; i < 32; i++) { @@ -502,7 +500,7 @@ void JitArm::lmw(UGeckoInstruction inst) LDR(RX, rA, (i - inst.RD) * 4); REV(RX, RX); } - gpr.Unlock(rA, rB); + gpr.Unlock(rA); } void JitArm::stmw(UGeckoInstruction inst) @@ -514,22 +512,20 @@ void JitArm::stmw(UGeckoInstruction inst) u32 a = inst.RA; ARMReg rA = gpr.GetReg(); ARMReg rB = gpr.GetReg(); - ARMReg rC = gpr.GetReg(); MOVI2R(rA, inst.SIMM_16); if (a) ADD(rA, rA, gpr.R(a)); Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) - BIC(rA, rA, mask); // 3 - MOVI2R(rB, (u32)Memory::base, false); // 4-5 - ADD(rA, rA, rB); // 6 + BIC(rA, rA, mask); + ADD(rA, rA, R8); for (int i = inst.RD; i < 32; i++) { ARMReg RX = gpr.R(i); - REV(rC, RX); - STR(rC, rA, (i - inst.RD) * 4); + REV(rB, RX); + STR(rB, rA, (i - inst.RD) * 4); } - gpr.Unlock(rA, rB, rC); + gpr.Unlock(rA, rB); } void JitArm::dcbst(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp index 9e630810d0..4ece379575 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp @@ -96,6 +96,7 @@ void JitArmAsmRoutineManager::Generate() SUB(_SP, _SP, 4); MOVI2R(R9, (u32)&PowerPC::ppcState.spr[0]); + MOVI2R(R8, (u32)Memory::base); FixupBranch skipToRealDispatcher = B(); dispatcher = GetCodePtr(); @@ -203,8 +204,7 @@ void JitArmAsmRoutineManager::GenerateCommon() const u8* loadPairedFloatTwo = GetCodePtr(); { BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); nemit.VLD1(I_32, D0, R10); nemit.VREV32(I_8, D0, D0); @@ -214,8 +214,7 @@ void JitArmAsmRoutineManager::GenerateCommon() const u8* loadPairedFloatOne = GetCodePtr(); { BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); nemit.VLD1(I_32, D0, R10); nemit.VREV32(I_8, D0, D0); @@ -225,8 +224,7 @@ void JitArmAsmRoutineManager::GenerateCommon() const u8* loadPairedU8Two = GetCodePtr(); { BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); LDRH(R12, R10); SXTB(R12, R12); @@ -251,8 +249,7 @@ void JitArmAsmRoutineManager::GenerateCommon() const u8* loadPairedU8One = GetCodePtr(); { BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); LDRB(R12, R10); SXTB(R12, R12); @@ -271,8 +268,7 @@ void JitArmAsmRoutineManager::GenerateCommon() const u8* loadPairedS8Two = GetCodePtr(); { BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); LDRH(R12, R10); SXTB(R12, R12); @@ -297,8 +293,7 @@ void JitArmAsmRoutineManager::GenerateCommon() const u8* loadPairedS8One = GetCodePtr(); { BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); LDRB(R12, R10); SXTB(R12, R12); @@ -317,8 +312,7 @@ void JitArmAsmRoutineManager::GenerateCommon() const u8* loadPairedU16Two = GetCodePtr(); { BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); LDRH(R12, R10); REV16(R12, R12); @@ -345,8 +339,7 @@ void JitArmAsmRoutineManager::GenerateCommon() const u8* loadPairedU16One = GetCodePtr(); { BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); LDRH(R12, R10); REV16(R12, R12); @@ -364,8 +357,7 @@ void JitArmAsmRoutineManager::GenerateCommon() const u8* loadPairedS16Two = GetCodePtr(); { BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); LDRH(R12, R10); REV16(R12, R12); @@ -392,8 +384,7 @@ void JitArmAsmRoutineManager::GenerateCommon() const u8* loadPairedS16One = GetCodePtr(); { BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); LDRH(R12, R10); @@ -439,8 +430,7 @@ void JitArmAsmRoutineManager::GenerateCommon() TST(R10, arghmask); FixupBranch argh = B_CC(CC_NEQ); BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); nemit.VREV32(I_8, D0, D0); nemit.VST1(I_32, D0, R10); @@ -511,8 +501,7 @@ void JitArmAsmRoutineManager::GenerateCommon() TST(R10, arghmask); FixupBranch argh = B_CC(CC_NEQ); BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); VMOV(R12, S0); REV(R12, R12); @@ -540,8 +529,7 @@ void JitArmAsmRoutineManager::GenerateCommon() TST(R10, arghmask); FixupBranch argh = B_CC(CC_NEQ); BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); VCVT(S0, S0, TO_INT | ROUND_TO_ZERO); VMOV(R12, S0); @@ -568,8 +556,7 @@ void JitArmAsmRoutineManager::GenerateCommon() TST(R10, arghmask); FixupBranch argh = B_CC(CC_NEQ); BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); + ADD(R10, R10, R8); VCVT(S0, S0, TO_INT | ROUND_TO_ZERO); VMOV(R12, S0); diff --git a/Source/Core/Core/PowerPC/JitArm32/JitRegCache.cpp b/Source/Core/Core/PowerPC/JitArm32/JitRegCache.cpp index 8a7c2990a9..8379b6aa9a 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitRegCache.cpp @@ -54,7 +54,7 @@ ARMReg *ArmRegCache::GetPPCAllocationOrder(int &count) // the ppc side. static ARMReg allocationOrder[] = { - R0, R1, R2, R3, R4, R5, R6, R7, R8 + R0, R1, R2, R3, R4, R5, R6, R7 }; count = sizeof(allocationOrder) / sizeof(const int); return allocationOrder; From 52c6fb180b3445dd039c8f5a9d0382a1dab8ad93 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 29 Nov 2014 06:47:07 +0000 Subject: [PATCH 10/12] [ARM32] Moves loadstore extend flag to backpatch code. --- Source/Core/Core/PowerPC/JitArm32/Jit.h | 1 + .../PowerPC/JitArm32/JitArm_BackPatch.cpp | 25 ++++++++++++++++++- .../PowerPC/JitArm32/JitArm_LoadStore.cpp | 6 ++--- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/Jit.h b/Source/Core/Core/PowerPC/JitArm32/Jit.h index 111ad8b787..98f6884e1c 100644 --- a/Source/Core/Core/PowerPC/JitArm32/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm32/Jit.h @@ -60,6 +60,7 @@ private: FLAG_SIZE_F32 = (1 << 5), FLAG_SIZE_F64 = (1 << 6), FLAG_REVERSE = (1 << 7), + FLAG_EXTEND = (1 << 8), }; u32 m_fastmem_size; diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp index f095d1c145..3351f2a8fe 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_BackPatch.cpp @@ -300,6 +300,9 @@ u32 JitArm::EmitBackpatchRoutine(ARMXEmitter* emit, u32 flags, bool fastmem, boo else if (flags & BackPatchInfo::FLAG_SIZE_16) emit->REV16(RS, RS); } + + if (flags & BackPatchInfo::FLAG_EXTEND) + emit->SXTH(RS, RS); } } else @@ -590,7 +593,6 @@ void JitArm::InitBackpatch() m_backpatch_info[flags] = info; } - // 16bit - reverse { flags = @@ -612,6 +614,27 @@ void JitArm::InitBackpatch() m_backpatch_info[flags] = info; } + // 16bit - sign extend + { + flags = + BackPatchInfo::FLAG_LOAD | + BackPatchInfo::FLAG_SIZE_16 | + BackPatchInfo::FLAG_EXTEND; + EmitBackpatchRoutine(this, flags, false, false, R0); + code_end = GetWritableCodePtr(); + info.m_slowmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + info.m_fastmem_trouble_inst_offset = + EmitBackpatchRoutine(this, flags, true, false, R0); + code_end = GetWritableCodePtr(); + info.m_fastmem_size = (code_end - code_base) / 4; + + SetCodePtr(code_base); + + m_backpatch_info[flags] = info; + } // 32bit - reverse { flags = diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp index 1682fe5b57..8cd32c561c 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStore.cpp @@ -347,13 +347,13 @@ void JitArm::SafeLoadToReg(ARMReg dest, s32 addr, s32 offsetReg, int accessSize, if (reverse) flags |= BackPatchInfo::FLAG_REVERSE; + if (signExtend) + flags |= BackPatchInfo::FLAG_EXTEND; + EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, !(is_immediate && Memory::IsRAMAddress(imm_addr)), dest); - if (signExtend) // Only on 16 loads - SXTH(dest, dest); - if (update) MOV(gpr.R(addr), rA); } From b848365f780f945149135b581a6c20a3d8cbecd6 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 29 Nov 2014 06:51:35 +0000 Subject: [PATCH 11/12] [ARM32] Minor optimization in paired loadstores. When the offset can fit in the instruction encoding make sure to do so. --- .../JitArm32/JitArm_LoadStorePaired.cpp | 29 +++++++++++++++---- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp index 6742e1ccc5..e0e485903a 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp @@ -35,9 +35,21 @@ void JitArm::psq_l(UGeckoInstruction inst) UBFX(R11, R11, 24, 6); // Scale LSL(R11, R11, 2); - MOVI2R(R10, (u32)offset); - if (inst.RA || update) // Always uses the register on update - ADD(R10, R10, gpr.R(inst.RA)); + Operand2 off; + if (TryMakeOperand2(offset, off)) + { + if (inst.RA || update) + ADD(R10, gpr.R(inst.RA), off); + else + MOV(R10, off); + } + else + { + MOVI2R(R10, (u32)offset); + if (inst.RA || update) // Always uses the register on update + ADD(R10, R10, gpr.R(inst.RA)); + } + if (update) MOV(gpr.R(inst.RA), R10); MOVI2R(R14, (u32)asm_routines.pairedLoadQuantized); @@ -126,14 +138,19 @@ void JitArm::psq_st(UGeckoInstruction inst) UBFX(R11, R11, 8, 6); // Scale LSL(R11, R11, 2); - if (inst.RA || update) // Always uses the register on update + Operand2 off; + if (TryMakeOperand2(offset, off)) { - MOVI2R(R14, offset); - ADD(R10, gpr.R(inst.RA), R14); + if (inst.RA || update) + ADD(R10, gpr.R(inst.RA), off); + else + MOV(R10, off); } else { MOVI2R(R10, (u32)offset); + if (inst.RA || update) // Always uses the register on update + ADD(R10, R10, gpr.R(inst.RA)); } if (update) From cd13d2d66f6e1a5fa39a69e79ca2aee3724d1ce4 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 29 Nov 2014 06:52:09 +0000 Subject: [PATCH 12/12] [ARM32] Fix a couple bugs in the paired loadstore routines. This code was obviously wrong, we were sign extending 8 bit unsigned values and loading from the wrong offset as well. This fixes a bug in Muramasa where some colours were going insane. --- Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp index 4ece379575..b0966d4f58 100644 --- a/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/JitAsm.cpp @@ -226,12 +226,10 @@ void JitArmAsmRoutineManager::GenerateCommon() BIC(R10, R10, mask); ADD(R10, R10, R8); - LDRH(R12, R10); - SXTB(R12, R12); + LDRB(R12, R10); VMOV(S0, R12); - LDRH(R12, R10, 2); - SXTB(R12, R12); + LDRB(R12, R10, 1); VMOV(S1, R12); MOVI2R(R10, (u32)&m_dequantizeTableS); @@ -252,7 +250,6 @@ void JitArmAsmRoutineManager::GenerateCommon() ADD(R10, R10, R8); LDRB(R12, R10); - SXTB(R12, R12); VMOV(S0, R12); MOVI2R(R10, (u32)&m_dequantizeTableS); @@ -270,12 +267,10 @@ void JitArmAsmRoutineManager::GenerateCommon() BIC(R10, R10, mask); ADD(R10, R10, R8); - LDRH(R12, R10); - SXTB(R12, R12); + LDRSB(R12, R10); VMOV(S0, R12); - LDRH(R12, R10, 2); - SXTB(R12, R12); + LDRSB(R12, R10, 1); VMOV(S1, R12); MOVI2R(R10, (u32)&m_dequantizeTableS); @@ -295,8 +290,7 @@ void JitArmAsmRoutineManager::GenerateCommon() BIC(R10, R10, mask); ADD(R10, R10, R8); - LDRB(R12, R10); - SXTB(R12, R12); + LDRSB(R12, R10); VMOV(S0, R12); MOVI2R(R10, (u32)&m_dequantizeTableS); @@ -316,12 +310,10 @@ void JitArmAsmRoutineManager::GenerateCommon() LDRH(R12, R10); REV16(R12, R12); - SXTH(R12, R12); VMOV(S0, R12); LDRH(R12, R10, 2); REV16(R12, R12); - SXTH(R12, R12); VMOV(S1, R12); MOVI2R(R10, (u32)&m_dequantizeTableS);