From 217c2c9d6acfe5b13e7afc08b42272788d44507b Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 20 Jan 2015 16:34:32 -0600 Subject: [PATCH 1/2] [AArch64] Add some more scalar VFP ops to the emitter. --- Source/Core/Common/Arm64Emitter.cpp | 28 ++++++++++++++++++++++++++++ Source/Core/Common/Arm64Emitter.h | 7 +++++++ 2 files changed, 35 insertions(+) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 39f62d21eb..5801e5ea57 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -1840,6 +1840,17 @@ void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opc (encoded_size << 10) | (Rn << 5) | Rt); } +void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) +{ + _assert_msg_(DYNA_REC, IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + + Write32((M << 31) | (S << 29) | (0b11110001 << 21) | (type << 22) | \ + (opcode << 15) | (1 << 14) | (Rn << 5) | Rd); +} + void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm); @@ -2082,12 +2093,29 @@ void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) opcode = 0b0010; EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn); } +// Scalar - 1 Source +void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn); +} +void ARM64FloatEmitter::FNEG(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalar1Source(0, 0, IsDouble(Rd), 0b000010, Rd, Rn); +} // Scalar - 2 Source +void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + Emit2Source(0, 0, IsDouble(Rd), 0b0010, Rd, Rn, Rm); +} void ARM64FloatEmitter::FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { Emit2Source(0, 0, IsDouble(Rd), 0, Rd, Rn, Rm); } +void ARM64FloatEmitter::FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + Emit2Source(0, 0, IsDouble(Rd), 0b0011, Rd, Rn, Rm); +} // Scalar floating point immediate void ARM64FloatEmitter::FMOV(ARM64Reg Rd, u32 imm) diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 086c813aee..7cc1d90361 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -617,8 +617,14 @@ public: // Loadstore multiple structure void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); + // Scalar - 1 Source + void FABS(ARM64Reg Rd, ARM64Reg Rn); + void FNEG(ARM64Reg Rd, ARM64Reg Rn); + // Scalar - 2 Source + void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); // Scalar floating point immediate void FMOV(ARM64Reg Rd, u32 imm); @@ -718,6 +724,7 @@ private: void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm); void EmitShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn); + void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); }; class ARM64CodeBlock : public CodeBlock From 67f2ff2e18b38efeaabecba48423cde06c0e8df0 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Tue, 20 Jan 2015 16:35:08 -0600 Subject: [PATCH 2/2] [AArch64] Move the 64bit floating point instructions to scalar. Instead of doing vector operations and throwing away the top 64bits of each operation, let's instead use scalar operations. On Cortex-A57 this saves us three cycles per vector operation changed to scalar, so this saves 3-9cycles per instruction emulated. Also puts one less micro-op in to the vector pipeline there. On the Nvidia Denver I couldn't see any noticeable performance difference, but it's a quirky architecture so it may be noticing we are throwing away the top bits anyway and optimizing it. The world may never know what's truly happening there. --- .../JitArm64/JitArm64_FloatingPoint.cpp | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index a670edccc8..305f30d217 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -27,7 +27,7 @@ void JitArm64::fabsx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(inst.FD); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FABS(64, V0, VB); + m_float_emit.FABS(EncodeRegToDouble(V0), EncodeRegToDouble(VB)); m_float_emit.INS(64, VD, 0, V0, 0); fpr.Unlock(V0); @@ -44,7 +44,7 @@ void JitArm64::faddsx(UGeckoInstruction inst) ARM64Reg VB = fpr.R(inst.FB); ARM64Reg VD = fpr.R(inst.FD); - m_float_emit.FADD(64, VD, VA, VB); + m_float_emit.FADD(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); m_float_emit.INS(64, VD, 1, VD, 0); } @@ -60,7 +60,7 @@ void JitArm64::faddx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(inst.FD); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FADD(64, V0, VA, VB); + m_float_emit.FADD(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); m_float_emit.INS(64, VD, 0, V0, 0); fpr.Unlock(V0); @@ -81,9 +81,9 @@ void JitArm64::fmaddsx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(d); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(64, V0, VA, VC); - m_float_emit.FADD(64, V0, V0, VB); - m_float_emit.DUP(64, VD, V0, 0); + m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); + m_float_emit.FADD(EncodeRegToDouble(VD), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); + m_float_emit.INS(64, VD, 1, VD, 0); fpr.Unlock(V0); } @@ -102,8 +102,8 @@ void JitArm64::fmaddx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(d); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(64, V0, VA, VC); - m_float_emit.FADD(64, V0, V0, VB); + m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); + m_float_emit.FADD(EncodeRegToDouble(V0), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); m_float_emit.INS(64, VD, 0, V0, 0); fpr.Unlock(V0); } @@ -136,9 +136,9 @@ void JitArm64::fmsubsx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(d); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(64, V0, VA, VC); - m_float_emit.FSUB(64, V0, V0, VB); - m_float_emit.DUP(64, VD, V0, 0); + m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); + m_float_emit.FSUB(EncodeRegToDouble(VD), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); + m_float_emit.INS(64, VD, 1, VD, 0); fpr.Unlock(V0); } @@ -157,8 +157,8 @@ void JitArm64::fmsubx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(d); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(64, V0, VA, VC); - m_float_emit.FSUB(64, V0, V0, VB); + m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); + m_float_emit.FSUB(EncodeRegToDouble(V0), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); m_float_emit.INS(64, VD, 0, V0, 0); fpr.Unlock(V0); } @@ -174,7 +174,7 @@ void JitArm64::fmulsx(UGeckoInstruction inst) ARM64Reg VC = fpr.R(inst.FC); ARM64Reg VD = fpr.R(inst.FD); - m_float_emit.FMUL(64, VD, VA, VC); + m_float_emit.FMUL(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); m_float_emit.INS(64, VD, 1, VD, 0); } @@ -190,7 +190,7 @@ void JitArm64::fmulx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(inst.FD); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(64, V0, VA, VC); + m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); m_float_emit.INS(64, VD, 0, V0, 0); fpr.Unlock(V0); @@ -207,8 +207,8 @@ void JitArm64::fnabsx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(inst.FD); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FABS(64, V0, VB); - m_float_emit.FNEG(64, V0, V0); + m_float_emit.FABS(EncodeRegToDouble(V0), EncodeRegToDouble(VB)); + m_float_emit.FNEG(EncodeRegToDouble(V0), EncodeRegToDouble(V0)); m_float_emit.INS(64, VD, 0, V0, 0); fpr.Unlock(V0); @@ -225,7 +225,7 @@ void JitArm64::fnegx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(inst.FD); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FNEG(64, V0, VB); + m_float_emit.FNEG(EncodeRegToDouble(V0), EncodeRegToDouble(VB)); m_float_emit.INS(64, VD, 0, V0, 0); fpr.Unlock(V0); @@ -246,10 +246,10 @@ void JitArm64::fnmaddsx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(d); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(64, V0, VA, VC); - m_float_emit.FADD(64, V0, V0, VB); - m_float_emit.FNEG(64, V0, V0); - m_float_emit.DUP(64, VD, V0, 0); + m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); + m_float_emit.FADD(EncodeRegToDouble(VD), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); + m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VD)); + m_float_emit.INS(64, VD, 1, VD, 0); fpr.Unlock(V0); } @@ -268,9 +268,9 @@ void JitArm64::fnmaddx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(d); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(64, V0, VA, VC); - m_float_emit.FADD(64, V0, V0, VB); - m_float_emit.FNEG(64, V0, V0); + m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); + m_float_emit.FADD(EncodeRegToDouble(V0), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); + m_float_emit.FNEG(EncodeRegToDouble(V0), EncodeRegToDouble(V0)); m_float_emit.INS(64, VD, 0, V0, 0); fpr.Unlock(V0); } @@ -290,10 +290,10 @@ void JitArm64::fnmsubsx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(d); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(64, V0, VA, VC); - m_float_emit.FSUB(64, V0, V0, VB); - m_float_emit.FNEG(64, V0, V0); - m_float_emit.DUP(64, VD, V0, 0); + m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); + m_float_emit.FSUB(EncodeRegToDouble(VD), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); + m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VD)); + m_float_emit.INS(64, VD, 1, VD, 0); fpr.Unlock(V0); } @@ -312,9 +312,9 @@ void JitArm64::fnmsubx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(d); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FMUL(64, V0, VA, VC); - m_float_emit.FSUB(64, V0, V0, VB); - m_float_emit.FNEG(64, V0, V0); + m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); + m_float_emit.FSUB(EncodeRegToDouble(V0), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); + m_float_emit.FNEG(EncodeRegToDouble(V0), EncodeRegToDouble(V0)); m_float_emit.INS(64, VD, 0, V0, 0); fpr.Unlock(V0); } @@ -353,7 +353,7 @@ void JitArm64::fsubsx(UGeckoInstruction inst) ARM64Reg VB = fpr.R(inst.FB); ARM64Reg VD = fpr.R(inst.FD); - m_float_emit.FSUB(64, VD, VA, VB); + m_float_emit.FSUB(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); m_float_emit.INS(64, VD, 1, VD, 0); } @@ -369,7 +369,7 @@ void JitArm64::fsubx(UGeckoInstruction inst) ARM64Reg VD = fpr.R(inst.FD); ARM64Reg V0 = fpr.GetReg(); - m_float_emit.FSUB(64, V0, VA, VB); + m_float_emit.FSUB(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); m_float_emit.INS(64, VD, 0, V0, 0); fpr.Unlock(V0);