From 304e601ad349f192406eaa356eb160a9a5e95f5a Mon Sep 17 00:00:00 2001 From: degasus Date: Sun, 13 Aug 2017 01:51:37 +0200 Subject: [PATCH] JitArm64: Reimplement aarch64 cycle counters. CNTVCT_EL0 is force-enabled on all linux plattforms. Windows is untested, but as this is the best way to get *any* low overhead performance counters, they likely use it as well. --- Source/Core/Common/Arm64Emitter.cpp | 8 +++ Source/Core/Common/Arm64Emitter.h | 2 +- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 85 +++++++---------------- Source/Core/Core/PowerPC/JitArm64/Jit.h | 6 -- Source/Core/Core/PowerPC/Profiler.cpp | 2 +- 5 files changed, 34 insertions(+), 69 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index bd72df2e94..ed1fa2b626 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -1218,6 +1218,14 @@ void ARM64XEmitter::MRS(ARM64Reg Rt, PStateField field) EncodeSystemInst(o0 | 4, op1, CRn, CRm, op2, DecodeReg(Rt)); } +void ARM64XEmitter::CNTVCT(Arm64Gen::ARM64Reg Rt) +{ + _assert_msg_(DYNA_REC, Is64Bit(Rt), "CNTVCT: Rt must be 64-bit"); + + // MRS , CNTVCT_EL0 ; Read CNTVCT_EL0 into Xt + EncodeSystemInst(3 | 4, 3, 0xe, 0, 2, DecodeReg(Rt)); +} + void ARM64XEmitter::HINT(SystemHint op) { EncodeSystemInst(0, 3, 2, 0, op, WSP); diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 4b5bc9f137..2f49795519 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -603,9 +603,9 @@ public: // System void _MSR(PStateField field, u8 imm); - void _MSR(PStateField field, ARM64Reg Rt); void MRS(ARM64Reg Rt, PStateField field); + void CNTVCT(ARM64Reg Rt); void HINT(SystemHint op); void CLREX(); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index b7dbf5a161..7cc2c476c2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -36,15 +36,6 @@ constexpr size_t SAFE_STACK_SIZE = 512 * 1024; constexpr size_t GUARD_SIZE = 0x10000; // two guards - bottom (permanent) and middle (see above) constexpr size_t GUARD_OFFSET = STACK_SIZE - SAFE_STACK_SIZE - GUARD_SIZE; -static bool HasCycleCounters() -{ - // Bit needs to be set to support cycle counters - const u32 PMUSERENR_CR = 0x4; - u32 reg; - asm("mrs %[val], PMUSERENR_EL0" : [val] "=r"(reg)); - return !!(reg & PMUSERENR_CR); -} - void JitArm64::Init() { InitializeInstructionTables(); @@ -72,8 +63,6 @@ void JitArm64::Init() AllocStack(); GenerateAsm(); - - m_supports_cycle_counter = HasCycleCounters(); } bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx) @@ -514,73 +503,47 @@ void JitArm64::DumpCode(const u8* start, const u8* end) WARN_LOG(DYNA_REC, "Code dump from %p to %p:\n%s", start, end, output.c_str()); } -void JitArm64::EmitResetCycleCounters() -{ - const u32 PMCR_EL0_E = 1; - const u32 PMCR_EL0_P = 2; - const u32 PMCR_EL0_C = 4; - const u32 PMCR_EL0_LC = 0x40; - _MSR(FIELD_PMCR_EL0, X10); - MOVI2R(X11, PMCR_EL0_E | PMCR_EL0_P | PMCR_EL0_C | PMCR_EL0_LC); - ORR(X10, X10, X11); - MRS(X10, FIELD_PMCR_EL0); -} - -void JitArm64::EmitGetCycles(Arm64Gen::ARM64Reg reg) -{ - _MSR(FIELD_PMCCNTR_EL0, reg); -} - void JitArm64::BeginTimeProfile(JitBlock* b) { MOVP2R(X0, &b->profile_data); LDR(INDEX_UNSIGNED, X1, X0, offsetof(JitBlock::ProfileData, runCount)); ADD(X1, X1, 1); - if (m_supports_cycle_counter) - { - EmitResetCycleCounters(); - EmitGetCycles(X2); + // Fetch the current counter register + CNTVCT(X2); - // stores runCount and ticStart - STP(INDEX_UNSIGNED, X1, X2, X0, offsetof(JitBlock::ProfileData, runCount)); - } - else - { - STR(INDEX_UNSIGNED, X1, X0, offsetof(JitBlock::ProfileData, runCount)); - - MOVP2R(X1, &QueryPerformanceCounter); - ADD(X0, X0, offsetof(JitBlock::ProfileData, ticStart)); - BLR(X1); - } + // stores runCount and ticStart + STP(INDEX_SIGNED, X1, X2, X0, offsetof(JitBlock::ProfileData, runCount)); } void JitArm64::EndTimeProfile(JitBlock* b) { - MOVP2R(X20, &b->profile_data); - if (m_supports_cycle_counter) - { - EmitGetCycles(X2); - } - else - { - MOVP2R(X1, &QueryPerformanceCounter); - ADD(X0, X20, offsetof(JitBlock::ProfileData, ticStop)); - BLR(X1); + ARM64Reg WA = gpr.GetReg(); + ARM64Reg XA = EncodeRegTo64(WA); + ARM64Reg WB = gpr.GetReg(); + ARM64Reg XB = EncodeRegTo64(WB); + ARM64Reg WC = gpr.GetReg(); + ARM64Reg XC = EncodeRegTo64(WC); + ARM64Reg WD = gpr.GetReg(); + ARM64Reg XD = EncodeRegTo64(WD); - LDR(INDEX_UNSIGNED, X2, X20, offsetof(JitBlock::ProfileData, ticStop)); - } + // Fetch the current counter register + CNTVCT(XB); - LDR(INDEX_UNSIGNED, X1, X20, offsetof(JitBlock::ProfileData, ticStart)); + MOVP2R(XA, &b->profile_data); + + LDR(INDEX_UNSIGNED, XC, XA, offsetof(JitBlock::ProfileData, ticStart)); + SUB(XB, XB, XC); // loads ticCounter and downcountCounter - LDP(INDEX_UNSIGNED, X3, X4, X20, offsetof(JitBlock::ProfileData, ticCounter)); - SUB(X2, X2, X1); - ADD(X3, X3, X2); - ADDI2R(X4, X4, js.downcountAmount); + LDP(INDEX_SIGNED, XC, XD, XA, offsetof(JitBlock::ProfileData, ticCounter)); + ADD(XC, XC, XB); + ADDI2R(XD, XD, js.downcountAmount); // stores ticCounter and downcountCounter - STP(INDEX_UNSIGNED, X3, X4, X20, offsetof(JitBlock::ProfileData, ticCounter)); + STP(INDEX_SIGNED, XC, XD, XA, offsetof(JitBlock::ProfileData, ticCounter)); + + gpr.Unlock(WA, WB, WC, WD); } void JitArm64::Run() diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 5685440ca1..918ceb678c 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -174,9 +174,6 @@ private: static void InitializeInstructionTables(); void CompileInstruction(PPCAnalyst::CodeOp& op); - void EmitResetCycleCounters(); - void EmitGetCycles(Arm64Gen::ARM64Reg reg); - // Simple functions to switch between near and far code emitting void SwitchToFarCode() { @@ -253,9 +250,6 @@ private: Arm64Gen::ARM64CodeBlock farcode; u8* nearcode; // Backed up when we switch to far code. - // Do we support cycle counter profiling? - bool m_supports_cycle_counter; - bool m_enable_blr_optimization; bool m_cleanup_after_stackfault = false; u8* m_stack_base = nullptr; diff --git a/Source/Core/Core/PowerPC/Profiler.cpp b/Source/Core/Core/PowerPC/Profiler.cpp index 0e1a176727..0378c4a5b5 100644 --- a/Source/Core/Core/PowerPC/Profiler.cpp +++ b/Source/Core/Core/PowerPC/Profiler.cpp @@ -10,7 +10,7 @@ namespace Profiler { -bool g_ProfileBlocks; +bool g_ProfileBlocks = false; void WriteProfileResults(const std::string& filename) {