From 9e2f4dd7da306a011d3f34bad7c1808186a9bbd0 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Mon, 1 Jun 2015 19:43:35 +0200 Subject: [PATCH 1/3] VertexLoaderX64: revert 9da86092aeb1fda7470a661a36 I can't reproduce that it's actually faster and it will definitely be slower with position caching for zfreeze. --- Source/Core/VideoCommon/VertexLoaderX64.cpp | 42 +++++++++++---------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp index 9abde62b8b..ba29a7497a 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -77,7 +77,7 @@ OpArg VertexLoaderX64::GetVertexAddr(int array, u64 attribute) int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format) { - static const __m128i shuffle_lut[4][3] = { + static const __m128i shuffle_lut[5][3] = { {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF00L), // 1x u8 _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L), // 2x u8 _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L)}, // 3x u8 @@ -90,6 +90,9 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x0001FFFFL), // 1x s16 _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL), // 2x s16 _mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL)}, // 3x s16 + {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x00010203L), // 1x float + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L), // 2x float + _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L)}, // 3x float }; static const __m128 scale_factors[32] = { _mm_set_ps1(1./(1u<< 0)), _mm_set_ps1(1./(1u<< 1)), _mm_set_ps1(1./(1u<< 2)), _mm_set_ps1(1./(1u<< 3)), @@ -119,21 +122,6 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count if (attribute == DIRECT) m_src_ofs += load_bytes; - if (format == FORMAT_FLOAT) - { - // Floats don't need to be scaled or converted, - // so we can just load/swap/store them directly - // and return early. - for (int i = 0; i < count_in; i++) - { - LoadAndSwap(32, scratch3, data); - MOV(32, dest, R(scratch3)); - data.AddMemOffset(sizeof(float)); - dest.AddMemOffset(sizeof(float)); - } - return load_bytes; - } - if (cpu_info.bSSSE3) { if (load_bytes > 8) @@ -194,13 +182,29 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count else PSRLD(coords, 16); break; + case FORMAT_FLOAT: + // Floats don't need to be scaled or converted, + // so we can just load/swap/store them directly + // and return early. + // (In SSSE3 we still need to store them.) + for (int i = 0; i < count_in; i++) + { + LoadAndSwap(32, scratch3, data); + MOV(32, dest, R(scratch3)); + data.AddMemOffset(sizeof(float)); + dest.AddMemOffset(sizeof(float)); + } + return load_bytes; } } - CVTDQ2PS(coords, R(coords)); + if (format != FORMAT_FLOAT) + { + CVTDQ2PS(coords, R(coords)); - if (dequantize && scaling_exponent) - MULPS(coords, MPIC(&scale_factors[scaling_exponent])); + if (dequantize && scaling_exponent) + MULPS(coords, MPIC(&scale_factors[scaling_exponent])); + } switch (count_out) { From 5ddd2cef6c5cc8e5414636d73365691de726b3d0 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Mon, 1 Jun 2015 19:58:27 +0200 Subject: [PATCH 2/3] zfreeze: cache vertex positions Suggested by degasus. --- Source/Core/Common/x64Emitter.cpp | 1 + Source/Core/Common/x64Emitter.h | 1 + Source/Core/VideoCommon/VertexLoader.cpp | 5 +- Source/Core/VideoCommon/VertexLoader.h | 1 + .../Core/VideoCommon/VertexLoaderManager.cpp | 3 ++ Source/Core/VideoCommon/VertexLoaderManager.h | 5 ++ Source/Core/VideoCommon/VertexLoaderX64.cpp | 47 +++++++++++++++++++ .../VideoCommon/VertexLoader_Position.cpp | 14 +++++- Source/Core/VideoCommon/VertexManagerBase.cpp | 20 +++----- 9 files changed, 80 insertions(+), 17 deletions(-) diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index c1b703b174..1c1a8b9754 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -1823,6 +1823,7 @@ void XEmitter::PCMPGTD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0x66, d void XEmitter::PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg) {WriteSSEOp(0x66, 0xC5, dest, arg); Write8(subreg);} void XEmitter::PINSRW(X64Reg dest, const OpArg& arg, u8 subreg) {WriteSSEOp(0x66, 0xC4, dest, arg); Write8(subreg);} +void XEmitter::PINSRD(X64Reg dest, const OpArg& arg, u8 subreg) {WriteSSE41Op(0x66, 0x3A22, dest, arg); Write8(subreg);} void XEmitter::PMADDWD(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xF5, dest, arg); } void XEmitter::PSADBW(X64Reg dest, const OpArg& arg) {WriteSSEOp(0x66, 0xF6, dest, arg);} diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 7b6c9be703..c62254aec4 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -711,6 +711,7 @@ public: void PEXTRW(X64Reg dest, const OpArg& arg, u8 subreg); void PINSRW(X64Reg dest, const OpArg& arg, u8 subreg); + void PINSRD(X64Reg dest, const OpArg& arg, u8 subreg); void PMADDWD(X64Reg dest, const OpArg& arg); void PSADBW(X64Reg dest, const OpArg& arg); diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index 72aad88d7f..b8c8fab4de 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -14,6 +14,7 @@ #include "VideoCommon/VertexLoader_Normal.h" #include "VideoCommon/VertexLoader_Position.h" #include "VideoCommon/VertexLoader_TextCoord.h" +#include "VideoCommon/VertexLoaderManager.h" #include "VideoCommon/VideoCommon.h" #include "VideoCommon/VideoConfig.h" @@ -24,6 +25,8 @@ u8* g_vertex_manager_write_ptr; static void LOADERDECL PosMtx_ReadDirect_UByte(VertexLoader* loader) { u32 posmtx = DataReadU8() & 0x3f; + if (loader->m_counter < 3) + VertexLoaderManager::position_matrix_index[loader->m_counter] = posmtx; DataWrite(posmtx); PRIM_LOG("posmtx: %d, ", posmtx); } @@ -316,7 +319,7 @@ int VertexLoader::RunVertices(DataReader src, DataReader dst, int count) m_numLoadedVertices += count; m_skippedVertices = 0; - for (int s = 0; s < count; s++) + for (m_counter = count - 1; m_counter >= 0; m_counter--) { m_tcIndex = 0; m_colIndex = 0; diff --git a/Source/Core/VideoCommon/VertexLoader.h b/Source/Core/VideoCommon/VertexLoader.h index b53532a592..50ece53095 100644 --- a/Source/Core/VideoCommon/VertexLoader.h +++ b/Source/Core/VideoCommon/VertexLoader.h @@ -49,6 +49,7 @@ public: int m_texmtxread; bool m_vertexSkip; int m_skippedVertices; + int m_counter; private: // Pipeline. diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index dbb421991c..46a0eeaab0 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -26,6 +26,9 @@ namespace VertexLoaderManager { +float position_cache[3][4]; +u32 position_matrix_index[3]; + typedef std::unordered_map> NativeVertexFormatMap; static NativeVertexFormatMap s_native_vertex_map; static NativeVertexFormat* s_current_vtx_fmt; diff --git a/Source/Core/VideoCommon/VertexLoaderManager.h b/Source/Core/VideoCommon/VertexLoaderManager.h index 99336fc333..92d7e6da8f 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.h +++ b/Source/Core/VideoCommon/VertexLoaderManager.h @@ -28,5 +28,10 @@ namespace VertexLoaderManager // Resolved pointers to array bases. Used by vertex loaders. extern u8 *cached_arraybases[12]; void UpdateVertexArrayPointers(); + + // Position cache for zfreeze (3 vertices, 4 floats each to allow SIMD overwrite). + // These arrays are in reverse order. + extern float position_cache[3][4]; + extern u32 position_matrix_index[3]; } diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp index ba29a7497a..a298d7e1dd 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -23,6 +23,11 @@ static const X64Reg base_reg = RBX; static const u8* memory_base_ptr = (u8*)&g_main_cp_state.array_strides; +static OpArg MPIC(const void* ptr, X64Reg scale_reg, int scale = SCALE_1) +{ + return MComplex(base_reg, scale_reg, scale, (s32)((u8*)ptr - memory_base_ptr)); +} + static OpArg MPIC(const void* ptr) { return MDisp(base_reg, (s32)((u8*)ptr - memory_base_ptr)); @@ -193,6 +198,31 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count MOV(32, dest, R(scratch3)); data.AddMemOffset(sizeof(float)); dest.AddMemOffset(sizeof(float)); + + // zfreeze + if (native_format == &m_native_vtx_decl.position) + { + if (cpu_info.bSSE4_1) + { + PINSRD(coords, R(scratch3), i); + } + else + { + PINSRW(coords, R(scratch3), 2 * i + 0); + SHR(32, R(scratch3), Imm8(16)); + PINSRW(coords, R(scratch3), 2 * i + 1); + } + } + } + + // zfreeze + if (native_format == &m_native_vtx_decl.position) + { + CMP(32, R(count_reg), Imm8(3)); + FixupBranch dont_store = J_CC(CC_A); + LEA(32, scratch3, MScaled(count_reg, SCALE_4, -4)); + MOVUPS(MPIC(VertexLoaderManager::position_cache, scratch3, SCALE_4), coords); + SetJumpTarget(dont_store); } return load_bytes; } @@ -213,6 +243,16 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count case 3: MOVUPS(dest, coords); break; } + // zfreeze + if (native_format == &m_native_vtx_decl.position) + { + CMP(32, R(count_reg), Imm8(3)); + FixupBranch dont_store = J_CC(CC_A); + LEA(32, scratch3, MScaled(count_reg, SCALE_4, -4)); + MOVUPS(MPIC(VertexLoaderManager::position_cache, scratch3, SCALE_4), coords); + SetJumpTarget(dont_store); + } + return load_bytes; } @@ -388,6 +428,13 @@ void VertexLoaderX64::GenerateVertexLoader() MOVZX(32, 8, scratch1, MDisp(src_reg, m_src_ofs)); AND(32, R(scratch1), Imm8(0x3F)); MOV(32, MDisp(dst_reg, m_dst_ofs), R(scratch1)); + + // zfreeze + CMP(32, R(count_reg), Imm8(3)); + FixupBranch dont_store = J_CC(CC_A); + MOV(32, MPIC(VertexLoaderManager::position_matrix_index - 1, count_reg, SCALE_4), R(scratch1)); + SetJumpTarget(dont_store); + m_native_components |= VB_HAS_POSMTXIDX; m_native_vtx_decl.posmtx.components = 4; m_native_vtx_decl.posmtx.enable = true; diff --git a/Source/Core/VideoCommon/VertexLoader_Position.cpp b/Source/Core/VideoCommon/VertexLoader_Position.cpp index ed6b4587b3..fd17fbf4d7 100644 --- a/Source/Core/VideoCommon/VertexLoader_Position.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Position.cpp @@ -32,7 +32,12 @@ void LOADERDECL Pos_ReadDirect(VertexLoader* loader) DataReader src(g_video_buffer_read_ptr, nullptr); for (int i = 0; i < N; ++i) - dst.Write(PosScale(src.Read(), scale)); + { + float value = PosScale(src.Read(), scale); + if (loader->m_counter < 3) + VertexLoaderManager::position_cache[loader->m_counter][i] = value; + dst.Write(value); + } g_vertex_manager_write_ptr = dst.GetPointer(); g_video_buffer_read_ptr = src.GetPointer(); @@ -52,7 +57,12 @@ void LOADERDECL Pos_ReadIndex(VertexLoader* loader) DataReader dst(g_vertex_manager_write_ptr, nullptr); for (int i = 0; i < N; ++i) - dst.Write(PosScale(Common::FromBigEndian(data[i]), scale)); + { + float value = PosScale(Common::FromBigEndian(data[i]), scale); + if (loader->m_counter < 3) + VertexLoaderManager::position_cache[loader->m_counter][i] = value; + dst.Write(value); + } g_vertex_manager_write_ptr = dst.GetPointer(); LOG_VTX(); diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index dcef793cec..e9e8ebf901 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -279,7 +279,6 @@ void VertexManager::DoState(PointerWrap& p) void VertexManager::CalculateZSlope(NativeVertexFormat* format) { - float vtx[9]; float out[12]; float viewOffset[2] = { xfmem.viewport.xOrig - bpmem.scissorOffset.x * 2, xfmem.viewport.yOrig - bpmem.scissorOffset.y * 2}; @@ -290,31 +289,24 @@ void VertexManager::CalculateZSlope(NativeVertexFormat* format) // Global matrix ID. u32 mtxIdx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx; const PortableVertexDeclaration vert_decl = format->GetVertexDeclaration(); - size_t posOff = vert_decl.position.offset; - size_t mtxOff = vert_decl.posmtx.offset; // Make sure the buffer contains at least 3 vertices. if ((s_pCurBufferPointer - s_pBaseBufferPointer) < (vert_decl.stride * 3)) return; // Lookup vertices of the last rendered triangle and software-transform them - // This allows us to determine the depth slope, which will be used if z--freeze + // This allows us to determine the depth slope, which will be used if z-freeze // is enabled in the following flush. for (unsigned int i = 0; i < 3; ++i) { - u8* vtx_ptr = s_pCurBufferPointer - vert_decl.stride * (3 - i); - vtx[0 + i * 3] = ((float*)(vtx_ptr + posOff))[0]; - vtx[1 + i * 3] = ((float*)(vtx_ptr + posOff))[1]; - if (vert_decl.position.components == 3) - vtx[2 + i * 3] = ((float*)(vtx_ptr + posOff))[2]; - else - vtx[2 + i * 3] = 0; - // If this vertex format has per-vertex position matrix IDs, look it up. if (vert_decl.posmtx.enable) - mtxIdx = *((u32*)(vtx_ptr + mtxOff)); + mtxIdx = VertexLoaderManager::position_matrix_index[2 - i]; - VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4], mtxIdx); + if (vert_decl.position.components == 2) + VertexLoaderManager::position_cache[2 - i][2] = 0; + + VertexShaderManager::TransformToClipSpace(&VertexLoaderManager::position_cache[2 - i][0], &out[i * 4], mtxIdx); // Transform to Screenspace float inv_w = 1.0f / out[3 + i * 4]; From 10bd68936c11ae0ed74861ca2172d41bbac0dc1f Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 6 Jun 2015 23:50:50 -0500 Subject: [PATCH 3/3] [AArch64] Implement ZFreeze cache --- Source/Core/VideoCommon/VertexLoaderARM64.cpp | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.cpp b/Source/Core/VideoCommon/VertexLoaderARM64.cpp index 36b6aae0f5..35d3a989e7 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderARM64.cpp @@ -165,6 +165,18 @@ int VertexLoaderARM64::ReadVertex(u64 attribute, int format, int count_in, int c m_float_emit.ST1(32, 1, coords, EncodeRegTo64(scratch2_reg)); } + // Z-Freeze + if (native_format == &m_native_vtx_decl.position) + { + CMP(count_reg, 3); + FixupBranch dont_store = B(CC_GT); + MOVI2R(EncodeRegTo64(scratch2_reg), (u64)VertexLoaderManager::position_cache); + ORR(scratch1_reg, WSP, count_reg, ArithOption(count_reg, ST_LSL, 4)); + ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg)); + m_float_emit.STUR(write_size, coords, EncodeRegTo64(scratch1_reg), -16); + SetJumpTarget(dont_store); + } + native_format->components = count_out; native_format->enable = true; native_format->offset = m_dst_ofs; @@ -342,6 +354,14 @@ void VertexLoaderARM64::GenerateVertexLoader() LDRB(INDEX_UNSIGNED, scratch1_reg, src_reg, m_src_ofs); AND(scratch1_reg, scratch1_reg, 0, 5); STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs); + + // Z-Freeze + CMP(count_reg, 3); + FixupBranch dont_store = B(CC_GT); + MOVI2R(EncodeRegTo64(scratch2_reg), (u64)VertexLoaderManager::position_matrix_index - sizeof(u32)); + STR(INDEX_UNSIGNED, scratch1_reg, EncodeRegTo64(scratch2_reg), 0); + SetJumpTarget(dont_store); + m_native_components |= VB_HAS_POSMTXIDX; m_native_vtx_decl.posmtx.components = 4; m_native_vtx_decl.posmtx.enable = true;