diff --git a/Source/Core/VideoCommon/CMakeLists.txt b/Source/Core/VideoCommon/CMakeLists.txt index 2101a719c2..ceb83564f1 100644 --- a/Source/Core/VideoCommon/CMakeLists.txt +++ b/Source/Core/VideoCommon/CMakeLists.txt @@ -45,7 +45,7 @@ set(SRCS BoundingBox.cpp set(LIBS core png) if(_M_X86) - set(SRCS ${SRCS} TextureDecoder_x64.cpp) + set(SRCS ${SRCS} TextureDecoder_x64.cpp VertexLoaderX64.cpp) else() set(SRCS ${SRCS} TextureDecoder_Generic.cpp) endif() diff --git a/Source/Core/VideoCommon/CPMemory.h b/Source/Core/VideoCommon/CPMemory.h index 124c30a04a..4fb4b67f20 100644 --- a/Source/Core/VideoCommon/CPMemory.h +++ b/Source/Core/VideoCommon/CPMemory.h @@ -20,10 +20,13 @@ enum // Vertex components enum { - NOT_PRESENT = 0, - DIRECT = 1, - INDEX8 = 2, - INDEX16 = 3, + NOT_PRESENT = 0, + DIRECT = 1, + INDEX8 = 2, + INDEX16 = 3, + + MASK_INDEXED = 2, + MASK_ALL = 3, }; enum diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index e13e406d8d..734ea9a4a6 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -40,14 +40,9 @@ void VertexLoader::operator delete (void *p) static void LOADERDECL PosMtx_ReadDirect_UByte(VertexLoader* loader) { - BoundingBox::posMtxIdx = loader->m_curposmtx = DataReadU8() & 0x3f; - PRIM_LOG("posmtx: %d, ", loader->m_curposmtx); -} - -static void LOADERDECL PosMtx_Write(VertexLoader* loader) -{ - // u8, 0, 0, 0 - DataWrite(loader->m_curposmtx); + u8 posmtx = BoundingBox::posMtxIdx = DataReadU8() & 0x3f; + DataWrite(posmtx); + PRIM_LOG("posmtx: %d, ", posmtx); } static void LOADERDECL TexMtx_ReadDirect_UByte(VertexLoader* loader) @@ -69,18 +64,16 @@ static void LOADERDECL TexMtx_Write_Float2(VertexLoader* loader) DataWrite(float(loader->m_curtexmtx[loader->m_texmtxwrite++])); } -static void LOADERDECL TexMtx_Write_Float4(VertexLoader* loader) +static void LOADERDECL TexMtx_Write_Float3(VertexLoader* loader) { #if _M_SSE >= 0x200 __m128 output = _mm_cvtsi32_ss(_mm_castsi128_ps(_mm_setzero_si128()), loader->m_curtexmtx[loader->m_texmtxwrite++]); _mm_storeu_ps((float*)g_vertex_manager_write_ptr, _mm_shuffle_ps(output, output, 0x45 /* 1, 1, 0, 1 */)); - g_vertex_manager_write_ptr += sizeof(float) * 4; + g_vertex_manager_write_ptr += sizeof(float) * 3; #else DataWrite(0.f); DataWrite(0.f); DataWrite(float(loader->m_curtexmtx[loader->m_texmtxwrite++])); - // Just to fill out with 0. - DataWrite(0.f); #endif } @@ -137,16 +130,22 @@ void VertexLoader::CompileVertexTranslator() // Position in pc vertex format. int nat_offset = 0; - memset(&m_native_vtx_decl, 0, sizeof(m_native_vtx_decl)); // Position Matrix Index if (m_VtxDesc.PosMatIdx) { WriteCall(PosMtx_ReadDirect_UByte); components |= VB_HAS_POSMTXIDX; + m_native_vtx_decl.posmtx.components = 4; + m_native_vtx_decl.posmtx.enable = true; + m_native_vtx_decl.posmtx.offset = nat_offset; + m_native_vtx_decl.posmtx.type = VAR_UNSIGNED_BYTE; + m_native_vtx_decl.posmtx.integer = true; + nat_offset += 4; m_VertexSize += 1; } + if (m_VtxDesc.Tex0MatIdx) {m_VertexSize += 1; components |= VB_HAS_TEXMTXIDX0; WriteCall(TexMtx_ReadDirect_UByte); } if (m_VtxDesc.Tex1MatIdx) {m_VertexSize += 1; components |= VB_HAS_TEXMTXIDX1; WriteCall(TexMtx_ReadDirect_UByte); } if (m_VtxDesc.Tex2MatIdx) {m_VertexSize += 1; components |= VB_HAS_TEXMTXIDX2; WriteCall(TexMtx_ReadDirect_UByte); } @@ -267,11 +266,7 @@ void VertexLoader::CompileVertexTranslator() const int format = m_VtxAttr.texCoord[i].Format; const int elements = m_VtxAttr.texCoord[i].Elements; - if (tc[i] == NOT_PRESENT) - { - components &= ~(VB_HAS_UV0 << i); - } - else + if (tc[i] != NOT_PRESENT) { _assert_msg_(VIDEO, DIRECT <= tc[i] && tc[i] <= INDEX16, "Invalid texture coordinates!\n(tc[i] = %d)", (u32)tc[i]); _assert_msg_(VIDEO, FORMAT_UBYTE <= format && format <= FORMAT_FLOAT, "Invalid texture coordinates format!\n(format = %d)", format); @@ -295,9 +290,9 @@ void VertexLoader::CompileVertexTranslator() else { components |= VB_HAS_UV0 << i; // have to include since using now - m_native_vtx_decl.texcoords[i].components = 4; - nat_offset += 16; // still include the texture coordinate, but this time as 6 + 2 bytes - WriteCall(TexMtx_Write_Float4); + m_native_vtx_decl.texcoords[i].components = 3; + nat_offset += 12; + WriteCall(TexMtx_Write_Float3); } } else @@ -335,17 +330,6 @@ void VertexLoader::CompileVertexTranslator() if (!g_ActiveConfig.backend_info.bSupportsBBox) WriteCall(BoundingBox::Update); - if (m_VtxDesc.PosMatIdx) - { - WriteCall(PosMtx_Write); - m_native_vtx_decl.posmtx.components = 4; - m_native_vtx_decl.posmtx.enable = true; - m_native_vtx_decl.posmtx.offset = nat_offset; - m_native_vtx_decl.posmtx.type = VAR_UNSIGNED_BYTE; - m_native_vtx_decl.posmtx.integer = true; - nat_offset += 4; - } - // indexed position formats may skip a the vertex if (m_VtxDesc.Position & 2) { diff --git a/Source/Core/VideoCommon/VertexLoader.h b/Source/Core/VideoCommon/VertexLoader.h index f1b1eae64b..a7acd5dba1 100644 --- a/Source/Core/VideoCommon/VertexLoader.h +++ b/Source/Core/VideoCommon/VertexLoader.h @@ -58,7 +58,6 @@ public: // Matrix components are first in GC format but later in PC format - we need to store it temporarily // when decoding each vertex. - u8 m_curposmtx; u8 m_curtexmtx[8]; int m_texmtxwrite; int m_texmtxread; diff --git a/Source/Core/VideoCommon/VertexLoaderBase.cpp b/Source/Core/VideoCommon/VertexLoaderBase.cpp index fd3f186a45..815e162689 100644 --- a/Source/Core/VideoCommon/VertexLoaderBase.cpp +++ b/Source/Core/VideoCommon/VertexLoaderBase.cpp @@ -10,11 +10,17 @@ #include "VideoCommon/VertexLoader.h" #include "VideoCommon/VertexLoaderBase.h" +#ifdef _M_X86_64 +#include "VideoCommon/VertexLoaderX64.h" +#endif + VertexLoaderBase::VertexLoaderBase(const TVtxDesc &vtx_desc, const VAT &vtx_attr) { m_numLoadedVertices = 0; m_VertexSize = 0; m_native_vertex_format = nullptr; + m_native_components = 0; + memset(&m_native_vtx_decl, 0, sizeof(m_native_vtx_decl)); SetVAT(vtx_attr); m_VtxDesc = vtx_desc; @@ -198,15 +204,22 @@ VertexLoaderBase* VertexLoaderBase::CreateVertexLoader(const TVtxDesc& vtx_desc, { VertexLoaderBase* loader; -#if 0 +//#define COMPARE_VERTEXLOADERS + +#if defined(COMPARE_VERTEXLOADERS) && defined(_M_X86_64) // first try: Any new VertexLoader vs the old one loader = new VertexLoaderTester( new VertexLoader(vtx_desc, vtx_attr), // the software one - new VertexLoader(vtx_desc, vtx_attr), // the new one to compare + new VertexLoaderX64(vtx_desc, vtx_attr), // the new one to compare vtx_desc, vtx_attr); if (loader->IsInitialized()) return loader; delete loader; +#elif defined(_M_X86_64) + loader = new VertexLoaderX64(vtx_desc, vtx_attr); + if (loader->IsInitialized()) + return loader; + delete loader; #endif // last try: The old VertexLoader diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp new file mode 100644 index 0000000000..db29d8c9b7 --- /dev/null +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -0,0 +1,373 @@ +#ifdef _MSC_VER +#include +#else +#include +#endif + +#include "Common/CPUDetect.h" +#include "Common/x64ABI.h" +#include "VideoCommon/VertexLoaderX64.h" + +using namespace Gen; + +static const X64Reg src_reg = ABI_PARAM1; +static const X64Reg dst_reg = ABI_PARAM2; +static const X64Reg count_reg = ABI_PARAM3; +static const X64Reg scratch1 = RAX; +static const X64Reg scratch2 = R8; +static const X64Reg skipped_reg = R9; + +VertexLoaderX64::VertexLoaderX64(const TVtxDesc& vtx_desc, const VAT& vtx_att): VertexLoaderBase(vtx_desc, vtx_att) +{ + if (!IsInitialized()) + return; + + AllocCodeSpace(4096); + ClearCodeSpace(); + GenerateVertexLoader(); + WriteProtect(); +} + +OpArg VertexLoaderX64::GetVertexAddr(int array, u64 attribute) +{ + OpArg data = MDisp(src_reg, m_src_ofs); + if (attribute & MASK_INDEXED) + { + if (attribute == INDEX8) + { + MOVZX(64, 8, scratch1, data); + m_src_ofs += 1; + } + else + { + MOVZX(64, 16, scratch1, data); + m_src_ofs += 2; + // Convert to little-endian. + ROR(16, R(scratch1), Imm8(8)); + } + if (array == ARRAY_POSITION) + { + CMP(attribute == INDEX8 ? 8 : 16, R(scratch1), Imm8(-1)); + m_skip_vertex = J_CC(CC_E, true); + } + // TODO: Move cached_arraybases into CPState and use MDisp() relative to a constant register loaded with &g_main_cp_state. + IMUL(32, scratch1, M(&g_main_cp_state.array_strides[array])); + MOV(64, R(scratch2), M(&cached_arraybases[array])); + return MRegSum(scratch1, scratch2); + } + else + { + return data; + } +} + +int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count_in, int count_out, u8 scaling_exponent, AttributeFormat* native_format) +{ + static const __m128i shuffle_lut[5][3] = { + {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF00L), // 1x u8 + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L), // 2x u8 + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L)}, // 3x u8 + {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x00FFFFFFL), // 1x s8 + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL), // 2x s8 + _mm_set_epi32(0xFFFFFFFFL, 0x02FFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL)}, // 3x s8 + {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0001L), // 1x u16 + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0203L, 0xFFFF0001L), // 2x u16 + _mm_set_epi32(0xFFFFFFFFL, 0xFFFF0405L, 0xFFFF0203L, 0xFFFF0001L)}, // 3x u16 + {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x0001FFFFL), // 1x s16 + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL), // 2x s16 + _mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL)}, // 3x s16 + {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x00010203L), // 1x float + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L), // 2x float + _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L)}, // 3x float + }; + static const __m128 scale_factors[32] = { + _mm_set_ps1(1./(1u<< 0)), _mm_set_ps1(1./(1u<< 1)), _mm_set_ps1(1./(1u<< 2)), _mm_set_ps1(1./(1u<< 3)), + _mm_set_ps1(1./(1u<< 4)), _mm_set_ps1(1./(1u<< 5)), _mm_set_ps1(1./(1u<< 6)), _mm_set_ps1(1./(1u<< 7)), + _mm_set_ps1(1./(1u<< 8)), _mm_set_ps1(1./(1u<< 9)), _mm_set_ps1(1./(1u<<10)), _mm_set_ps1(1./(1u<<11)), + _mm_set_ps1(1./(1u<<12)), _mm_set_ps1(1./(1u<<13)), _mm_set_ps1(1./(1u<<14)), _mm_set_ps1(1./(1u<<15)), + _mm_set_ps1(1./(1u<<16)), _mm_set_ps1(1./(1u<<17)), _mm_set_ps1(1./(1u<<18)), _mm_set_ps1(1./(1u<<19)), + _mm_set_ps1(1./(1u<<20)), _mm_set_ps1(1./(1u<<21)), _mm_set_ps1(1./(1u<<22)), _mm_set_ps1(1./(1u<<23)), + _mm_set_ps1(1./(1u<<24)), _mm_set_ps1(1./(1u<<25)), _mm_set_ps1(1./(1u<<26)), _mm_set_ps1(1./(1u<<27)), + _mm_set_ps1(1./(1u<<28)), _mm_set_ps1(1./(1u<<29)), _mm_set_ps1(1./(1u<<30)), _mm_set_ps1(1./(1u<<31)), + }; + + X64Reg coords = XMM0; + + int elem_size = 1 << (format / 2); + int load_bytes = elem_size * count_in; + if (load_bytes >= 8) + MOVDQU(coords, data); + else if (load_bytes >= 4) + MOVQ_xmm(coords, data); + else + MOVD_xmm(coords, data); + + PSHUFB(coords, M(&shuffle_lut[format][count_in - 1])); + + if (format != FORMAT_FLOAT) + { + // Sign extend + if (format == FORMAT_BYTE) + PSRAD(coords, 24); + if (format == FORMAT_SHORT) + PSRAD(coords, 16); + + CVTDQ2PS(coords, R(coords)); + + if (scaling_exponent) + MULPS(coords, M(&scale_factors[scaling_exponent])); + } + + OpArg dest = MDisp(dst_reg, m_dst_ofs); + switch (count_out) + { + case 1: MOVSS(dest, coords); break; + case 2: MOVLPS(dest, coords); break; + case 3: MOVUPS(dest, coords); break; + } + + native_format->components = count_out; + native_format->enable = true; + native_format->offset = m_dst_ofs; + native_format->type = VAR_FLOAT; + native_format->integer = false; + m_dst_ofs += sizeof(float) * count_out; + + if (attribute == DIRECT) + m_src_ofs += load_bytes; + + return load_bytes; +} + +// TODO: generate alternative code for pre-BMI2/MOVBE CPUs + +void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format, int elements) +{ + int load_bytes = 0; + static const u32 mask_565 = 0xF8FCF800; + static const u32 mask_0f = 0x0F0F0F0F; + static const u32 mask_f0 = 0xF0F0F0F0; + static const u32 mask_fc = 0xFCFCFCFC; + switch (format) + { + case FORMAT_24B_888: + case FORMAT_32B_888x: + case FORMAT_32B_8888: + MOV(32, R(scratch1), data); + if (format != FORMAT_32B_8888 || !elements) + OR(32, R(scratch1), Imm32(0xFF000000)); + MOV(32, MDisp(dst_reg, m_dst_ofs), R(scratch1)); + load_bytes = 3 + (format != FORMAT_24B_888); + break; + + case FORMAT_16B_565: + // RRRRRGGG GGGBBBBB + // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR + MOVBE(16, scratch1, data); + PDEP(32, scratch1, scratch1, M(&mask_565)); + + MOV(32, R(scratch2), R(scratch1)); + SHR(32, R(scratch1), Imm8(5)); + AND(32, R(scratch1), Imm32(0x07000700)); + OR(32, R(scratch1), R(scratch2)); + + SHR(32, R(scratch2), Imm8(6)); + AND(32, R(scratch2), Imm32(0x00030000)); + OR(32, R(scratch1), R(scratch2)); + + OR(8, R(scratch1), Imm8(0xFF)); + MOVBE(32, MDisp(dst_reg, m_dst_ofs), scratch1); + load_bytes = 2; + break; + + case FORMAT_16B_4444: + // RRRRGGGG BBBBAAAA + // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR + MOVBE(16, scratch1, data); + PDEP(32, scratch2, scratch1, M(&mask_0f)); + PDEP(32, scratch1, scratch1, M(&mask_f0)); + OR(32, R(scratch1), R(scratch2)); + MOVBE(32, MDisp(dst_reg, m_dst_ofs), scratch1); + load_bytes = 2; + break; + + case FORMAT_24B_6666: + // RRRRRRGG GGGGBBBB BBAAAAAA + // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR + data.offset -= 1; + MOVBE(32, scratch1, data); + PDEP(32, scratch1, scratch1, M(&mask_fc)); + + MOV(32, R(scratch2), R(scratch1)); + SHR(32, R(scratch2), Imm8(6)); + AND(32, R(scratch2), Imm32(0x03030303)); + OR(32, R(scratch1), R(scratch2)); + + MOVBE(32, MDisp(dst_reg, m_dst_ofs), scratch1); + load_bytes = 3; + break; + } + if (attribute == DIRECT) + m_src_ofs += load_bytes; +} + +void VertexLoaderX64::GenerateVertexLoader() +{ + // Backup count since we're going to count it down. + PUSH(32, R(count_reg)); + + if (m_VtxDesc.Position & MASK_INDEXED) + XOR(32, R(skipped_reg), R(skipped_reg)); + + // TODO: load constants into registers outside the main loop + + const u8* loop_start = GetCodePtr(); + + if (m_VtxDesc.PosMatIdx) + { + MOVZX(32, 8, scratch1, MDisp(src_reg, m_src_ofs)); + AND(32, R(scratch1), Imm8(0x3F)); + MOV(32, MDisp(dst_reg, m_dst_ofs), R(scratch1)); + m_native_components |= VB_HAS_POSMTXIDX; + m_native_vtx_decl.posmtx.components = 4; + m_native_vtx_decl.posmtx.enable = true; + m_native_vtx_decl.posmtx.offset = m_dst_ofs; + m_native_vtx_decl.posmtx.type = VAR_UNSIGNED_BYTE; + m_native_vtx_decl.posmtx.integer = true; + m_src_ofs += sizeof(u8); + m_dst_ofs += sizeof(u32); + } + + u32 texmatidx_ofs[8]; + const u64 tm[8] = { + m_VtxDesc.Tex0MatIdx, m_VtxDesc.Tex1MatIdx, m_VtxDesc.Tex2MatIdx, m_VtxDesc.Tex3MatIdx, + m_VtxDesc.Tex4MatIdx, m_VtxDesc.Tex5MatIdx, m_VtxDesc.Tex6MatIdx, m_VtxDesc.Tex7MatIdx, + }; + for (int i = 0; i < 8; i++) + { + if (tm[i]) + texmatidx_ofs[i] = m_src_ofs++; + } + + OpArg data = GetVertexAddr(ARRAY_POSITION, m_VtxDesc.Position); + ReadVertex(data, m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements + 2, 3, m_VtxAttr.PosFrac, &m_native_vtx_decl.position); + + if (m_VtxDesc.Normal) + { + static const u8 map[] = {7, 6, 15, 14, 30}; + u8 scaling_exponent = map[m_VtxAttr.NormalFormat]; + + for (int i = 0; i < (m_VtxAttr.NormalElements ? 3 : 1); i++) + { + if (!i || m_VtxAttr.NormalIndex3) + { + data = GetVertexAddr(ARRAY_NORMAL, m_VtxDesc.Normal); + int elem_size = 1 << (m_VtxAttr.NormalFormat / 2); + data.offset += i * elem_size * 3; + } + data.offset += ReadVertex(data, m_VtxDesc.Normal, m_VtxAttr.NormalFormat, 3, 3, scaling_exponent, &m_native_vtx_decl.normals[i]); + } + + m_native_components |= VB_HAS_NRM0; + if (m_VtxAttr.NormalElements) + m_native_components |= VB_HAS_NRM1 | VB_HAS_NRM2; + } + + const u64 col[2] = {m_VtxDesc.Color0, m_VtxDesc.Color1}; + for (int i = 0; i < 2; i++) + { + if (col[i]) + { + data = GetVertexAddr(ARRAY_COLOR + i, col[i]); + ReadColor(data, col[i], m_VtxAttr.color[i].Comp, m_VtxAttr.color[i].Elements); + m_native_components |= VB_HAS_COL0 << i; + m_native_vtx_decl.colors[i].components = 4; + m_native_vtx_decl.colors[i].enable = true; + m_native_vtx_decl.colors[i].offset = m_dst_ofs; + m_native_vtx_decl.colors[i].type = VAR_UNSIGNED_BYTE; + m_native_vtx_decl.colors[i].integer = false; + m_dst_ofs += 4; + } + } + + const u64 tc[8] = { + m_VtxDesc.Tex0Coord, m_VtxDesc.Tex1Coord, m_VtxDesc.Tex2Coord, m_VtxDesc.Tex3Coord, + m_VtxDesc.Tex4Coord, m_VtxDesc.Tex5Coord, m_VtxDesc.Tex6Coord, m_VtxDesc.Tex7Coord, + }; + for (int i = 0; i < 8; i++) + { + int elements = m_VtxAttr.texCoord[i].Elements + 1; + if (tc[i]) + { + data = GetVertexAddr(ARRAY_TEXCOORD0 + i, tc[i]); + u8 scaling_exponent = m_VtxAttr.texCoord[i].Frac; + ReadVertex(data, tc[i], m_VtxAttr.texCoord[i].Format, elements, tm[i] ? 2 : elements, scaling_exponent, &m_native_vtx_decl.texcoords[i]); + m_native_components |= VB_HAS_UV0 << i; + } + if (tm[i]) + { + m_native_components |= VB_HAS_TEXMTXIDX0 << i; + m_native_vtx_decl.texcoords[i].components = 3; + m_native_vtx_decl.texcoords[i].enable = true; + m_native_vtx_decl.texcoords[i].type = VAR_FLOAT; + m_native_vtx_decl.texcoords[i].integer = false; + MOVZX(64, 8, scratch1, MDisp(src_reg, texmatidx_ofs[i])); + PXOR(XMM0, R(XMM0)); + CVTSI2SS(XMM0, R(scratch1)); + if (tc[i]) + { + CVTSI2SS(XMM0, R(scratch1)); + MOVSS(MDisp(dst_reg, m_dst_ofs), XMM0); + m_dst_ofs += sizeof(float); + } + else + { + m_native_vtx_decl.texcoords[i].offset = m_dst_ofs; + PXOR(XMM0, R(XMM0)); + CVTSI2SS(XMM0, R(scratch1)); + SHUFPS(XMM0, R(XMM0), 0x45); + MOVUPS(MDisp(dst_reg, m_dst_ofs), XMM0); + m_dst_ofs += sizeof(float) * 3; + } + } + } + + // Prepare for the next vertex. + ADD(64, R(dst_reg), Imm32(m_dst_ofs)); + const u8* cont = GetCodePtr(); + ADD(64, R(src_reg), Imm32(m_src_ofs)); + + SUB(32, R(count_reg), Imm8(1)); + J_CC(CC_NZ, loop_start); + + // Get the original count. + POP(32, R(ABI_RETURN)); + + if (m_VtxDesc.Position & MASK_INDEXED) + { + SUB(32, R(ABI_RETURN), R(skipped_reg)); + RET(); + + SetJumpTarget(m_skip_vertex); + ADD(32, R(skipped_reg), Imm8(m_VtxDesc.Position == INDEX8 ? 1 : 2)); + JMP(cont); + } + else + { + RET(); + } + + m_VertexSize = m_src_ofs; + m_native_vtx_decl.stride = m_dst_ofs; +} + +bool VertexLoaderX64::IsInitialized() +{ + return true; +} + +int VertexLoaderX64::RunVertices(int primitive, int count, DataReader src, DataReader dst) +{ + m_numLoadedVertices += count; + return ((int (*)(u8* src, u8* dst, int count))region)(src.GetPointer(), dst.GetPointer(), count); +} diff --git a/Source/Core/VideoCommon/VertexLoaderX64.h b/Source/Core/VideoCommon/VertexLoaderX64.h new file mode 100644 index 0000000000..e578fb49d8 --- /dev/null +++ b/Source/Core/VideoCommon/VertexLoaderX64.h @@ -0,0 +1,22 @@ +#include "Common/x64Emitter.h" +#include "VideoCommon/VertexLoaderBase.h" + +class VertexLoaderX64 : public VertexLoaderBase, public Gen::X64CodeBlock +{ +public: + VertexLoaderX64(const TVtxDesc& vtx_desc, const VAT& vtx_att); + +protected: + std::string GetName() const override { return "VertexLoaderJit"; } + bool IsInitialized() override; + int RunVertices(int primitive, int count, DataReader src, DataReader dst) override; + +private: + u32 m_src_ofs = 0; + u32 m_dst_ofs = 0; + Gen::FixupBranch m_skip_vertex; + Gen::OpArg GetVertexAddr(int array, u64 attribute); + int ReadVertex(Gen::OpArg data, u64 attribute, int format, int count_in, int count_out, u8 scaling_exponent, AttributeFormat* native_format); + void ReadColor(Gen::OpArg data, u64 attribute, int format, int elements); + void GenerateVertexLoader(); +}; diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj b/Source/Core/VideoCommon/VideoCommon.vcxproj index b4db374e28..f76c59e8ba 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcxproj +++ b/Source/Core/VideoCommon/VideoCommon.vcxproj @@ -66,6 +66,7 @@ + diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj.filters b/Source/Core/VideoCommon/VideoCommon.vcxproj.filters index d00ee5518c..eca9287439 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcxproj.filters +++ b/Source/Core/VideoCommon/VideoCommon.vcxproj.filters @@ -122,6 +122,9 @@ Vertex Loading + + Vertex Loading + Vertex Loading