Vertex Loader: SSE implementations of more position/texcoord/normal formats

~35-45% faster NFS:HP2, possibly other vertex-bound games.
This commit is contained in:
Fiora
2014-11-11 01:48:38 -08:00
parent da962a3d2b
commit 3ddf82a318
7 changed files with 299 additions and 106 deletions

View File

@ -41,8 +41,9 @@ static int s_texmtxread = 0;
int tcIndex;
int colIndex;
int colElements[2];
float posScale;
float tcScale[8];
// Duplicated (4x and 2x respectively) and used in SSE code in the vertex loader JIT
GC_ALIGNED128(float posScale[4]);
GC_ALIGNED64(float tcScale[8][2]);
static const float fractionTable[32] = {
1.0f / (1U << 0), 1.0f / (1U << 1), 1.0f / (1U << 2), 1.0f / (1U << 3),
@ -65,10 +66,8 @@ static void LOADERDECL PosMtx_ReadDirect_UByte()
static void LOADERDECL PosMtx_Write()
{
DataWrite<u8>(s_curposmtx);
DataWrite<u8>(0);
DataWrite<u8>(0);
DataWrite<u8>(0);
// u8, 0, 0, 0
DataWrite<u32>(s_curposmtx);
}
static void LOADERDECL TexMtx_ReadDirect_UByte()
@ -451,10 +450,10 @@ void VertexLoader::SetupRunVertices(const VAT& vat, int primitive, int const cou
m_VtxAttr.texCoord[6].Frac = vat.g2.Tex6Frac;
m_VtxAttr.texCoord[7].Frac = vat.g2.Tex7Frac;
posScale = fractionTable[m_VtxAttr.PosFrac];
posScale[0] = posScale[1] = posScale[2] = posScale[3] = fractionTable[m_VtxAttr.PosFrac];
if (m_native_components & VB_HAS_UVALL)
for (int i = 0; i < 8; i++)
tcScale[i] = fractionTable[m_VtxAttr.texCoord[i].Frac];
tcScale[i][0] = tcScale[i][1] = fractionTable[m_VtxAttr.texCoord[i].Frac];
for (int i = 0; i < 2; i++)
colElements[i] = m_VtxAttr.color[i].Elements;