Merge pull request #2192 from Tilka/sse2

VertexLoaderX64: support SSE2 as a fallback
This commit is contained in:
skidau
2015-03-21 12:58:24 +11:00
10 changed files with 326 additions and 228 deletions

View File

@ -1638,6 +1638,7 @@ void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, ar
void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);}
void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);}
void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x62, dest, arg);}
void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6C, dest, arg);}
void XEmitter::PSRLW(X64Reg reg, int shift)
{

View File

@ -680,6 +680,7 @@ public:
void PUNPCKLBW(X64Reg dest, const OpArg &arg);
void PUNPCKLWD(X64Reg dest, const OpArg &arg);
void PUNPCKLDQ(X64Reg dest, const OpArg &arg);
void PUNPCKLQDQ(X64Reg dest, const OpArg &arg);
void PTEST(X64Reg dest, OpArg arg);
void PAND(X64Reg dest, OpArg arg);

View File

@ -100,16 +100,17 @@ static T ReadNormalized(I value)
}
template <typename T, bool swap = false>
static void ReadVertexAttribute(T* dst, DataReader src, const AttributeFormat& format, int base_component, int max_components, bool reverse)
static void ReadVertexAttribute(T* dst, DataReader src, const AttributeFormat& format, int base_component, int components, bool reverse)
{
if (format.enable)
{
src.Skip(format.offset);
src.Skip(base_component * (1<<(format.type>>1)));
for (int i = 0; i < std::min(format.components - base_component, max_components); i++)
int i;
for (i = 0; i < std::min(format.components - base_component, components); i++)
{
int i_dst = reverse ? max_components - i - 1 : i;
int i_dst = reverse ? components - i - 1 : i;
switch (format.type)
{
case VAR_UNSIGNED_BYTE:
@ -131,6 +132,11 @@ static void ReadVertexAttribute(T* dst, DataReader src, const AttributeFormat& f
_assert_msg_(VIDEO, !format.integer || format.type != VAR_FLOAT, "only non-float values are allowed to be streamed as integer");
}
for (; i < components; i++)
{
int i_dst = reverse ? components - i - 1 : i;
dst[i_dst] = i == 3;
}
}
}

View File

@ -131,12 +131,13 @@ void VertexLoader::CompileVertexTranslator()
WriteCall(VertexLoader_Position::GetFunction(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements));
m_VertexSize += VertexLoader_Position::GetSize(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements);
m_native_vtx_decl.position.components = 3;
int pos_elements = m_VtxAttr.PosElements + 2;
m_native_vtx_decl.position.components = pos_elements;
m_native_vtx_decl.position.enable = true;
m_native_vtx_decl.position.offset = nat_offset;
m_native_vtx_decl.position.type = VAR_FLOAT;
m_native_vtx_decl.position.integer = false;
nat_offset += 12;
nat_offset += pos_elements * sizeof(float);
// Normals
if (m_VtxDesc.Normal != NOT_PRESENT)

View File

@ -370,8 +370,9 @@ void VertexLoaderARM64::GenerateVertexLoader()
load_size <<= 3;
s32 offset = GetAddressImm(ARRAY_POSITION, m_VtxDesc.Position, EncodeRegTo64(scratch1_reg), load_size);
ReadVertex(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements + 2, 3,
m_VtxAttr.ByteDequant, m_VtxAttr.PosFrac, &m_native_vtx_decl.position, offset);
int pos_elements = m_VtxAttr.PosElements + 2;
ReadVertex(m_VtxDesc.Position, m_VtxAttr.PosFormat, pos_elements, pos_elements,
m_VtxAttr.ByteDequant, m_VtxAttr.PosFrac, &m_native_vtx_decl.position, offset);
}
if (m_VtxDesc.Normal)

View File

@ -1,3 +1,4 @@
#include "Common/BitSet.h"
#include "Common/CPUDetect.h"
#include "Common/Intrinsics.h"
#include "Common/JitRegister.h"
@ -6,8 +7,6 @@
using namespace Gen;
#define VERTEX_LOADER_REGS {XMM0+16}
static const X64Reg src_reg = ABI_PARAM1;
static const X64Reg dst_reg = ABI_PARAM2;
static const X64Reg scratch1 = RAX;
@ -66,7 +65,7 @@ OpArg VertexLoaderX64::GetVertexAddr(int array, u64 attribute)
int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format)
{
static const __m128i shuffle_lut[5][3] = {
static const __m128i shuffle_lut[4][3] = {
{_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF00L), // 1x u8
_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L), // 2x u8
_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L)}, // 3x u8
@ -79,9 +78,6 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count
{_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x0001FFFFL), // 1x s16
_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL), // 2x s16
_mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL)}, // 3x s16
{_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x00010203L), // 1x float
_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L), // 2x float
_mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L)}, // 3x float
};
static const __m128 scale_factors[32] = {
_mm_set_ps1(1./(1u<< 0)), _mm_set_ps1(1./(1u<< 1)), _mm_set_ps1(1./(1u<< 2)), _mm_set_ps1(1./(1u<< 3)),
@ -98,47 +94,109 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count
int elem_size = 1 << (format / 2);
int load_bytes = elem_size * count_in;
if (load_bytes > 8)
MOVDQU(coords, data);
else if (load_bytes > 4)
MOVQ_xmm(coords, data);
else
MOVD_xmm(coords, data);
PSHUFB(coords, M(&shuffle_lut[format][count_in - 1]));
if (format != FORMAT_FLOAT)
{
// Sign extend
if (format == FORMAT_BYTE)
PSRAD(coords, 24);
if (format == FORMAT_SHORT)
PSRAD(coords, 16);
CVTDQ2PS(coords, R(coords));
if (dequantize && scaling_exponent)
MULPS(coords, M(&scale_factors[scaling_exponent]));
}
OpArg dest = MDisp(dst_reg, m_dst_ofs);
switch (count_out)
{
case 1: MOVSS(dest, coords); break;
case 2: MOVLPS(dest, coords); break;
case 3: MOVUPS(dest, coords); break;
}
native_format->components = count_out;
native_format->enable = true;
native_format->offset = m_dst_ofs;
native_format->type = VAR_FLOAT;
native_format->integer = false;
m_dst_ofs += sizeof(float) * count_out;
if (attribute == DIRECT)
m_src_ofs += load_bytes;
if (format == FORMAT_FLOAT)
{
// Floats don't need to be scaled or converted,
// so we can just load/swap/store them directly
// and return early.
for (int i = 0; i < count_in; i++)
{
LoadAndSwap(32, scratch3, data);
MOV(32, dest, R(scratch3));
data.offset += sizeof(float);
dest.offset += sizeof(float);
}
return load_bytes;
}
if (cpu_info.bSSSE3)
{
if (load_bytes > 8)
MOVDQU(coords, data);
else if (load_bytes > 4)
MOVQ_xmm(coords, data);
else
MOVD_xmm(coords, data);
PSHUFB(coords, M(&shuffle_lut[format][count_in - 1]));
// Sign-extend.
if (format == FORMAT_BYTE)
PSRAD(coords, 24);
if (format == FORMAT_SHORT)
PSRAD(coords, 16);
}
else
{
// SSE2
X64Reg temp = XMM1;
switch (format)
{
case FORMAT_UBYTE:
MOVD_xmm(coords, data);
PXOR(temp, R(temp));
PUNPCKLBW(coords, R(temp));
PUNPCKLWD(coords, R(temp));
break;
case FORMAT_BYTE:
MOVD_xmm(coords, data);
PUNPCKLBW(coords, R(coords));
PUNPCKLWD(coords, R(coords));
PSRAD(coords, 24);
break;
case FORMAT_USHORT:
case FORMAT_SHORT:
switch (count_in)
{
case 1:
LoadAndSwap(32, scratch3, data);
MOVD_xmm(coords, R(scratch3)); // ......X.
break;
case 2:
LoadAndSwap(32, scratch3, data);
MOVD_xmm(coords, R(scratch3)); // ......XY
PSHUFLW(coords, R(coords), 0x24); // ....Y.X.
break;
case 3:
LoadAndSwap(64, scratch3, data);
MOVQ_xmm(coords, R(scratch3)); // ....XYZ.
PUNPCKLQDQ(coords, R(coords)); // ..Z.XYZ.
PSHUFLW(coords, R(coords), 0xAC); // ..Z.Y.X.
break;
}
if (format == FORMAT_SHORT)
PSRAD(coords, 16);
else
PSRLD(coords, 16);
break;
}
}
CVTDQ2PS(coords, R(coords));
if (dequantize && scaling_exponent)
MULPS(coords, M(&scale_factors[scaling_exponent]));
switch (count_out)
{
case 1: MOVSS(dest, coords); break;
case 2: MOVLPS(dest, coords); break;
case 3: MOVUPS(dest, coords); break;
}
return load_bytes;
}
@ -290,7 +348,10 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
void VertexLoaderX64::GenerateVertexLoader()
{
ABI_PushRegistersAndAdjustStack(VERTEX_LOADER_REGS, 8);
BitSet32 xmm_regs;
xmm_regs[XMM0+16] = true;
xmm_regs[XMM1+16] = !cpu_info.bSSSE3;
ABI_PushRegistersAndAdjustStack(xmm_regs, 8);
// Backup count since we're going to count it down.
PUSH(32, R(ABI_PARAM3));
@ -332,7 +393,8 @@ void VertexLoaderX64::GenerateVertexLoader()
}
OpArg data = GetVertexAddr(ARRAY_POSITION, m_VtxDesc.Position);
ReadVertex(data, m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements + 2, 3,
int pos_elements = 2 + m_VtxAttr.PosElements;
ReadVertex(data, m_VtxDesc.Position, m_VtxAttr.PosFormat, pos_elements, pos_elements,
m_VtxAttr.ByteDequant, m_VtxAttr.PosFrac, &m_native_vtx_decl.position);
if (m_VtxDesc.Normal)
@ -408,7 +470,7 @@ void VertexLoaderX64::GenerateVertexLoader()
m_native_vtx_decl.texcoords[i].offset = m_dst_ofs;
PXOR(XMM0, R(XMM0));
CVTSI2SS(XMM0, R(scratch1));
SHUFPS(XMM0, R(XMM0), 0x45);
SHUFPS(XMM0, R(XMM0), 0x45); // 000X -> 0X00
MOVUPS(MDisp(dst_reg, m_dst_ofs), XMM0);
m_dst_ofs += sizeof(float) * 3;
}
@ -426,7 +488,7 @@ void VertexLoaderX64::GenerateVertexLoader()
// Get the original count.
POP(32, R(ABI_RETURN));
ABI_PopRegistersAndAdjustStack(VERTEX_LOADER_REGS, 8);
ABI_PopRegistersAndAdjustStack(xmm_regs, 8);
if (m_VtxDesc.Position & MASK_INDEXED)
{
@ -446,12 +508,6 @@ void VertexLoaderX64::GenerateVertexLoader()
m_native_vtx_decl.stride = m_dst_ofs;
}
bool VertexLoaderX64::IsInitialized()
{
// Uses PSHUFB.
return cpu_info.bSSSE3;
}
int VertexLoaderX64::RunVertices(DataReader src, DataReader dst, int count, int primitive)
{
m_numLoadedVertices += count;

View File

@ -8,7 +8,7 @@ public:
protected:
std::string GetName() const override { return "VertexLoaderX64"; }
bool IsInitialized() override;
bool IsInitialized() override { return true; }
int RunVertices(DataReader src, DataReader dst, int count, int primitive) override;
private:

View File

@ -30,8 +30,8 @@ void LOADERDECL Pos_ReadDirect(VertexLoader* loader)
DataReader dst(g_vertex_manager_write_ptr, nullptr);
DataReader src(g_video_buffer_read_ptr, nullptr);
for (int i = 0; i < 3; ++i)
dst.Write(i < N ? PosScale(src.Read<T>(), scale) : 0.f);
for (int i = 0; i < N; ++i)
dst.Write(PosScale(src.Read<T>(), scale));
g_vertex_manager_write_ptr = dst.GetPointer();
g_video_buffer_read_ptr = src.GetPointer();
@ -50,8 +50,8 @@ void LOADERDECL Pos_ReadIndex(VertexLoader* loader)
auto const scale = loader->m_posScale;
DataReader dst(g_vertex_manager_write_ptr, nullptr);
for (int i = 0; i < 3; ++i)
dst.Write(i < N ? PosScale(Common::FromBigEndian(data[i]), scale) : 0.f);
for (int i = 0; i < N; ++i)
dst.Write(PosScale(Common::FromBigEndian(data[i]), scale));
g_vertex_manager_write_ptr = dst.GetPointer();
LOG_VTX();