mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-07-23 14:19:46 -06:00
TextureDecoder: Use target attributes on SSSE3 decoders
This commit is contained in:
@ -249,11 +249,11 @@ static void TexDecoder_DecodeImpl_C4(u32* dst, const u8* src, int width, int hei
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FUNCTION_TARGET_SSSE3
|
||||||
static void TexDecoder_DecodeImpl_I4_SSSE3(u32* dst, const u8* src, int width, int height,
|
static void TexDecoder_DecodeImpl_I4_SSSE3(u32* dst, const u8* src, int width, int height,
|
||||||
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
int Wsteps4, int Wsteps8)
|
int Wsteps4, int Wsteps8)
|
||||||
{
|
{
|
||||||
#if _M_SSE >= 0x301
|
|
||||||
const __m128i kMask_x0f = _mm_set1_epi32(0x0f0f0f0fL);
|
const __m128i kMask_x0f = _mm_set1_epi32(0x0f0f0f0fL);
|
||||||
const __m128i kMask_xf0 = _mm_set1_epi32(0xf0f0f0f0L);
|
const __m128i kMask_xf0 = _mm_set1_epi32(0xf0f0f0f0L);
|
||||||
|
|
||||||
@ -296,7 +296,6 @@ static void TexDecoder_DecodeImpl_I4_SSSE3(u32* dst, const u8* src, int width, i
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void TexDecoder_DecodeImpl_I4(u32* dst, const u8* src, int width, int height, int texformat,
|
static void TexDecoder_DecodeImpl_I4(u32* dst, const u8* src, int width, int height, int texformat,
|
||||||
@ -389,11 +388,11 @@ static void TexDecoder_DecodeImpl_I4(u32* dst, const u8* src, int width, int hei
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FUNCTION_TARGET_SSSE3
|
||||||
static void TexDecoder_DecodeImpl_I8_SSSE3(u32* dst, const u8* src, int width, int height,
|
static void TexDecoder_DecodeImpl_I8_SSSE3(u32* dst, const u8* src, int width, int height,
|
||||||
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
int Wsteps4, int Wsteps8)
|
int Wsteps4, int Wsteps8)
|
||||||
{
|
{
|
||||||
#if _M_SSE >= 0x301
|
|
||||||
// xsacha optimized with SSSE3 intrinsics
|
// xsacha optimized with SSSE3 intrinsics
|
||||||
// Produces a ~10% speed improvement over SSE2 implementation
|
// Produces a ~10% speed improvement over SSE2 implementation
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
@ -418,7 +417,6 @@ static void TexDecoder_DecodeImpl_I8_SSSE3(u32* dst, const u8* src, int width, i
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void TexDecoder_DecodeImpl_I8(u32* dst, const u8* src, int width, int height, int texformat,
|
static void TexDecoder_DecodeImpl_I8(u32* dst, const u8* src, int width, int height, int texformat,
|
||||||
@ -572,11 +570,11 @@ static void TexDecoder_DecodeImpl_IA4(u32* dst, const u8* src, int width, int he
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FUNCTION_TARGET_SSSE3
|
||||||
static void TexDecoder_DecodeImpl_IA8_SSSE3(u32* dst, const u8* src, int width, int height,
|
static void TexDecoder_DecodeImpl_IA8_SSSE3(u32* dst, const u8* src, int width, int height,
|
||||||
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
int Wsteps4, int Wsteps8)
|
int Wsteps4, int Wsteps8)
|
||||||
{
|
{
|
||||||
#if _M_SSE >= 0x301
|
|
||||||
// xsacha optimized with SSSE3 intrinsics.
|
// xsacha optimized with SSSE3 intrinsics.
|
||||||
// Produces an ~50% speed improvement over SSE2 implementation.
|
// Produces an ~50% speed improvement over SSE2 implementation.
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
@ -595,7 +593,6 @@ static void TexDecoder_DecodeImpl_IA8_SSSE3(u32* dst, const u8* src, int width,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void TexDecoder_DecodeImpl_IA8(u32* dst, const u8* src, int width, int height, int texformat,
|
static void TexDecoder_DecodeImpl_IA8(u32* dst, const u8* src, int width, int height, int texformat,
|
||||||
@ -767,11 +764,11 @@ static void TexDecoder_DecodeImpl_RGB565(u32* dst, const u8* src, int width, int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FUNCTION_TARGET_SSSE3
|
||||||
static void TexDecoder_DecodeImpl_RGB5A3_SSSE3(u32* dst, const u8* src, int width, int height,
|
static void TexDecoder_DecodeImpl_RGB5A3_SSSE3(u32* dst, const u8* src, int width, int height,
|
||||||
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
int Wsteps4, int Wsteps8)
|
int Wsteps4, int Wsteps8)
|
||||||
{
|
{
|
||||||
#if _M_SSE >= 0x301
|
|
||||||
const __m128i kMask_x1f = _mm_set1_epi32(0x0000001fL);
|
const __m128i kMask_x1f = _mm_set1_epi32(0x0000001fL);
|
||||||
const __m128i kMask_x0f = _mm_set1_epi32(0x0000000fL);
|
const __m128i kMask_x0f = _mm_set1_epi32(0x0000000fL);
|
||||||
const __m128i kMask_x07 = _mm_set1_epi32(0x00000007L);
|
const __m128i kMask_x07 = _mm_set1_epi32(0x00000007L);
|
||||||
@ -872,7 +869,6 @@ static void TexDecoder_DecodeImpl_RGB5A3_SSSE3(u32* dst, const u8* src, int widt
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void TexDecoder_DecodeImpl_RGB5A3(u32* dst, const u8* src, int width, int height,
|
static void TexDecoder_DecodeImpl_RGB5A3(u32* dst, const u8* src, int width, int height,
|
||||||
@ -995,11 +991,11 @@ static void TexDecoder_DecodeImpl_RGB5A3(u32* dst, const u8* src, int width, int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FUNCTION_TARGET_SSSE3
|
||||||
static void TexDecoder_DecodeImpl_RGBA8_SSSE3(u32* dst, const u8* src, int width, int height,
|
static void TexDecoder_DecodeImpl_RGBA8_SSSE3(u32* dst, const u8* src, int width, int height,
|
||||||
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
int Wsteps4, int Wsteps8)
|
int Wsteps4, int Wsteps8)
|
||||||
{
|
{
|
||||||
#if _M_SSE >= 0x301
|
|
||||||
// xsacha optimized with SSSE3 instrinsics
|
// xsacha optimized with SSSE3 instrinsics
|
||||||
// Produces a ~30% speed improvement over SSE2 implementation
|
// Produces a ~30% speed improvement over SSE2 implementation
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
@ -1028,7 +1024,6 @@ static void TexDecoder_DecodeImpl_RGBA8_SSSE3(u32* dst, const u8* src, int width
|
|||||||
_mm_storeu_si128(dst128, rgba11);
|
_mm_storeu_si128(dst128, rgba11);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void TexDecoder_DecodeImpl_RGBA8(u32* dst, const u8* src, int width, int height,
|
static void TexDecoder_DecodeImpl_RGBA8(u32* dst, const u8* src, int width, int height,
|
||||||
@ -1414,14 +1409,6 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
|||||||
int Wsteps4 = (width + 3) / 4;
|
int Wsteps4 = (width + 3) / 4;
|
||||||
int Wsteps8 = (width + 7) / 8;
|
int Wsteps8 = (width + 7) / 8;
|
||||||
|
|
||||||
// If the binary was not compiled with SSSE3 support, the functions turn into no-ops.
|
|
||||||
// Therefore, we shouldn't call them based on what the CPU reports at runtime alone.
|
|
||||||
#if _M_SSE >= 0x301
|
|
||||||
bool has_SSSE3 = cpu_info.bSSSE3;
|
|
||||||
#else
|
|
||||||
bool has_SSSE3 = false;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
switch (texformat)
|
switch (texformat)
|
||||||
{
|
{
|
||||||
case GX_TF_C4:
|
case GX_TF_C4:
|
||||||
@ -1429,7 +1416,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case GX_TF_I4:
|
case GX_TF_I4:
|
||||||
if (has_SSSE3)
|
if (cpu_info.bSSSE3)
|
||||||
TexDecoder_DecodeImpl_I4_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
TexDecoder_DecodeImpl_I4_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
Wsteps8);
|
Wsteps8);
|
||||||
else
|
else
|
||||||
@ -1437,7 +1424,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case GX_TF_I8:
|
case GX_TF_I8:
|
||||||
if (has_SSSE3)
|
if (cpu_info.bSSSE3)
|
||||||
TexDecoder_DecodeImpl_I8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
TexDecoder_DecodeImpl_I8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
Wsteps8);
|
Wsteps8);
|
||||||
else
|
else
|
||||||
@ -1453,7 +1440,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case GX_TF_IA8:
|
case GX_TF_IA8:
|
||||||
if (has_SSSE3)
|
if (cpu_info.bSSSE3)
|
||||||
TexDecoder_DecodeImpl_IA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
TexDecoder_DecodeImpl_IA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
Wsteps8);
|
Wsteps8);
|
||||||
else
|
else
|
||||||
@ -1472,7 +1459,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case GX_TF_RGB5A3:
|
case GX_TF_RGB5A3:
|
||||||
if (has_SSSE3)
|
if (cpu_info.bSSSE3)
|
||||||
TexDecoder_DecodeImpl_RGB5A3_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
TexDecoder_DecodeImpl_RGB5A3_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
Wsteps8);
|
Wsteps8);
|
||||||
else
|
else
|
||||||
@ -1481,7 +1468,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case GX_TF_RGBA8:
|
case GX_TF_RGBA8:
|
||||||
if (has_SSSE3)
|
if (cpu_info.bSSSE3)
|
||||||
TexDecoder_DecodeImpl_RGBA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
TexDecoder_DecodeImpl_RGBA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
Wsteps8);
|
Wsteps8);
|
||||||
else
|
else
|
||||||
|
Reference in New Issue
Block a user