Video: Clearly separate Texture and EFB Copy formats

Improve bookkeeping around formats. Hopefully make code less confusing.

- Rename TlutFormat -> TLUTFormat to follow conventions.
- Use enum classes to prevent using a Texture format where an EFB Copy format
  is expected or vice-versa.
- Use common EFBCopyFormat names regardless of depth and YUV configurations.
This commit is contained in:
N.E.C
2017-07-30 12:45:55 -07:00
parent 9649494f67
commit c3a57bbad5
27 changed files with 1275 additions and 1319 deletions

View File

@ -212,12 +212,13 @@ static void DecodeDXTBlock(u32* dst, const DXTBlock* src, int pitch)
// free to make the assumption that addresses are multiples of 16 in the aligned case.
// TODO: complete SSE2 optimization of less often used texture formats.
// TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads.
static void TexDecoder_DecodeImpl_C4(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
static void TexDecoder_DecodeImpl_C4(u32* dst, const u8* src, int width, int height,
TextureFormat texformat, const u8* tlut, TLUTFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
switch (tlutfmt)
{
case GX_TL_RGB5A3:
case TLUTFormat::RGB5A3:
{
for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
@ -226,7 +227,7 @@ static void TexDecoder_DecodeImpl_C4(u32* dst, const u8* src, int width, int hei
}
break;
case GX_TL_IA8:
case TLUTFormat::IA8:
{
for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
@ -235,7 +236,7 @@ static void TexDecoder_DecodeImpl_C4(u32* dst, const u8* src, int width, int hei
}
break;
case GX_TL_RGB565:
case TLUTFormat::RGB565:
{
for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
@ -251,8 +252,8 @@ static void TexDecoder_DecodeImpl_C4(u32* dst, const u8* src, int width, int hei
FUNCTION_TARGET_SSSE3
static void TexDecoder_DecodeImpl_I4_SSSE3(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
TextureFormat texformat, const u8* tlut,
TLUTFormat tlutfmt, int Wsteps4, int Wsteps8)
{
const __m128i kMask_x0f = _mm_set1_epi32(0x0f0f0f0fL);
const __m128i kMask_xf0 = _mm_set1_epi32(0xf0f0f0f0L);
@ -298,8 +299,9 @@ static void TexDecoder_DecodeImpl_I4_SSSE3(u32* dst, const u8* src, int width, i
}
}
static void TexDecoder_DecodeImpl_I4(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
static void TexDecoder_DecodeImpl_I4(u32* dst, const u8* src, int width, int height,
TextureFormat texformat, const u8* tlut, TLUTFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
const __m128i kMask_x0f = _mm_set1_epi32(0x0f0f0f0fL);
const __m128i kMask_xf0 = _mm_set1_epi32(0xf0f0f0f0L);
@ -390,8 +392,8 @@ static void TexDecoder_DecodeImpl_I4(u32* dst, const u8* src, int width, int hei
FUNCTION_TARGET_SSSE3
static void TexDecoder_DecodeImpl_I8_SSSE3(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
TextureFormat texformat, const u8* tlut,
TLUTFormat tlutfmt, int Wsteps4, int Wsteps8)
{
// xsacha optimized with SSSE3 intrinsics
// Produces a ~10% speed improvement over SSE2 implementation
@ -419,8 +421,9 @@ static void TexDecoder_DecodeImpl_I8_SSSE3(u32* dst, const u8* src, int width, i
}
}
static void TexDecoder_DecodeImpl_I8(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
static void TexDecoder_DecodeImpl_I8(u32* dst, const u8* src, int width, int height,
TextureFormat texformat, const u8* tlut, TLUTFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
// JSD optimized with SSE2 intrinsics.
// Produces an ~86% speed improvement over reference C implementation.
@ -518,12 +521,13 @@ static void TexDecoder_DecodeImpl_I8(u32* dst, const u8* src, int width, int hei
}
}
static void TexDecoder_DecodeImpl_C8(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
static void TexDecoder_DecodeImpl_C8(u32* dst, const u8* src, int width, int height,
TextureFormat texformat, const u8* tlut, TLUTFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
switch (tlutfmt)
{
case GX_TL_RGB5A3:
case TLUTFormat::RGB5A3:
{
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
@ -532,7 +536,7 @@ static void TexDecoder_DecodeImpl_C8(u32* dst, const u8* src, int width, int hei
}
break;
case GX_TL_IA8:
case TLUTFormat::IA8:
{
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
@ -541,7 +545,7 @@ static void TexDecoder_DecodeImpl_C8(u32* dst, const u8* src, int width, int hei
}
break;
case GX_TL_RGB565:
case TLUTFormat::RGB565:
{
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
@ -555,8 +559,9 @@ static void TexDecoder_DecodeImpl_C8(u32* dst, const u8* src, int width, int hei
}
}
static void TexDecoder_DecodeImpl_IA4(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
static void TexDecoder_DecodeImpl_IA4(u32* dst, const u8* src, int width, int height,
TextureFormat texformat, const u8* tlut, TLUTFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
for (int y = 0; y < height; y += 4)
{
@ -572,8 +577,8 @@ static void TexDecoder_DecodeImpl_IA4(u32* dst, const u8* src, int width, int he
FUNCTION_TARGET_SSSE3
static void TexDecoder_DecodeImpl_IA8_SSSE3(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
TextureFormat texformat, const u8* tlut,
TLUTFormat tlutfmt, int Wsteps4, int Wsteps8)
{
// xsacha optimized with SSSE3 intrinsics.
// Produces an ~50% speed improvement over SSE2 implementation.
@ -595,8 +600,9 @@ static void TexDecoder_DecodeImpl_IA8_SSSE3(u32* dst, const u8* src, int width,
}
}
static void TexDecoder_DecodeImpl_IA8(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
static void TexDecoder_DecodeImpl_IA8(u32* dst, const u8* src, int width, int height,
TextureFormat texformat, const u8* tlut, TLUTFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
// JSD optimized with SSE2 intrinsics.
// Produces an ~80% speed improvement over reference C implementation.
@ -656,12 +662,12 @@ static void TexDecoder_DecodeImpl_IA8(u32* dst, const u8* src, int width, int he
}
static void TexDecoder_DecodeImpl_C14X2(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
TextureFormat texformat, const u8* tlut, TLUTFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
switch (tlutfmt)
{
case GX_TL_RGB5A3:
case TLUTFormat::RGB5A3:
{
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
@ -670,7 +676,7 @@ static void TexDecoder_DecodeImpl_C14X2(u32* dst, const u8* src, int width, int
}
break;
case GX_TL_IA8:
case TLUTFormat::IA8:
{
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
@ -679,7 +685,7 @@ static void TexDecoder_DecodeImpl_C14X2(u32* dst, const u8* src, int width, int
}
break;
case GX_TL_RGB565:
case TLUTFormat::RGB565:
{
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
@ -694,8 +700,8 @@ static void TexDecoder_DecodeImpl_C14X2(u32* dst, const u8* src, int width, int
}
static void TexDecoder_DecodeImpl_RGB565(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
TextureFormat texformat, const u8* tlut,
TLUTFormat tlutfmt, int Wsteps4, int Wsteps8)
{
// JSD optimized with SSE2 intrinsics.
// Produces an ~78% speed improvement over reference C implementation.
@ -766,8 +772,8 @@ static void TexDecoder_DecodeImpl_RGB565(u32* dst, const u8* src, int width, int
FUNCTION_TARGET_SSSE3
static void TexDecoder_DecodeImpl_RGB5A3_SSSE3(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
TextureFormat texformat, const u8* tlut,
TLUTFormat tlutfmt, int Wsteps4, int Wsteps8)
{
const __m128i kMask_x1f = _mm_set1_epi32(0x0000001fL);
const __m128i kMask_x0f = _mm_set1_epi32(0x0000000fL);
@ -872,8 +878,8 @@ static void TexDecoder_DecodeImpl_RGB5A3_SSSE3(u32* dst, const u8* src, int widt
}
static void TexDecoder_DecodeImpl_RGB5A3(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
TextureFormat texformat, const u8* tlut,
TLUTFormat tlutfmt, int Wsteps4, int Wsteps8)
{
const __m128i kMask_x1f = _mm_set1_epi32(0x0000001fL);
const __m128i kMask_x0f = _mm_set1_epi32(0x0000000fL);
@ -993,8 +999,8 @@ static void TexDecoder_DecodeImpl_RGB5A3(u32* dst, const u8* src, int width, int
FUNCTION_TARGET_SSSE3
static void TexDecoder_DecodeImpl_RGBA8_SSSE3(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
TextureFormat texformat, const u8* tlut,
TLUTFormat tlutfmt, int Wsteps4, int Wsteps8)
{
// xsacha optimized with SSSE3 instrinsics
// Produces a ~30% speed improvement over SSE2 implementation
@ -1027,7 +1033,7 @@ static void TexDecoder_DecodeImpl_RGBA8_SSSE3(u32* dst, const u8* src, int width
}
static void TexDecoder_DecodeImpl_RGBA8(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
TextureFormat texformat, const u8* tlut, TLUTFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
// JSD optimized with SSE2 intrinsics
@ -1148,7 +1154,7 @@ static void TexDecoder_DecodeImpl_RGBA8(u32* dst, const u8* src, int width, int
}
static void TexDecoder_DecodeImpl_CMPR(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
TextureFormat texformat, const u8* tlut, TLUTFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
// The metroid games use this format almost exclusively.
@ -1403,19 +1409,19 @@ static void TexDecoder_DecodeImpl_CMPR(u32* dst, const u8* src, int width, int h
}
}
void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt)
void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, TextureFormat texformat,
const u8* tlut, TLUTFormat tlutfmt)
{
int Wsteps4 = (width + 3) / 4;
int Wsteps8 = (width + 7) / 8;
switch (texformat)
{
case GX_TF_C4:
case TextureFormat::C4:
TexDecoder_DecodeImpl_C4(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
break;
case GX_TF_I4:
case TextureFormat::I4:
if (cpu_info.bSSSE3)
TexDecoder_DecodeImpl_I4_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
@ -1423,7 +1429,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
TexDecoder_DecodeImpl_I4(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
break;
case GX_TF_I8:
case TextureFormat::I8:
if (cpu_info.bSSSE3)
TexDecoder_DecodeImpl_I8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
@ -1431,15 +1437,15 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
TexDecoder_DecodeImpl_I8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
break;
case GX_TF_C8:
case TextureFormat::C8:
TexDecoder_DecodeImpl_C8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
break;
case GX_TF_IA4:
case TextureFormat::IA4:
TexDecoder_DecodeImpl_IA4(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
break;
case GX_TF_IA8:
case TextureFormat::IA8:
if (cpu_info.bSSSE3)
TexDecoder_DecodeImpl_IA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
@ -1448,17 +1454,17 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
Wsteps8);
break;
case GX_TF_C14X2:
case TextureFormat::C14X2:
TexDecoder_DecodeImpl_C14X2(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
break;
case GX_TF_RGB565:
case TextureFormat::RGB565:
TexDecoder_DecodeImpl_RGB565(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
break;
case GX_TF_RGB5A3:
case TextureFormat::RGB5A3:
if (cpu_info.bSSSE3)
TexDecoder_DecodeImpl_RGB5A3_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
@ -1467,7 +1473,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
Wsteps8);
break;
case GX_TF_RGBA8:
case TextureFormat::RGBA8:
if (cpu_info.bSSSE3)
TexDecoder_DecodeImpl_RGBA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
@ -1476,12 +1482,13 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
Wsteps8);
break;
case GX_TF_CMPR:
case TextureFormat::CMPR:
TexDecoder_DecodeImpl_CMPR(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
break;
default:
PanicAlert("Unhandled texture format %d", texformat);
PanicAlert("Invalid Texture Format (0x%X)! (_TexDecoder_DecodeImpl)",
static_cast<int>(texformat));
break;
}
}