VideoCommon: Added automatic selection routines for SSSE3/SSE4.1 codes. It selects SSSE3/SSE4.1 codes only if a proper preprocessor definition is defined and the target cpu supports SSSE3/SSE4.1. The selection routines in VertexLoader_* use function pointers. TextureDecoder uses a combination of "#if" and "if" statements.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5302 8ced0084-cf51-0410-be5f-012b33b47a6e
2025-07-21 05:09:34 -06:00 · 2010-04-09 15:13:42 +00:00
parent 91c6f5acba
commit 956b8eb54d
8 changed files with 295 additions and 158 deletions
--- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp
+++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
@ -366,37 +366,41 @@ inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr)
    }
 }

-#if _M_SSE >= 0x301
-	static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L);
-#endif
-
 inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem + tlutaddr);
+	for (int x = 0; x < 8; x++)
+	{
+		u8 val = src[x];
+		*dst++ = Common::swap16(tlut[val]);
+	}
+}

 #if _M_SSE >= 0x301
+static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L);
+
+inline void decodebytesC8_To_Raw16_SSSE3(u16* dst, const u8* src, int tlutaddr)
+{
+	u16* tlut = (u16*)(texMem + tlutaddr);

 	// Make 8 16-bits unsigned integer values
-	const __m128i a = _mm_set_epi16(tlut[src[7]], tlut[src[6]], tlut[src[5]], tlut[src[4]], tlut[src[3]], tlut[src[2]], tlut[src[1]], tlut[src[0]]);
+	__m128i a = _mm_setzero_si128();
+	a = _mm_insert_epi16(a, tlut[src[0]], 0);
+	a = _mm_insert_epi16(a, tlut[src[1]], 1);
+	a = _mm_insert_epi16(a, tlut[src[2]], 2);
+	a = _mm_insert_epi16(a, tlut[src[3]], 3);
+	a = _mm_insert_epi16(a, tlut[src[4]], 4);
+	a = _mm_insert_epi16(a, tlut[src[5]], 5);
+	a = _mm_insert_epi16(a, tlut[src[6]], 6);
+	a = _mm_insert_epi16(a, tlut[src[7]], 7);

 	// Apply Common::swap16() to 16-bits unsigned integers at once
 	const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16);

 	// Store values to dst without polluting the caches
 	_mm_stream_si128((__m128i*)dst, b);
-
-#else
-
-	for (int x = 0; x < 8; x++)
-	{
-		u8 val = src[x];
-		*dst++ = Common::swap16(tlut[val]);
-	}
-
-#endif
-
 }
-
+#endif

 inline void decodebytesC14X2_5A3_To_BGRA32(u32 *dst, const u16 *src, int tlutaddr)
 {
@ -958,10 +962,26 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
        }
 		else
 		{
-            for (int y = 0; y < height; y += 4)
-                for (int x = 0; x < width; x += 8)
-                    for (int iy = 0; iy < 4; iy++, src += 8)
-                        decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
+
+#if _M_SSE >= 0x301
+
+			if (cpu_info.bSSSE3) {
+				for (int y = 0; y < height; y += 4)
+					for (int x = 0; x < width; x += 8)
+						for (int iy = 0; iy < 4; iy++, src += 8)
+							decodebytesC8_To_Raw16_SSSE3((u16*)dst + (y + iy) * width + x, src, tlutaddr);
+				break;
+			} else
+
+#endif
+
+			{
+				for (int y = 0; y < height; y += 4)
+					for (int x = 0; x < width; x += 8)
+						for (int iy = 0; iy < 4; iy++, src += 8)
+							decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
+
+			}
 		}
        return GetPCFormatFromTLUTFormat(tlutfmt);
    case GX_TF_IA4:
@ -1028,59 +1048,93 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
    case GX_TF_RGBA8:  // speed critical
        {

-#if _M_SSE >= 0x301
-
-			for (int y = 0; y < height; y += 4) {
-				__m128i* p = (__m128i*)(src + y * width * 4);
-				for (int x = 0; x < width; x += 4) {
+			// FIXME(nodchip): the following code is too complicated.

 #if _M_SSE >= 0x401
-					// Load 64-bytes at once.
-					const __m128i a0 = _mm_stream_load_si128(p++);
-					const __m128i a1 = _mm_stream_load_si128(p++);
-					const __m128i a2 = _mm_stream_load_si128(p++);
-					const __m128i a3 = _mm_stream_load_si128(p++);
-#else
-					const __m128i a0 = _mm_load_si128(p++);
-					const __m128i a1 = _mm_load_si128(p++);
-					const __m128i a2 = _mm_load_si128(p++);
-					const __m128i a3 = _mm_load_si128(p++);
+
+			if (cpu_info.bSSE4_1) {
+				for (int y = 0; y < height; y += 4) {
+					__m128i* p = (__m128i*)(src + y * width * 4);
+					for (int x = 0; x < width; x += 4) {
+
+						// Load 64-bytes at once.
+						const __m128i a0 = _mm_stream_load_si128(p++);
+						const __m128i a1 = _mm_stream_load_si128(p++);
+						const __m128i a2 = _mm_stream_load_si128(p++);
+						const __m128i a3 = _mm_stream_load_si128(p++);
+
+						// Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(),
+						// apply Common::swap32() by _mm_shuffle_epi8() and
+						// store them by _mm_stream_si128().
+						// See decodebytesARGB8_4() about the idea.
+						const __m128i b0 = _mm_unpacklo_epi16(a0, a2);
+						const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32);
+						_mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0);
+
+						const __m128i b1 = _mm_unpackhi_epi16(a0, a2);
+						const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32);
+						_mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1);
+
+						const __m128i b2 = _mm_unpacklo_epi16(a1, a3);
+						const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32);
+						_mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2);
+
+						const __m128i b3 = _mm_unpackhi_epi16(a1, a3);
+						const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32);
+						_mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3);
+					}
+				}
+			} else
+
 #endif

-					// Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(),
-					// apply Common::swap32() by _mm_shuffle_epi8() and
-					// store them by _mm_stream_si128().
-					// See decodebytesARGB8_4() about the idea.
-					const __m128i b0 = _mm_unpacklo_epi16(a0, a2);
-					const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32);
-					_mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0);
+#if _M_SSE >= 0x301

-					const __m128i b1 = _mm_unpackhi_epi16(a0, a2);
-					const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32);
-					_mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1);
+			if (cpu_info.bSSSE3) {
+				for (int y = 0; y < height; y += 4) {
+					__m128i* p = (__m128i*)(src + y * width * 4);
+					for (int x = 0; x < width; x += 4) {

-					const __m128i b2 = _mm_unpacklo_epi16(a1, a3);
-					const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32);
-					_mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2);
+						const __m128i a0 = _mm_load_si128(p++);
+						const __m128i a1 = _mm_load_si128(p++);
+						const __m128i a2 = _mm_load_si128(p++);
+						const __m128i a3 = _mm_load_si128(p++);

-					const __m128i b3 = _mm_unpackhi_epi16(a1, a3);
-					const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32);
-					_mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3);
+						// Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(),
+						// apply Common::swap32() by _mm_shuffle_epi8() and
+						// store them by _mm_stream_si128().
+						// See decodebytesARGB8_4() about the idea.
+						const __m128i b0 = _mm_unpacklo_epi16(a0, a2);
+						const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32);
+						_mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0);
+
+						const __m128i b1 = _mm_unpackhi_epi16(a0, a2);
+						const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32);
+						_mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1);
+
+						const __m128i b2 = _mm_unpacklo_epi16(a1, a3);
+						const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32);
+						_mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2);
+
+						const __m128i b3 = _mm_unpackhi_epi16(a1, a3);
+						const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32);
+						_mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3);
+					}
 				}
+			} else
+
+#endif
+
+			{
+				for (int y = 0; y < height; y += 4)
+					for (int x = 0; x < width; x += 4)
+					{
+						for (int iy = 0; iy < 4; iy++)
+							decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
+						src += 64;
+					}
 			}
-
-#else
-
-			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-				{
-					for (int iy = 0; iy < 4; iy++)
-						decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
-					src += 64;
-				}
-
-#endif
-        }
+		}
        return PC_TEX_FMT_BGRA32;
    case GX_TF_CMPR:  // speed critical
        // The metroid games use this format almost exclusively.