VertexLoaderX64: optimize more color swizzles

Based on https://gist.github.com/rygorous/c5dba8c6c75bc83aa4ae, but with a small tweak and a somewhat related change for the 565 case.
2025-07-23 14:19:46 -06:00 · 2015-10-25 14:51:07 +01:00
parent bd20dd3962
commit b0d5b4483e
1 changed files with 15 additions and 32 deletions
--- a/Source/Core/VideoCommon/VertexLoaderX64.cpp
+++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp
@ -280,17 +280,13 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
 			}
 			else
 			{
-				MOV(32, R(scratch3), R(scratch1));
-				SHL(32, R(scratch1), Imm8(16));
-				AND(32, R(scratch1), Imm32(0xF8000000));
-
-				MOV(32, R(scratch2), R(scratch3));
-				SHL(32, R(scratch2), Imm8(13));
+				SHL(32, R(scratch1), Imm8(11));
+				LEA(32, scratch2, MScaled(scratch1, SCALE_4, 0));
+				LEA(32, scratch3, MScaled(scratch2, SCALE_8, 0));
+				AND(32, R(scratch1), Imm32(0x0000F800));
 				AND(32, R(scratch2), Imm32(0x00FC0000));
+				AND(32, R(scratch3), Imm32(0xF8000000));
 				OR(32, R(scratch1), R(scratch2));
-
-				SHL(32, R(scratch3), Imm8(11));
-				AND(32, R(scratch3), Imm32(0x0000F800));
 				OR(32, R(scratch1), R(scratch3));

 				MOV(32, R(scratch2), R(scratch1));
@ -302,7 +298,6 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
 				AND(32, R(scratch2), Imm32(0x00030000));
 				OR(32, R(scratch1), R(scratch2));
 			}
-
 			OR(32, R(scratch1), Imm32(0x000000FF));
 			SwapAndStore(32, MDisp(dst_reg, m_dst_ofs), scratch1);
 			load_bytes = 2;
@ -328,7 +323,6 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
 				SHL(32, R(scratch1), Imm8(4));
 				OR(32, R(scratch1), R(scratch2));
 				AND(32, R(scratch1), Imm32(0x0F0F0F0F));
-
 			}
 			MOV(32, R(scratch2), R(scratch1));
 			SHL(32, R(scratch1), Imm8(4));
@ -350,32 +344,21 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
 			}
 			else
 			{
-				MOV(32, R(scratch3), R(scratch1));
-				SHL(32, R(scratch1), Imm8(8));
-				AND(32, R(scratch1), Imm32(0xFC000000));
+				LEA(32, scratch2, MScaled(scratch1, SCALE_4, 0)); // ______RR RRRRGGGG GGBBBBBB AAAAAA__
+				AND(32, R(scratch2), Imm32(0x00003FFC));          // ________ ________ __BBBBBB AAAAAA__
+				SHL(32, R(scratch1), Imm8(6));                    // __RRRRRR GGGGGGBB BBBBAAAA AA______
+				AND(32, R(scratch1), Imm32(0x3FFC0000));          // __RRRRRR GGGGGG__ ________ ________
+				OR(32, R(scratch1), R(scratch2));                 // __RRRRRR GGGGGG__ __BBBBBB AAAAAA__
+
+				LEA(32, scratch2, MScaled(scratch1, SCALE_4, 0)); // RRRRRRGG GGGG____ BBBBBBAA AAAA____
+				AND(32, R(scratch2), Imm32(0xFC00FC00));          // RRRRRR__ ________ BBBBBB__ ________
+				AND(32, R(scratch1), Imm32(0x00FC00FC));          // ________ GGGGGG__ ________ AAAAAA__
+				OR(32, R(scratch1), R(scratch2));                 // RRRRRR__ GGGGGG__ BBBBBB__ AAAAAA__
 				MOV(32, R(scratch2), R(scratch1));
-
-				MOV(32, R(scratch1), R(scratch3));
-				SHL(32, R(scratch1), Imm8(6));
-				AND(32, R(scratch1), Imm32(0x00FC0000));
-				OR(32, R(scratch2), R(scratch1));
-
-				MOV(32, R(scratch1), R(scratch3));
-				SHL(32, R(scratch1), Imm8(4));
-				AND(32, R(scratch1), Imm32(0x0000FC00));
-				OR(32, R(scratch2), R(scratch1));
-
-				SHL(32, R(scratch3), Imm8(2));
-				AND(32, R(scratch3), Imm32(0x000000FC));
-				OR(32, R(scratch2), R(scratch3));
-
-				MOV(32, R(scratch1), R(scratch2));
 			}
-
 			SHR(32, R(scratch1), Imm8(6));
 			AND(32, R(scratch1), Imm32(0x03030303));
 			OR(32, R(scratch1), R(scratch2));
-
 			SwapAndStore(32, MDisp(dst_reg, m_dst_ofs), scratch1);
 			load_bytes = 3;
 			break;