VertexLoaderX64: optimize more color swizzles

Based on https://gist.github.com/rygorous/c5dba8c6c75bc83aa4ae, but with
a small tweak and a somewhat related change for the 565 case.
This commit is contained in:
Tillmann Karras 2015-10-25 14:51:07 +01:00
parent bd20dd3962
commit b0d5b4483e

View File

@ -280,17 +280,13 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
}
else
{
MOV(32, R(scratch3), R(scratch1));
SHL(32, R(scratch1), Imm8(16));
AND(32, R(scratch1), Imm32(0xF8000000));
MOV(32, R(scratch2), R(scratch3));
SHL(32, R(scratch2), Imm8(13));
SHL(32, R(scratch1), Imm8(11));
LEA(32, scratch2, MScaled(scratch1, SCALE_4, 0));
LEA(32, scratch3, MScaled(scratch2, SCALE_8, 0));
AND(32, R(scratch1), Imm32(0x0000F800));
AND(32, R(scratch2), Imm32(0x00FC0000));
AND(32, R(scratch3), Imm32(0xF8000000));
OR(32, R(scratch1), R(scratch2));
SHL(32, R(scratch3), Imm8(11));
AND(32, R(scratch3), Imm32(0x0000F800));
OR(32, R(scratch1), R(scratch3));
MOV(32, R(scratch2), R(scratch1));
@ -302,7 +298,6 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
AND(32, R(scratch2), Imm32(0x00030000));
OR(32, R(scratch1), R(scratch2));
}
OR(32, R(scratch1), Imm32(0x000000FF));
SwapAndStore(32, MDisp(dst_reg, m_dst_ofs), scratch1);
load_bytes = 2;
@ -328,7 +323,6 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
SHL(32, R(scratch1), Imm8(4));
OR(32, R(scratch1), R(scratch2));
AND(32, R(scratch1), Imm32(0x0F0F0F0F));
}
MOV(32, R(scratch2), R(scratch1));
SHL(32, R(scratch1), Imm8(4));
@ -350,32 +344,21 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
}
else
{
MOV(32, R(scratch3), R(scratch1));
SHL(32, R(scratch1), Imm8(8));
AND(32, R(scratch1), Imm32(0xFC000000));
LEA(32, scratch2, MScaled(scratch1, SCALE_4, 0)); // ______RR RRRRGGGG GGBBBBBB AAAAAA__
AND(32, R(scratch2), Imm32(0x00003FFC)); // ________ ________ __BBBBBB AAAAAA__
SHL(32, R(scratch1), Imm8(6)); // __RRRRRR GGGGGGBB BBBBAAAA AA______
AND(32, R(scratch1), Imm32(0x3FFC0000)); // __RRRRRR GGGGGG__ ________ ________
OR(32, R(scratch1), R(scratch2)); // __RRRRRR GGGGGG__ __BBBBBB AAAAAA__
LEA(32, scratch2, MScaled(scratch1, SCALE_4, 0)); // RRRRRRGG GGGG____ BBBBBBAA AAAA____
AND(32, R(scratch2), Imm32(0xFC00FC00)); // RRRRRR__ ________ BBBBBB__ ________
AND(32, R(scratch1), Imm32(0x00FC00FC)); // ________ GGGGGG__ ________ AAAAAA__
OR(32, R(scratch1), R(scratch2)); // RRRRRR__ GGGGGG__ BBBBBB__ AAAAAA__
MOV(32, R(scratch2), R(scratch1));
MOV(32, R(scratch1), R(scratch3));
SHL(32, R(scratch1), Imm8(6));
AND(32, R(scratch1), Imm32(0x00FC0000));
OR(32, R(scratch2), R(scratch1));
MOV(32, R(scratch1), R(scratch3));
SHL(32, R(scratch1), Imm8(4));
AND(32, R(scratch1), Imm32(0x0000FC00));
OR(32, R(scratch2), R(scratch1));
SHL(32, R(scratch3), Imm8(2));
AND(32, R(scratch3), Imm32(0x000000FC));
OR(32, R(scratch2), R(scratch3));
MOV(32, R(scratch1), R(scratch2));
}
SHR(32, R(scratch1), Imm8(6));
AND(32, R(scratch1), Imm32(0x03030303));
OR(32, R(scratch1), R(scratch2));
SwapAndStore(32, MDisp(dst_reg, m_dst_ofs), scratch1);
load_bytes = 3;
break;