mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2024-11-14 21:37:52 -07:00
Remove unnecessary casting bloat. Since AMD Stream 2.2 this is no longer a requirement.
Stream 2.2 is minimum requirement for OpenCL 1.1 and binary kernels as well (I hope to implement these soon).Remove unnecessary casting bloat. Since AMD Stream 2.2 this is no longer a requirement. Sidenote: Stream 2.2/Catalyst 10.7b is minimum requirement for OpenCL 1.1 and binary kernels as well. I hope to implement these soon. For the latest ATI Drivers and SDK: http://support.amd.com/us/kbarticles/Pages/OpenCL11ATICat107UpdateDriver.aspx http://support.amd.com/us/kbarticles/Pages/OpenCL11ATICat107UpdateDriver.aspx git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6102 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
56d916bf93
commit
963ece2017
@ -24,9 +24,9 @@ kernel void DecodeI4(global uchar *dst,
|
|||||||
{
|
{
|
||||||
uchar4 val = vload4(srcOffset, src);
|
uchar4 val = vload4(srcOffset, src);
|
||||||
uchar8 res;
|
uchar8 res;
|
||||||
res.even = (val >> (uchar4)4) & (uchar4)0x0F;
|
res.even = (val >> 4) & 0x0F;
|
||||||
res.odd = val & (uchar4)0x0F;
|
res.odd = val & 0x0F;
|
||||||
res |= res << (uchar8)4;
|
res |= res << 4;
|
||||||
vstore8(res, 0, dst + ((y + iy)*width + x));
|
vstore8(res, 0, dst + ((y + iy)*width + x));
|
||||||
srcOffset++;
|
srcOffset++;
|
||||||
}
|
}
|
||||||
@ -41,9 +41,9 @@ kernel void DecodeI4_RGBA(global uint *dst,
|
|||||||
{
|
{
|
||||||
uchar4 val = vload4(srcOffset, src);
|
uchar4 val = vload4(srcOffset, src);
|
||||||
uchar8 res;
|
uchar8 res;
|
||||||
res.even = (val >> (uchar4)4) & (uchar4)0x0F;
|
res.even = (val >> 4) & 0x0F;
|
||||||
res.odd = val & (uchar4)0x0F;
|
res.odd = val & 0x0F;
|
||||||
res |= res << (uchar8)4;
|
res |= res << 4;
|
||||||
vstore8(upsample(upsample(res,res),upsample(res,res)), 0, dst + ((y + iy)*width + x));
|
vstore8(upsample(upsample(res,res),upsample(res,res)), 0, dst + ((y + iy)*width + x));
|
||||||
srcOffset++;
|
srcOffset++;
|
||||||
}
|
}
|
||||||
@ -108,8 +108,8 @@ kernel void DecodeIA4(global ushort *dst,
|
|||||||
for (int iy = 0; iy < 4; iy++)
|
for (int iy = 0; iy < 4; iy++)
|
||||||
{
|
{
|
||||||
val = vload8(srcOffset++, src);
|
val = vload8(srcOffset++, src);
|
||||||
res = upsample(val >> (uchar8)4, val & (uchar8)0xF);
|
res = upsample(val >> 4, val & 0xF);
|
||||||
res |= res << (ushort8)4;
|
res |= res << 4;
|
||||||
vstore8(res, 0, dst + y*width + x);
|
vstore8(res, 0, dst + y*width + x);
|
||||||
dst+=width;
|
dst+=width;
|
||||||
}
|
}
|
||||||
@ -125,10 +125,10 @@ kernel void DecodeIA4_RGBA(global uint *dst,
|
|||||||
for (int iy = 0; iy < 4; iy++)
|
for (int iy = 0; iy < 4; iy++)
|
||||||
{
|
{
|
||||||
val = vload8(srcOffset++, src);
|
val = vload8(srcOffset++, src);
|
||||||
uchar8 a = val >> (uchar8)4;
|
uchar8 a = val >> 4;
|
||||||
uchar8 l = val & (uchar8)0xF;
|
uchar8 l = val & 0xF;
|
||||||
res = upsample(upsample(a, l), upsample(l,l));
|
res = upsample(upsample(a, l), upsample(l,l));
|
||||||
res |= res << (uint8)4;
|
res |= res << 4;
|
||||||
vstore8(res, 0, dst + y*width + x);
|
vstore8(res, 0, dst + y*width + x);
|
||||||
dst+=width;
|
dst+=width;
|
||||||
}
|
}
|
||||||
@ -142,7 +142,7 @@ kernel void DecodeRGBA8(global ushort *dst,
|
|||||||
for (int iy = 0; iy < 4; iy++)
|
for (int iy = 0; iy < 4; iy++)
|
||||||
{
|
{
|
||||||
ushort8 val = (ushort8)(vload4(srcOffset, src), vload4(srcOffset + 4, src));
|
ushort8 val = (ushort8)(vload4(srcOffset, src), vload4(srcOffset + 4, src));
|
||||||
ushort8 bgra = rotate(val,(ushort8)8).s40516273;
|
ushort8 bgra = rotate(val,8).s40516273;
|
||||||
vstore8(bgra, 0, dst + ((y + iy)*width + x) * 2);
|
vstore8(bgra, 0, dst + ((y + iy)*width + x) * 2);
|
||||||
srcOffset++;
|
srcOffset++;
|
||||||
}
|
}
|
||||||
@ -175,7 +175,7 @@ kernel void DecodeRGB565(global ushort *dst,
|
|||||||
dst += width*y + x;
|
dst += width*y + x;
|
||||||
for (int iy = 0; iy < 4; iy++)
|
for (int iy = 0; iy < 4; iy++)
|
||||||
{
|
{
|
||||||
vstore4(rotate(vload4(srcOffset++, src),(ushort4)8), 0, dst + iy*width);
|
vstore4(rotate(vload4(srcOffset++, src),8), 0, dst + iy*width);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -189,10 +189,10 @@ kernel void DecodeRGB565_RGBA(global uchar *dst,
|
|||||||
uchar8 val = vload8(srcOffset++, src);
|
uchar8 val = vload8(srcOffset++, src);
|
||||||
|
|
||||||
uchar16 res;
|
uchar16 res;
|
||||||
res.even.even = bitselect(val.even, val.even >> (uchar4)5, (uchar4)7);
|
res.even.even = bitselect(val.even, val.even >> 5, 7);
|
||||||
res.odd.even = bitselect((val.odd >> (uchar4)3) | (val.even << (uchar4)5), val.even >> (uchar4)1, (uchar4)3);
|
res.odd.even = bitselect((val.odd >> 3) | (val.even << 5), val.even >> 1, 3);
|
||||||
res.even.odd = bitselect(val.odd << (uchar4)3, val.odd >> (uchar4)2, (uchar4)7);
|
res.even.odd = bitselect(val.odd << 3, val.odd >> 2, 7);
|
||||||
res.odd.odd = (uchar4)0xFF;
|
res.odd.odd = 0xFF;
|
||||||
|
|
||||||
vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
|
vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
|
||||||
}
|
}
|
||||||
@ -207,16 +207,16 @@ kernel void DecodeRGB5A3(global uchar *dst,
|
|||||||
uchar16 resNoAlpha, resAlpha, choice;
|
uchar16 resNoAlpha, resAlpha, choice;
|
||||||
#define iterateRGB5A3() \
|
#define iterateRGB5A3() \
|
||||||
val = vload8(srcOffset++, src); \
|
val = vload8(srcOffset++, src); \
|
||||||
resNoAlpha.s26AE = val.even << (uchar4)1; \
|
resNoAlpha.s26AE = val.even << 1; \
|
||||||
resNoAlpha.s159D = val.even << (uchar4)6 | val.odd >> (uchar4)2; \
|
resNoAlpha.s159D = val.even << 6 | val.odd >> 2; \
|
||||||
resNoAlpha.s048C = val.odd << (uchar4)3; \
|
resNoAlpha.s048C = val.odd << 3; \
|
||||||
resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> (uchar16)5, (uchar16)0x3); \
|
resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> 5, 0x3); \
|
||||||
resNoAlpha.s37BF = (uchar4)(0xFF); \
|
resNoAlpha.s37BF = 0xFF; \
|
||||||
resAlpha.s26AE = bitselect(val.even << (uchar4)4, val.even, (uchar4)0xF); \
|
resAlpha.s26AE = bitselect(val.even << 4, val.even, 0xF); \
|
||||||
resAlpha.s159D = bitselect(val.odd, val.odd >> (uchar4)4, (uchar4)0xF); \
|
resAlpha.s159D = bitselect(val.odd, val.odd >> 4, 0xF); \
|
||||||
resAlpha.s048C = bitselect(val.odd << (uchar4)4, val.odd, (uchar4)0xF); \
|
resAlpha.s048C = bitselect(val.odd << 4, val.odd, 0xF); \
|
||||||
resAlpha.s37BF = bitselect(val.even << (uchar4)1, val.even >> (uchar4)2, (uchar4)0x1C); \
|
resAlpha.s37BF = bitselect(val.even << 1, val.even >> 2, 0x1C); \
|
||||||
resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> (uchar4)5, (uchar4)0x3); \
|
resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> 5, 0x3); \
|
||||||
choice = (uchar16)((uchar4)(val.even.s0), \
|
choice = (uchar16)((uchar4)(val.even.s0), \
|
||||||
(uchar4)(val.even.s1), \
|
(uchar4)(val.even.s1), \
|
||||||
(uchar4)(val.even.s2), \
|
(uchar4)(val.even.s2), \
|
||||||
@ -237,16 +237,16 @@ kernel void DecodeRGB5A3_RGBA(global uchar *dst,
|
|||||||
uchar16 resNoAlpha, resAlpha, choice;
|
uchar16 resNoAlpha, resAlpha, choice;
|
||||||
#define iterateRGB5A3_RGBA() \
|
#define iterateRGB5A3_RGBA() \
|
||||||
val = vload8(srcOffset++, src); \
|
val = vload8(srcOffset++, src); \
|
||||||
resNoAlpha.s048C = val.even << (uchar4)1; \
|
resNoAlpha.s048C = val.even << 1; \
|
||||||
resNoAlpha.s159D = val.even << (uchar4)6 | val.odd >> (uchar4)2; \
|
resNoAlpha.s159D = val.even << 6 | val.odd >> 2; \
|
||||||
resNoAlpha.s26AE = val.odd << (uchar4)3; \
|
resNoAlpha.s26AE = val.odd << 3; \
|
||||||
resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> (uchar16)5, (uchar16)0x3); \
|
resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> 5, 0x3); \
|
||||||
resNoAlpha.s37BF = (uchar4)(0xFF); \
|
resNoAlpha.s37BF = 0xFF; \
|
||||||
resAlpha.s048C = bitselect(val.even << (uchar4)4, val.even, (uchar4)0xF); \
|
resAlpha.s048C = bitselect(val.even << 4, val.even, 0xF); \
|
||||||
resAlpha.s159D = bitselect(val.odd, val.odd >> (uchar4)4, (uchar4)0xF); \
|
resAlpha.s159D = bitselect(val.odd, val.odd >> 4, 0xF); \
|
||||||
resAlpha.s26AE = bitselect(val.odd << (uchar4)4, val.odd, (uchar4)0xF); \
|
resAlpha.s26AE = bitselect(val.odd << 4, val.odd, 0xF); \
|
||||||
resAlpha.s37BF = bitselect(val.even << (uchar4)1, val.even >> (uchar4)2, (uchar4)0x1C); \
|
resAlpha.s37BF = bitselect(val.even << 1, val.even >> 2, 0x1C); \
|
||||||
resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> (uchar4)5, (uchar4)0x3); \
|
resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> 5, 0x3); \
|
||||||
choice = (uchar16)((uchar4)(val.even.s0), \
|
choice = (uchar16)((uchar4)(val.even.s0), \
|
||||||
(uchar4)(val.even.s1), \
|
(uchar4)(val.even.s1), \
|
||||||
(uchar4)(val.even.s2), \
|
(uchar4)(val.even.s2), \
|
||||||
@ -274,13 +274,13 @@ kernel void decodeCMPRBlock(global uchar *dst,
|
|||||||
|
|
||||||
uchar2 colora565 = (uchar2)(val.s1, val.s3);
|
uchar2 colora565 = (uchar2)(val.s1, val.s3);
|
||||||
uchar2 colorb565 = (uchar2)(val.s0, val.s2);
|
uchar2 colorb565 = (uchar2)(val.s0, val.s2);
|
||||||
uchar8 color32 = (uchar8)(bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7),
|
uchar8 color32 = (uchar8)(bitselect(colora565 << 3, colora565 >> 2, 7),
|
||||||
bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3),
|
bitselect((colora565 >> 3) | (colorb565 << 5), colorb565 >> 1, 3),
|
||||||
bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7),
|
bitselect(colorb565, colorb565 >> 5, 7),
|
||||||
(uchar2)0xFF);
|
(uchar2)0xFF);
|
||||||
|
|
||||||
ushort4 frac2 = convert_ushort4(color32.even) - convert_ushort4(color32.odd);
|
ushort4 frac2 = convert_ushort4(color32.even) - convert_ushort4(color32.odd);
|
||||||
uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8);
|
uchar4 frac = convert_uchar4((frac2 * 3) / 8);
|
||||||
|
|
||||||
ushort4 colorAlpha = upsample((uchar4)(color32.even.s0,color32.even.s1,color32.even.s2,0),
|
ushort4 colorAlpha = upsample((uchar4)(color32.even.s0,color32.even.s1,color32.even.s2,0),
|
||||||
rhadd(color32.odd, color32.even));
|
rhadd(color32.odd, color32.even));
|
||||||
@ -320,13 +320,13 @@ kernel void decodeCMPRBlock_RGBA(global uchar *dst,
|
|||||||
|
|
||||||
uchar2 colora565 = (uchar2)(val.s1, val.s3);
|
uchar2 colora565 = (uchar2)(val.s1, val.s3);
|
||||||
uchar2 colorb565 = (uchar2)(val.s0, val.s2);
|
uchar2 colorb565 = (uchar2)(val.s0, val.s2);
|
||||||
uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7),
|
uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> 5, 7),
|
||||||
bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3),
|
bitselect((colora565 >> 3) | (colorb565 << 5), colorb565 >> 1, 3),
|
||||||
bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7),
|
bitselect(colora565 << 3, colora565 >> 2, 7),
|
||||||
(uchar2)0xFF);
|
(uchar2)0xFF);
|
||||||
|
|
||||||
ushort4 frac2 = convert_ushort4(color32.even) - convert_ushort4(color32.odd);
|
ushort4 frac2 = convert_ushort4(color32.even) - convert_ushort4(color32.odd);
|
||||||
uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8);
|
uchar4 frac = convert_uchar4((frac2 * 3) / 8);
|
||||||
|
|
||||||
ushort4 colorAlpha = upsample((uchar4)(color32.even.s0,color32.even.s1,color32.even.s2,0),
|
ushort4 colorAlpha = upsample((uchar4)(color32.even.s0,color32.even.s1,color32.even.s2,0),
|
||||||
rhadd(color32.odd, color32.even));
|
rhadd(color32.odd, color32.even));
|
||||||
|
Loading…
Reference in New Issue
Block a user