mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-07-26 07:39:45 -06:00
Fix TextureDecoder.cl to work on both NVidia and ATI video cards.
To do so I had to re-add the casting bloat removed in revision 6102. Also, for some odd reason the NVidia OpenCL drivers don't like 8 bit rotations, but are okay with 2, 4 bit rotations. These are apparently bugs in the NVidia drivers that are hopefully fixed in future versions. Also, on linux make sure the TextureDecoder.cl file is copied from the shared data directory to the users directory. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6611 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
@ -24,9 +24,9 @@ kernel void DecodeI4(global uchar *dst,
|
||||
{
|
||||
uchar4 val = vload4(srcOffset, src);
|
||||
uchar8 res;
|
||||
res.even = (val >> 4) & 0x0F;
|
||||
res.odd = val & 0x0F;
|
||||
res |= res << 4;
|
||||
res.even = (val >> (uchar4)4) & (uchar4)0x0F;
|
||||
res.odd = val & (uchar4)0x0F;
|
||||
res |= res << (uchar8)4;
|
||||
vstore8(res, 0, dst + ((y + iy)*width + x));
|
||||
srcOffset++;
|
||||
}
|
||||
@ -41,9 +41,9 @@ kernel void DecodeI4_RGBA(global uint *dst,
|
||||
{
|
||||
uchar4 val = vload4(srcOffset, src);
|
||||
uchar8 res;
|
||||
res.even = (val >> 4) & 0x0F;
|
||||
res.odd = val & 0x0F;
|
||||
res |= res << 4;
|
||||
res.even = (val >> (uchar4)4) & (uchar4)0x0F;
|
||||
res.odd = val & (uchar4)0x0F;
|
||||
res |= res << (uchar8)4;
|
||||
vstore8(upsample(upsample(res,res),upsample(res,res)), 0, dst + ((y + iy)*width + x));
|
||||
srcOffset++;
|
||||
}
|
||||
@ -108,8 +108,8 @@ kernel void DecodeIA4(global ushort *dst,
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
{
|
||||
val = vload8(srcOffset++, src);
|
||||
res = upsample(val >> 4, val & 0xF);
|
||||
res |= res << 4;
|
||||
res = upsample(val >> (uchar8)4, val & (uchar8)0xF);
|
||||
res |= res << (ushort8)4;
|
||||
vstore8(res, 0, dst + y*width + x);
|
||||
dst+=width;
|
||||
}
|
||||
@ -125,10 +125,10 @@ kernel void DecodeIA4_RGBA(global uint *dst,
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
{
|
||||
val = vload8(srcOffset++, src);
|
||||
uchar8 a = val >> 4;
|
||||
uchar8 l = val & 0xF;
|
||||
uchar8 a = val >> (uchar8)4;
|
||||
uchar8 l = val & (uchar8)0xF;
|
||||
res = upsample(upsample(a, l), upsample(l,l));
|
||||
res |= res << 4;
|
||||
res |= res << (uint8)4;
|
||||
vstore8(res, 0, dst + y*width + x);
|
||||
dst+=width;
|
||||
}
|
||||
@ -142,7 +142,8 @@ kernel void DecodeRGBA8(global ushort *dst,
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
{
|
||||
ushort8 val = (ushort8)(vload4(srcOffset, src), vload4(srcOffset + 4, src));
|
||||
ushort8 bgra = rotate(val,8).s40516273;
|
||||
ushort8 temp = rotate(val, (ushort8)4);
|
||||
ushort8 bgra = rotate(temp, (ushort8)4).s40516273;
|
||||
vstore8(bgra, 0, dst + ((y + iy)*width + x) * 2);
|
||||
srcOffset++;
|
||||
}
|
||||
@ -175,7 +176,8 @@ kernel void DecodeRGB565(global ushort *dst,
|
||||
dst += width*y + x;
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
{
|
||||
vstore4(rotate(vload4(srcOffset++, src),8), 0, dst + iy*width);
|
||||
ushort4 val = rotate(vload4(srcOffset++, src),(ushort4)4);
|
||||
vstore4(rotate(val,(ushort4)4), 0, dst + iy*width);
|
||||
}
|
||||
}
|
||||
|
||||
@ -189,10 +191,10 @@ kernel void DecodeRGB565_RGBA(global uchar *dst,
|
||||
uchar8 val = vload8(srcOffset++, src);
|
||||
|
||||
uchar16 res;
|
||||
res.even.even = bitselect(val.even, val.even >> 5, 7);
|
||||
res.odd.even = bitselect((val.odd >> 3) | (val.even << 5), val.even >> 1, 3);
|
||||
res.even.odd = bitselect(val.odd << 3, val.odd >> 2, 7);
|
||||
res.odd.odd = 0xFF;
|
||||
res.even.even = bitselect(val.even, val.even >> (uchar4)5, (uchar4)7);
|
||||
res.odd.even = bitselect((val.odd >> (uchar4)3) | (val.even << (uchar4)5), val.even >> (uchar4)1, (uchar4)3);
|
||||
res.even.odd = bitselect(val.odd << (uchar4)3, val.odd >> (uchar4)2, (uchar4)7);
|
||||
res.odd.odd = (uchar4)0xFF;
|
||||
|
||||
vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
|
||||
}
|
||||
@ -207,16 +209,16 @@ kernel void DecodeRGB5A3(global uchar *dst,
|
||||
uchar16 resNoAlpha, resAlpha, choice;
|
||||
#define iterateRGB5A3() \
|
||||
val = vload8(srcOffset++, src); \
|
||||
resNoAlpha.s26AE = val.even << 1; \
|
||||
resNoAlpha.s159D = val.even << 6 | val.odd >> 2; \
|
||||
resNoAlpha.s048C = val.odd << 3; \
|
||||
resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> 5, 0x3); \
|
||||
resNoAlpha.s37BF = 0xFF; \
|
||||
resAlpha.s26AE = bitselect(val.even << 4, val.even, 0xF); \
|
||||
resAlpha.s159D = bitselect(val.odd, val.odd >> 4, 0xF); \
|
||||
resAlpha.s048C = bitselect(val.odd << 4, val.odd, 0xF); \
|
||||
resAlpha.s37BF = bitselect(val.even << 1, val.even >> 2, 0x1C); \
|
||||
resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> 5, 0x3); \
|
||||
resNoAlpha.s26AE = val.even << (uchar4)1; \
|
||||
resNoAlpha.s159D = val.even << (uchar4)6 | val.odd >> (uchar4)2; \
|
||||
resNoAlpha.s048C = val.odd << (uchar4)3; \
|
||||
resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> (uchar16)5, (uchar16)0x3); \
|
||||
resNoAlpha.s37BF = (uchar4)(0xFF); \
|
||||
resAlpha.s26AE = bitselect(val.even << (uchar4)4, val.even, (uchar4)0xF); \
|
||||
resAlpha.s159D = bitselect(val.odd, val.odd >> (uchar4)4, (uchar4)0xF); \
|
||||
resAlpha.s048C = bitselect(val.odd << (uchar4)4, val.odd, (uchar4)0xF); \
|
||||
resAlpha.s37BF = bitselect(val.even << (uchar4)1, val.even >> (uchar4)2, (uchar4)0x1C); \
|
||||
resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> (uchar4)5, (uchar4)0x3); \
|
||||
choice = (uchar16)((uchar4)(val.even.s0), \
|
||||
(uchar4)(val.even.s1), \
|
||||
(uchar4)(val.even.s2), \
|
||||
@ -237,16 +239,16 @@ kernel void DecodeRGB5A3_RGBA(global uchar *dst,
|
||||
uchar16 resNoAlpha, resAlpha, choice;
|
||||
#define iterateRGB5A3_RGBA() \
|
||||
val = vload8(srcOffset++, src); \
|
||||
resNoAlpha.s048C = val.even << 1; \
|
||||
resNoAlpha.s159D = val.even << 6 | val.odd >> 2; \
|
||||
resNoAlpha.s26AE = val.odd << 3; \
|
||||
resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> 5, 0x3); \
|
||||
resNoAlpha.s37BF = 0xFF; \
|
||||
resAlpha.s048C = bitselect(val.even << 4, val.even, 0xF); \
|
||||
resAlpha.s159D = bitselect(val.odd, val.odd >> 4, 0xF); \
|
||||
resAlpha.s26AE = bitselect(val.odd << 4, val.odd, 0xF); \
|
||||
resAlpha.s37BF = bitselect(val.even << 1, val.even >> 2, 0x1C); \
|
||||
resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> 5, 0x3); \
|
||||
resNoAlpha.s048C = val.even << (uchar4)1; \
|
||||
resNoAlpha.s159D = val.even << (uchar4)6 | val.odd >> (uchar4)2; \
|
||||
resNoAlpha.s26AE = val.odd << (uchar4)3; \
|
||||
resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> (uchar16)5, (uchar16)0x3); \
|
||||
resNoAlpha.s37BF = (uchar4)(0xFF); \
|
||||
resAlpha.s048C = bitselect(val.even << (uchar4)4, val.even, (uchar4)0xF); \
|
||||
resAlpha.s159D = bitselect(val.odd, val.odd >> (uchar4)4, (uchar4)0xF); \
|
||||
resAlpha.s26AE = bitselect(val.odd << (uchar4)4, val.odd, (uchar4)0xF); \
|
||||
resAlpha.s37BF = bitselect(val.even << (uchar4)1, val.even >> (uchar4)2, (uchar4)0x1C); \
|
||||
resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> (uchar4)5, (uchar4)0x3); \
|
||||
choice = (uchar16)((uchar4)(val.even.s0), \
|
||||
(uchar4)(val.even.s1), \
|
||||
(uchar4)(val.even.s2), \
|
||||
@ -274,13 +276,13 @@ kernel void decodeCMPRBlock(global uchar *dst,
|
||||
|
||||
uchar2 colora565 = (uchar2)(val.s1, val.s3);
|
||||
uchar2 colorb565 = (uchar2)(val.s0, val.s2);
|
||||
uchar8 color32 = (uchar8)(bitselect(colora565 << 3, colora565 >> 2, 7),
|
||||
bitselect((colora565 >> 3) | (colorb565 << 5), colorb565 >> 1, 3),
|
||||
bitselect(colorb565, colorb565 >> 5, 7),
|
||||
uchar8 color32 = (uchar8)(bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7),
|
||||
bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3),
|
||||
bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7),
|
||||
(uchar2)0xFF);
|
||||
|
||||
ushort4 frac2 = convert_ushort4(color32.even) - convert_ushort4(color32.odd);
|
||||
uchar4 frac = convert_uchar4((frac2 * 3) / 8);
|
||||
uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8);
|
||||
|
||||
ushort4 colorAlpha = upsample((uchar4)(color32.even.s0,color32.even.s1,color32.even.s2,0),
|
||||
rhadd(color32.odd, color32.even));
|
||||
@ -320,13 +322,13 @@ kernel void decodeCMPRBlock_RGBA(global uchar *dst,
|
||||
|
||||
uchar2 colora565 = (uchar2)(val.s1, val.s3);
|
||||
uchar2 colorb565 = (uchar2)(val.s0, val.s2);
|
||||
uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> 5, 7),
|
||||
bitselect((colora565 >> 3) | (colorb565 << 5), colorb565 >> 1, 3),
|
||||
bitselect(colora565 << 3, colora565 >> 2, 7),
|
||||
uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7),
|
||||
bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3),
|
||||
bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7),
|
||||
(uchar2)0xFF);
|
||||
|
||||
ushort4 frac2 = convert_ushort4(color32.even) - convert_ushort4(color32.odd);
|
||||
uchar4 frac = convert_uchar4((frac2 * 3) / 8);
|
||||
uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8);
|
||||
|
||||
ushort4 colorAlpha = upsample((uchar4)(color32.even.s0,color32.even.s1,color32.even.s2,0),
|
||||
rhadd(color32.odd, color32.even));
|
||||
@ -356,4 +358,4 @@ kernel void DecodeCMPR_RGBA(global uchar *dst,
|
||||
decodeCMPRBlock_RGBA(dst + 16, src, width); src += 8;
|
||||
decodeCMPRBlock_RGBA(dst + 16 * width, src, width); src += 8;
|
||||
decodeCMPRBlock_RGBA(dst + 16 * (width + 1), src, width);
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user