OpenCL: Moved the OpenCL code from the compiled source to a separate file TextureDecoder.cl and added a .rules files to copy the updated version to the build directory.

Fixed RGB5A3 decoding with alpha
New CMPR decoding, blocks with no alpha are great, still have to figure the problems with transparent blocks. Disabled for now.
Added a better error reporting to the base OpenCL functions

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@4439 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
Orphis 2009-10-20 00:55:07 +00:00
parent e81d25562d
commit 094c73a080
6 changed files with 401 additions and 275 deletions

View File

@ -0,0 +1,252 @@
// Copyright (C) 2003 Dolphin Project.
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
kernel void DecodeI4(global uchar *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 8;
int srcOffset = x + y * width / 8;
for (int iy = 0; iy < 8; iy++)
{
uchar4 val = vload4(srcOffset, src);
uchar8 res;
res.even = (val >> 4) & 0x0F;
res.odd = val & 0x0F;
res |= res << 4;
vstore8(res, 0, dst + ((y + iy)*width + x));
srcOffset++;
}
}
kernel void DecodeI8(global uchar *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 8;
for (int iy = 0; iy < 4; iy++)
{
vstore8(vload8(srcOffset, src),
0, dst + ((y + iy)*width + x));
srcOffset++;
}
}
kernel void DecodeIA8(global uchar *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 4;
for (int iy = 0; iy < 4; iy++)
{
uchar8 val = vload8(srcOffset, src);
uchar8 res;
res.odd = val.even;
res.even = val.odd;
vstore8(res, 0, dst + ((y + iy)*width + x) * 2);
srcOffset++;
}
}
kernel void DecodeIA4(global uchar *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
int srcOffset = ((x * 4) + (y * width)) / 8;
for (int iy = 0; iy < 4; iy++)
{
uchar8 val = vload8(srcOffset, src);
uchar16 res;
res.odd = (val >> 4) & 0x0F;
res.even = val & 0x0F;
res |= res << 4;
vstore16(res, 0, dst + ((y + iy)*width + x) * 2);
srcOffset++;
}
}
kernel void DecodeRGBA8(global uchar *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = (x * 2) + (y * width) / 2;
for (int iy = 0; iy < 4; iy++)
{
uchar8 ar = vload8(srcOffset, src);
uchar8 gb = vload8(srcOffset + 4, src);
uchar16 res;
res.even.even = gb.odd;
res.even.odd = ar.odd;
res.odd.even = gb.even;
res.odd.odd = ar.even;
vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
srcOffset++;
}
}
kernel void DecodeRGB565(global ushort *dst,
const global ushort *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = x + (y * width) / 4;
for (int iy = 0; iy < 4; iy++)
{
ushort4 val = vload4(srcOffset, src);
val = (val >> 8) | (val << 8);
vstore4(val, 0, dst + ((y + iy)*width + x));
srcOffset++;
}
}
kernel void DecodeRGB5A3(global uchar *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = x + (y * width) / 4;
for (int iy = 0; iy < 4; iy++)
{
ushort8 val = convert_ushort8(vload8(srcOffset, src));
ushort4 vs = val.odd | (val.even << 8);
uchar16 resNoAlpha;
resNoAlpha.s26AE = convert_uchar4(vs >> 7); // R
resNoAlpha.s159D = convert_uchar4(vs >> 2); // G
resNoAlpha.s048C = convert_uchar4(vs << 3); // B
resNoAlpha &= 0xF8;
resNoAlpha |= (resNoAlpha >> 5) & 3; // 5 -> 8
resNoAlpha.s37BF = (uchar4)(0xFF);
uchar16 resAlpha;
resAlpha.s26AE = convert_uchar4(vs >> 8); // R
resAlpha.s159D = convert_uchar4(vs >> 4); // G
resAlpha.s048C = convert_uchar4(vs); // B
resAlpha &= 0x0F;
resAlpha |= (resAlpha << 4);
resAlpha.s37BF = convert_uchar4(vs >> 7) & 0xE0;
resAlpha.s37BF |= ((resAlpha.s37BF >> 3) & 0x1C)
| ((resAlpha.s37BF >> 6) & 0x3);
uchar16 choice = (uchar16)((uchar4)(vs.s0 >> 8),
(uchar4)(vs.s1 >> 8),
(uchar4)(vs.s2 >> 8),
(uchar4)(vs.s3 >> 8));
uchar16 res;
res = select(resAlpha, resNoAlpha, choice);
vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
srcOffset++;
}
}
uint4 unpack2bits(uchar b)
{
return (uint4)(b >> 6,
(b >> 4) & 3,
(b >> 2) & 3,
b & 3);
}
/*
Lots of debug code there that I'm using to find the problems with CMPR decoding
I think blocks having no alpha are properly decoded, only the blocks with alpha
are problematic. This is WIP !
*/
kernel void decodeCMPRBlock(global uchar *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = (x * 2 + y * width / 2) / 8; //x / 4 + y * width / 16; //(x * 4) + (y * width) / 16;
uchar8 val = vload8(0, src);
ushort2 color565 = (ushort2)((val.s1 & 0xFF) | (val.s0 << 8), (val.s3 & 0xFF) | (val.s2 << 8));
uchar8 color32 = convert_uchar8((ushort8)
(((color565 << 3) & 0xF8) | ((color565 >> 2) & 0x7),
((color565 >> 3) & 0xFC) | ((color565 >> 9) & 0x3),
((color565 >> 8) & 0xF8) | ((color565 >> 13) & 0x7),
0xFF, 0xFF));
uint4 colors;
uint4 choice = (uint4)((color565.s0 - color565.s1) << 16);
uint4 colorNoAlpha;
//uchar4 frac = (color32.odd - color32.even) / 2;
//frac = frac - (frac / 4);
uchar4 frac = convert_uchar4((((convert_ushort4(color32.even) & 0xFF) - (convert_ushort4(color32.odd) & 0xFF)) * 3) / 8);
//colorNoAlpha = convert_uint4(frac);
colorNoAlpha = convert_uint4(color32.even - frac);
colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.odd + frac);
colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.odd);
colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.even);
uint4 colorAlpha;
//uchar4 midpoint = rhadd(color32.odd, color32.even);
uchar4 midpoint = convert_uchar4((convert_ushort4(color32.odd) + convert_ushort4(color32.even) + 1) / 2);
midpoint.s3 = 0xFF;
//colorAlpha = convert_uint4(color32.odd);
colorAlpha = convert_uint4((uchar4)(0, 0, 0, 0));
colorAlpha = (colorAlpha << 8) | convert_uint4(midpoint);
colorAlpha = (colorAlpha << 8) | convert_uint4(color32.odd);
colorAlpha = (colorAlpha << 8) | convert_uint4(color32.even);
//colorNoAlpha = (uint4)(0xFFFFFFFF);
//colorAlpha = (uint4)(0, 0, 0, 0xFFFFFFFF);
colors = select(colorNoAlpha, colorAlpha, choice);
uint16 colorsFull = (uint16)(colors, colors, colors, colors);
uint4 shift0 = unpack2bits(val.s4);
uint4 shift1 = unpack2bits(val.s5);
uint4 shift2 = unpack2bits(val.s6);
uint4 shift3 = unpack2bits(val.s7);
uint16 shifts = (uint16)((uint4)(shift3.s0), (uint4)(shift3.s1), (uint4)(shift3.s2), (uint4)(shift3.s3));
shifts = (shifts << 8) | (uint16)((uint4)(shift2.s0), (uint4)(shift2.s1), (uint4)(shift2.s2), (uint4)(shift2.s3));
shifts = (shifts << 8) | (uint16)((uint4)(shift1.s0), (uint4)(shift1.s1), (uint4)(shift1.s2), (uint4)(shift1.s3));
shifts = (shifts << 8) | (uint16)((uint4)(shift0.s0), (uint4)(shift0.s1), (uint4)(shift0.s2), (uint4)(shift0.s3));
shifts <<= 3;
for (int iy = 0; iy < 4; iy++)
{
uchar16 res;
res = convert_uchar16(colorsFull >> (shifts & 0xFF));
shifts >>= 8;
//uchar4 t = convert_uchar4((ushort4)(color565.s0 >> 8, color565.s0 & 0xFF, color565.s1 >> 8, color565.s1 & 0xFF));
//res = (uchar16)(t, t, t, t);
//res = (uchar16)(frac, color32.even - color32.odd, (color32.even - color32.odd) / 2, (color32.even - color32.odd) / 2 - ((color32.even - color32.odd) / 8));
//res = (uchar16)(color32.even, color32.odd, frac, convert_uchar4(choice));
//res = convert_uchar16((uint16)(colorNoAlpha >> 24, colorNoAlpha >> 16, colorNoAlpha >> 8, colorNoAlpha));
//res = convert_uchar16((uint16)(colorAlpha >> 24, colorAlpha >> 16, colorAlpha >> 8, colorAlpha));
//res = convert_uchar16((uint16)(colors >> 24, colors >> 16, colors >> 8, colors));
//res = convert_uchar16(shifts & 0xFF);
//res = convert_uchar16((uint16)(shift0, shift1, shift2, shift3));
//res = (uchar16)(((x))) + (uchar16)(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3);
//res.lo = val; res.s8 = x >> 8; res.s9 = x; res.sA = (iy + y) >> 8; res.sB = y + iy; res.sC = width >> 8; res.sD = width; res.sE = srcOffset >> 8; res.sF = srcOffset;
vstore16(res, 0, dst);
dst += width * 4;
}
}
kernel void DecodeCMPR(global uchar *dst,
const global uchar *src, int width)
{
int x = get_global_id(0) * 8, y = get_global_id(1) * 8;
src += x * 4 + (y * width) / 2;
decodeCMPRBlock(dst + (y * width + x) * 4, src, width);
src += 8;
decodeCMPRBlock(dst + (y * width + x + 4) * 4, src, width);
src += 8;
decodeCMPRBlock(dst + ((y + 4) * width + x) * 4, src, width);
src += 8;
decodeCMPRBlock(dst + ((y + 4) * width + x + 4) * 4, src, width);
}

View File

@ -45,7 +45,7 @@ bool Initialize() {
err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS) if (err != CL_SUCCESS)
{ {
PanicAlert("Error: Failed to create a device group!"); HandleCLError(err, "Failed to create a device group!");
return false; return false;
} }
@ -54,7 +54,7 @@ bool Initialize() {
g_context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); g_context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!g_context) if (!g_context)
{ {
PanicAlert("Error: Failed to create a compute context!"); HandleCLError(err, "Failed to create a compute context!");
return false; return false;
} }
@ -63,7 +63,7 @@ bool Initialize() {
g_cmdq = clCreateCommandQueue(g_context, device_id, 0, &err); g_cmdq = clCreateCommandQueue(g_context, device_id, 0, &err);
if (!g_cmdq) if (!g_cmdq)
{ {
PanicAlert("Error: Failed to create a command commands!"); HandleCLError(err, "Failed to create a command commands!");
return false; return false;
} }
//PanicAlert("Initialized OpenCL fine!"); //PanicAlert("Initialized OpenCL fine!");
@ -88,7 +88,7 @@ cl_program CompileProgram(const char *Kernel) {
program = clCreateProgramWithSource(OpenCL::g_context, 1, (const char **) & Kernel, NULL, &err); program = clCreateProgramWithSource(OpenCL::g_context, 1, (const char **) & Kernel, NULL, &err);
if (!program) if (!program)
{ {
printf("Error: Failed to create compute program!"); HandleCLError(err, "Error: Failed to create compute program!");
return NULL; return NULL;
} }
@ -114,7 +114,7 @@ cl_kernel CompileKernel(cl_program program, const char *Function)
cl_kernel kernel = clCreateKernel(program, Function, &err); cl_kernel kernel = clCreateKernel(program, Function, &err);
if (!kernel || err != CL_SUCCESS) if (!kernel || err != CL_SUCCESS)
{ {
PanicAlert("Error: Failed to create compute kernel!"); HandleCLError(err, "Failed to create compute kernel!");
return NULL; return NULL;
} }
return kernel; return kernel;
@ -130,5 +130,66 @@ void Destroy() {
#endif #endif
} }
void HandleCLError(cl_int error, char* str)
{
char* name;
switch(error)
{
#define CL_ERROR(x) case (x): name = #x; break
CL_ERROR(CL_SUCCESS);
CL_ERROR(CL_DEVICE_NOT_FOUND);
CL_ERROR(CL_DEVICE_NOT_AVAILABLE);
CL_ERROR(CL_COMPILER_NOT_AVAILABLE);
CL_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE);
CL_ERROR(CL_OUT_OF_RESOURCES);
CL_ERROR(CL_OUT_OF_HOST_MEMORY);
CL_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE);
CL_ERROR(CL_MEM_COPY_OVERLAP);
CL_ERROR(CL_IMAGE_FORMAT_MISMATCH);
CL_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED);
CL_ERROR(CL_BUILD_PROGRAM_FAILURE);
CL_ERROR(CL_MAP_FAILURE);
CL_ERROR(CL_INVALID_VALUE);
CL_ERROR(CL_INVALID_DEVICE_TYPE);
CL_ERROR(CL_INVALID_PLATFORM);
CL_ERROR(CL_INVALID_DEVICE);
CL_ERROR(CL_INVALID_CONTEXT);
CL_ERROR(CL_INVALID_QUEUE_PROPERTIES);
CL_ERROR(CL_INVALID_COMMAND_QUEUE);
CL_ERROR(CL_INVALID_HOST_PTR);
CL_ERROR(CL_INVALID_MEM_OBJECT);
CL_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
CL_ERROR(CL_INVALID_IMAGE_SIZE);
CL_ERROR(CL_INVALID_SAMPLER);
CL_ERROR(CL_INVALID_BINARY);
CL_ERROR(CL_INVALID_BUILD_OPTIONS);
CL_ERROR(CL_INVALID_PROGRAM);
CL_ERROR(CL_INVALID_PROGRAM_EXECUTABLE);
CL_ERROR(CL_INVALID_KERNEL_NAME);
CL_ERROR(CL_INVALID_KERNEL_DEFINITION);
CL_ERROR(CL_INVALID_KERNEL);
CL_ERROR(CL_INVALID_ARG_INDEX);
CL_ERROR(CL_INVALID_ARG_VALUE);
CL_ERROR(CL_INVALID_ARG_SIZE);
CL_ERROR(CL_INVALID_KERNEL_ARGS);
CL_ERROR(CL_INVALID_WORK_DIMENSION);
CL_ERROR(CL_INVALID_WORK_GROUP_SIZE);
CL_ERROR(CL_INVALID_WORK_ITEM_SIZE);
CL_ERROR(CL_INVALID_GLOBAL_OFFSET);
CL_ERROR(CL_INVALID_EVENT_WAIT_LIST);
CL_ERROR(CL_INVALID_EVENT);
CL_ERROR(CL_INVALID_OPERATION);
CL_ERROR(CL_INVALID_GL_OBJECT);
CL_ERROR(CL_INVALID_BUFFER_SIZE);
CL_ERROR(CL_INVALID_MIP_LEVEL);
#undef CL_ERROR
default:
name = "Unknown error code";
}
if(!str)
str = "";
PanicAlert("OpenCL error: %s %s (%d)", str, name, error);
}
}; };

View File

@ -60,6 +60,7 @@ void Destroy();
cl_program CompileProgram(const char *Kernel); cl_program CompileProgram(const char *Kernel);
cl_kernel CompileKernel(cl_program program, const char *Function); cl_kernel CompileKernel(cl_program program, const char *Function);
void HandleCLError(cl_int error, char* str = 0);
}; };

View File

@ -0,0 +1,18 @@
<?xml version="1.0" encoding="utf-8"?>
<VisualStudioToolFile
Name="OpenCL"
Version="8,00"
>
<Rules>
<CustomBuildRule
Name="OpenCL source"
DisplayName="OpenCL"
CommandLine="xcopy /I /Y $(InputPath) $(SolutionDir)\..\Binary\$(PlatformName)\User\OpenCL"
Outputs="$(SolutionDir)\..\Binary\$(PlatformName)\User\OpenCL\$(InputFileName)"
FileExtensions="*.cl"
>
<Properties>
</Properties>
</CustomBuildRule>
</Rules>
</VisualStudioToolFile>

View File

@ -18,6 +18,7 @@
#include "OCLTextureDecoder.h" #include "OCLTextureDecoder.h"
#include "OpenCL.h" #include "OpenCL.h"
#include "FileUtil.h"
#include <fcntl.h> #include <fcntl.h>
#include <stdio.h> #include <stdio.h>
@ -26,159 +27,16 @@
#include <math.h> #include <math.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <string>
//#define DEBUG_OPENCL
struct sDecoders struct sDecoders
{ {
const char name[256]; // kernel name const char name[256]; // kernel name
cl_kernel kernel; // compute kernel cl_kernel kernel; // compute kernel
}; };
const char *Kernel = " \n\
kernel void DecodeI4(global uchar *dst, \n\
const global uchar *src, int width) \n\
{ \n\
int x = get_global_id(0) * 8, y = get_global_id(1) * 8; \n\
int srcOffset = x + y * width / 8; \n\
for (int iy = 0; iy < 8; iy++) \n\
{ \n\
uchar4 val = vload4(srcOffset, src); \n\
uchar8 res; \n\
res.even = (val >> 4) & 0x0F; \n\
res.odd = val & 0x0F; \n\
res |= res << 4; \n\
vstore8(res, 0, dst + ((y + iy)*width + x)); \n\
srcOffset++; \n\
} \n\
} \n\
\n\
kernel void DecodeI8(global uchar *dst, \n\
const global uchar *src, int width) \n\
{ \n\
int x = get_global_id(0) * 8, y = get_global_id(1) * 4; \n\
int srcOffset = ((x * 4) + (y * width)) / 8; \n\
for (int iy = 0; iy < 4; iy++) \n\
{ \n\
vstore8(vload8(srcOffset, src), \n\
0, dst + ((y + iy)*width + x)); \n\
srcOffset++; \n\
} \n\
} \n\
\n\
kernel void DecodeIA8(global uchar *dst, \n\
const global uchar *src, int width) \n\
{ \n\
int x = get_global_id(0) * 4, y = get_global_id(1) * 4; \n\
int srcOffset = ((x * 4) + (y * width)) / 4; \n\
for (int iy = 0; iy < 4; iy++) \n\
{ \n\
uchar8 val = vload8(srcOffset, src); \n\
uchar8 res; \n\
res.odd = val.even; \n\
res.even = val.odd; \n\
vstore8(res, 0, dst + ((y + iy)*width + x) * 2); \n\
srcOffset++; \n\
} \n\
} \n\
\n\
kernel void DecodeIA4(global uchar *dst, \n\
const global uchar *src, int width) \n\
{ \n\
int x = get_global_id(0) * 8, y = get_global_id(1) * 4; \n\
int srcOffset = ((x * 4) + (y * width)) / 8; \n\
for (int iy = 0; iy < 4; iy++) \n\
{ \n\
uchar8 val = vload8(srcOffset, src); \n\
uchar16 res; \n\
res.odd = (val >> 4) & 0x0F; \n\
res.even = val & 0x0F; \n\
res |= res << 4; \n\
vstore16(res, 0, dst + ((y + iy)*width + x) * 2); \n\
srcOffset++; \n\
} \n\
} \n\
\n\
kernel void DecodeRGBA8(global uchar *dst, \n\
const global uchar *src, int width) \n\
{ \n\
int x = get_global_id(0) * 4, y = get_global_id(1) * 4; \n\
int srcOffset = (x * 2) + (y * width) / 2; \n\
for (int iy = 0; iy < 4; iy++) \n\
{ \n\
uchar8 ar = vload8(srcOffset, src); \n\
uchar8 gb = vload8(srcOffset + 4, src); \n\
uchar16 res; \n\
res.even.even = gb.odd; \n\
res.even.odd = ar.odd; \n\
res.odd.even = gb.even; \n\
res.odd.odd = ar.even; \n\
vstore16(res, 0, dst + ((y + iy)*width + x) * 4); \n\
srcOffset++; \n\
} \n\
} \n\
\n\
kernel void DecodeRGB565(global ushort *dst, \n\
const global ushort *src, int width) \n\
{ \n\
int x = get_global_id(0) * 4, y = get_global_id(1) * 4; \n\
int srcOffset = x + (y * width) / 4; \n\
for (int iy = 0; iy < 4; iy++) \n\
{ \n\
ushort4 val = vload4(srcOffset, src); \n\
val = (val >> 8) | (val << 8); \n\
vstore4(val, 0, dst + ((y + iy)*width + x)); \n\
srcOffset++; \n\
} \n\
} \n\
\n\
kernel void DecodeRGB5A3(global uchar *dst, \n\
const global uchar *src, int width) \n\
{ \n\
int x = get_global_id(0) * 4, y = get_global_id(1) * 4; \n\
int srcOffset = x + (y * width) / 4; \n\
for (int iy = 0; iy < 4; iy++) \n\
{ \n\
ushort8 val = convert_ushort8(vload8(srcOffset, src));\n\
ushort4 vs = val.odd | (val.even << 8); \n\
uchar16 resNoAlpha; \n\
resNoAlpha.odd.odd = (uchar4)(0xFF); \n\
resNoAlpha.even.odd = convert_uchar4(vs >> 7) & 0xF8; \n\
resNoAlpha.odd.even = convert_uchar4(vs >> 2) & 0xF8; \n\
resNoAlpha.even.even = convert_uchar4(vs << 3) & 0xF8;\n\
// Better but cause color bleeding \n\
//resNoAlpha |= resNoAlpha >> 5; \n\
uchar16 resAlpha; \n\
resAlpha.even.odd = convert_uchar4(vs >> 8) & 0x0F; \n\
resAlpha.odd.even = convert_uchar4(vs >> 4) & 0x0F; \n\
resAlpha.even.even = convert_uchar4(vs) & 0x0F; \n\
resAlpha |= resNoAlpha << 4; \n\
resAlpha.odd.odd = convert_uchar4(vs >> 7) & 0xE0; \n\
resAlpha.odd.odd |= (resAlpha.odd.odd >> 3) \n\
| (resAlpha.odd.odd >> 6); \n\
uchar16 choice = (uchar16)((uchar4)(vs.s0 >> 8), \n\
(uchar4)(vs.s1 >> 8), \n\
(uchar4)(vs.s2 >> 8), \n\
(uchar4)(vs.s3 >> 8)); \n\
uchar16 res; \n\
res = select(resAlpha, resNoAlpha, choice); \n\
vstore16(res, 0, dst + ((y + iy)*width + x) * 4); \n\
srcOffset++; \n\
} \n\
} \n\
\n\
kernel void DecodeDXT(global ulong *dst, \n\
const global ulong *src, int width) \n\
{ // TODO: PLEASE NOTE THAT THIS CODE DOES NOT WORK \n\
int x = get_global_id(0) * 8, y = get_global_id(1) * 8; \n\
int srcOffset = ((x * 4) + (y * width)) / 8; \n\
for (int iy = 0; iy < 4; iy++) \n\
{ \n\
vstore8(vload8(srcOffset, src), \n\
0, dst + ((y + iy)*width + x)); \n\
srcOffset++; \n\
} \n\
} \n\
";
cl_program g_program; cl_program g_program;
// NULL terminated set of kernels // NULL terminated set of kernels
sDecoders Decoders[] = { sDecoders Decoders[] = {
@ -189,22 +47,29 @@ sDecoders Decoders[] = {
{"DecodeRGBA8", NULL}, {"DecodeRGBA8", NULL},
{"DecodeRGB565", NULL}, {"DecodeRGB565", NULL},
{"DecodeRGB5A3", NULL}, {"DecodeRGB5A3", NULL},
{"DecodeDXT", NULL}, {"DecodeCMPR", NULL},
{"", NULL}, {"", NULL},
}; };
bool g_Inited = false; bool g_Inited = false;
cl_mem g_clsrc, g_cldst; // texture buffer memory objects cl_mem g_clsrc, g_cldst; // texture buffer memory objects
void TexDecoder_OpenCL_Initialize() { void TexDecoder_OpenCL_Initialize() {
#if defined(HAVE_OPENCL) && HAVE_OPENCL #if defined(HAVE_OPENCL) && HAVE_OPENCL
if(!g_Inited) if(!g_Inited)
{ {
if(!OpenCL::Initialize()) if(!OpenCL::Initialize())
return; return;
std::string code;
char* filename = "User/OpenCL/TextureDecoder.cl";
if (!File::ReadFileToString(true, filename, code))
{
ERROR_LOG(VIDEO, "Failed to load OpenCL code %s - file is missing?", filename);
return;
}
g_program = OpenCL::CompileProgram(Kernel); g_program = OpenCL::CompileProgram(code.c_str());
int i = 0; int i = 0;
while(strlen(Decoders[i].name) > 0) { while(strlen(Decoders[i].name) > 0) {
@ -213,16 +78,18 @@ void TexDecoder_OpenCL_Initialize() {
} }
// Allocating maximal Wii texture size in advance, so that we don't have to allocate/deallocate per texture // Allocating maximal Wii texture size in advance, so that we don't have to allocate/deallocate per texture
#ifndef DEBUG_OPENCL
g_clsrc = clCreateBuffer(OpenCL::GetContext(), CL_MEM_READ_ONLY , 1024 * 1024 * sizeof(u32), NULL, NULL); g_clsrc = clCreateBuffer(OpenCL::GetContext(), CL_MEM_READ_ONLY , 1024 * 1024 * sizeof(u32), NULL, NULL);
g_cldst = clCreateBuffer(OpenCL::GetContext(), CL_MEM_WRITE_ONLY, 1024 * 1024 * sizeof(u32), NULL, NULL); g_cldst = clCreateBuffer(OpenCL::GetContext(), CL_MEM_WRITE_ONLY, 1024 * 1024 * sizeof(u32), NULL, NULL);
#endif
g_Inited = true; g_Inited = true;
} }
#endif #endif
} }
void TexDecoder_OpenCL_Shutdown() { void TexDecoder_OpenCL_Shutdown() {
#if defined(HAVE_OPENCL) && HAVE_OPENCL #if defined(HAVE_OPENCL) && HAVE_OPENCL && !defined(DEBUG_OPENCL)
if(g_clsrc) if(g_clsrc)
clReleaseMemObject(g_clsrc); clReleaseMemObject(g_clsrc);
@ -257,7 +124,6 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei
formatResult = PC_TEX_FMT_I8; formatResult = PC_TEX_FMT_I8;
break; break;
case GX_TF_IA4: case GX_TF_IA4:
// Maybe a cleaner way is needed
kernelToRun = Decoders[2].kernel; kernelToRun = Decoders[2].kernel;
sizeOfSrc = sizeof(u8); sizeOfSrc = sizeof(u8);
sizeOfDst = sizeof(u16); sizeOfDst = sizeof(u16);
@ -295,9 +161,10 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei
formatResult = PC_TEX_FMT_BGRA32; formatResult = PC_TEX_FMT_BGRA32;
break; break;
case GX_TF_CMPR: case GX_TF_CMPR:
return PC_TEX_FMT_NONE; // <-- TODO: Fix CMPR return PC_TEX_FMT_NONE; // Remove to test CMPR
kernelToRun = Decoders[7].kernel; kernelToRun = Decoders[7].kernel;
sizeOfSrc = sizeOfDst = sizeof(u32); sizeOfSrc = sizeof(u8) / 2.0f;
sizeOfDst = sizeof(u32);
xSkip = 8; xSkip = 8;
ySkip = 8; ySkip = 8;
formatResult = PC_TEX_FMT_BGRA32; formatResult = PC_TEX_FMT_BGRA32;
@ -306,6 +173,11 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei
return PC_TEX_FMT_NONE; return PC_TEX_FMT_NONE;
} }
#ifdef DEBUG_OPENCL
g_clsrc = clCreateBuffer(OpenCL::GetContext(), CL_MEM_READ_ONLY , 1024 * 1024 * sizeof(u32), NULL, NULL);
g_cldst = clCreateBuffer(OpenCL::GetContext(), CL_MEM_WRITE_ONLY, 1024 * 1024 * sizeof(u32), NULL, NULL);
#endif
clEnqueueWriteBuffer(OpenCL::GetCommandQueue(), g_clsrc, CL_TRUE, 0, (size_t)(width * height * sizeOfSrc), src, 0, NULL, NULL); clEnqueueWriteBuffer(OpenCL::GetCommandQueue(), g_clsrc, CL_TRUE, 0, (size_t)(width * height * sizeOfSrc), src, 0, NULL, NULL);
clSetKernelArg(kernelToRun, 0, sizeof(cl_mem), &g_cldst); clSetKernelArg(kernelToRun, 0, sizeof(cl_mem), &g_cldst);
@ -324,125 +196,22 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei
err = clEnqueueNDRangeKernel(OpenCL::GetCommandQueue(), kernelToRun, 2, NULL, global, NULL, 0, NULL, NULL); err = clEnqueueNDRangeKernel(OpenCL::GetCommandQueue(), kernelToRun, 2, NULL, global, NULL, 0, NULL, NULL);
if(err) if(err)
PanicAlert("Error queueing kernel"); OpenCL::HandleCLError(err, "Failed to enqueue kernel");
clFinish(OpenCL::GetCommandQueue()); clFinish(OpenCL::GetCommandQueue());
clEnqueueReadBuffer(OpenCL::GetCommandQueue(), g_cldst, CL_TRUE, 0, (size_t)(width * height * sizeOfDst), dst, 0, NULL, NULL); clEnqueueReadBuffer(OpenCL::GetCommandQueue(), g_cldst, CL_TRUE, 0, (size_t)(width * height * sizeOfDst), dst, 0, NULL, NULL);
#ifdef DEBUG_OPENCL
clReleaseMemObject(g_clsrc);
clReleaseMemObject(g_cldst);
#endif
return formatResult; return formatResult;
#else #else
return PC_TEX_FMT_NONE; return PC_TEX_FMT_NONE;
#endif #endif
/* switch (texformat)
{
case GX_TF_C4:
if (tlutfmt == 2)
{
// Special decoding is required for TLUT format 5A3
for (int y = 0; y < height; y += 8)
for (int x = 0; x < width; x += 8)
for (int iy = 0; iy < 8; iy++, src += 4)
decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src, tlutaddr);
}
else
{
for (int y = 0; y < height; y += 8)
for (int x = 0; x < width; x += 8)
for (int iy = 0; iy < 8; iy++, src += 4)
decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
}
return GetPCFormatFromTLUTFormat(tlutfmt);
case GX_TF_C8:
if (tlutfmt == 2)
{
// Special decoding is required for TLUT format 5A3
for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8)
for (int iy = 0; iy < 4; iy++, src += 8)
decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src, tlutaddr);
}
else
{
for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8)
for (int iy = 0; iy < 4; iy++, src += 8)
decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
}
return GetPCFormatFromTLUTFormat(tlutfmt);
case GX_TF_C14X2:
if (tlutfmt == 2)
{
// Special decoding is required for TLUT format 5A3
for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4)
for (int iy = 0; iy < 4; iy++, src += 8)
decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)src, tlutaddr);
}
else
{
for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4)
for (int iy = 0; iy < 4; iy++, src += 8)
decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x, (u16*)src, tlutaddr);
}
return GetPCFormatFromTLUTFormat(tlutfmt);
case GX_TF_RGB565:
{
for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4)
for (int iy = 0; iy < 4; iy++, src += 8)
{
u16 *ptr = (u16 *)dst + (y + iy) * width + x;
u16 *s = (u16 *)src;
for(int j = 0; j < 4; j++)
*ptr++ = Common::swap16(*s++);
}
}
return PC_TEX_FMT_RGB565;
case GX_TF_RGB5A3:
{
for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4)
for (int iy = 0; iy < 4; iy++, src += 8)
//decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4);
decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src);
}
return PC_TEX_FMT_BGRA32;
case GX_TF_RGBA8: // speed critical
{
for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4)
{
for (int iy = 0; iy < 4; iy++)
decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
src += 64;
}
}
return PC_TEX_FMT_BGRA32;
case GX_TF_CMPR: // speed critical
// The metroid games use this format almost exclusively.
{
for (int y = 0; y < height; y += 8)
{
for (int x = 0; x < width; x += 8)
{
decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src, width);
src += sizeof(DXTBlock);
decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src, width);
src += sizeof(DXTBlock);
decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src, width);
src += sizeof(DXTBlock);
decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src, width);
src += sizeof(DXTBlock);
}
}
return PC_TEX_FMT_BGRA32;
}
}
*/
// The "copy" texture formats, too?
return PC_TEX_FMT_NONE; return PC_TEX_FMT_NONE;
} }

View File

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="Windows-1252"?> <?xml version="1.0" encoding="Windows-1252"?>
<VisualStudioProject <VisualStudioProject
ProjectType="Visual C++" ProjectType="Visual C++"
Version="9.00" Version="9,00"
Name="VideoCommon" Name="VideoCommon"
ProjectGUID="{E5D1F0C0-AA07-4841-A4EB-4CF4DAA6B0FA}" ProjectGUID="{E5D1F0C0-AA07-4841-A4EB-4CF4DAA6B0FA}"
RootNamespace="VideoCommon" RootNamespace="VideoCommon"
@ -17,6 +17,9 @@
/> />
</Platforms> </Platforms>
<ToolFiles> <ToolFiles>
<ToolFile
RelativePath=".\OpenCL.rules"
/>
</ToolFiles> </ToolFiles>
<Configurations> <Configurations>
<Configuration <Configuration
@ -32,6 +35,9 @@
<Tool <Tool
Name="VCCustomBuildTool" Name="VCCustomBuildTool"
/> />
<Tool
Name="OpenCL source"
/>
<Tool <Tool
Name="VCXMLDataGeneratorTool" Name="VCXMLDataGeneratorTool"
/> />
@ -95,6 +101,9 @@
<Tool <Tool
Name="VCCustomBuildTool" Name="VCCustomBuildTool"
/> />
<Tool
Name="OpenCL source"
/>
<Tool <Tool
Name="VCXMLDataGeneratorTool" Name="VCXMLDataGeneratorTool"
/> />
@ -160,6 +169,9 @@
<Tool <Tool
Name="VCCustomBuildTool" Name="VCCustomBuildTool"
/> />
<Tool
Name="OpenCL source"
/>
<Tool <Tool
Name="VCXMLDataGeneratorTool" Name="VCXMLDataGeneratorTool"
/> />
@ -226,6 +238,9 @@
<Tool <Tool
Name="VCCustomBuildTool" Name="VCCustomBuildTool"
/> />
<Tool
Name="OpenCL source"
/>
<Tool <Tool
Name="VCXMLDataGeneratorTool" Name="VCXMLDataGeneratorTool"
/> />
@ -292,6 +307,9 @@
<Tool <Tool
Name="VCCustomBuildTool" Name="VCCustomBuildTool"
/> />
<Tool
Name="OpenCL source"
/>
<Tool <Tool
Name="VCXMLDataGeneratorTool" Name="VCXMLDataGeneratorTool"
/> />
@ -353,6 +371,9 @@
<Tool <Tool
Name="VCCustomBuildTool" Name="VCCustomBuildTool"
/> />
<Tool
Name="OpenCL source"
/>
<Tool <Tool
Name="VCXMLDataGeneratorTool" Name="VCXMLDataGeneratorTool"
/> />
@ -699,6 +720,10 @@
RelativePath=".\Src\OpenCL\OCLTextureDecoder.h" RelativePath=".\Src\OpenCL\OCLTextureDecoder.h"
> >
</File> </File>
<File
RelativePath="..\..\..\Data\User\OpenCL\TextureDecoder.cl"
>
</File>
</Filter> </Filter>
</Filter> </Filter>
<File <File