Shaders: Reintroduce the old Asci-Art shader.

Dolphin had a nice but horrible slow asci art shader.
Beside being slow, it used the font from dolphin's old OSD implementation.
So it was also broken after the VideoCommon redesign.

This patch reintroduces the (almost) identical method for the asci art generation.
However with many improvements:
* Inline definition of the font, so no external dependency any more
* Optimized performance by partial unrolling with CSE'ing some memory loads
* Shader subgroup instructions, so 32 times faster on desktop CPUs on Vulkan + OGL (no D3D support)
* Option to select character size by internal or window resultion
* Dropped the last row of pixels of each character (only modification of the outcome)
This commit is contained in:
degasus 2023-01-27 23:54:09 +01:00
parent 7de01597c6
commit 1571098783

View File

@ -0,0 +1,411 @@
/*
[configuration]
[OptionBool]
GUIName = Use target window resolution
OptionName = USE_WINDOW_RES
DefaultValue = true
[OptionBool]
GUIName = Debug: Calculate only one character per subgroup
OptionName = DEBUG_ONLY_ONE_CHAR
DefaultValue = false
[/configuration]
*/
const uint MAX_CHARS = 96u; // max 96, must be a multiple of 32
const bool HAVE_FULL_FEATURE_FALLBACK = true;
const uint UNROLL_FALLBACK = 4;
const uint UNROLL_SIMD = 3; // max MAX_CHARS / 32
// #undef SUPPORTS_SUBGROUP_REDUCTION
/*
The header-only font
We have 96 (ASCII) characters, each of them is 12 pixels high and 8 pixels wide.
To store the boolean value per pixel, 96 bits per character is needed.
So three 32 bit integers are used per character.
This takes in total roughly 1 kB of constant buffer.
The first character must be all-one for the optimized implementation below.
*/
const uint char_width = 8;
const uint char_height = 12;
const uint char_count = 96;
const uint char_pixels = char_width * char_height;
const float2 char_dim = float2(char_width, char_height);
const uint rasters[char_count][(char_pixels + 31) / 32] = {
{0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}, {0x18181818, 0x00181818, 0x00181800},
{0x6c6c6c6c, 0x00000000, 0x00000000}, {0x66660000, 0xff6666ff, 0x00006666},
{0x1bff7e18, 0xd8f87e1f, 0x00187eff}, {0x6edb1b0e, 0x760c1830, 0x0070d8db},
{0x3333361c, 0x1b0e0e1b, 0x00fe63f3}, {0x18383070, 0x00000000, 0x00000000},
{0x0c0c1830, 0x0c0c0c0c, 0x0030180c}, {0x3030180c, 0x30303030, 0x000c1830},
{0x5a990000, 0x5a3cff3c, 0x00000099}, {0x18180000, 0x18ffff18, 0x00001818},
{0x00000000, 0x38000000, 0x000c1838}, {0x00000000, 0x00ffff00, 0x00000000},
{0x00000000, 0x00000000, 0x00001c1c}, {0x6060c0c0, 0x18183030, 0x06060c0c},
{0xe3c3663c, 0xc7cfdbf3, 0x003c66c3}, {0x181e1c18, 0x18181818, 0x007e1818},
{0x60c0e77e, 0x060c1830, 0x00ff0303}, {0xc0c0e77e, 0xc0e07ee0, 0x007ee7c0},
{0x363c3830, 0x3030ff33, 0x00303030}, {0x030303ff, 0xc0e07f03, 0x007ee7c0},
{0x0303e77e, 0xc3e37f03, 0x007ee7c3}, {0xc0c0c0ff, 0x0c183060, 0x000c0c0c},
{0xc3c3e77e, 0xc3e77ee7, 0x007ee7c3}, {0xc3c3e77e, 0xc0c0fee7, 0x007ee7c0},
{0x00000000, 0x00001c1c, 0x00001c1c}, {0x38000000, 0x38000038, 0x000c1838},
{0x0c183060, 0x0c060306, 0x00603018}, {0x00000000, 0xff00ffff, 0x000000ff},
{0x30180c06, 0x3060c060, 0x00060c18}, {0xc0c3c37e, 0x18183060, 0x00180000},
{0x7e000000, 0xdbcbbbc3, 0x00fc06f3}, {0xc3663c18, 0xc3ffc3c3, 0x00c3c3c3},
{0xc3c3e37f, 0xc3e37fe3, 0x007fe3c3}, {0x0303e77e, 0x03030303, 0x007ee703},
{0xc3e3733f, 0xc3c3c3c3, 0x003f73e3}, {0x030303ff, 0x03033f03, 0x00ff0303},
{0x030303ff, 0x0303033f, 0x00030303}, {0x0303e77e, 0xc3f30303, 0x007ee7c3},
{0xc3c3c3c3, 0xc3c3ffc3, 0x00c3c3c3}, {0x1818187e, 0x18181818, 0x007e1818},
{0x60606060, 0x60606060, 0x003e7763}, {0x1b3363c3, 0x1b0f070f, 0x00c36333},
{0x03030303, 0x03030303, 0x00ff0303}, {0xffffe7c3, 0xc3c3c3db, 0x00c3c3c3},
{0xcfcfc7c7, 0xf3fbdbdf, 0x00e3e3f3}, {0xc3c3e77e, 0xc3c3c3c3, 0x007ee7c3},
{0xc3c3e37f, 0x03037fe3, 0x00030303}, {0xc3c3663c, 0xdbc3c3c3, 0x00fc76fb},
{0xc3c3e37f, 0x1b0f7fe3, 0x00c36333}, {0x0303e77e, 0xc0e07e07, 0x007ee7c0},
{0x181818ff, 0x18181818, 0x00181818}, {0xc3c3c3c3, 0xc3c3c3c3, 0x007ee7c3},
{0xc3c3c3c3, 0x6666c3c3, 0x00183c3c}, {0xc3c3c3c3, 0xffdbdbc3, 0x00c3e7ff},
{0x3c6666c3, 0x3c3c183c, 0x00c36666}, {0x3c6666c3, 0x1818183c, 0x00181818},
{0x60c0c0ff, 0x060c7e30, 0x00ff0303}, {0x0c0c0c3c, 0x0c0c0c0c, 0x003c0c0c},
{0x0c0c0606, 0x30301818, 0xc0c06060}, {0x3030303c, 0x30303030, 0x003c3030},
{0xc3663c18, 0x00000000, 0x00000000}, {0x00000000, 0x00000000, 0xff000000},
{0x181c0c0e, 0x00000000, 0x00000000}, {0x00000000, 0xfec0c37e, 0x00fec3c3},
{0x03030303, 0xc3c37f03, 0x007fc3c3}, {0x00000000, 0x0303c37e, 0x007ec303},
{0xc0c0c0c0, 0xc3c3fec0, 0x00fec3c3}, {0x00000000, 0x7fc3c37e, 0x00fe0303},
{0x0c0ccc78, 0x0c0c3f0c, 0x000c0c0c}, {0x00000000, 0xc3c3c37e, 0xc3c0c0fe},
{0x03030303, 0xc3c3c37f, 0x00c3c3c3}, {0x00001800, 0x18181818, 0x00181818},
{0x00003000, 0x30303030, 0x36303030}, {0x03030303, 0x0f1b3363, 0x0063331f},
{0x1818181e, 0x18181818, 0x007e1818}, {0x00000000, 0xdbdbdb7f, 0x00dbdbdb},
{0x00000000, 0x6363633f, 0x00636363}, {0x00000000, 0x6363633e, 0x003e6363},
{0x00000000, 0xc3c3c37f, 0x03037fc3}, {0x00000000, 0xc3c3c3fe, 0xc0c0fec3},
{0x00000000, 0x0303077f, 0x00030303}, {0x00000000, 0x7e0303fe, 0x007fc0c0},
{0x0c0c0c00, 0x0c0c0c3f, 0x00386c0c}, {0x00000000, 0x63636363, 0x007e6363},
{0x00000000, 0x6666c3c3, 0x00183c3c}, {0x00000000, 0xdbc3c3c3, 0x00c3e7ff},
{0x00000000, 0x183c66c3, 0x00c3663c}, {0x00000000, 0x3c6666c3, 0x06060c18},
{0x00000000, 0x183060ff, 0x00ff060c}, {0x181818f0, 0x181c0f1c, 0x00f01818},
{0x18181818, 0x18181818, 0x18181818}, {0x1818180f, 0x1838f038, 0x000f1818},
{0x06000000, 0x0060f18f, 0x00000000}, {0x00000000, 0x00000000, 0x00000000}};
// Precalculated sum of all pixels per character
const uint raster_active_pixels[char_count] = {
96, 18, 16, 40, 56, 42, 46, 10, 22, 22, 32, 28, 10, 16, 6, 24,
52, 29, 36, 44, 35, 42, 50, 28, 58, 51, 12, 16, 22, 32, 22, 26,
41, 46, 57, 38, 52, 38, 32, 46, 48, 30, 31, 43, 28, 56, 64, 52,
42, 52, 52, 44, 28, 48, 42, 58, 42, 32, 38, 26, 24, 26, 14, 8,
10, 34, 40, 26, 40, 32, 30, 33, 39, 16, 20, 37, 28, 43, 30, 30,
34, 34, 20, 28, 27, 30, 26, 36, 26, 24, 26, 30, 24, 30, 14, 0};
// Get one sample of the font: (pixel index, character index)
float SampleFont(uint2 pos) {
return (rasters[pos.y][pos.x / 32] >> (pos.x % 32)) & uint(1);
}
// Get one sample of the framebuffer: (character position in screen space, pixel index)
float3 SampleTex(uint2 char_pos, uint pixel) {
float2 inv_resoltion = OptionEnabled(USE_WINDOW_RES) ? GetInvWindowResolution() : GetInvResolution();
float2 tex_pos = char_pos * char_dim + float2(pixel % char_width, pixel / char_width) + 0.5;
return SampleLocation(tex_pos * inv_resoltion).xyz;
}
struct CharResults {
float3 fg; // font color
float3 bg; // background color
float err; // MSE of this configuration
uint c; // character index
};
// Calculate the font and background color and the MSE for a given character
CharResults CalcCharRes(uint c, float3 t, float3 ft) {
CharResults o;
o.c = c;
// Inputs:
// tt: sum of all texture samples squared
// t: sum of all texture samples
// ff: sum of all font samples squared
// f: sum of all font samples
// ft: sum of all font samples * texture samples
// The font is either 1.0 or 0.0, so ff == f
// As the font is constant, this is pre-calculated
float f = raster_active_pixels[c];
float ff = f;
// The calculation isn't stable if the font is all-one. Return max err
// instead.
if (f == char_pixels) {
o.err = char_pixels * char_pixels;
return o;
}
// tt is only used as constant offset for the error, define it as zero
float3 tt = float3(0.0, 0.0, 0.0);
// The next lines are a bit harder, hf :-)
// The idea is to find the perfect char with the perfect background color
// and the perfect font color. As this is an equation with three unknowns,
// we can't just try all chars and color combinations.
// As criterion how "perfect" the selection is, we compare the "mean
// squared error" of the resulted colors of all chars. So, now the big
// issue: how to calculate the MSE without knowing the two colors ...
// In the next steps, "a" is the font color, "b" is the background color,
// "f" is the font value at this pixel, "t" is the texture value
// So the square error of one pixel is:
// e = ( t - a⋅f - b⋅(1-f) ) ^ 2
// In longer:
// e = a^2⋅f^2 - 2⋅a⋅b⋅f^2 + 2⋅a⋅b⋅f - 2⋅a⋅f⋅t + b^2⋅f^2 - 2⋅b^2⋅f + b^2 +
// 2⋅b⋅f⋅t - 2⋅b⋅t + t^2
// The sum of all errors is: (as shortcut, ff,f,ft,t,tt are now the sums
// like declared above, sum(1) is the count of pixels) sum(e) = a^2⋅ff -
// 2⋅a^2⋅ff + 2⋅a⋅b⋅f - 2⋅a⋅ft + b^2⋅ff - 2⋅b^2⋅f + b^2⋅sum(1) + 2⋅b⋅ft -
// 2⋅b⋅t + tt
// tt is only used as a constant offset, so its value has no effect on a,b or
// on the relative error. So it can be completely dropped.
// To find the minimum, we have to derive this by "a" and "b":
// d/da sum(e) = 2⋅a⋅ff + 2⋅b⋅f - 2⋅b⋅ff - 2⋅ft
// d/db sum(e) = 2⋅a⋅f - 2⋅a⋅ff - 4⋅b⋅f + 2⋅b⋅ff + 2⋅b⋅sum(1) + 2⋅ft - 2⋅t
// So, both equations must be zero at minimum and there is only one
// solution.
float3 a = (ft * (f - float(char_pixels)) + t * (f - ff)) / (f * f - ff * float(char_pixels));
float3 b = (ft * f - t * ff) / (f * f - ff * float(char_pixels));
float3 e = a * a * ff + 2.0 * a * b * (f - ff) - 2.0 * a * ft +
b * b * (-2.0 * f + ff + float(char_pixels)) + 2.0 * b * ft -
2.0 * b * t + tt;
o.err = dot(e, float3(1.0, 1.0, 1.0));
o.fg = a;
o.bg = b;
o.c = c;
return o;
}
// Get the color of the pixel of this invocation based on the character details
float3 GetFinalPixel(CharResults char_out) {
float2 resolution = OptionEnabled(USE_WINDOW_RES) ? GetWindowResolution() : GetResolution();
uint2 char_pos = uint2(floor(GetCoordinates() * resolution / char_dim));
uint2 pixel_offset = uint2(floor(GetCoordinates() * resolution) - char_pos * char_dim);
float font = SampleFont(int2(pixel_offset.x + char_width * pixel_offset.y, char_out.c));
return char_out.fg * font + char_out.bg * (1.0 - font);
}
/*
This shader performs some kind of brute force evaluation, which character fits best.
for c in characters:
for p in pixels:
ft += font(c,p) * texture(p)
res = CalcCharRes(ft)
min(res.err)
Terrible in performance, only for reference.
*/
CharResults CalcCharTrivial(uint2 char_pos) {
float3 t;
CharResults char_out;
char_out.err = char_pixels * char_pixels;
for (uint c = 0; c < MAX_CHARS; c += 1) {
float3 ft = float3(0.0, 0.0, 0.0);
for (uint pixel = 0; pixel < char_pixels; pixel += 1) {
float3 tex = SampleTex(char_pos, pixel);
float font = SampleFont(uint2(pixel, c));
ft += font * tex;
}
if (c == 0) t = ft;
CharResults res = CalcCharRes(c, t, ft);
if (res.err < char_out.err)
char_out = res;
}
return char_out;
}
/*
However for better performance, some characters are tested at once. This saves some expensive texture() calls.
Also split the loop over the pixels in groups of 32 for only fetching the uint32 of the font once.
*/
CharResults CalcCharFallback(uint2 char_pos) {
float3 t;
CharResults char_out;
char_out.err = char_pixels * char_pixels;
for (uint c = 0; c < MAX_CHARS; c += UNROLL_FALLBACK) {
// Declare ft
float3 ft[UNROLL_FALLBACK];
for (uint i = 0; i < UNROLL_FALLBACK; i++)
ft[i] = float3(0.0, 0.0, 0.0);
// Split `for p : pixels` in groups of 32. This makes accessing the texture (bit in uint32) easier.
for (uint pixel = 0; pixel < char_pixels; pixel += 32) {
uint font_i[UNROLL_FALLBACK];
for (uint i = 0; i < UNROLL_FALLBACK; i++)
font_i[i] = rasters[c + i][pixel / 32];
for (uint pixel_offset = 0; pixel_offset < 32; pixel_offset += 1) {
float3 tex = SampleTex(char_pos, pixel + pixel_offset);
// Inner kernel of `ft += font * tex`. Most time is spend in here.
for (uint i = 0; i < UNROLL_FALLBACK; i++) {
float font = (font_i[i] >> pixel_offset) & uint(1);
ft[i] += font * tex;
}
}
}
if (c == 0) {
// First char has font := 1, so t = ft. Cache this value for the next iterations.
t = ft[0];
}
// Check if this character fits better than the last one.
for (uint i = 0; i < UNROLL_FALLBACK; i++) {
CharResults res = CalcCharRes(c + i, t, ft[i]);
if (res.err < char_out.err)
char_out = res;
}
}
return char_out;
}
/*
SIMD optimized version with subgroup intrinsics
- distribute all characters over the lanes and check for them in parallel
- distribute the uniform texture access and broadcast each back to each lane
*/
CharResults CalcCharSIMD(uint2 char_pos, uint simd_width) {
// Font color, bg color, character, error -- of character with minimum error
CharResults char_out;
char_out.err = char_pixels * char_pixels;
float3 t;
#ifdef SUPPORTS_SUBGROUP_REDUCTION
// Hack: Work in hard-codeded fixed SIMD mode
if (gl_SubgroupInvocationID < simd_width) {
// Loop over all characters
for (uint c = 0; c < MAX_CHARS; c += UNROLL_SIMD * simd_width) {
// registers for "sum of font * texture"
float3 ft[UNROLL_SIMD];
for (uint i = 0; i < UNROLL_SIMD; i++)
ft[i] = float3(0.0, 0.0, 0.0);
for (uint pixel = 0; pixel < char_pixels; pixel += 32) {
// Preload the font uint32 for the next 32 pixels
uint font_i[UNROLL_SIMD];
for (uint i = 0; i < UNROLL_SIMD; i++)
font_i[i] = rasters[c + UNROLL_SIMD*gl_SubgroupInvocationID + i][pixel / 32];
for (uint pixel_offset = 0; pixel_offset < 32; pixel_offset += simd_width) {
// Copy one full WRAP of textures into registers and shuffle them around
// for later usage. This avoids one memory transaction per tested pixel
// & character.
float3 tex_simd = SampleTex(char_pos, pixel + pixel_offset + gl_SubgroupInvocationID);
for (uint k = 0; k < simd_width; k += 1) {
float3 tex = subgroupBroadcast(tex_simd, k);
// Note: As pixel iterates based on power-of-two gl_SubgroupSize, the
// const memory access to rasters is CSE'd and the inner loop
// after unrolling only contains: testing one bit + shuffle +
// conditional add
for (uint i = 0; i < UNROLL_SIMD; i++) {
float font = (font_i[i] >> (k + pixel_offset % 32)) & uint(1);
ft[i] += font * tex;
}
}
}
}
if (c == 0) {
// font[0] is a hardcoded 1 font, so t = ft
t = subgroupBroadcast(ft[0], 0);
}
for (uint i = 0; i < UNROLL_SIMD; i++) {
CharResults res = CalcCharRes(c + UNROLL_SIMD*gl_SubgroupInvocationID + i, t, ft[i]);
if (res.err < char_out.err)
char_out = res;
}
}
}
// Broadcast to get the best character of all threads
float err_min = subgroupMin(char_out.err);
uint smallest = subgroupBallotFindLSB(subgroupBallot(err_min == char_out.err));
char_out.fg = subgroupBroadcast(char_out.fg, smallest);
char_out.bg = subgroupBroadcast(char_out.bg, smallest);
char_out.c = subgroupBroadcast(char_out.c, smallest);
char_out.err = err_min;
#endif
return char_out;
}
bool supportsSIMD(uint simd_width) {
#ifdef SUPPORTS_SUBGROUP_REDUCTION
const uint mask = simd_width == 32u ? 0xFFFFFFFFu : (1u << simd_width) - 1;
return (subgroupBallot(true)[0] & mask) == mask;
#else
return false;
#endif
}
void main() {
// Calculate the character position of this pixel
float2 resolution = OptionEnabled(USE_WINDOW_RES) ? GetWindowResolution() : GetResolution();
uint2 char_pos_self = uint2(floor(GetCoordinates() * resolution / char_dim));
float3 color_out;
#ifdef SUPPORTS_SUBGROUP_REDUCTION
if (supportsSIMD(8)) {
// Loop over all character positions covered by this wave
bool pixel_active = !gl_HelperInvocation;
CharResults char_out;
while (true) {
// Fetch the next active character position
uint4 active_lanes = subgroupBallot(pixel_active);
if (active_lanes == uint4(0, 0, 0, 0)) {
break;
}
uint2 char_pos = subgroupBroadcast(char_pos_self, subgroupBallotFindLSB(active_lanes));
// And calculate everything for this character position
if (supportsSIMD(32)) {
char_out = CalcCharSIMD(char_pos, 32);
} else if (supportsSIMD(16)) {
char_out = CalcCharSIMD(char_pos, 16);
} else if (supportsSIMD(8)) {
char_out = CalcCharSIMD(char_pos, 8);
}
// Draw the character on screen
if (char_pos == char_pos_self) {
color_out = GetFinalPixel(char_out);
pixel_active = false;
}
if (OptionEnabled(DEBUG_ONLY_ONE_CHAR)) {
break;
}
}
} else
#endif
if (HAVE_FULL_FEATURE_FALLBACK) {
color_out = GetFinalPixel(CalcCharFallback(char_pos_self));
} else {
color_out = Sample().xyz;
}
SetOutput(float4(color_out, 1.0));
}