Files
melonDS/src/GPU3D_Compute.cpp
2023-09-26 23:20:16 +02:00

1501 lines
52 KiB
C++

/*
Copyright 2016-2022 melonDS team
This file is part of melonDS.
melonDS is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option)
any later version.
melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with melonDS. If not, see http://www.gnu.org/licenses/.
*/
#include "GPU3D_Compute.h"
#include <assert.h>
#define XXH_STATIC_LINKING_ONLY
#include "xxhash/xxhash.h"
#include "OpenGLSupport.h"
#include "GPU3D_Compute_shaders.h"
namespace GPU3D
{
ComputeRenderer::ComputeRenderer()
: Renderer3D(true)
{}
ComputeRenderer::~ComputeRenderer()
{}
bool ComputeRenderer::CompileShader(GLuint& shader, const char* source, const std::initializer_list<const char*>& defines)
{
std::string shaderName;
std::string shaderSource;
shaderSource += "#version 430 core\n";
for (const char* define : defines)
{
shaderSource += "#define ";
shaderSource += define;
shaderSource += '\n';
shaderName += define;
shaderName += ',';
}
shaderSource += "#define ScreenWidth ";
shaderSource += std::to_string(ScreenWidth);
shaderSource += "\n#define ScreenHeight ";
shaderSource += std::to_string(ScreenHeight);
shaderSource += "\n#define MaxWorkTiles ";
shaderSource += std::to_string(MaxWorkTiles);
shaderSource += ComputeRendererShaders::Common;
shaderSource += source;
return OpenGL::CompileComputeProgram(shader, shaderSource.c_str(), shaderName.c_str());
}
void blah(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,const void *userParam)
{
printf("%s\n", message);
}
bool ComputeRenderer::Init()
{
//glDebugMessageCallback(blah, NULL);
//glEnable(GL_DEBUG_OUTPUT);
glGenBuffers(1, &YSpanSetupMemory);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW);
glGenBuffers(1, &RenderPolygonMemory);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(RenderPolygon)*2048, nullptr, GL_DYNAMIC_DRAW);
glGenBuffers(1, &XSpanSetupMemory);
glGenBuffers(1, &BinResultMemory);
glGenBuffers(1, &FinalTileMemory);
glGenBuffers(1, &YSpanIndicesTextureMemory);
glGenBuffers(1, &TileMemory);
glGenTextures(1, &YSpanIndicesTexture);
glGenTextures(1, &LowResFramebuffer);
glBindTexture(GL_TEXTURE_2D, LowResFramebuffer);
glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8UI, 256, 192);
glGenBuffers(1, &MetaUniformMemory);
glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory);
glBufferData(GL_UNIFORM_BUFFER, sizeof(MetaUniform), nullptr, GL_DYNAMIC_DRAW);
glGenSamplers(9, Samplers);
for (u32 j = 0; j < 3; j++)
{
for (u32 i = 0; i < 3; i++)
{
const GLenum translateWrapMode[3] = {GL_CLAMP_TO_EDGE, GL_REPEAT, GL_MIRRORED_REPEAT};
glSamplerParameteri(Samplers[i+j*3], GL_TEXTURE_WRAP_S, translateWrapMode[i]);
glSamplerParameteri(Samplers[i+j*3], GL_TEXTURE_WRAP_T, translateWrapMode[j]);
glSamplerParameteri(Samplers[i+j*3], GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glSamplerParameterf(Samplers[i+j*3], GL_TEXTURE_MAG_FILTER, GL_NEAREST);
}
}
glGenBuffers(1, &PixelBuffer);
glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer);
glBufferData(GL_PIXEL_PACK_BUFFER, 256*192*4, NULL, GL_DYNAMIC_READ);
return true;
}
void ComputeRenderer::DeInit()
{
ResetTexcache();
glDeleteBuffers(1, &YSpanSetupMemory);
glDeleteBuffers(1, &RenderPolygonMemory);
glDeleteBuffers(1, &TileMemory);
glDeleteBuffers(1, &XSpanSetupMemory);
glDeleteBuffers(1, &BinResultMemory);
glDeleteBuffers(1, &FinalTileMemory);
glDeleteBuffers(1, &YSpanIndicesTextureMemory);
glDeleteTextures(1, &YSpanIndicesTexture);
glDeleteTextures(1, &Framebuffer);
glDeleteBuffers(1, &MetaUniformMemory);
glDeleteSamplers(9, Samplers);
glDeleteBuffers(1, &PixelBuffer);
}
void ComputeRenderer::DeleteShaders()
{
std::initializer_list<GLuint> allPrograms =
{
ShaderInterpXSpans[0],
ShaderInterpXSpans[1],
ShaderBinCombined,
ShaderDepthBlend[0],
ShaderDepthBlend[1],
ShaderRasteriseNoTexture[0],
ShaderRasteriseNoTexture[1],
ShaderRasteriseNoTextureToon[0],
ShaderRasteriseNoTextureToon[1],
ShaderRasteriseNoTextureHighlight[0],
ShaderRasteriseNoTextureHighlight[1],
ShaderRasteriseUseTextureDecal[0],
ShaderRasteriseUseTextureDecal[1],
ShaderRasteriseUseTextureModulate[0],
ShaderRasteriseUseTextureModulate[1],
ShaderRasteriseUseTextureToon[0],
ShaderRasteriseUseTextureToon[1],
ShaderRasteriseUseTextureHighlight[0],
ShaderRasteriseUseTextureHighlight[1],
ShaderRasteriseShadowMask[0],
ShaderRasteriseShadowMask[1],
ShaderClearCoarseBinMask,
ShaderClearIndirectWorkCount,
ShaderCalculateWorkListOffset,
ShaderSortWork,
ShaderFinalPass[0],
ShaderFinalPass[1],
ShaderFinalPass[2],
ShaderFinalPass[3],
ShaderFinalPass[4],
ShaderFinalPass[5],
ShaderFinalPass[6],
ShaderFinalPass[7],
};
for (GLuint program : allPrograms)
glDeleteProgram(program);
}
void ComputeRenderer::ResetTexcache()
{
for (u32 i = 0; i < 8; i++)
{
for (u32 j = 0; j < 8; j++)
{
for (u32 k = 0; k < TexArrays[i][j].size(); k++)
glDeleteTextures(1, &TexArrays[i][j][k]);
TexArrays[i][j].clear();
FreeTextures[i][j].clear();
}
}
TexCache.clear();
}
void ComputeRenderer::Reset()
{
ResetTexcache();
}
void ComputeRenderer::SetRenderSettings(GPU::RenderSettings& settings)
{
if (ScaleFactor != -1)
{
DeleteShaders();
}
ScaleFactor = settings.GL_ScaleFactor;
ScreenWidth = 256 * ScaleFactor;
ScreenHeight = 192 * ScaleFactor;
TilesPerLine = ScreenWidth/TileSize;
TileLines = ScreenHeight/TileSize;
MaxWorkTiles = TilesPerLine*TileLines*8;
glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*2*ScreenWidth*ScreenHeight, nullptr, GL_DYNAMIC_DRAW);
int binResultSize = sizeof(BinResultHeader)
+ MaxWorkTiles*2*4 // UnsortedWorkDescs
+ MaxWorkTiles*2*4 // SortedWork
+ TilesPerLine*TileLines*CoarseBinStride*4 // BinnedMaskCoarse
+ TilesPerLine*TileLines*BinStride*4 // BinnedMask
+ TilesPerLine*TileLines*BinStride*4; // WorkOffsets
glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, binResultSize, nullptr, GL_DYNAMIC_DRAW);
if (Framebuffer != 0)
glDeleteTextures(1, &Framebuffer);
glGenTextures(1, &Framebuffer);
glBindTexture(GL_TEXTURE_2D, Framebuffer);
glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, ScreenWidth, ScreenHeight);
// eh those are pretty bad guesses
// though real hw shouldn't be eable to render all 2048 polygons on every line either
int maxYSpanIndices = 64*2048 * ScaleFactor;
YSpanIndices.resize(maxYSpanIndices);
glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory);
glBufferData(GL_TEXTURE_BUFFER, maxYSpanIndices*2*4, nullptr, GL_DYNAMIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, XSpanSetupMemory);
glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupX)*maxYSpanIndices, nullptr, GL_DYNAMIC_DRAW);
glBindTexture(GL_TEXTURE_BUFFER, YSpanIndicesTexture);
glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA16UI, YSpanIndicesTextureMemory);
CompileShader(ShaderInterpXSpans[0], ComputeRendererShaders::InterpSpans, {"InterpSpans", "ZBuffer"});
CompileShader(ShaderInterpXSpans[1], ComputeRendererShaders::InterpSpans, {"InterpSpans", "WBuffer"});
CompileShader(ShaderBinCombined, ComputeRendererShaders::BinCombined, {"BinCombined"});
CompileShader(ShaderDepthBlend[0], ComputeRendererShaders::DepthBlend, {"DepthBlend", "ZBuffer"});
CompileShader(ShaderDepthBlend[1], ComputeRendererShaders::DepthBlend, {"DepthBlend", "WBuffer"});
CompileShader(ShaderRasteriseNoTexture[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture"});
CompileShader(ShaderRasteriseNoTexture[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture"});
CompileShader(ShaderRasteriseNoTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Toon"});
CompileShader(ShaderRasteriseNoTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Toon"});
CompileShader(ShaderRasteriseNoTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Highlight"});
CompileShader(ShaderRasteriseNoTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Highlight"});
CompileShader(ShaderRasteriseUseTextureDecal[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Decal"});
CompileShader(ShaderRasteriseUseTextureDecal[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Decal"});
CompileShader(ShaderRasteriseUseTextureModulate[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Modulate"});
CompileShader(ShaderRasteriseUseTextureModulate[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Modulate"});
CompileShader(ShaderRasteriseUseTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Toon"});
CompileShader(ShaderRasteriseUseTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Toon"});
CompileShader(ShaderRasteriseUseTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Highlight"});
CompileShader(ShaderRasteriseUseTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Highlight"});
CompileShader(ShaderRasteriseShadowMask[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "ShadowMask"});
CompileShader(ShaderRasteriseShadowMask[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "ShadowMask"});
CompileShader(ShaderClearCoarseBinMask, ComputeRendererShaders::ClearCoarseBinMask, {"ClearCoarseBinMask"});
CompileShader(ShaderClearIndirectWorkCount, ComputeRendererShaders::ClearIndirectWorkCount, {"ClearIndirectWorkCount"});
CompileShader(ShaderCalculateWorkListOffset, ComputeRendererShaders::CalcOffsets, {"CalculateWorkOffsets"});
CompileShader(ShaderSortWork, ComputeRendererShaders::SortWork, {"SortWork"});
CompileShader(ShaderFinalPass[0], ComputeRendererShaders::FinalPass, {"FinalPass"});
CompileShader(ShaderFinalPass[1], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking"});
CompileShader(ShaderFinalPass[2], ComputeRendererShaders::FinalPass, {"FinalPass", "Fog"});
CompileShader(ShaderFinalPass[3], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking", "Fog"});
CompileShader(ShaderFinalPass[4], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing"});
CompileShader(ShaderFinalPass[5], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking"});
CompileShader(ShaderFinalPass[6], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "Fog"});
CompileShader(ShaderFinalPass[7], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking", "Fog"});
}
void ComputeRenderer::VCount144()
{
}
void ComputeRenderer::SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to)
{
span->Z0 = poly->FinalZ[from];
span->W0 = poly->FinalW[from];
span->Z1 = poly->FinalZ[to];
span->W1 = poly->FinalW[to];
span->ColorR0 = poly->Vertices[from]->FinalColor[0];
span->ColorG0 = poly->Vertices[from]->FinalColor[1];
span->ColorB0 = poly->Vertices[from]->FinalColor[2];
span->ColorR1 = poly->Vertices[to]->FinalColor[0];
span->ColorG1 = poly->Vertices[to]->FinalColor[1];
span->ColorB1 = poly->Vertices[to]->FinalColor[2];
span->TexcoordU0 = poly->Vertices[from]->TexCoords[0];
span->TexcoordV0 = poly->Vertices[from]->TexCoords[1];
span->TexcoordU1 = poly->Vertices[to]->TexCoords[0];
span->TexcoordV1 = poly->Vertices[to]->TexCoords[1];
}
void ComputeRenderer::SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2])
{
s32 x0 = positions[vertex][0];
if (side)
{
span->DxInitial = -0x40000;
x0--;
}
else
{
span->DxInitial = 0;
}
span->X0 = span->X1 = x0;
span->XMin = x0;
span->XMax = x0;
span->Y0 = span->Y1 = positions[vertex][1];
if (span->XMin < rp->XMin)
{
rp->XMin = span->XMin;
rp->XMinY = span->Y0;
}
if (span->XMax > rp->XMax)
{
rp->XMax = span->XMax;
rp->XMaxY = span->Y0;
}
span->Increment = 0;
span->I0 = span->I1 = span->IRecip = 0;
span->Linear = true;
span->XCovIncr = 0;
span->IsDummy = true;
SetupAttrs(span, poly, vertex, vertex);
}
void ComputeRenderer::SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2])
{
span->X0 = positions[from][0];
span->X1 = positions[to][0];
span->Y0 = positions[from][1];
span->Y1 = positions[to][1];
SetupAttrs(span, poly, from, to);
s32 minXY, maxXY;
bool negative = false;
if (span->X1 > span->X0)
{
span->XMin = span->X0;
span->XMax = span->X1-1;
minXY = span->Y0;
maxXY = span->Y1;
}
else if (span->X1 < span->X0)
{
span->XMin = span->X1;
span->XMax = span->X0-1;
negative = true;
minXY = span->Y1;
maxXY = span->Y0;
}
else
{
span->XMin = span->X0;
if (side) span->XMin--;
span->XMax = span->XMin;
// doesn't matter for completely vertical slope
minXY = span->Y0;
maxXY = span->Y0;
}
if (span->XMin < rp->XMin)
{
rp->XMin = span->XMin;
rp->XMinY = minXY;
}
if (span->XMax > rp->XMax)
{
rp->XMax = span->XMax;
rp->XMaxY = maxXY;
}
span->IsDummy = false;
s32 xlen = span->XMax+1 - span->XMin;
s32 ylen = span->Y1 - span->Y0;
// slope increment has a 18-bit fractional part
// note: for some reason, x/y isn't calculated directly,
// instead, 1/y is calculated and then multiplied by x
// TODO: this is still not perfect (see for example x=169 y=33)
if (ylen == 0)
{
span->Increment = 0;
}
else if (ylen == xlen)
{
span->Increment = 0x40000;
}
else
{
s32 yrecip = (1<<18) / ylen;
span->Increment = (span->X1-span->X0) * yrecip;
if (span->Increment < 0) span->Increment = -span->Increment;
}
bool xMajor = (span->Increment > 0x40000);
if (side)
{
// right
if (xMajor)
span->DxInitial = negative ? (0x20000 + 0x40000) : (span->Increment - 0x20000);
else if (span->Increment != 0)
span->DxInitial = negative ? 0x40000 : 0;
else
span->DxInitial = -0x40000;
}
else
{
// left
if (xMajor)
span->DxInitial = negative ? ((span->Increment - 0x20000) + 0x40000) : 0x20000;
else if (span->Increment != 0)
span->DxInitial = negative ? 0x40000 : 0;
else
span->DxInitial = 0;
}
if (xMajor)
{
if (side)
{
span->I0 = span->X0 - 1;
span->I1 = span->X1 - 1;
}
else
{
span->I0 = span->X0;
span->I1 = span->X1;
}
// used for calculating AA coverage
span->XCovIncr = (ylen << 10) / xlen;
}
else
{
span->I0 = span->Y0;
span->I1 = span->Y1;
}
if (span->I0 != span->I1)
span->IRecip = (1<<30) / (span->I1 - span->I0);
else
span->IRecip = 0;
span->Linear = (span->W0 == span->W1) && !(span->W0 & 0x7E) && !(span->W1 & 0x7E);
if ((span->W0 & 0x1) && !(span->W1 & 0x1))
{
span->W0n = (span->W0 - 1) >> 1;
span->W0d = (span->W0 + 1) >> 1;
span->W1d = span->W1 >> 1;
}
else
{
span->W0n = span->W0 >> 1;
span->W0d = span->W0 >> 1;
span->W1d = span->W1 >> 1;
}
}
inline u32 TextureWidth(u32 texparam)
{
return 8 << ((texparam >> 20) & 0x7);
}
inline u32 TextureHeight(u32 texparam)
{
return 8 << ((texparam >> 23) & 0x7);
}
inline u16 ColorAvg(u16 color0, u16 color1)
{
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
u32 b0 = color0 & 0x7C00;
u32 r1 = color1 & 0x001F;
u32 g1 = color1 & 0x03E0;
u32 b1 = color1 & 0x7C00;
u32 r = (r0 + r1) >> 1;
u32 g = ((g0 + g1) >> 1) & 0x03E0;
u32 b = ((b0 + b1) >> 1) & 0x7C00;
return r | g | b;
}
inline u16 Color5of3(u16 color0, u16 color1)
{
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
u32 b0 = color0 & 0x7C00;
u32 r1 = color1 & 0x001F;
u32 g1 = color1 & 0x03E0;
u32 b1 = color1 & 0x7C00;
u32 r = (r0*5 + r1*3) >> 3;
u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0;
u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00;
return r | g | b;
}
inline u16 Color3of5(u16 color0, u16 color1)
{
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
u32 b0 = color0 & 0x7C00;
u32 r1 = color1 & 0x001F;
u32 g1 = color1 & 0x03E0;
u32 b1 = color1 & 0x7C00;
u32 r = (r0*3 + r1*5) >> 3;
u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0;
u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00;
return r | g | b;
}
inline u32 ConvertRGB5ToRGB8(u16 val)
{
return (((u32)val & 0x1F) << 3)
| (((u32)val & 0x3E0) << 6)
| (((u32)val & 0x7C00) << 9);
}
inline u32 ConvertRGB5ToBGR8(u16 val)
{
return (((u32)val & 0x1F) << 9)
| (((u32)val & 0x3E0) << 6)
| (((u32)val & 0x7C00) << 3);
}
inline u32 ConvertRGB5ToRGB6(u16 val)
{
u8 r = (val & 0x1F) << 1;
u8 g = (val & 0x3E0) >> 4;
u8 b = (val & 0x7C00) >> 9;
if (r) r++;
if (g) g++;
if (b) b++;
return (u32)r | ((u32)g << 8) | ((u32)b << 16);
}
enum
{
outputFmt_RGB6A5,
outputFmt_RGBA8,
outputFmt_BGRA8
};
template <int outputFmt>
void ConvertCompressedTexture(u32 width, u32 height, u32* output, u8* texData, u8* texAuxData, u16* palData)
{
// we process a whole block at the time
for (int y = 0; y < height / 4; y++)
{
for (int x = 0; x < width / 4; x++)
{
u32 data = ((u32*)texData)[x + y * (width / 4)];
u16 auxData = ((u16*)texAuxData)[x + y * (width / 4)];
u32 paletteOffset = auxData & 0x3FFF;
u16 color0 = palData[paletteOffset*2] | 0x8000;
u16 color1 = palData[paletteOffset*2+1] | 0x8000;
u16 color2, color3;
switch ((auxData >> 14) & 0x3)
{
case 0:
color2 = palData[paletteOffset*2+2] | 0x8000;
color3 = 0;
break;
case 1:
{
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
u32 b0 = color0 & 0x7C00;
u32 r1 = color1 & 0x001F;
u32 g1 = color1 & 0x03E0;
u32 b1 = color1 & 0x7C00;
u32 r = (r0 + r1) >> 1;
u32 g = ((g0 + g1) >> 1) & 0x03E0;
u32 b = ((b0 + b1) >> 1) & 0x7C00;
color2 = r | g | b | 0x8000;
}
color3 = 0;
break;
case 2:
color2 = palData[paletteOffset*2+2] | 0x8000;
color3 = palData[paletteOffset*2+3] | 0x8000;
break;
case 3:
{
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
u32 b0 = color0 & 0x7C00;
u32 r1 = color1 & 0x001F;
u32 g1 = color1 & 0x03E0;
u32 b1 = color1 & 0x7C00;
u32 r = (r0*5 + r1*3) >> 3;
u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0;
u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00;
color2 = r | g | b | 0x8000;
}
{
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
u32 b0 = color0 & 0x7C00;
u32 r1 = color1 & 0x001F;
u32 g1 = color1 & 0x03E0;
u32 b1 = color1 & 0x7C00;
u32 r = (r0*3 + r1*5) >> 3;
u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0;
u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00;
color3 = r | g | b | 0x8000;
}
break;
}
// in 2020 our default data types are big enough to be used as lookup tables...
u64 packed = color0 | ((u64)color1 << 16) | ((u64)color2 << 32) | ((u64)color3 << 48);
for (int j = 0; j < 4; j++)
{
for (int i = 0; i < 4; i++)
{
u16 color = (packed >> 16 * (data >> 2 * (i + j * 4))) & 0xFFFF;
u32 res;
switch (outputFmt)
{
case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color)
| ((color & 0x8000) ? 0x1F000000 : 0); break;
case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color)
| ((color & 0x8000) ? 0xFF000000 : 0); break;
case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color)
| ((color & 0x8000) ? 0xFF000000 : 0); break;
}
output[x * 4 + i + (y * 4 + j) * width] = res;
}
}
}
}
}
template <int outputFmt, int X, int Y>
void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData)
{
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x++)
{
u8 val = texData[x + y * width];
u32 idx = val & ((1 << Y) - 1);
u16 color = palData[idx];
u32 alpha = (val >> Y) & ((1 << X) - 1);
if (X != 5)
alpha = alpha * 4 + alpha / 2;
u32 res;
switch (outputFmt)
{
case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color) | alpha << 24; break;
// make sure full alpha == 255
case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break;
case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break;
}
output[x + y * width] = res;
}
}
}
template <int outputFmt, int colorBits>
void ConvertNColorsTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent)
{
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width / (8 / colorBits); x++)
{
u8 val = texData[x + y * (width / (8 / colorBits))];
for (int i = 0; i < 8 / colorBits; i++)
{
u32 index = (val >> (i * colorBits)) & ((1 << colorBits) - 1);
u16 color = palData[index];
bool transparent = color0Transparent && index == 0;
u32 res;
switch (outputFmt)
{
case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color)
| (transparent ? 0 : 0x1F000000); break;
case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color)
| (transparent ? 0 : 0xFF000000); break;
case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color)
| (transparent ? 0 : 0xFF000000); break;
}
output[x * (8 / colorBits) + y * width + i] = res;
}
}
}
}
ComputeRenderer::TexCacheEntry& ComputeRenderer::GetTexture(u32 texParam, u32 palBase)
{
// remove sampling and texcoord gen params
texParam &= ~0xC00F0000;
u32 fmt = (texParam >> 26) & 0x7;
u64 key = texParam;
if (fmt != 7)
{
key |= (u64)palBase << 32;
if (fmt == 5)
key &= ~((u64)1 << 29);
}
//printf("%" PRIx64 " %" PRIx32 " %" PRIx32 "\n", key, texParam, palBase);
assert(fmt != 0 && "no texture is not a texture format!");
auto it = TexCache.find(key);
if (it != TexCache.end())
return it->second;
u32 widthLog2 = (texParam >> 20) & 0x7;
u32 heightLog2 = (texParam >> 23) & 0x7;
u32 width = 8 << widthLog2;
u32 height = 8 << heightLog2;
u32 addr = (texParam & 0xFFFF) * 8;
TexCacheEntry entry = {0};
entry.TextureRAMStart[0] = addr;
entry.WidthLog2 = widthLog2;
entry.HeightLog2 = heightLog2;
// apparently a new texture
if (fmt == 7)
{
entry.TextureRAMSize[0] = width*height*2;
for (u32 i = 0; i < width*height; i++)
{
u16 value = *(u16*)&GPU::VRAMFlat_Texture[addr + i * 2];
TextureDecodingBuffer[i] = ConvertRGB5ToRGB6(value) | (value & 0x8000 ? 0x1F000000 : 0);
}
}
else if (fmt == 5)
{
u8* texData = &GPU::VRAMFlat_Texture[addr];
u32 slot1addr = 0x20000 + ((addr & 0x1FFFC) >> 1);
if (addr >= 0x40000)
slot1addr += 0x10000;
u8* texAuxData = &GPU::VRAMFlat_Texture[slot1addr];
u16* palData = (u16*)(GPU::VRAMFlat_TexPal + palBase*16);
entry.TextureRAMSize[0] = width*height/16*4;
entry.TextureRAMStart[1] = slot1addr;
entry.TextureRAMSize[1] = width*height/16*2;
entry.TexPalStart = palBase*16;
entry.TexPalSize = 0x10000;
ConvertCompressedTexture<outputFmt_RGB6A5>(width, height, TextureDecodingBuffer, texData, texAuxData, palData);
}
else
{
u32 texSize, palAddr = palBase*16, numPalEntries;
switch (fmt)
{
case 1: texSize = width*height; numPalEntries = 32; break;
case 6: texSize = width*height; numPalEntries = 8; break;
case 2: texSize = width*height/4; numPalEntries = 4; palAddr >>= 1; break;
case 3: texSize = width*height/2; numPalEntries = 16; break;
case 4: texSize = width*height; numPalEntries = 256; break;
}
palAddr &= 0x1FFFF;
/*printf("creating texture | fmt: %d | %dx%d | %08x | %08x\n", fmt, width, height, addr, palAddr);
svcSleepThread(1000*1000);*/
entry.TextureRAMSize[0] = texSize;
entry.TexPalStart = palAddr;
entry.TexPalSize = numPalEntries*2;
u8* texData = &GPU::VRAMFlat_Texture[addr];
u16* palData = (u16*)(GPU::VRAMFlat_TexPal + palAddr);
//assert(entry.TexPalStart+entry.TexPalSize <= 128*1024*1024);
bool color0Transparent = texParam & (1 << 29);
switch (fmt)
{
case 1: ConvertAXIYTexture<outputFmt_RGB6A5, 3, 5>(width, height, TextureDecodingBuffer, texData, palData); break;
case 6: ConvertAXIYTexture<outputFmt_RGB6A5, 5, 3>(width, height, TextureDecodingBuffer, texData, palData); break;
case 2: ConvertNColorsTexture<outputFmt_RGB6A5, 2>(width, height, TextureDecodingBuffer, texData, palData, color0Transparent); break;
case 3: ConvertNColorsTexture<outputFmt_RGB6A5, 4>(width, height, TextureDecodingBuffer, texData, palData, color0Transparent); break;
case 4: ConvertNColorsTexture<outputFmt_RGB6A5, 8>(width, height, TextureDecodingBuffer, texData, palData, color0Transparent); break;
}
}
for (int i = 0; i < 2; i++)
{
if (entry.TextureRAMSize[i])
entry.TextureHash[i] = XXH3_64bits(&GPU::VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]);
}
if (entry.TexPalSize)
entry.TexPalHash = XXH3_64bits(&GPU::VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize);
auto& texArrays = TexArrays[widthLog2][heightLog2];
auto& freeTextures = FreeTextures[widthLog2][heightLog2];
if (freeTextures.size() == 0)
{
texArrays.resize(texArrays.size()+1);
GLuint& array = texArrays[texArrays.size()-1];
u32 layers = std::min<u32>((8*1024*1024) / (width*height*4), 64);
// allocate new array texture
glGenTextures(1, &array);
glBindTexture(GL_TEXTURE_2D_ARRAY, array);
glTexStorage3D(GL_TEXTURE_2D_ARRAY, 1, GL_RGBA8UI, width, height, layers);
//printf("allocating new layer set for %d %d %d %d\n", width, height, texArrays.size()-1, array.ImageDescriptor);
for (u32 i = 0; i < layers; i++)
{
freeTextures.push_back(TexArrayEntry{array, i});
}
}
TexArrayEntry storagePlace = freeTextures[freeTextures.size()-1];
freeTextures.pop_back();
//printf("using storage place %d %d | %d %d (%d)\n", width, height, storagePlace.TexArrayIdx, storagePlace.LayerIdx, array.ImageDescriptor);
glBindTexture(GL_TEXTURE_2D_ARRAY, storagePlace.TextureID);
glTexSubImage3D(GL_TEXTURE_2D_ARRAY,
0, 0, 0, storagePlace.Layer,
width, height, 1,
GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, TextureDecodingBuffer);
entry.Texture = storagePlace;
return TexCache.emplace(std::make_pair(key, entry)).first->second;
}
struct Variant
{
GLuint Texture, Sampler;
u16 Width, Height;
u8 BlendMode;
bool operator==(const Variant& other)
{
return Texture == other.Texture && Sampler == other.Sampler && BlendMode == other.BlendMode;
}
};
/*
Antialiasing
W-Buffer
With Texture
0
1, 3
2
without Texture
2
0, 1, 3
=> 20 Shader + 1x Shadow Mask
*/
void ComputeRenderer::RenderFrame()
{
//printf("render frame\n");
auto textureDirty = GPU::VRAMDirty_Texture.DeriveState(GPU::VRAMMap_Texture);
auto texPalDirty = GPU::VRAMDirty_TexPal.DeriveState(GPU::VRAMMap_TexPal);
bool textureChanged = GPU::MakeVRAMFlat_TextureCoherent(textureDirty);
bool texPalChanged = GPU::MakeVRAMFlat_TexPalCoherent(texPalDirty);
if (textureChanged || texPalChanged)
{
//printf("check invalidation %d\n", TexCache.size());
for (auto it = TexCache.begin(); it != TexCache.end();)
{
TexCacheEntry& entry = it->second;
if (textureChanged)
{
for (u32 i = 0; i < 2; i++)
{
u32 startBit = entry.TextureRAMStart[i] / GPU::VRAMDirtyGranularity;
u32 bitsCount = ((entry.TextureRAMStart[i] + entry.TextureRAMSize[i] + GPU::VRAMDirtyGranularity - 1) / GPU::VRAMDirtyGranularity) - startBit;
u32 startEntry = startBit >> 6;
u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
for (u32 j = startEntry; j < startEntry + entriesCount; j++)
{
if (GetRangedBitMask(j, startBit, bitsCount) & textureDirty.Data[j])
{
u64 newTexHash = XXH3_64bits(&GPU::VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]);
if (newTexHash != entry.TextureHash[i])
goto invalidate;
}
}
}
}
if (texPalChanged && entry.TexPalSize > 0)
{
u32 startBit = entry.TexPalStart / GPU::VRAMDirtyGranularity;
u32 bitsCount = ((entry.TexPalStart + entry.TexPalSize + GPU::VRAMDirtyGranularity - 1) / GPU::VRAMDirtyGranularity) - startBit;
u32 startEntry = startBit >> 6;
u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
for (u32 j = startEntry; j < startEntry + entriesCount; j++)
{
if (GetRangedBitMask(j, startBit, bitsCount) & texPalDirty.Data[j])
{
u64 newPalHash = XXH3_64bits(&GPU::VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize);
if (newPalHash != entry.TexPalHash)
goto invalidate;
}
}
}
it++;
continue;
invalidate:
FreeTextures[entry.WidthLog2][entry.HeightLog2].push_back(entry.Texture);
//printf("invalidating texture %d\n", entry.ImageDescriptor);
it = TexCache.erase(it);
}
}
else if (RenderFrameIdentical)
{
return;
}
int numYSpans = 0;
int numSetupIndices = 0;
/*
Some games really like to spam small textures, often
to store the data like PPU tiles. E.g. Shantae
or some Mega Man game. Fortunately they are usually kind
enough to not vary the texture size all too often (usually
they just use 8x8 or 16x for everything).
This is the reason we have this whole mess where textures of
the same size are put into array textures. This allows
to increase the batch size.
Less variance between each Variant hah!
*/
u32 numVariants = 0, prevVariant, prevTexLayer;
Variant variants[MaxVariants];
int foundviatexcache = 0, foundviaprev = 0, numslow = 0;
bool enableTextureMaps = RenderDispCnt & (1<<0);
for (int i = 0; i < RenderNumPolygons; i++)
{
Polygon* polygon = RenderPolygonRAM[i];
u32 nverts = polygon->NumVertices;
u32 vtop = polygon->VTop, vbot = polygon->VBottom;
u32 curVL = vtop, curVR = vtop;
u32 nextVL, nextVR;
RenderPolygons[i].FirstXSpan = numSetupIndices;
RenderPolygons[i].Attr = polygon->Attr;
bool foundVariant = false;
if (i > 0)
{
// if the whole texture attribute matches
// the texture layer will also match
Polygon* prevPolygon = RenderPolygonRAM[i - 1];
foundVariant = prevPolygon->TexParam == polygon->TexParam
&& prevPolygon->TexPalette == polygon->TexPalette
&& (prevPolygon->Attr & 0x30) == (polygon->Attr & 0x30)
&& prevPolygon->IsShadowMask == polygon->IsShadowMask;
if (foundVariant)
foundviaprev++;
}
if (!foundVariant)
{
Variant variant;
variant.BlendMode = polygon->IsShadowMask ? 4 : ((polygon->Attr >> 4) & 0x3);
variant.Texture = 0;
variant.Sampler = 0;
TexCacheEntry* texcacheEntry = nullptr;
// we always need to look up the texture to get the layer of the array texture
if (enableTextureMaps && (polygon->TexParam >> 26) & 0x7)
{
texcacheEntry = &GetTexture(polygon->TexParam, polygon->TexPalette);
bool wrapS = (polygon->TexParam >> 16) & 1;
bool wrapT = (polygon->TexParam >> 17) & 1;
bool mirrorS = (polygon->TexParam >> 18) & 1;
bool mirrorT = (polygon->TexParam >> 19) & 1;
variant.Sampler = Samplers[(wrapS ? (mirrorS ? 2 : 1) : 0) + (wrapT ? (mirrorT ? 2 : 1) : 0) * 3];
variant.Texture = texcacheEntry->Texture.TextureID;
prevTexLayer = texcacheEntry->Texture.Layer;
if (texcacheEntry->LastVariant < numVariants && variants[texcacheEntry->LastVariant] == variant)
{
foundVariant = true;
prevVariant = texcacheEntry->LastVariant;
foundviatexcache++;
}
}
if (!foundVariant)
{
numslow++;
for (int j = numVariants - 1; j >= 0; j--)
{
if (variants[j] == variant)
{
foundVariant = true;
prevVariant = j;
goto foundVariant;
}
}
prevVariant = numVariants;
variants[numVariants] = variant;
variants[numVariants].Width = TextureWidth(polygon->TexParam);
variants[numVariants].Height = TextureHeight(polygon->TexParam);
numVariants++;
assert(numVariants <= MaxVariants);
foundVariant:;
if (texcacheEntry)
texcacheEntry->LastVariant = prevVariant;
}
}
RenderPolygons[i].Variant = prevVariant;
RenderPolygons[i].TextureLayer = (float)prevTexLayer;
if (polygon->FacingView)
{
nextVL = curVL + 1;
if (nextVL >= nverts) nextVL = 0;
nextVR = curVR - 1;
if ((s32)nextVR < 0) nextVR = nverts - 1;
}
else
{
nextVL = curVL - 1;
if ((s32)nextVL < 0) nextVL = nverts - 1;
nextVR = curVR + 1;
if (nextVR >= nverts) nextVR = 0;
}
s32 scaledPositions[10][2];
s32 ytop = ScreenHeight, ybot = 0;
for (int i = 0; i < polygon->NumVertices; i++)
{
scaledPositions[i][0] = (polygon->Vertices[i]->HiresPosition[0] * ScaleFactor) >> 4;
scaledPositions[i][1] = (polygon->Vertices[i]->HiresPosition[1] * ScaleFactor) >> 4;
ytop = std::min(scaledPositions[i][1], ytop);
ybot = std::max(scaledPositions[i][1], ybot);
}
RenderPolygons[i].YTop = ytop;
RenderPolygons[i].YBot = ybot;
RenderPolygons[i].XMin = ScreenWidth;
RenderPolygons[i].XMax = 0;
if (ybot == ytop)
{
vtop = 0; vbot = 0;
RenderPolygons[i].YBot++;
int j = 1;
if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j;
if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j;
j = nverts - 1;
if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j;
if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j;
assert(numYSpans < MaxYSpanSetups);
u32 curSpanL = numYSpans;
SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vtop, 0, scaledPositions);
assert(numYSpans < MaxYSpanSetups);
u32 curSpanR = numYSpans;
SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vbot, 1, scaledPositions);
YSpanIndices[numSetupIndices].PolyIdx = i;
YSpanIndices[numSetupIndices].SpanIdxL = curSpanL;
YSpanIndices[numSetupIndices].SpanIdxR = curSpanR;
YSpanIndices[numSetupIndices].Y = ytop;
numSetupIndices++;
}
else
{
u32 curSpanL = numYSpans;
assert(numYSpans < MaxYSpanSetups);
SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions);
u32 curSpanR = numYSpans;
assert(numYSpans < MaxYSpanSetups);
SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions);
for (u32 y = ytop; y < ybot; y++)
{
if (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom)
{
while (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom)
{
curVL = nextVL;
if (polygon->FacingView)
{
nextVL = curVL + 1;
if (nextVL >= nverts)
nextVL = 0;
}
else
{
nextVL = curVL - 1;
if ((s32)nextVL < 0)
nextVL = nverts - 1;
}
}
assert(numYSpans < MaxYSpanSetups);
curSpanL = numYSpans;
SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions);
}
if (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom)
{
while (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom)
{
curVR = nextVR;
if (polygon->FacingView)
{
nextVR = curVR - 1;
if ((s32)nextVR < 0)
nextVR = nverts - 1;
}
else
{
nextVR = curVR + 1;
if (nextVR >= nverts)
nextVR = 0;
}
}
assert(numYSpans < MaxYSpanSetups);
curSpanR = numYSpans;
SetupYSpan(&RenderPolygons[i] ,&YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions);
}
YSpanIndices[numSetupIndices].PolyIdx = i;
YSpanIndices[numSetupIndices].SpanIdxL = curSpanL;
YSpanIndices[numSetupIndices].SpanIdxR = curSpanR;
YSpanIndices[numSetupIndices].Y = y;
numSetupIndices++;
}
}
//printf("polygon min max %d %d | %d %d\n", RenderPolygons[i].XMin, RenderPolygons[i].XMinY, RenderPolygons[i].XMax, RenderPolygons[i].XMaxY);
}
/*for (u32 i = 0; i < RenderNumPolygons; i++)
{
if (RenderPolygons[i].Variant >= numVariants)
{
printf("blarb2 %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons);
}
//assert(RenderPolygons[i].Variant < numVariants);
}*/
if (numYSpans > 0)
{
glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory);
glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(SpanSetupY)*numYSpans, YSpanSetups);
glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory);
glBufferSubData(GL_TEXTURE_BUFFER, 0, numSetupIndices*4*2, YSpanIndices.data());
glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory);
glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, RenderNumPolygons*sizeof(RenderPolygon), RenderPolygons);
// we haven't accessed image data yet, so we don't need to invalidate anything
}
//printf("found via %d %d %d of %d\n", foundviatexcache, foundviaprev, numslow, RenderNumPolygons);
// bind everything
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, YSpanSetupMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, XSpanSetupMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, RenderPolygonMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, BinResultMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, TileMemory);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, FinalTileMemory);
MetaUniform meta;
meta.DispCnt = RenderDispCnt;
meta.NumPolygons = RenderNumPolygons;
meta.NumVariants = numVariants;
meta.AlphaRef = RenderAlphaRef;
{
u32 r = (RenderClearAttr1 << 1) & 0x3E; if (r) r++;
u32 g = (RenderClearAttr1 >> 4) & 0x3E; if (g) g++;
u32 b = (RenderClearAttr1 >> 9) & 0x3E; if (b) b++;
u32 a = (RenderClearAttr1 >> 16) & 0x1F;
meta.ClearColor = r | (g << 8) | (b << 16) | (a << 24);
meta.ClearDepth = ((RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF;
meta.ClearAttr = RenderClearAttr1 & 0x3F008000;
}
for (u32 i = 0; i < 32; i++)
{
u32 color = RenderToonTable[i];
u32 r = (color << 1) & 0x3E;
u32 g = (color >> 4) & 0x3E;
u32 b = (color >> 9) & 0x3E;
if (r) r++;
if (g) g++;
if (b) b++;
meta.ToonTable[i*4+0] = r | (g << 8) | (b << 16);
}
for (u32 i = 0; i < 34; i++)
{
meta.ToonTable[i*4+1] = RenderFogDensityTable[i];
}
for (u32 i = 0; i < 8; i++)
{
u32 color = RenderEdgeTable[i];
u32 r = (color << 1) & 0x3E;
u32 g = (color >> 4) & 0x3E;
u32 b = (color >> 9) & 0x3E;
if (r) r++;
if (g) g++;
if (b) b++;
meta.ToonTable[i*4+2] = r | (g << 8) | (b << 16);
}
meta.FogOffset = RenderFogOffset;
meta.FogShift = RenderFogShift;
{
u32 fogR = (RenderFogColor << 1) & 0x3E; if (fogR) fogR++;
u32 fogG = (RenderFogColor >> 4) & 0x3E; if (fogG) fogG++;
u32 fogB = (RenderFogColor >> 9) & 0x3E; if (fogB) fogB++;
u32 fogA = (RenderFogColor >> 16) & 0x1F;
meta.FogColor = fogR | (fogG << 8) | (fogB << 16) | (fogA << 24);
}
glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory);
glBufferSubData(GL_UNIFORM_BUFFER, 0, sizeof(MetaUniform), &meta);
glBindBufferBase(GL_UNIFORM_BUFFER, 0, MetaUniformMemory);
glUseProgram(ShaderClearCoarseBinMask);
glDispatchCompute(TilesPerLine*TileLines/32, 1, 1);
bool wbuffer = false;
if (numYSpans > 0)
{
wbuffer = RenderPolygonRAM[0]->WBuffer;
glUseProgram(ShaderClearIndirectWorkCount);
glDispatchCompute((numVariants+31)/32, 1, 1);
// calculate x-spans
glBindImageTexture(0, YSpanIndicesTexture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGBA16UI);
glUseProgram(ShaderInterpXSpans[wbuffer]);
glDispatchCompute((numSetupIndices + 31) / 32, 1, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
// bin polygons
glUseProgram(ShaderBinCombined);
glDispatchCompute(((RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH);
glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
// calculate list offsets
glUseProgram(ShaderCalculateWorkListOffset);
glDispatchCompute((numVariants + 31) / 32, 1, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
// sort shader work
glUseProgram(ShaderSortWork);
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory);
glDispatchComputeIndirect(offsetof(BinResultHeader, SortWorkWorkCount));
glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
glActiveTexture(GL_TEXTURE0);
// rasterise
{
bool highLightMode = RenderDispCnt & (1<<1);
GLuint shadersNoTexture[] =
{
ShaderRasteriseNoTexture[wbuffer],
ShaderRasteriseNoTexture[wbuffer],
highLightMode
? ShaderRasteriseNoTextureHighlight[wbuffer]
: ShaderRasteriseNoTextureToon[wbuffer],
ShaderRasteriseNoTexture[wbuffer],
ShaderRasteriseShadowMask[wbuffer]
};
GLuint shadersUseTexture[] =
{
ShaderRasteriseUseTextureModulate[wbuffer],
ShaderRasteriseUseTextureDecal[wbuffer],
highLightMode
? ShaderRasteriseUseTextureHighlight[wbuffer]
: ShaderRasteriseUseTextureToon[wbuffer],
ShaderRasteriseUseTextureDecal[wbuffer],
ShaderRasteriseShadowMask[wbuffer]
};
GLuint prevShader = 0;
s32 prevTexture = 0, prevSampler = 0;
for (int i = 0; i < numVariants; i++)
{
GLuint shader = 0;
if (variants[i].Texture == 0)
{
shader = shadersNoTexture[variants[i].BlendMode];
}
else
{
shader = shadersUseTexture[variants[i].BlendMode];
if (variants[i].Texture != prevTexture)
{
glBindTexture(GL_TEXTURE_2D_ARRAY, variants[i].Texture);
prevTexture = variants[i].Texture;
}
if (variants[i].Sampler != prevSampler)
{
glBindSampler(0, variants[i].Sampler);
prevSampler = variants[i].Sampler;
}
}
assert(shader != 0);
if (shader != prevShader)
{
glUseProgram(shader);
prevShader = shader;
}
glUniform1ui(UniformIdxCurVariant, i);
glUniform2f(UniformIdxTextureSize, 1.f / variants[i].Width, 1.f / variants[i].Height);
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory);
glDispatchComputeIndirect(offsetof(BinResultHeader, VariantWorkCount) + i*4*4);
}
}
}
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
// compose final image
glUseProgram(ShaderDepthBlend[wbuffer]);
glDispatchCompute(ScreenWidth/TileSize, ScreenHeight/TileSize, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
glBindImageTexture(0, Framebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8);
glBindImageTexture(1, LowResFramebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8UI);
u32 finalPassShader = 0;
if (RenderDispCnt & (1<<4))
finalPassShader |= 0x4;
if (RenderDispCnt & (1<<7))
finalPassShader |= 0x2;
if (RenderDispCnt & (1<<5))
finalPassShader |= 0x1;
glUseProgram(ShaderFinalPass[finalPassShader]);
glDispatchCompute(ScreenWidth/32, ScreenHeight, 1);
glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
/*u64 starttime = armGetSystemTick();
EmuQueue.waitIdle();
printf("total time %f\n", armTicksToNs(armGetSystemTick()-starttime)*0.000001f);*/
/*for (u32 i = 0; i < RenderNumPolygons; i++)
{
if (RenderPolygons[i].Variant >= numVariants)
{
printf("blarb %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons);
}
//assert(RenderPolygons[i].Variant < numVariants);
}*/
/*for (int i = 0; i < binresult->SortWorkWorkCount[0]*32; i++)
{
printf("sorted %x %x\n", binresult->SortedWork[i*2+0], binresult->SortedWork[i*2+1]);
}*/
/* if (polygonvisible != -1)
{
SpanSetupX* xspans = Gfx::DataHeap->CpuAddr<SpanSetupX>(XSpanSetupMemory);
printf("span result\n");
Polygon* poly = RenderPolygonRAM[polygonvisible];
u32 xspanoffset = RenderPolygons[polygonvisible].FirstXSpan;
for (u32 i = 0; i < (poly->YBottom - poly->YTop); i++)
{
printf("%d: %d - %d | %d %d | %d %d\n", i + poly->YTop, xspans[xspanoffset + i].X0, xspans[xspanoffset + i].X1, xspans[xspanoffset + i].__pad0, xspans[xspanoffset + i].__pad1, RenderPolygons[polygonvisible].YTop, RenderPolygons[polygonvisible].YBot);
}
}*/
/*
printf("xspans: %d\n", numSetupIndices);
SpanSetupX* xspans = Gfx::DataHeap->CpuAddr<SpanSetupX>(XSpanSetupMemory[curSlice]);
for (int i = 0; i < numSetupIndices; i++)
{
printf("poly %d %d %d | line %d | %d to %d\n", YSpanIndices[i].PolyIdx, YSpanIndices[i].SpanIdxL, YSpanIndices[i].SpanIdxR, YSpanIndices[i].Y, xspans[i].X0, xspans[i].X1);
}
printf("bin result\n");
BinResult* binresult = Gfx::DataHeap->CpuAddr<BinResult>(BinResultMemory);
for (u32 y = 0; y < 192/8; y++)
{
for (u32 x = 0; x < 256/8; x++)
{
printf("%08x ", binresult->BinnedMaskCoarse[(x + y * (256/8)) * 2]);
}
printf("\n");
}*/
}
void ComputeRenderer::RestartFrame()
{
}
u32* ComputeRenderer::GetLine(int line)
{
int stride = 256;
if (line == 0)
{
glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer);
u8* data = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
if (data) memcpy(&FramebufferCPU[0], data, 4*stride*192);
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
}
return &FramebufferCPU[stride * line];
}
void ComputeRenderer::SetupAccelFrame()
{
glBindTexture(GL_TEXTURE_2D, Framebuffer);
}
void ComputeRenderer::PrepareCaptureFrame()
{
glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer);
glBindTexture(GL_TEXTURE_2D, LowResFramebuffer);
glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, nullptr);
}
}