/* Copyright 2016-2022 melonDS team This file is part of melonDS. melonDS is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. melonDS is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with melonDS. If not, see http://www.gnu.org/licenses/. */ #include "GPU3D_Compute.h" #include #define XXH_STATIC_LINKING_ONLY #include "xxhash/xxhash.h" #include "OpenGLSupport.h" #include "GPU3D_Compute_shaders.h" namespace GPU3D { ComputeRenderer::ComputeRenderer() : Renderer3D(true) {} ComputeRenderer::~ComputeRenderer() {} bool ComputeRenderer::CompileShader(GLuint& shader, const char* source, const std::initializer_list& defines) { std::string shaderName; std::string shaderSource; shaderSource += "#version 430 core\n"; for (const char* define : defines) { shaderSource += "#define "; shaderSource += define; shaderSource += '\n'; shaderName += define; shaderName += ','; } shaderSource += "#define ScreenWidth "; shaderSource += std::to_string(ScreenWidth); shaderSource += "\n#define ScreenHeight "; shaderSource += std::to_string(ScreenHeight); shaderSource += "\n#define MaxWorkTiles "; shaderSource += std::to_string(MaxWorkTiles); shaderSource += ComputeRendererShaders::Common; shaderSource += source; return OpenGL::CompileComputeProgram(shader, shaderSource.c_str(), shaderName.c_str()); } void blah(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,const void *userParam) { printf("%s\n", message); } bool ComputeRenderer::Init() { //glDebugMessageCallback(blah, NULL); //glEnable(GL_DEBUG_OUTPUT); glGenBuffers(1, &YSpanSetupMemory); glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW); glGenBuffers(1, &RenderPolygonMemory); glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(RenderPolygon)*2048, nullptr, GL_DYNAMIC_DRAW); glGenBuffers(1, &XSpanSetupMemory); glGenBuffers(1, &BinResultMemory); glGenBuffers(1, &FinalTileMemory); glGenBuffers(1, &YSpanIndicesTextureMemory); glGenBuffers(1, &TileMemory); glGenTextures(1, &YSpanIndicesTexture); glGenTextures(1, &LowResFramebuffer); glBindTexture(GL_TEXTURE_2D, LowResFramebuffer); glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8UI, 256, 192); glGenBuffers(1, &MetaUniformMemory); glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory); glBufferData(GL_UNIFORM_BUFFER, sizeof(MetaUniform), nullptr, GL_DYNAMIC_DRAW); glGenSamplers(9, Samplers); for (u32 j = 0; j < 3; j++) { for (u32 i = 0; i < 3; i++) { const GLenum translateWrapMode[3] = {GL_CLAMP_TO_EDGE, GL_REPEAT, GL_MIRRORED_REPEAT}; glSamplerParameteri(Samplers[i+j*3], GL_TEXTURE_WRAP_S, translateWrapMode[i]); glSamplerParameteri(Samplers[i+j*3], GL_TEXTURE_WRAP_T, translateWrapMode[j]); glSamplerParameteri(Samplers[i+j*3], GL_TEXTURE_MIN_FILTER, GL_NEAREST); glSamplerParameterf(Samplers[i+j*3], GL_TEXTURE_MAG_FILTER, GL_NEAREST); } } glGenBuffers(1, &PixelBuffer); glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer); glBufferData(GL_PIXEL_PACK_BUFFER, 256*192*4, NULL, GL_DYNAMIC_READ); return true; } void ComputeRenderer::DeInit() { ResetTexcache(); glDeleteBuffers(1, &YSpanSetupMemory); glDeleteBuffers(1, &RenderPolygonMemory); glDeleteBuffers(1, &TileMemory); glDeleteBuffers(1, &XSpanSetupMemory); glDeleteBuffers(1, &BinResultMemory); glDeleteBuffers(1, &FinalTileMemory); glDeleteBuffers(1, &YSpanIndicesTextureMemory); glDeleteTextures(1, &YSpanIndicesTexture); glDeleteTextures(1, &Framebuffer); glDeleteBuffers(1, &MetaUniformMemory); glDeleteSamplers(9, Samplers); glDeleteBuffers(1, &PixelBuffer); } void ComputeRenderer::DeleteShaders() { std::initializer_list allPrograms = { ShaderInterpXSpans[0], ShaderInterpXSpans[1], ShaderBinCombined, ShaderDepthBlend[0], ShaderDepthBlend[1], ShaderRasteriseNoTexture[0], ShaderRasteriseNoTexture[1], ShaderRasteriseNoTextureToon[0], ShaderRasteriseNoTextureToon[1], ShaderRasteriseNoTextureHighlight[0], ShaderRasteriseNoTextureHighlight[1], ShaderRasteriseUseTextureDecal[0], ShaderRasteriseUseTextureDecal[1], ShaderRasteriseUseTextureModulate[0], ShaderRasteriseUseTextureModulate[1], ShaderRasteriseUseTextureToon[0], ShaderRasteriseUseTextureToon[1], ShaderRasteriseUseTextureHighlight[0], ShaderRasteriseUseTextureHighlight[1], ShaderRasteriseShadowMask[0], ShaderRasteriseShadowMask[1], ShaderClearCoarseBinMask, ShaderClearIndirectWorkCount, ShaderCalculateWorkListOffset, ShaderSortWork, ShaderFinalPass[0], ShaderFinalPass[1], ShaderFinalPass[2], ShaderFinalPass[3], ShaderFinalPass[4], ShaderFinalPass[5], ShaderFinalPass[6], ShaderFinalPass[7], }; for (GLuint program : allPrograms) glDeleteProgram(program); } void ComputeRenderer::ResetTexcache() { for (u32 i = 0; i < 8; i++) { for (u32 j = 0; j < 8; j++) { for (u32 k = 0; k < TexArrays[i][j].size(); k++) glDeleteTextures(1, &TexArrays[i][j][k]); TexArrays[i][j].clear(); FreeTextures[i][j].clear(); } } TexCache.clear(); } void ComputeRenderer::Reset() { ResetTexcache(); } void ComputeRenderer::SetRenderSettings(GPU::RenderSettings& settings) { if (ScaleFactor != -1) { DeleteShaders(); } ScaleFactor = settings.GL_ScaleFactor; ScreenWidth = 256 * ScaleFactor; ScreenHeight = 192 * ScaleFactor; TilesPerLine = ScreenWidth/TileSize; TileLines = ScreenHeight/TileSize; MaxWorkTiles = TilesPerLine*TileLines*8; glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW); glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*2*ScreenWidth*ScreenHeight, nullptr, GL_DYNAMIC_DRAW); int binResultSize = sizeof(BinResultHeader) + MaxWorkTiles*2*4 // UnsortedWorkDescs + MaxWorkTiles*2*4 // SortedWork + TilesPerLine*TileLines*CoarseBinStride*4 // BinnedMaskCoarse + TilesPerLine*TileLines*BinStride*4 // BinnedMask + TilesPerLine*TileLines*BinStride*4; // WorkOffsets glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, binResultSize, nullptr, GL_DYNAMIC_DRAW); if (Framebuffer != 0) glDeleteTextures(1, &Framebuffer); glGenTextures(1, &Framebuffer); glBindTexture(GL_TEXTURE_2D, Framebuffer); glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, ScreenWidth, ScreenHeight); // eh those are pretty bad guesses // though real hw shouldn't be eable to render all 2048 polygons on every line either int maxYSpanIndices = 64*2048 * ScaleFactor; YSpanIndices.resize(maxYSpanIndices); glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory); glBufferData(GL_TEXTURE_BUFFER, maxYSpanIndices*2*4, nullptr, GL_DYNAMIC_DRAW); glBindBuffer(GL_SHADER_STORAGE_BUFFER, XSpanSetupMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupX)*maxYSpanIndices, nullptr, GL_DYNAMIC_DRAW); glBindTexture(GL_TEXTURE_BUFFER, YSpanIndicesTexture); glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA16UI, YSpanIndicesTextureMemory); CompileShader(ShaderInterpXSpans[0], ComputeRendererShaders::InterpSpans, {"InterpSpans", "ZBuffer"}); CompileShader(ShaderInterpXSpans[1], ComputeRendererShaders::InterpSpans, {"InterpSpans", "WBuffer"}); CompileShader(ShaderBinCombined, ComputeRendererShaders::BinCombined, {"BinCombined"}); CompileShader(ShaderDepthBlend[0], ComputeRendererShaders::DepthBlend, {"DepthBlend", "ZBuffer"}); CompileShader(ShaderDepthBlend[1], ComputeRendererShaders::DepthBlend, {"DepthBlend", "WBuffer"}); CompileShader(ShaderRasteriseNoTexture[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture"}); CompileShader(ShaderRasteriseNoTexture[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture"}); CompileShader(ShaderRasteriseNoTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Toon"}); CompileShader(ShaderRasteriseNoTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Toon"}); CompileShader(ShaderRasteriseNoTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Highlight"}); CompileShader(ShaderRasteriseNoTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Highlight"}); CompileShader(ShaderRasteriseUseTextureDecal[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Decal"}); CompileShader(ShaderRasteriseUseTextureDecal[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Decal"}); CompileShader(ShaderRasteriseUseTextureModulate[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Modulate"}); CompileShader(ShaderRasteriseUseTextureModulate[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Modulate"}); CompileShader(ShaderRasteriseUseTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Toon"}); CompileShader(ShaderRasteriseUseTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Toon"}); CompileShader(ShaderRasteriseUseTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Highlight"}); CompileShader(ShaderRasteriseUseTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Highlight"}); CompileShader(ShaderRasteriseShadowMask[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "ShadowMask"}); CompileShader(ShaderRasteriseShadowMask[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "ShadowMask"}); CompileShader(ShaderClearCoarseBinMask, ComputeRendererShaders::ClearCoarseBinMask, {"ClearCoarseBinMask"}); CompileShader(ShaderClearIndirectWorkCount, ComputeRendererShaders::ClearIndirectWorkCount, {"ClearIndirectWorkCount"}); CompileShader(ShaderCalculateWorkListOffset, ComputeRendererShaders::CalcOffsets, {"CalculateWorkOffsets"}); CompileShader(ShaderSortWork, ComputeRendererShaders::SortWork, {"SortWork"}); CompileShader(ShaderFinalPass[0], ComputeRendererShaders::FinalPass, {"FinalPass"}); CompileShader(ShaderFinalPass[1], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking"}); CompileShader(ShaderFinalPass[2], ComputeRendererShaders::FinalPass, {"FinalPass", "Fog"}); CompileShader(ShaderFinalPass[3], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking", "Fog"}); CompileShader(ShaderFinalPass[4], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing"}); CompileShader(ShaderFinalPass[5], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking"}); CompileShader(ShaderFinalPass[6], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "Fog"}); CompileShader(ShaderFinalPass[7], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking", "Fog"}); } void ComputeRenderer::VCount144() { } void ComputeRenderer::SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to) { span->Z0 = poly->FinalZ[from]; span->W0 = poly->FinalW[from]; span->Z1 = poly->FinalZ[to]; span->W1 = poly->FinalW[to]; span->ColorR0 = poly->Vertices[from]->FinalColor[0]; span->ColorG0 = poly->Vertices[from]->FinalColor[1]; span->ColorB0 = poly->Vertices[from]->FinalColor[2]; span->ColorR1 = poly->Vertices[to]->FinalColor[0]; span->ColorG1 = poly->Vertices[to]->FinalColor[1]; span->ColorB1 = poly->Vertices[to]->FinalColor[2]; span->TexcoordU0 = poly->Vertices[from]->TexCoords[0]; span->TexcoordV0 = poly->Vertices[from]->TexCoords[1]; span->TexcoordU1 = poly->Vertices[to]->TexCoords[0]; span->TexcoordV1 = poly->Vertices[to]->TexCoords[1]; } void ComputeRenderer::SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2]) { s32 x0 = positions[vertex][0]; if (side) { span->DxInitial = -0x40000; x0--; } else { span->DxInitial = 0; } span->X0 = span->X1 = x0; span->XMin = x0; span->XMax = x0; span->Y0 = span->Y1 = positions[vertex][1]; if (span->XMin < rp->XMin) { rp->XMin = span->XMin; rp->XMinY = span->Y0; } if (span->XMax > rp->XMax) { rp->XMax = span->XMax; rp->XMaxY = span->Y0; } span->Increment = 0; span->I0 = span->I1 = span->IRecip = 0; span->Linear = true; span->XCovIncr = 0; span->IsDummy = true; SetupAttrs(span, poly, vertex, vertex); } void ComputeRenderer::SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2]) { span->X0 = positions[from][0]; span->X1 = positions[to][0]; span->Y0 = positions[from][1]; span->Y1 = positions[to][1]; SetupAttrs(span, poly, from, to); s32 minXY, maxXY; bool negative = false; if (span->X1 > span->X0) { span->XMin = span->X0; span->XMax = span->X1-1; minXY = span->Y0; maxXY = span->Y1; } else if (span->X1 < span->X0) { span->XMin = span->X1; span->XMax = span->X0-1; negative = true; minXY = span->Y1; maxXY = span->Y0; } else { span->XMin = span->X0; if (side) span->XMin--; span->XMax = span->XMin; // doesn't matter for completely vertical slope minXY = span->Y0; maxXY = span->Y0; } if (span->XMin < rp->XMin) { rp->XMin = span->XMin; rp->XMinY = minXY; } if (span->XMax > rp->XMax) { rp->XMax = span->XMax; rp->XMaxY = maxXY; } span->IsDummy = false; s32 xlen = span->XMax+1 - span->XMin; s32 ylen = span->Y1 - span->Y0; // slope increment has a 18-bit fractional part // note: for some reason, x/y isn't calculated directly, // instead, 1/y is calculated and then multiplied by x // TODO: this is still not perfect (see for example x=169 y=33) if (ylen == 0) { span->Increment = 0; } else if (ylen == xlen) { span->Increment = 0x40000; } else { s32 yrecip = (1<<18) / ylen; span->Increment = (span->X1-span->X0) * yrecip; if (span->Increment < 0) span->Increment = -span->Increment; } bool xMajor = (span->Increment > 0x40000); if (side) { // right if (xMajor) span->DxInitial = negative ? (0x20000 + 0x40000) : (span->Increment - 0x20000); else if (span->Increment != 0) span->DxInitial = negative ? 0x40000 : 0; else span->DxInitial = -0x40000; } else { // left if (xMajor) span->DxInitial = negative ? ((span->Increment - 0x20000) + 0x40000) : 0x20000; else if (span->Increment != 0) span->DxInitial = negative ? 0x40000 : 0; else span->DxInitial = 0; } if (xMajor) { if (side) { span->I0 = span->X0 - 1; span->I1 = span->X1 - 1; } else { span->I0 = span->X0; span->I1 = span->X1; } // used for calculating AA coverage span->XCovIncr = (ylen << 10) / xlen; } else { span->I0 = span->Y0; span->I1 = span->Y1; } if (span->I0 != span->I1) span->IRecip = (1<<30) / (span->I1 - span->I0); else span->IRecip = 0; span->Linear = (span->W0 == span->W1) && !(span->W0 & 0x7E) && !(span->W1 & 0x7E); if ((span->W0 & 0x1) && !(span->W1 & 0x1)) { span->W0n = (span->W0 - 1) >> 1; span->W0d = (span->W0 + 1) >> 1; span->W1d = span->W1 >> 1; } else { span->W0n = span->W0 >> 1; span->W0d = span->W0 >> 1; span->W1d = span->W1 >> 1; } } inline u32 TextureWidth(u32 texparam) { return 8 << ((texparam >> 20) & 0x7); } inline u32 TextureHeight(u32 texparam) { return 8 << ((texparam >> 23) & 0x7); } inline u16 ColorAvg(u16 color0, u16 color1) { u32 r0 = color0 & 0x001F; u32 g0 = color0 & 0x03E0; u32 b0 = color0 & 0x7C00; u32 r1 = color1 & 0x001F; u32 g1 = color1 & 0x03E0; u32 b1 = color1 & 0x7C00; u32 r = (r0 + r1) >> 1; u32 g = ((g0 + g1) >> 1) & 0x03E0; u32 b = ((b0 + b1) >> 1) & 0x7C00; return r | g | b; } inline u16 Color5of3(u16 color0, u16 color1) { u32 r0 = color0 & 0x001F; u32 g0 = color0 & 0x03E0; u32 b0 = color0 & 0x7C00; u32 r1 = color1 & 0x001F; u32 g1 = color1 & 0x03E0; u32 b1 = color1 & 0x7C00; u32 r = (r0*5 + r1*3) >> 3; u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0; u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00; return r | g | b; } inline u16 Color3of5(u16 color0, u16 color1) { u32 r0 = color0 & 0x001F; u32 g0 = color0 & 0x03E0; u32 b0 = color0 & 0x7C00; u32 r1 = color1 & 0x001F; u32 g1 = color1 & 0x03E0; u32 b1 = color1 & 0x7C00; u32 r = (r0*3 + r1*5) >> 3; u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0; u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00; return r | g | b; } inline u32 ConvertRGB5ToRGB8(u16 val) { return (((u32)val & 0x1F) << 3) | (((u32)val & 0x3E0) << 6) | (((u32)val & 0x7C00) << 9); } inline u32 ConvertRGB5ToBGR8(u16 val) { return (((u32)val & 0x1F) << 9) | (((u32)val & 0x3E0) << 6) | (((u32)val & 0x7C00) << 3); } inline u32 ConvertRGB5ToRGB6(u16 val) { u8 r = (val & 0x1F) << 1; u8 g = (val & 0x3E0) >> 4; u8 b = (val & 0x7C00) >> 9; if (r) r++; if (g) g++; if (b) b++; return (u32)r | ((u32)g << 8) | ((u32)b << 16); } enum { outputFmt_RGB6A5, outputFmt_RGBA8, outputFmt_BGRA8 }; template void ConvertCompressedTexture(u32 width, u32 height, u32* output, u8* texData, u8* texAuxData, u16* palData) { // we process a whole block at the time for (int y = 0; y < height / 4; y++) { for (int x = 0; x < width / 4; x++) { u32 data = ((u32*)texData)[x + y * (width / 4)]; u16 auxData = ((u16*)texAuxData)[x + y * (width / 4)]; u32 paletteOffset = auxData & 0x3FFF; u16 color0 = palData[paletteOffset*2] | 0x8000; u16 color1 = palData[paletteOffset*2+1] | 0x8000; u16 color2, color3; switch ((auxData >> 14) & 0x3) { case 0: color2 = palData[paletteOffset*2+2] | 0x8000; color3 = 0; break; case 1: { u32 r0 = color0 & 0x001F; u32 g0 = color0 & 0x03E0; u32 b0 = color0 & 0x7C00; u32 r1 = color1 & 0x001F; u32 g1 = color1 & 0x03E0; u32 b1 = color1 & 0x7C00; u32 r = (r0 + r1) >> 1; u32 g = ((g0 + g1) >> 1) & 0x03E0; u32 b = ((b0 + b1) >> 1) & 0x7C00; color2 = r | g | b | 0x8000; } color3 = 0; break; case 2: color2 = palData[paletteOffset*2+2] | 0x8000; color3 = palData[paletteOffset*2+3] | 0x8000; break; case 3: { u32 r0 = color0 & 0x001F; u32 g0 = color0 & 0x03E0; u32 b0 = color0 & 0x7C00; u32 r1 = color1 & 0x001F; u32 g1 = color1 & 0x03E0; u32 b1 = color1 & 0x7C00; u32 r = (r0*5 + r1*3) >> 3; u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0; u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00; color2 = r | g | b | 0x8000; } { u32 r0 = color0 & 0x001F; u32 g0 = color0 & 0x03E0; u32 b0 = color0 & 0x7C00; u32 r1 = color1 & 0x001F; u32 g1 = color1 & 0x03E0; u32 b1 = color1 & 0x7C00; u32 r = (r0*3 + r1*5) >> 3; u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0; u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00; color3 = r | g | b | 0x8000; } break; } // in 2020 our default data types are big enough to be used as lookup tables... u64 packed = color0 | ((u64)color1 << 16) | ((u64)color2 << 32) | ((u64)color3 << 48); for (int j = 0; j < 4; j++) { for (int i = 0; i < 4; i++) { u16 color = (packed >> 16 * (data >> 2 * (i + j * 4))) & 0xFFFF; u32 res; switch (outputFmt) { case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color) | ((color & 0x8000) ? 0x1F000000 : 0); break; case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color) | ((color & 0x8000) ? 0xFF000000 : 0); break; case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color) | ((color & 0x8000) ? 0xFF000000 : 0); break; } output[x * 4 + i + (y * 4 + j) * width] = res; } } } } } template void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData) { for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { u8 val = texData[x + y * width]; u32 idx = val & ((1 << Y) - 1); u16 color = palData[idx]; u32 alpha = (val >> Y) & ((1 << X) - 1); if (X != 5) alpha = alpha * 4 + alpha / 2; u32 res; switch (outputFmt) { case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color) | alpha << 24; break; // make sure full alpha == 255 case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break; case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break; } output[x + y * width] = res; } } } template void ConvertNColorsTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent) { for (int y = 0; y < height; y++) { for (int x = 0; x < width / (8 / colorBits); x++) { u8 val = texData[x + y * (width / (8 / colorBits))]; for (int i = 0; i < 8 / colorBits; i++) { u32 index = (val >> (i * colorBits)) & ((1 << colorBits) - 1); u16 color = palData[index]; bool transparent = color0Transparent && index == 0; u32 res; switch (outputFmt) { case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color) | (transparent ? 0 : 0x1F000000); break; case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color) | (transparent ? 0 : 0xFF000000); break; case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color) | (transparent ? 0 : 0xFF000000); break; } output[x * (8 / colorBits) + y * width + i] = res; } } } } ComputeRenderer::TexCacheEntry& ComputeRenderer::GetTexture(u32 texParam, u32 palBase) { // remove sampling and texcoord gen params texParam &= ~0xC00F0000; u32 fmt = (texParam >> 26) & 0x7; u64 key = texParam; if (fmt != 7) { key |= (u64)palBase << 32; if (fmt == 5) key &= ~((u64)1 << 29); } //printf("%" PRIx64 " %" PRIx32 " %" PRIx32 "\n", key, texParam, palBase); assert(fmt != 0 && "no texture is not a texture format!"); auto it = TexCache.find(key); if (it != TexCache.end()) return it->second; u32 widthLog2 = (texParam >> 20) & 0x7; u32 heightLog2 = (texParam >> 23) & 0x7; u32 width = 8 << widthLog2; u32 height = 8 << heightLog2; u32 addr = (texParam & 0xFFFF) * 8; TexCacheEntry entry = {0}; entry.TextureRAMStart[0] = addr; entry.WidthLog2 = widthLog2; entry.HeightLog2 = heightLog2; // apparently a new texture if (fmt == 7) { entry.TextureRAMSize[0] = width*height*2; for (u32 i = 0; i < width*height; i++) { u16 value = *(u16*)&GPU::VRAMFlat_Texture[addr + i * 2]; TextureDecodingBuffer[i] = ConvertRGB5ToRGB6(value) | (value & 0x8000 ? 0x1F000000 : 0); } } else if (fmt == 5) { u8* texData = &GPU::VRAMFlat_Texture[addr]; u32 slot1addr = 0x20000 + ((addr & 0x1FFFC) >> 1); if (addr >= 0x40000) slot1addr += 0x10000; u8* texAuxData = &GPU::VRAMFlat_Texture[slot1addr]; u16* palData = (u16*)(GPU::VRAMFlat_TexPal + palBase*16); entry.TextureRAMSize[0] = width*height/16*4; entry.TextureRAMStart[1] = slot1addr; entry.TextureRAMSize[1] = width*height/16*2; entry.TexPalStart = palBase*16; entry.TexPalSize = 0x10000; ConvertCompressedTexture(width, height, TextureDecodingBuffer, texData, texAuxData, palData); } else { u32 texSize, palAddr = palBase*16, numPalEntries; switch (fmt) { case 1: texSize = width*height; numPalEntries = 32; break; case 6: texSize = width*height; numPalEntries = 8; break; case 2: texSize = width*height/4; numPalEntries = 4; palAddr >>= 1; break; case 3: texSize = width*height/2; numPalEntries = 16; break; case 4: texSize = width*height; numPalEntries = 256; break; } palAddr &= 0x1FFFF; /*printf("creating texture | fmt: %d | %dx%d | %08x | %08x\n", fmt, width, height, addr, palAddr); svcSleepThread(1000*1000);*/ entry.TextureRAMSize[0] = texSize; entry.TexPalStart = palAddr; entry.TexPalSize = numPalEntries*2; u8* texData = &GPU::VRAMFlat_Texture[addr]; u16* palData = (u16*)(GPU::VRAMFlat_TexPal + palAddr); //assert(entry.TexPalStart+entry.TexPalSize <= 128*1024*1024); bool color0Transparent = texParam & (1 << 29); switch (fmt) { case 1: ConvertAXIYTexture(width, height, TextureDecodingBuffer, texData, palData); break; case 6: ConvertAXIYTexture(width, height, TextureDecodingBuffer, texData, palData); break; case 2: ConvertNColorsTexture(width, height, TextureDecodingBuffer, texData, palData, color0Transparent); break; case 3: ConvertNColorsTexture(width, height, TextureDecodingBuffer, texData, palData, color0Transparent); break; case 4: ConvertNColorsTexture(width, height, TextureDecodingBuffer, texData, palData, color0Transparent); break; } } for (int i = 0; i < 2; i++) { if (entry.TextureRAMSize[i]) entry.TextureHash[i] = XXH3_64bits(&GPU::VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]); } if (entry.TexPalSize) entry.TexPalHash = XXH3_64bits(&GPU::VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize); auto& texArrays = TexArrays[widthLog2][heightLog2]; auto& freeTextures = FreeTextures[widthLog2][heightLog2]; if (freeTextures.size() == 0) { texArrays.resize(texArrays.size()+1); GLuint& array = texArrays[texArrays.size()-1]; u32 layers = std::min((8*1024*1024) / (width*height*4), 64); // allocate new array texture glGenTextures(1, &array); glBindTexture(GL_TEXTURE_2D_ARRAY, array); glTexStorage3D(GL_TEXTURE_2D_ARRAY, 1, GL_RGBA8UI, width, height, layers); //printf("allocating new layer set for %d %d %d %d\n", width, height, texArrays.size()-1, array.ImageDescriptor); for (u32 i = 0; i < layers; i++) { freeTextures.push_back(TexArrayEntry{array, i}); } } TexArrayEntry storagePlace = freeTextures[freeTextures.size()-1]; freeTextures.pop_back(); //printf("using storage place %d %d | %d %d (%d)\n", width, height, storagePlace.TexArrayIdx, storagePlace.LayerIdx, array.ImageDescriptor); glBindTexture(GL_TEXTURE_2D_ARRAY, storagePlace.TextureID); glTexSubImage3D(GL_TEXTURE_2D_ARRAY, 0, 0, 0, storagePlace.Layer, width, height, 1, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, TextureDecodingBuffer); entry.Texture = storagePlace; return TexCache.emplace(std::make_pair(key, entry)).first->second; } struct Variant { GLuint Texture, Sampler; u16 Width, Height; u8 BlendMode; bool operator==(const Variant& other) { return Texture == other.Texture && Sampler == other.Sampler && BlendMode == other.BlendMode; } }; /* Antialiasing W-Buffer With Texture 0 1, 3 2 without Texture 2 0, 1, 3 => 20 Shader + 1x Shadow Mask */ void ComputeRenderer::RenderFrame() { //printf("render frame\n"); auto textureDirty = GPU::VRAMDirty_Texture.DeriveState(GPU::VRAMMap_Texture); auto texPalDirty = GPU::VRAMDirty_TexPal.DeriveState(GPU::VRAMMap_TexPal); bool textureChanged = GPU::MakeVRAMFlat_TextureCoherent(textureDirty); bool texPalChanged = GPU::MakeVRAMFlat_TexPalCoherent(texPalDirty); if (textureChanged || texPalChanged) { //printf("check invalidation %d\n", TexCache.size()); for (auto it = TexCache.begin(); it != TexCache.end();) { TexCacheEntry& entry = it->second; if (textureChanged) { for (u32 i = 0; i < 2; i++) { u32 startBit = entry.TextureRAMStart[i] / GPU::VRAMDirtyGranularity; u32 bitsCount = ((entry.TextureRAMStart[i] + entry.TextureRAMSize[i] + GPU::VRAMDirtyGranularity - 1) / GPU::VRAMDirtyGranularity) - startBit; u32 startEntry = startBit >> 6; u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry; for (u32 j = startEntry; j < startEntry + entriesCount; j++) { if (GetRangedBitMask(j, startBit, bitsCount) & textureDirty.Data[j]) { u64 newTexHash = XXH3_64bits(&GPU::VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]); if (newTexHash != entry.TextureHash[i]) goto invalidate; } } } } if (texPalChanged && entry.TexPalSize > 0) { u32 startBit = entry.TexPalStart / GPU::VRAMDirtyGranularity; u32 bitsCount = ((entry.TexPalStart + entry.TexPalSize + GPU::VRAMDirtyGranularity - 1) / GPU::VRAMDirtyGranularity) - startBit; u32 startEntry = startBit >> 6; u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry; for (u32 j = startEntry; j < startEntry + entriesCount; j++) { if (GetRangedBitMask(j, startBit, bitsCount) & texPalDirty.Data[j]) { u64 newPalHash = XXH3_64bits(&GPU::VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize); if (newPalHash != entry.TexPalHash) goto invalidate; } } } it++; continue; invalidate: FreeTextures[entry.WidthLog2][entry.HeightLog2].push_back(entry.Texture); //printf("invalidating texture %d\n", entry.ImageDescriptor); it = TexCache.erase(it); } } else if (RenderFrameIdentical) { return; } int numYSpans = 0; int numSetupIndices = 0; /* Some games really like to spam small textures, often to store the data like PPU tiles. E.g. Shantae or some Mega Man game. Fortunately they are usually kind enough to not vary the texture size all too often (usually they just use 8x8 or 16x for everything). This is the reason we have this whole mess where textures of the same size are put into array textures. This allows to increase the batch size. Less variance between each Variant hah! */ u32 numVariants = 0, prevVariant, prevTexLayer; Variant variants[MaxVariants]; int foundviatexcache = 0, foundviaprev = 0, numslow = 0; bool enableTextureMaps = RenderDispCnt & (1<<0); for (int i = 0; i < RenderNumPolygons; i++) { Polygon* polygon = RenderPolygonRAM[i]; u32 nverts = polygon->NumVertices; u32 vtop = polygon->VTop, vbot = polygon->VBottom; u32 curVL = vtop, curVR = vtop; u32 nextVL, nextVR; RenderPolygons[i].FirstXSpan = numSetupIndices; RenderPolygons[i].Attr = polygon->Attr; bool foundVariant = false; if (i > 0) { // if the whole texture attribute matches // the texture layer will also match Polygon* prevPolygon = RenderPolygonRAM[i - 1]; foundVariant = prevPolygon->TexParam == polygon->TexParam && prevPolygon->TexPalette == polygon->TexPalette && (prevPolygon->Attr & 0x30) == (polygon->Attr & 0x30) && prevPolygon->IsShadowMask == polygon->IsShadowMask; if (foundVariant) foundviaprev++; } if (!foundVariant) { Variant variant; variant.BlendMode = polygon->IsShadowMask ? 4 : ((polygon->Attr >> 4) & 0x3); variant.Texture = 0; variant.Sampler = 0; TexCacheEntry* texcacheEntry = nullptr; // we always need to look up the texture to get the layer of the array texture if (enableTextureMaps && (polygon->TexParam >> 26) & 0x7) { texcacheEntry = &GetTexture(polygon->TexParam, polygon->TexPalette); bool wrapS = (polygon->TexParam >> 16) & 1; bool wrapT = (polygon->TexParam >> 17) & 1; bool mirrorS = (polygon->TexParam >> 18) & 1; bool mirrorT = (polygon->TexParam >> 19) & 1; variant.Sampler = Samplers[(wrapS ? (mirrorS ? 2 : 1) : 0) + (wrapT ? (mirrorT ? 2 : 1) : 0) * 3]; variant.Texture = texcacheEntry->Texture.TextureID; prevTexLayer = texcacheEntry->Texture.Layer; if (texcacheEntry->LastVariant < numVariants && variants[texcacheEntry->LastVariant] == variant) { foundVariant = true; prevVariant = texcacheEntry->LastVariant; foundviatexcache++; } } if (!foundVariant) { numslow++; for (int j = numVariants - 1; j >= 0; j--) { if (variants[j] == variant) { foundVariant = true; prevVariant = j; goto foundVariant; } } prevVariant = numVariants; variants[numVariants] = variant; variants[numVariants].Width = TextureWidth(polygon->TexParam); variants[numVariants].Height = TextureHeight(polygon->TexParam); numVariants++; assert(numVariants <= MaxVariants); foundVariant:; if (texcacheEntry) texcacheEntry->LastVariant = prevVariant; } } RenderPolygons[i].Variant = prevVariant; RenderPolygons[i].TextureLayer = (float)prevTexLayer; if (polygon->FacingView) { nextVL = curVL + 1; if (nextVL >= nverts) nextVL = 0; nextVR = curVR - 1; if ((s32)nextVR < 0) nextVR = nverts - 1; } else { nextVL = curVL - 1; if ((s32)nextVL < 0) nextVL = nverts - 1; nextVR = curVR + 1; if (nextVR >= nverts) nextVR = 0; } s32 scaledPositions[10][2]; s32 ytop = ScreenHeight, ybot = 0; for (int i = 0; i < polygon->NumVertices; i++) { scaledPositions[i][0] = (polygon->Vertices[i]->HiresPosition[0] * ScaleFactor) >> 4; scaledPositions[i][1] = (polygon->Vertices[i]->HiresPosition[1] * ScaleFactor) >> 4; ytop = std::min(scaledPositions[i][1], ytop); ybot = std::max(scaledPositions[i][1], ybot); } RenderPolygons[i].YTop = ytop; RenderPolygons[i].YBot = ybot; RenderPolygons[i].XMin = ScreenWidth; RenderPolygons[i].XMax = 0; if (ybot == ytop) { vtop = 0; vbot = 0; RenderPolygons[i].YBot++; int j = 1; if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j; if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j; j = nverts - 1; if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j; if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j; assert(numYSpans < MaxYSpanSetups); u32 curSpanL = numYSpans; SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vtop, 0, scaledPositions); assert(numYSpans < MaxYSpanSetups); u32 curSpanR = numYSpans; SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vbot, 1, scaledPositions); YSpanIndices[numSetupIndices].PolyIdx = i; YSpanIndices[numSetupIndices].SpanIdxL = curSpanL; YSpanIndices[numSetupIndices].SpanIdxR = curSpanR; YSpanIndices[numSetupIndices].Y = ytop; numSetupIndices++; } else { u32 curSpanL = numYSpans; assert(numYSpans < MaxYSpanSetups); SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions); u32 curSpanR = numYSpans; assert(numYSpans < MaxYSpanSetups); SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions); for (u32 y = ytop; y < ybot; y++) { if (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom) { while (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom) { curVL = nextVL; if (polygon->FacingView) { nextVL = curVL + 1; if (nextVL >= nverts) nextVL = 0; } else { nextVL = curVL - 1; if ((s32)nextVL < 0) nextVL = nverts - 1; } } assert(numYSpans < MaxYSpanSetups); curSpanL = numYSpans; SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions); } if (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom) { while (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom) { curVR = nextVR; if (polygon->FacingView) { nextVR = curVR - 1; if ((s32)nextVR < 0) nextVR = nverts - 1; } else { nextVR = curVR + 1; if (nextVR >= nverts) nextVR = 0; } } assert(numYSpans < MaxYSpanSetups); curSpanR = numYSpans; SetupYSpan(&RenderPolygons[i] ,&YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions); } YSpanIndices[numSetupIndices].PolyIdx = i; YSpanIndices[numSetupIndices].SpanIdxL = curSpanL; YSpanIndices[numSetupIndices].SpanIdxR = curSpanR; YSpanIndices[numSetupIndices].Y = y; numSetupIndices++; } } //printf("polygon min max %d %d | %d %d\n", RenderPolygons[i].XMin, RenderPolygons[i].XMinY, RenderPolygons[i].XMax, RenderPolygons[i].XMaxY); } /*for (u32 i = 0; i < RenderNumPolygons; i++) { if (RenderPolygons[i].Variant >= numVariants) { printf("blarb2 %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons); } //assert(RenderPolygons[i].Variant < numVariants); }*/ if (numYSpans > 0) { glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory); glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(SpanSetupY)*numYSpans, YSpanSetups); glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory); glBufferSubData(GL_TEXTURE_BUFFER, 0, numSetupIndices*4*2, YSpanIndices.data()); glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory); glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, RenderNumPolygons*sizeof(RenderPolygon), RenderPolygons); // we haven't accessed image data yet, so we don't need to invalidate anything } //printf("found via %d %d %d of %d\n", foundviatexcache, foundviaprev, numslow, RenderNumPolygons); // bind everything glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, YSpanSetupMemory); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, XSpanSetupMemory); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, RenderPolygonMemory); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, BinResultMemory); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, TileMemory); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, FinalTileMemory); MetaUniform meta; meta.DispCnt = RenderDispCnt; meta.NumPolygons = RenderNumPolygons; meta.NumVariants = numVariants; meta.AlphaRef = RenderAlphaRef; { u32 r = (RenderClearAttr1 << 1) & 0x3E; if (r) r++; u32 g = (RenderClearAttr1 >> 4) & 0x3E; if (g) g++; u32 b = (RenderClearAttr1 >> 9) & 0x3E; if (b) b++; u32 a = (RenderClearAttr1 >> 16) & 0x1F; meta.ClearColor = r | (g << 8) | (b << 16) | (a << 24); meta.ClearDepth = ((RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF; meta.ClearAttr = RenderClearAttr1 & 0x3F008000; } for (u32 i = 0; i < 32; i++) { u32 color = RenderToonTable[i]; u32 r = (color << 1) & 0x3E; u32 g = (color >> 4) & 0x3E; u32 b = (color >> 9) & 0x3E; if (r) r++; if (g) g++; if (b) b++; meta.ToonTable[i*4+0] = r | (g << 8) | (b << 16); } for (u32 i = 0; i < 34; i++) { meta.ToonTable[i*4+1] = RenderFogDensityTable[i]; } for (u32 i = 0; i < 8; i++) { u32 color = RenderEdgeTable[i]; u32 r = (color << 1) & 0x3E; u32 g = (color >> 4) & 0x3E; u32 b = (color >> 9) & 0x3E; if (r) r++; if (g) g++; if (b) b++; meta.ToonTable[i*4+2] = r | (g << 8) | (b << 16); } meta.FogOffset = RenderFogOffset; meta.FogShift = RenderFogShift; { u32 fogR = (RenderFogColor << 1) & 0x3E; if (fogR) fogR++; u32 fogG = (RenderFogColor >> 4) & 0x3E; if (fogG) fogG++; u32 fogB = (RenderFogColor >> 9) & 0x3E; if (fogB) fogB++; u32 fogA = (RenderFogColor >> 16) & 0x1F; meta.FogColor = fogR | (fogG << 8) | (fogB << 16) | (fogA << 24); } glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory); glBufferSubData(GL_UNIFORM_BUFFER, 0, sizeof(MetaUniform), &meta); glBindBufferBase(GL_UNIFORM_BUFFER, 0, MetaUniformMemory); glUseProgram(ShaderClearCoarseBinMask); glDispatchCompute(TilesPerLine*TileLines/32, 1, 1); bool wbuffer = false; if (numYSpans > 0) { wbuffer = RenderPolygonRAM[0]->WBuffer; glUseProgram(ShaderClearIndirectWorkCount); glDispatchCompute((numVariants+31)/32, 1, 1); // calculate x-spans glBindImageTexture(0, YSpanIndicesTexture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGBA16UI); glUseProgram(ShaderInterpXSpans[wbuffer]); glDispatchCompute((numSetupIndices + 31) / 32, 1, 1); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); // bin polygons glUseProgram(ShaderBinCombined); glDispatchCompute(((RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); // calculate list offsets glUseProgram(ShaderCalculateWorkListOffset); glDispatchCompute((numVariants + 31) / 32, 1, 1); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); // sort shader work glUseProgram(ShaderSortWork); glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory); glDispatchComputeIndirect(offsetof(BinResultHeader, SortWorkWorkCount)); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); glActiveTexture(GL_TEXTURE0); // rasterise { bool highLightMode = RenderDispCnt & (1<<1); GLuint shadersNoTexture[] = { ShaderRasteriseNoTexture[wbuffer], ShaderRasteriseNoTexture[wbuffer], highLightMode ? ShaderRasteriseNoTextureHighlight[wbuffer] : ShaderRasteriseNoTextureToon[wbuffer], ShaderRasteriseNoTexture[wbuffer], ShaderRasteriseShadowMask[wbuffer] }; GLuint shadersUseTexture[] = { ShaderRasteriseUseTextureModulate[wbuffer], ShaderRasteriseUseTextureDecal[wbuffer], highLightMode ? ShaderRasteriseUseTextureHighlight[wbuffer] : ShaderRasteriseUseTextureToon[wbuffer], ShaderRasteriseUseTextureDecal[wbuffer], ShaderRasteriseShadowMask[wbuffer] }; GLuint prevShader = 0; s32 prevTexture = 0, prevSampler = 0; for (int i = 0; i < numVariants; i++) { GLuint shader = 0; if (variants[i].Texture == 0) { shader = shadersNoTexture[variants[i].BlendMode]; } else { shader = shadersUseTexture[variants[i].BlendMode]; if (variants[i].Texture != prevTexture) { glBindTexture(GL_TEXTURE_2D_ARRAY, variants[i].Texture); prevTexture = variants[i].Texture; } if (variants[i].Sampler != prevSampler) { glBindSampler(0, variants[i].Sampler); prevSampler = variants[i].Sampler; } } assert(shader != 0); if (shader != prevShader) { glUseProgram(shader); prevShader = shader; } glUniform1ui(UniformIdxCurVariant, i); glUniform2f(UniformIdxTextureSize, 1.f / variants[i].Width, 1.f / variants[i].Height); glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory); glDispatchComputeIndirect(offsetof(BinResultHeader, VariantWorkCount) + i*4*4); } } } glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); // compose final image glUseProgram(ShaderDepthBlend[wbuffer]); glDispatchCompute(ScreenWidth/TileSize, ScreenHeight/TileSize, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); glBindImageTexture(0, Framebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8); glBindImageTexture(1, LowResFramebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8UI); u32 finalPassShader = 0; if (RenderDispCnt & (1<<4)) finalPassShader |= 0x4; if (RenderDispCnt & (1<<7)) finalPassShader |= 0x2; if (RenderDispCnt & (1<<5)) finalPassShader |= 0x1; glUseProgram(ShaderFinalPass[finalPassShader]); glDispatchCompute(ScreenWidth/32, ScreenHeight, 1); glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); /*u64 starttime = armGetSystemTick(); EmuQueue.waitIdle(); printf("total time %f\n", armTicksToNs(armGetSystemTick()-starttime)*0.000001f);*/ /*for (u32 i = 0; i < RenderNumPolygons; i++) { if (RenderPolygons[i].Variant >= numVariants) { printf("blarb %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons); } //assert(RenderPolygons[i].Variant < numVariants); }*/ /*for (int i = 0; i < binresult->SortWorkWorkCount[0]*32; i++) { printf("sorted %x %x\n", binresult->SortedWork[i*2+0], binresult->SortedWork[i*2+1]); }*/ /* if (polygonvisible != -1) { SpanSetupX* xspans = Gfx::DataHeap->CpuAddr(XSpanSetupMemory); printf("span result\n"); Polygon* poly = RenderPolygonRAM[polygonvisible]; u32 xspanoffset = RenderPolygons[polygonvisible].FirstXSpan; for (u32 i = 0; i < (poly->YBottom - poly->YTop); i++) { printf("%d: %d - %d | %d %d | %d %d\n", i + poly->YTop, xspans[xspanoffset + i].X0, xspans[xspanoffset + i].X1, xspans[xspanoffset + i].__pad0, xspans[xspanoffset + i].__pad1, RenderPolygons[polygonvisible].YTop, RenderPolygons[polygonvisible].YBot); } }*/ /* printf("xspans: %d\n", numSetupIndices); SpanSetupX* xspans = Gfx::DataHeap->CpuAddr(XSpanSetupMemory[curSlice]); for (int i = 0; i < numSetupIndices; i++) { printf("poly %d %d %d | line %d | %d to %d\n", YSpanIndices[i].PolyIdx, YSpanIndices[i].SpanIdxL, YSpanIndices[i].SpanIdxR, YSpanIndices[i].Y, xspans[i].X0, xspans[i].X1); } printf("bin result\n"); BinResult* binresult = Gfx::DataHeap->CpuAddr(BinResultMemory); for (u32 y = 0; y < 192/8; y++) { for (u32 x = 0; x < 256/8; x++) { printf("%08x ", binresult->BinnedMaskCoarse[(x + y * (256/8)) * 2]); } printf("\n"); }*/ } void ComputeRenderer::RestartFrame() { } u32* ComputeRenderer::GetLine(int line) { int stride = 256; if (line == 0) { glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer); u8* data = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY); if (data) memcpy(&FramebufferCPU[0], data, 4*stride*192); glUnmapBuffer(GL_PIXEL_PACK_BUFFER); } return &FramebufferCPU[stride * line]; } void ComputeRenderer::SetupAccelFrame() { glBindTexture(GL_TEXTURE_2D, Framebuffer); } void ComputeRenderer::PrepareCaptureFrame() { glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer); glBindTexture(GL_TEXTURE_2D, LowResFramebuffer); glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, nullptr); } }