From e7168ac5639315dbcf6ceda642c6b69f74b7f8bb Mon Sep 17 00:00:00 2001 From: RSDuck Date: Mon, 8 May 2023 19:34:05 +0200 Subject: [PATCH] stop using fixed size buffers based on scale factor in shaders this makes shader compile times tolerable on Wintel - beginning of the shader cache - increase size of tile idx in workdesc to 20 bits --- src/GPU3D_Compute.cpp | 38 ++- src/GPU3D_Compute.h | 17 +- src/GPU3D_Compute_shaders.h | 532 ++++++++++++++++++++---------------- src/GPU3D_OpenGL.cpp | 25 +- src/GPU3D_OpenGL.h | 2 +- src/OpenGLSupport.cpp | 217 ++++++++++++++- src/OpenGLSupport.h | 10 +- 7 files changed, 550 insertions(+), 291 deletions(-) diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp index a7fd9cb8..34856235 100644 --- a/src/GPU3D_Compute.cpp +++ b/src/GPU3D_Compute.cpp @@ -39,7 +39,7 @@ ComputeRenderer::~ComputeRenderer() -bool ComputeRenderer::CompileShader(GLuint& shader, const char* source, const std::initializer_list& defines) +bool ComputeRenderer::CompileShader(GLuint& shader, const std::string& source, const std::initializer_list& defines) { std::string shaderName; std::string shaderSource; @@ -72,8 +72,8 @@ void blah(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,con bool ComputeRenderer::Init() { - //glDebugMessageCallback(blah, NULL); - //glEnable(GL_DEBUG_OUTPUT); + glDebugMessageCallback(blah, NULL); + glEnable(GL_DEBUG_OUTPUT); glGenBuffers(1, &YSpanSetupMemory); glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW); @@ -86,7 +86,8 @@ bool ComputeRenderer::Init() glGenBuffers(1, &BinResultMemory); glGenBuffers(1, &FinalTileMemory); glGenBuffers(1, &YSpanIndicesTextureMemory); - glGenBuffers(1, &TileMemory); + glGenBuffers(tilememoryLayer_Num, TileMemory); + glGenBuffers(1, &WorkDescMemory); glGenTextures(1, &YSpanIndicesTexture); glGenTextures(1, &LowResFramebuffer); @@ -123,9 +124,10 @@ void ComputeRenderer::DeInit() glDeleteBuffers(1, &YSpanSetupMemory); glDeleteBuffers(1, &RenderPolygonMemory); - glDeleteBuffers(1, &TileMemory); glDeleteBuffers(1, &XSpanSetupMemory); glDeleteBuffers(1, &BinResultMemory); + glDeleteBuffers(tilememoryLayer_Num, TileMemory); + glDeleteBuffers(1, &WorkDescMemory); glDeleteBuffers(1, &FinalTileMemory); glDeleteBuffers(1, &YSpanIndicesTextureMemory); glDeleteTextures(1, &YSpanIndicesTexture); @@ -214,21 +216,25 @@ void ComputeRenderer::SetRenderSettings(GPU::RenderSettings& settings) MaxWorkTiles = TilesPerLine*TileLines*8; - glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory); - glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW); + for (int i = 0; i < tilememoryLayer_Num; i++) + { + glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory[i]); + glBufferData(GL_SHADER_STORAGE_BUFFER, 4*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW); + } glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*2*ScreenWidth*ScreenHeight, nullptr, GL_DYNAMIC_DRAW); int binResultSize = sizeof(BinResultHeader) - + MaxWorkTiles*2*4 // UnsortedWorkDescs - + MaxWorkTiles*2*4 // SortedWork + TilesPerLine*TileLines*CoarseBinStride*4 // BinnedMaskCoarse + TilesPerLine*TileLines*BinStride*4 // BinnedMask + TilesPerLine*TileLines*BinStride*4; // WorkOffsets glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory); glBufferData(GL_SHADER_STORAGE_BUFFER, binResultSize, nullptr, GL_DYNAMIC_DRAW); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, WorkDescMemory); + glBufferData(GL_SHADER_STORAGE_BUFFER, MaxWorkTiles*2*4*2, nullptr, GL_DYNAMIC_DRAW); + if (Framebuffer != 0) glDeleteTextures(1, &Framebuffer); glGenTextures(1, &Framebuffer); @@ -1237,12 +1243,14 @@ void ComputeRenderer::RenderFrame() //printf("found via %d %d %d of %d\n", foundviatexcache, foundviaprev, numslow, RenderNumPolygons); // bind everything - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, YSpanSetupMemory); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, RenderPolygonMemory); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, XSpanSetupMemory); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, RenderPolygonMemory); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, BinResultMemory); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, TileMemory); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, YSpanSetupMemory); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, FinalTileMemory); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, BinResultMemory); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 7, WorkDescMemory); MetaUniform meta; meta.DispCnt = RenderDispCnt; @@ -1327,7 +1335,6 @@ void ComputeRenderer::RenderFrame() glDispatchCompute((numVariants + 31) / 32, 1, 1); glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); - // sort shader work glUseProgram(ShaderSortWork); glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory); @@ -1336,6 +1343,9 @@ void ComputeRenderer::RenderFrame() glActiveTexture(GL_TEXTURE0); + for (int i = 0; i < tilememoryLayer_Num; i++) + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2+i, TileMemory[i]); + // rasterise { bool highLightMode = RenderDispCnt & (1<<1); diff --git a/src/GPU3D_Compute.h b/src/GPU3D_Compute.h index 721c7a61..b28a3006 100644 --- a/src/GPU3D_Compute.h +++ b/src/GPU3D_Compute.h @@ -74,7 +74,17 @@ private: GLuint XSpanSetupMemory; GLuint BinResultMemory; GLuint RenderPolygonMemory; - GLuint TileMemory; + GLuint WorkDescMemory; + + enum + { + tilememoryLayer_Color, + tilememoryLayer_Depth, + tilememoryLayer_Attr, + tilememoryLayer_Num, + }; + + GLuint TileMemory[tilememoryLayer_Num]; GLuint FinalTileMemory; u32 DummyLine[256] = {}; @@ -102,7 +112,7 @@ private: s32 DxInitial; s32 XCovIncr; - u32 IsDummy, __pad1; + u32 IsDummy; }; struct SpanSetupX { @@ -138,7 +148,6 @@ private: u32 Attr; float TextureLayer; - u32 __pad0, __pad1; }; static constexpr int TileSize = 8; @@ -233,7 +242,7 @@ private: void SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2]); void SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2]); - bool CompileShader(GLuint& shader, const char* source, const std::initializer_list& defines); + bool CompileShader(GLuint& shader, const std::string& source, const std::initializer_list& defines); }; } diff --git a/src/GPU3D_Compute_shaders.h b/src/GPU3D_Compute_shaders.h index 6b63919a..dccbe6ee 100644 --- a/src/GPU3D_Compute_shaders.h +++ b/src/GPU3D_Compute_shaders.h @@ -19,6 +19,8 @@ #ifndef GPU3D_COMPUTE_SHADERS #define GPU3D_COMPUTE_SHADERS +#include + namespace GPU3D { @@ -69,23 +71,67 @@ namespace ComputeRendererShaders */ +const std::string XSpanSetupBuffer{R"( -const char* Common = R"( -struct Polygon +const uint XSpanSetup_Linear = 1U << 0; +const uint XSpanSetup_FillInside = 1U << 1; +const uint XSpanSetup_FillLeft = 1U << 2; +const uint XSpanSetup_FillRight = 1U << 3; + +struct XSpanSetup { - int FirstXSpan; - int YTop, YBot; + int X0, X1; - int XMin, XMax; - int XMinY, XMaxY; + int InsideStart, InsideEnd, EdgeCovL, EdgeCovR; - int Variant; + int XRecip; - uint Attr; + uint Flags; - float TextureLayer; + int Z0, Z1, W0, W1; + int ColorR0, ColorG0, ColorB0; + int ColorR1, ColorG1, ColorB1; + int TexcoordU0, TexcoordV0; + int TexcoordU1, TexcoordV1; + + int CovLInitial, CovRInitial; }; +#if defined(Rasterise) +int CalcYFactorX(XSpanSetup span, int x) +{ + x -= span.X0; + + if (span.X0 != span.X1) + { + uint numLo = uint(x) * uint(span.W0); + uint numHi = 0U; + numHi |= numLo >> (32U-YFactorShift); + numLo <<= YFactorShift; + + uint den = uint(x) * uint(span.W0) + uint(span.X1 - span.X0 - x) * uint(span.W1); + + if (den == 0) + return 0; + else + return int(Div64_32_32(numHi, numLo, den)); + } + else + { + return 0; + } +} +#endif + +layout (std430, binding = 1) buffer XSpanSetupsBuffer +{ + XSpanSetup XSpanSetups[]; +}; + +)"}; + +const std::string YSpanSetupBuffer{R"( + struct YSpanSetup { // Attributes @@ -113,53 +159,185 @@ struct YSpanSetup bool IsDummy; }; -const uint XSpanSetup_Linear = 1U << 0; -const uint XSpanSetup_FillInside = 1U << 1; -const uint XSpanSetup_FillLeft = 1U << 2; -const uint XSpanSetup_FillRight = 1U << 3; - -struct XSpanSetup +#if defined(InterpSpans) +int CalcYFactorY(YSpanSetup span, int i) { - int X0, X1; + /* + maybe it would be better to do use a 32x32=64 multiplication? + */ + uint numLo = uint(abs(i)) * uint(span.W0n); + uint numHi = 0U; + numHi |= numLo >> (32U-YFactorShift); + numLo <<= YFactorShift; - int InsideStart, InsideEnd, EdgeCovL, EdgeCovR; + uint den = uint(abs(i)) * uint(span.W0d) + uint(abs(span.I1 - span.I0 - i)) * span.W1d; - int XRecip; + if (den == 0) + { + return 0; + } + else + { + return int(Div64_32_32(numHi, numLo, den)); + } +} - uint Flags; +int CalculateDx(int y, YSpanSetup span) +{ + return span.DxInitial + (y - span.Y0) * span.Increment; +} - int Z0, Z1, W0, W1; - int ColorR0, ColorG0, ColorB0; - int ColorR1, ColorG1, ColorB1; - int TexcoordU0, TexcoordV0; - int TexcoordU1, TexcoordV1; +int CalculateX(int dx, YSpanSetup span) +{ + int x = span.X0; + if (span.X1 < span.X0) + x -= dx >> 18; + else + x += dx >> 18; + return clamp(x, span.XMin, span.XMax); +} - int CovLInitial, CovRInitial; -}; +void EdgeParams_XMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov) +{ + bool negative = span.X1 < span.X0; + int len; + if (side != negative) + len = (dx >> 18) - ((dx-span.Increment) >> 18); + else + len = ((dx+span.Increment) >> 18) - (dx >> 18); + edgelen = len; -layout (std140, binding = 0) readonly buffer YSpanSetupsBuffer + int xlen = span.XMax + 1 - span.XMin; + int startx = dx >> 18; + if (negative) startx = xlen - startx; + if (side) startx = startx - len + 1; + + uint r; + int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen), r)); + edgecov = (1<<31) | ((startcov & 0x3FF) << 12) | (span.XCovIncr & 0x3FF); +} + +void EdgeParams_YMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov) +{ + bool negative = span.X1 < span.X0; + edgelen = 1; + + if (span.Increment == 0) + { + edgecov = 31; + } + else + { + int cov = ((dx >> 9) + (span.Increment >> 10)) >> 4; + if ((cov >> 5) != (dx >> 18)) cov = 31; + cov &= 0x1F; + if (side == negative) cov = 0x1F - cov; + + edgecov = cov; + } +} +#endif + +layout (std430, binding = 2) buffer YSpanSetupsBuffer { YSpanSetup YSpanSetups[]; }; -#if defined(InterpSpans) || defined(BinCombined) || defined(Rasterise) -layout (std140, binding = 1) -#ifdef InterpSpans -writeonly -#endif -#if defined(BinCombined) || defined(Rasterise) -readonly -#endif -buffer XSpanSetupsBuffer -{ - XSpanSetup XSpanSetups[]; -}; -#endif +)"}; -layout (std140, binding = 2) readonly buffer PolygonBuffer +const std::string PolygonBuffer{R"( +struct Polygon +{ + int FirstXSpan; + int YTop, YBot; + + int XMin, XMax; + int XMinY, XMaxY; + + int Variant; + + uint Attr; + + float TextureLayer; +}; + +layout (std430, binding = 0) readonly buffer PolygonBuffer { Polygon Polygons[]; }; +)"}; + +const std::string BinningBuffer{R"( + +layout (std430, binding = 6) buffer BinResultBuffer +{ + uvec4 VariantWorkCount[MaxVariants]; + uint SortedWorkOffset[MaxVariants]; + + uvec4 SortWorkWorkCount; + + uint BinningMaskAndOffset[]; + //uint BinnedMaskCoarse[TilesPerLine*TileLines*CoarseBinStride]; + //uint BinnedMask[TilesPerLine*TileLines*BinStride]; + //uint WorkOffsets[TilesPerLine*TileLines*BinStride]; +}; + +const int BinningCoarseMaskStart = 0; +const int BinningMaskStart = BinningCoarseMaskStart+TilesPerLine*TileLines*CoarseBinStride; +const int BinningWorkOffsetsStart = BinningMaskStart+TilesPerLine*TileLines*BinStride; + +)"}; + +/* + structure of each WorkDesc item: + x: + bits 0-10: polygon idx + bits 11-31: tile idx (before sorting within variant after sorting within all tiles) + y: + bits 0-15: X position on screen + bits 15-31: Y position on screen +*/ +const std::string WorkDescBuffer{R"( +layout (std430, binding = 7) buffer WorkDescBuffer +{ + //uvec2 UnsortedWorkDescs[MaxWorkTiles]; + //uvec2 SortedWorkDescs[MaxWorkTiles]; + uvec2 WorkDescs[]; +}; + +const uint WorkDescsUnsortedStart = 0; +const uint WorkDescsSortedStart = WorkDescsUnsortedStart+MaxWorkTiles; + +)"}; + +const std::string Tilebuffers{R"( +layout (std430, binding = 2) buffer ColorTileBuffer +{ + uint ColorTiles[]; +}; +layout (std430, binding = 3) buffer DepthTileBuffer +{ + uint DepthTiles[]; +}; +layout (std430, binding = 4) buffer AttrTileBuffer +{ + uint AttrTiles[]; +}; + +)"}; + +const std::string ResultBuffer{R"( +layout (std430, binding = 5) buffer ResultBuffer +{ + uint ResultValue[]; +}; + +const uint ResultColorStart = 0; +const uint ResultDepthStart = ResultColorStart+ScreenWidth*ScreenHeight*2; +const uint ResultAttrStart = ResultDepthStart+ScreenWidth*ScreenHeight*2; +)"}; + +const char* Common = R"( #define TileSize 8 const int CoarseTileCountX = 8; @@ -174,56 +352,8 @@ const int TileLines = ScreenHeight/TileSize; const int BinStride = 2048/32; const int CoarseBinStride = BinStride/32; - const int MaxVariants = 256; -layout (std430, binding = 3) -buffer BinResultBuffer -{ - uvec4 VariantWorkCount[MaxVariants]; - uint SortedWorkOffset[MaxVariants]; - - uvec4 SortWorkWorkCount; - uvec2 UnsortedWorkDescs[MaxWorkTiles]; - uvec2 SortedWork[MaxWorkTiles]; - - uint BinnedMaskCoarse[TilesPerLine*TileLines*CoarseBinStride]; - uint BinnedMask[TilesPerLine*TileLines*BinStride]; - uint WorkOffsets[TilesPerLine*TileLines*BinStride]; -}; - -#if defined(Rasterise) || defined(DepthBlend) -layout (std430, binding = 4) -#ifdef Rasterise -writeonly -#endif -#ifdef DepthBlend -readonly -#endif -buffer TilesBuffer -{ - uint ColorTiles[MaxWorkTiles*TileSize*TileSize]; - uint DepthTiles[MaxWorkTiles*TileSize*TileSize]; - uint AttrTiles[MaxWorkTiles*TileSize*TileSize]; -}; -#endif - -#if defined(DepthBlend) || defined(FinalPass) -layout (std430, binding = 5) -#ifdef DepthBlend -writeonly -#endif -#ifdef FinalPass -readonly -#endif -buffer RasterResult -{ - uint ColorResult[ScreenWidth*ScreenHeight*2]; - uint DepthResult[ScreenWidth*ScreenHeight*2]; - uint AttrResult[ScreenWidth*ScreenHeight*2]; -}; -#endif - layout (std140, binding = 0) uniform MetaUniform { uint NumPolygons; @@ -243,6 +373,12 @@ layout (std140, binding = 0) uniform MetaUniform uint FogOffset, FogShift, FogColor; }; +#ifdef InterpSpans +const int YFactorShift = 9; +#else +const int YFactorShift = 8; +#endif + #if defined(InterpSpans) || defined(Rasterise) uint Umulh(uint a, uint b) { @@ -338,58 +474,6 @@ uint Div64_32_32(uint numHi, uint numLo, uint den) return bitfieldInsert(qhat, q1, 16, 16); } -#ifdef InterpSpans -const int YFactorShift = 9; -#else -const int YFactorShift = 8; -#endif - -int CalcYFactorY(YSpanSetup span, int i) -{ - /* - maybe it would be better to do use a 32x32=64 multiplication? - */ - uint numLo = uint(abs(i)) * uint(span.W0n); - uint numHi = 0U; - numHi |= numLo >> (32U-YFactorShift); - numLo <<= YFactorShift; - - uint den = uint(abs(i)) * uint(span.W0d) + uint(abs(span.I1 - span.I0 - i)) * span.W1d; - - if (den == 0) - { - return 0; - } - else - { - return int(Div64_32_32(numHi, numLo, den)); - } -} - -int CalcYFactorX(XSpanSetup span, int x) -{ - x -= span.X0; - - if (span.X0 != span.X1) - { - uint numLo = uint(x) * uint(span.W0); - uint numHi = 0U; - numHi |= numLo >> (32U-YFactorShift); - numLo <<= YFactorShift; - - uint den = uint(x) * uint(span.W0) + uint(span.X1 - span.X0 - x) * uint(span.W1); - - if (den == 0) - return 0; - else - return int(Div64_32_32(numHi, numLo, den)); - } - else - { - return 0; - } -} - int InterpolateAttrPersp(int y0, int y1, int ifactor) { if (y0 == y1) @@ -548,67 +632,14 @@ uint InterpolateZWBuffer(int z0, int z1, int ifactor) return uint(z1) + uint((int64_t(z0-z1) * int64_t((1<> YFactorShift); }*/ } - -int CalculateDx(int y, YSpanSetup span) -{ - return span.DxInitial + (y - span.Y0) * span.Increment; -} - -int CalculateX(int dx, YSpanSetup span) -{ - int x = span.X0; - if (span.X1 < span.X0) - x -= dx >> 18; - else - x += dx >> 18; - return clamp(x, span.XMin, span.XMax); -} - -void EdgeParams_XMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov) -{ - bool negative = span.X1 < span.X0; - int len; - if (side != negative) - len = (dx >> 18) - ((dx-span.Increment) >> 18); - else - len = ((dx+span.Increment) >> 18) - (dx >> 18); - edgelen = len; - - int xlen = span.XMax + 1 - span.XMin; - int startx = dx >> 18; - if (negative) startx = xlen - startx; - if (side) startx = startx - len + 1; - - uint r; - int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen), r)); - edgecov = (1<<31) | ((startcov & 0x3FF) << 12) | (span.XCovIncr & 0x3FF); -} - -void EdgeParams_YMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov) -{ - bool negative = span.X1 < span.X0; - edgelen = 1; - - if (span.Increment == 0) - { - edgecov = 31; - } - else - { - int cov = ((dx >> 9) + (span.Increment >> 10)) >> 4; - if ((cov >> 5) != (dx >> 18)) cov = 31; - cov &= 0x1F; - if (side == negative) cov = 0x1F - cov; - - edgecov = cov; - } -} #endif )"; -const char* InterpSpans = R"( - +const std::string InterpSpans = + PolygonBuffer + + XSpanSetupBuffer + + YSpanSetupBuffer + R"( layout (local_size_x = 32) in; layout (binding = 0, rgba16ui) uniform readonly uimageBuffer SetupIndices; @@ -803,7 +834,8 @@ void main() )"; -const char* ClearIndirectWorkCount = R"( +const std::string ClearIndirectWorkCount = + BinningBuffer + R"( layout (local_size_x = 32) in; @@ -814,19 +846,23 @@ void main() )"; -const char* ClearCoarseBinMask = R"( - +const std::string ClearCoarseBinMask = + BinningBuffer + R"( layout (local_size_x = 32) in; void main() { - BinnedMaskCoarse[gl_GlobalInvocationID.x*CoarseBinStride+0] = 0; - BinnedMaskCoarse[gl_GlobalInvocationID.x*CoarseBinStride+1] = 0; + BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+0] = 0; + BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+1] = 0; } )"; -const char* BinCombined = R"( +const std::string BinCombined = + PolygonBuffer + + BinningBuffer + + XSpanSetupBuffer + + WorkDescBuffer + R"( layout (local_size_x = 32) in; @@ -942,15 +978,15 @@ void main() int linearTile = fineTile.x + fineTile.y * TilesPerLine + coarseTile.x * CoarseTileCountX + coarseTile.y * TilesPerLine * CoarseTileCountY; - BinnedMask[linearTile * BinStride + groupIdx] = binnedMask; + BinningMaskAndOffset[BinningMaskStart + linearTile * BinStride + groupIdx] = binnedMask; int coarseMaskIdx = linearTile * CoarseBinStride + (groupIdx >> 5); if (binnedMask != 0U) - atomicOr(BinnedMaskCoarse[coarseMaskIdx], 1U << (groupIdx & 0x1F)); + atomicOr(BinningMaskAndOffset[BinningCoarseMaskStart + coarseMaskIdx], 1U << (groupIdx & 0x1F)); if (binnedMask != 0U) { uint workOffset = atomicAdd(VariantWorkCount[0].w, uint(bitCount(binnedMask))); - WorkOffsets[linearTile * BinStride + groupIdx] = workOffset; + BinningMaskAndOffset[BinningWorkOffsetsStart + linearTile * BinStride + groupIdx] = workOffset; uint tilePositionCombined = bitfieldInsert(fineTileTopLeft.x, fineTileTopLeft.y, 16, 16); @@ -964,7 +1000,7 @@ void main() int variantIdx = Polygons[polygonIdx].Variant; int inVariantOffset = int(atomicAdd(VariantWorkCount[variantIdx].z, 1)); - UnsortedWorkDescs[workOffset + idx] = uvec2(tilePositionCombined, bitfieldInsert(inVariantOffset, polygonIdx, 16, 16)); + WorkDescs[WorkDescsUnsortedStart + workOffset + idx] = uvec2(tilePositionCombined, bitfieldInsert(polygonIdx, inVariantOffset, 12, 20)); idx++; } @@ -973,7 +1009,8 @@ void main() )"; -const char* CalcOffsets = R"( +const std::string CalcOffsets = + BinningBuffer + R"( layout (local_size_x = 32) in; @@ -993,7 +1030,10 @@ void main() )"; -const char* SortWork = R"( +const std::string SortWork = + PolygonBuffer + + BinningBuffer + + WorkDescBuffer + R"( layout (local_size_x = 32) in; @@ -1001,19 +1041,24 @@ void main() { if (gl_GlobalInvocationID.x < VariantWorkCount[0].w) { - uvec2 workDesc = UnsortedWorkDescs[gl_GlobalInvocationID.x]; - int inVariantOffset = int(bitfieldExtract(workDesc.y, 0, 16)); - int polygonIdx = int(bitfieldExtract(workDesc.y, 16, 16)); + uvec2 workDesc = WorkDescs[WorkDescsUnsortedStart + gl_GlobalInvocationID.x]; + int inVariantOffset = int(bitfieldExtract(workDesc.y, 12, 20)); + int polygonIdx = int(bitfieldExtract(workDesc.y, 0, 12)); int variantIdx = Polygons[polygonIdx].Variant; int sortedIndex = int(SortedWorkOffset[variantIdx]) + inVariantOffset; - SortedWork[sortedIndex] = uvec2(workDesc.x, bitfieldInsert(workDesc.y, gl_GlobalInvocationID.x, 0, 16)); + WorkDescs[WorkDescsSortedStart + sortedIndex] = uvec2(workDesc.x, bitfieldInsert(workDesc.y, gl_GlobalInvocationID.x, 12, 20)); } } )"; -const char* Rasterise = R"( +const std::string Rasterise = + PolygonBuffer + + WorkDescBuffer + + XSpanSetupBuffer + + BinningBuffer + + Tilebuffers + R"( layout (local_size_x = TileSize, local_size_y = TileSize) in; @@ -1024,10 +1069,10 @@ layout (location = 1) uniform vec2 InvTextureSize; void main() { - uvec2 workDesc = SortedWork[SortedWorkOffset[CurVariant] + gl_WorkGroupID.z]; - Polygon polygon = Polygons[bitfieldExtract(workDesc.y, 16, 16)]; + uvec2 workDesc = WorkDescs[WorkDescsSortedStart + SortedWorkOffset[CurVariant] + gl_WorkGroupID.z]; + Polygon polygon = Polygons[bitfieldExtract(workDesc.y, 0, 12)]; ivec2 position = ivec2(bitfieldExtract(workDesc.x, 0, 16), bitfieldExtract(workDesc.x, 16, 16)) + ivec2(gl_LocalInvocationID.xy); - int tileOffset = int(bitfieldExtract(workDesc.y, 0, 16)) * TileSize * TileSize + TileSize * int(gl_LocalInvocationID.y) + int(gl_LocalInvocationID.x); + int tileOffset = int(bitfieldExtract(workDesc.y, 12, 20)) * TileSize * TileSize + TileSize * int(gl_LocalInvocationID.y) + int(gl_LocalInvocationID.x); uint color = 0U; if (position.y >= polygon.YTop && position.y < polygon.YBot) @@ -1203,7 +1248,11 @@ void main() )"; -const char* DepthBlend = R"( +const std::string DepthBlend = + PolygonBuffer + + Tilebuffers + + ResultBuffer + + BinningBuffer + R"( layout (local_size_x = TileSize, local_size_y = TileSize) in; @@ -1253,8 +1302,8 @@ void ProcessCoarseMask(int linearTile, uint coarseMask, uint coarseOffset, uint tileOffset = linearTile * BinStride + coarseBit + coarseOffset; - uint fineMask = BinnedMask[tileOffset]; - uint workIdx = WorkOffsets[tileOffset]; + uint fineMask = BinningMaskAndOffset[BinningMaskStart + tileOffset]; + uint workIdx = BinningMaskAndOffset[BinningWorkOffsetsStart + tileOffset]; while (fineMask != 0U) { @@ -1403,8 +1452,8 @@ void main() { int linearTile = int(gl_WorkGroupID.x + (gl_WorkGroupID.y * TilesPerLine)); - uint coarseMaskLo = BinnedMaskCoarse[linearTile*CoarseBinStride + 0]; - uint coarseMaskHi = BinnedMaskCoarse[linearTile*CoarseBinStride + 1]; + uint coarseMaskLo = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 0]; + uint coarseMaskHi = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 1]; uvec2 color = uvec2(ClearColor, 0U); uvec2 depth = uvec2(ClearDepth, 0U); @@ -1416,17 +1465,18 @@ void main() ProcessCoarseMask(linearTile, coarseMaskHi, BinStride/2, color, depth, attr, stencil, prevIsShadowMask); int resultOffset = int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y) * ScreenWidth; - ColorResult[resultOffset] = color.x; - ColorResult[resultOffset+FramebufferStride] = color.y; - DepthResult[resultOffset] = depth.x; - DepthResult[resultOffset+FramebufferStride] = depth.y; - AttrResult[resultOffset] = attr.x; - AttrResult[resultOffset+FramebufferStride] = attr.y; + ResultValue[ResultColorStart+resultOffset] = color.x; + ResultValue[ResultColorStart+resultOffset+FramebufferStride] = color.y; + ResultValue[ResultDepthStart+resultOffset] = depth.x; + ResultValue[ResultDepthStart+resultOffset+FramebufferStride] = depth.y; + ResultValue[ResultAttrStart+resultOffset] = attr.x; + ResultValue[ResultAttrStart+resultOffset+FramebufferStride] = attr.y; } )"; -const char* FinalPass = R"( +const std::string FinalPass = + ResultBuffer + R"( layout (local_size_x = 32) in; @@ -1481,9 +1531,9 @@ void main() int srcX = int(gl_GlobalInvocationID.x); int resultOffset = int(srcX) + int(gl_GlobalInvocationID.y) * ScreenWidth; - uvec2 color = uvec2(ColorResult[resultOffset], ColorResult[resultOffset+FramebufferStride]); - uvec2 depth = uvec2(DepthResult[resultOffset], DepthResult[resultOffset+FramebufferStride]); - uvec2 attr = uvec2(AttrResult[resultOffset], AttrResult[resultOffset+FramebufferStride]); + uvec2 color = uvec2(ResultValue[resultOffset+ResultColorStart], ResultValue[resultOffset+FramebufferStride+ResultColorStart]); + uvec2 depth = uvec2(ResultValue[resultOffset+ResultDepthStart], ResultValue[resultOffset+FramebufferStride+ResultDepthStart]); + uvec2 attr = uvec2(ResultValue[resultOffset+ResultAttrStart], ResultValue[resultOffset+FramebufferStride+ResultAttrStart]); #ifdef EdgeMarking if ((attr.x & 0xFU) != 0U) @@ -1493,23 +1543,23 @@ void main() if (srcX > 0U) { - otherAttr.x = AttrResult[resultOffset-1]; - otherDepth.x = DepthResult[resultOffset-1]; + otherAttr.x = ResultValue[resultOffset-1+ResultAttrStart]; + otherDepth.x = ResultValue[resultOffset-1+ResultDepthStart]; } if (srcX < ScreenWidth-1) { - otherAttr.y = AttrResult[resultOffset+1]; - otherDepth.y = DepthResult[resultOffset+1]; + otherAttr.y = ResultValue[resultOffset+1+ResultAttrStart]; + otherDepth.y = ResultValue[resultOffset+1+ResultDepthStart]; } if (gl_GlobalInvocationID.y > 0U) { - otherAttr.z = AttrResult[resultOffset-ScreenWidth]; - otherDepth.z = DepthResult[resultOffset-ScreenWidth]; + otherAttr.z = ResultValue[resultOffset-ScreenWidth+ResultAttrStart]; + otherDepth.z = ResultValue[resultOffset-ScreenWidth+ResultDepthStart]; } if (gl_GlobalInvocationID.y < ScreenHeight-1) { - otherAttr.w = AttrResult[resultOffset+ScreenWidth]; - otherDepth.w = DepthResult[resultOffset+ScreenWidth]; + otherAttr.w = ResultValue[resultOffset+ScreenWidth+ResultAttrStart]; + otherDepth.w = ResultValue[resultOffset+ScreenWidth+ResultDepthStart]; } uint polyId = bitfieldExtract(attr.x, 24, 6); diff --git a/src/GPU3D_OpenGL.cpp b/src/GPU3D_OpenGL.cpp index 6d7a021b..825a4711 100644 --- a/src/GPU3D_OpenGL.cpp +++ b/src/GPU3D_OpenGL.cpp @@ -28,26 +28,22 @@ namespace GPU3D { -bool GLRenderer::BuildRenderShader(u32 flags, const char* vs, const char* fs) +bool GLRenderer::BuildRenderShader(u32 flags, const std::string& vs, const std::string& fs) { char shadername[32]; sprintf(shadername, "RenderShader%02X", flags); int headerlen = strlen(kShaderHeader); - int vslen = strlen(vs); - int vsclen = strlen(kRenderVSCommon); - char* vsbuf = new char[headerlen + vsclen + vslen + 1]; - strcpy(&vsbuf[0], kShaderHeader); - strcpy(&vsbuf[headerlen], kRenderVSCommon); - strcpy(&vsbuf[headerlen + vsclen], vs); + std::string vsbuf; + vsbuf += kShaderHeader; + vsbuf += kRenderVSCommon; + vsbuf += vs; - int fslen = strlen(fs); - int fsclen = strlen(kRenderFSCommon); - char* fsbuf = new char[headerlen + fsclen + fslen + 1]; - strcpy(&fsbuf[0], kShaderHeader); - strcpy(&fsbuf[headerlen], kRenderFSCommon); - strcpy(&fsbuf[headerlen + fsclen], fs); + std::string fsbuf; + fsbuf += kShaderHeader; + fsbuf += kRenderFSCommon; + fsbuf += fs; GLuint prog; bool ret = OpenGL::CompileVertexFragmentProgram(prog, @@ -56,9 +52,6 @@ bool GLRenderer::BuildRenderShader(u32 flags, const char* vs, const char* fs) {{"vPosition", 0}, {"vColor", 1}, {"vTexcoord", 2}, {"vPolygonAttr", 3}}, {{"oColor", 0}, {"oAttr", 1}}); - delete[] vsbuf; - delete[] fsbuf; - if (!ret) return false; GLint uni_id = glGetUniformBlockIndex(prog, "uConfig"); diff --git a/src/GPU3D_OpenGL.h b/src/GPU3D_OpenGL.h index 9505f50e..9337d44a 100644 --- a/src/GPU3D_OpenGL.h +++ b/src/GPU3D_OpenGL.h @@ -67,7 +67,7 @@ private: RendererPolygon PolygonList[2048] {}; - bool BuildRenderShader(u32 flags, const char* vs, const char* fs); + bool BuildRenderShader(u32 flags, const std::string& vs, const std::string& fs); void UseRenderShader(u32 flags); void SetupPolygon(RendererPolygon* rp, Polygon* polygon); u32* SetupVertex(Polygon* poly, int vid, Vertex* vtx, u32 vtxattr, u32* vptr); diff --git a/src/OpenGLSupport.cpp b/src/OpenGLSupport.cpp index 5b3df9e3..e94dd3f8 100644 --- a/src/OpenGLSupport.cpp +++ b/src/OpenGLSupport.cpp @@ -18,15 +18,174 @@ #include "OpenGLSupport.h" +#include +#include + +#include + +#define XXH_STATIC_LINKING_ONLY +#include "xxhash/xxhash.h" + using Platform::Log; using Platform::LogLevel; namespace OpenGL { -bool CompilerShader(GLuint& id, const char* source, const char* name, const char* type) +struct ShaderCacheEntry +{ + u32 Length; + u8* Data; + u32 BinaryFormat; + + ShaderCacheEntry(u8* data, u32 length, u32 binaryFmt) + : Length(length), Data(data), BinaryFormat(binaryFmt) + { + assert(data != nullptr); + } + + ShaderCacheEntry(const ShaderCacheEntry&) = delete; + ShaderCacheEntry(ShaderCacheEntry&& other) + { + Data = other.Data; + Length = other.Length; + BinaryFormat = other.BinaryFormat; + + other.Data = nullptr; + other.Length = 0; + other.BinaryFormat = 0; + } + + ~ShaderCacheEntry() + { + if (Data) // check whether it was moved + delete[] Data; + } +}; + +std::unordered_map ShaderCache; +std::vector NewShaders; + +constexpr u32 ShaderCacheMagic = 0x11CAC4E1; +constexpr u32 ShaderCacheVersion = 1; + +void LoadShaderCache() +{ + // for now the shader cache only contains only compute shaders + // because they take the longest to compile + FILE* file = Platform::OpenLocalFile("shadercache", "rb"); + if (file == nullptr) + { + Log(LogLevel::Error, "Could not find shader cache\n"); + return; + } + + u32 magic, version, numPrograms; + if (fread(&magic, 4, 1, file) != 1 || magic != ShaderCacheMagic) + { + Log(LogLevel::Error, "Shader cache file has invalid magic\n"); + goto fileInvalid; + } + + if (fread(&version, 4, 1, file) != 1 || version != ShaderCacheVersion) + { + Log(LogLevel::Error, "Shader cache file has bad version\n"); + goto fileInvalid; + } + + if (fread(&numPrograms, 4, 1, file) != 1) + { + Log(LogLevel::Error, "Shader cache file invalid program count\n"); + goto fileInvalid; + } + + // not the best approach, because once changes pile up + // we read and overwrite the old files + for (u32 i = 0; i < numPrograms; i++) + { + int error = 3; + + u32 length, binaryFormat; + u64 sourceHash; + error -= fread(&sourceHash, 8, 1, file); + error -= fread(&length, 4, 1, file); + error -= fread(&binaryFormat, 4, 1, file); + + if (error != 0) + { + Log(LogLevel::Error, "Invalid shader cache entry\n"); + goto fileInvalid; + } + + u8* data = new u8[length]; + if (fread(data, length, 1, file) != 1) + { + Log(LogLevel::Error, "Could not read shader cache entry data\n"); + delete[] data; + goto fileInvalid; + } + + ShaderCache.erase(sourceHash); + ShaderCache.emplace(sourceHash, ShaderCacheEntry(data, length, binaryFormat)); + } + +fileInvalid: + fclose(file); +} + +void SaveShaderCache() +{ + FILE* file = Platform::OpenLocalFile("shadercache", "rb+"); + if (file == nullptr) + file = Platform::OpenLocalFile("shadercache", "wb"); + + if (file == nullptr) + { + Log(LogLevel::Error, "Could not open or create shader cache file\n"); + return; + } + + int written = 3; + u32 magic = ShaderCacheMagic, version = ShaderCacheVersion, numPrograms = ShaderCache.size(); + written -= fwrite(&magic, 4, 1, file); + written -= fwrite(&version, 4, 1, file); + written -= fwrite(&numPrograms, 4, 1, file); + + if (written != 0) + { + Log(LogLevel::Error, "Could not write shader cache header\n"); + goto writeError; + } + + fseek(file, 0, SEEK_END); + + printf("new shaders %d\n", NewShaders.size()); + + for (u64 newShader : NewShaders) + { + int error = 4; + auto it = ShaderCache.find(newShader); + + error -= fwrite(&it->first, 8, 1, file); + error -= fwrite(&it->second.Length, 4, 1, file); + error -= fwrite(&it->second.BinaryFormat, 4, 1, file); + error -= fwrite(it->second.Data, it->second.Length, 1, file); + + if (error != 0) + { + Log(LogLevel::Error, "Could not insert new shader cache entry\n"); + goto writeError; + } + } + +writeError: + fclose(file); + + NewShaders.clear(); +} + +bool CompilerShader(GLuint& id, const std::string& source, const std::string& name, const std::string& type) { - int len; int res; if (!glCreateShader) @@ -35,8 +194,10 @@ bool CompilerShader(GLuint& id, const char* source, const char* name, const char return false; } - len = strlen(source); - glShaderSource(id, 1, &source, &len); + const char* sourceC = source.c_str(); + int len = source.length(); + glShaderSource(id, 1, &sourceC, &len); + glCompileShader(id); glGetShaderiv(id, GL_COMPILE_STATUS, &res); @@ -46,8 +207,8 @@ bool CompilerShader(GLuint& id, const char* source, const char* name, const char if (res < 1) res = 1024; char* log = new char[res+1]; glGetShaderInfoLog(id, res+1, NULL, log); - Log(LogLevel::Error, "OpenGL: failed to compile %s shader %s: %s\n", type, name, log); - Log(LogLevel::Debug, "shader source:\n--\n%s\n--\n", source); + Log(LogLevel::Error, "OpenGL: failed to compile %s shader %s: %s\n", type.c_str(), name.c_str(), log); + Log(LogLevel::Debug, "shader source:\n--\n%s\n--\n", source.c_str()); delete[] log; return false; @@ -92,8 +253,29 @@ bool LinkProgram(GLuint& result, GLuint* ids, int numIds) return true; } -bool CompileComputeProgram(GLuint& result, const char* source, const char* name) +bool CompileComputeProgram(GLuint& result, const std::string& source, const std::string& name) { + result = glCreateProgram(); + + /*u64 sourceHash = XXH64(source.data(), source.size(), 0); + auto it = ShaderCache.find(sourceHash); + if (it != ShaderCache.end()) + { + glProgramBinary(result, it->second.BinaryFormat, it->second.Data, it->second.Length); + + GLint linkStatus; + glGetProgramiv(result, GL_LINK_STATUS, &linkStatus); + if (linkStatus == GL_TRUE) + { + Log(LogLevel::Info, "Restored shader %s from cache\n", name.c_str()); + return true; + } + else + { + } + }*/ + Log(LogLevel::Error, "Shader %s from cache was rejected\n", name.c_str()); + GLuint shader = glCreateShader(GL_COMPUTE_SHADER); bool linkingSucess = false; if (glDeleteProgram) @@ -101,9 +283,6 @@ bool CompileComputeProgram(GLuint& result, const char* source, const char* name) goto error; } - result = glCreateProgram(); - - printf("compiling %s", name); if (!CompilerShader(shader, source, name, "compute")) goto error; @@ -113,14 +292,28 @@ error: glDeleteShader(shader); if (!linkingSucess) + { glDeleteProgram(result); + } + /*else + { + GLint length; + GLenum format; + glGetProgramiv(result, GL_PROGRAM_BINARY_LENGTH, &length); + + u8* buffer = new u8[length]; + glGetProgramBinary(result, length, nullptr, &format, buffer); + + ShaderCache.emplace(sourceHash, ShaderCacheEntry(buffer, length, format)); + NewShaders.push_back(sourceHash); + }*/ return linkingSucess; } bool CompileVertexFragmentProgram(GLuint& result, - const char* vs, const char* fs, - const char* name, + const std::string& vs, const std::string& fs, + const std::string& name, const std::initializer_list& vertexInAttrs, const std::initializer_list& fragmentOutAttrs) { diff --git a/src/OpenGLSupport.h b/src/OpenGLSupport.h index 7d8aae44..ec2cb1f1 100644 --- a/src/OpenGLSupport.h +++ b/src/OpenGLSupport.h @@ -29,19 +29,23 @@ namespace OpenGL { +void LoadShaderCache(); +void SaveShaderCache(); + struct AttributeTarget { const char* Name; u32 Location; }; + bool CompileVertexFragmentProgram(GLuint& result, - const char* vs, const char* fs, - const char* name, + const std::string& vs, const std::string& fs, + const std::string& name, const std::initializer_list& vertexInAttrs, const std::initializer_list& fragmentOutAttrs); -bool CompileComputeProgram(GLuint& result, const char* source, const char* name); +bool CompileComputeProgram(GLuint& result, const std::string& source, const std::string& name); }