Compute shader renderer (#2041)

* nothing works yet

* don't double buffer 3D framebuffers for the GL Renderer
looks like leftovers from when 3D+2D composition was done in the frontend

* oops

* it works!

* implement display capture for compute renderer
it's actually just all stolen from the regular OpenGL renderer

* fix bad indirect call

* handle cleanup properly

* add hires rendering to the compute shader renderer

* fix UB
also misc changes to use more unsigned multiplication
also fix framebuffer resize

* correct edge filling behaviour when AA is disabled

* fix full color textures

* fix edge marking (polygon id is 6-bit not 5)
also make the code a bit nicer

* take all edge cases into account for XMin/XMax calculation

* use hires coordinate again

* stop using fixed size buffers based on scale factor in shaders
this makes shader compile times tolerable on Wintel
- beginning of the shader cache
- increase size of tile idx in workdesc to 20 bits

* apparently & is not defined on bvec4
why does this even compile on Intel and Nvidia?

* put the texture cache into it's own file

* add compute shader renderer properly to the GUI
also add option to toggle using high resolution vertex coordinates

* unbind sampler object in compute shader renderer

* fix GetRangedBitMask for 64 bit aligned 64 bits
pretty embarassing

* convert NonStupidBitfield.h back to LF only new lines

* actually adapt to latest changes

* fix stupid merge

* actually make compute shader renderer work with newest changes

* show progress on shader compilation

* remove merge leftover
This commit is contained in:
RSDuck 2024-05-13 17:17:39 +02:00 committed by GitHub
parent c85a2103bb
commit 043244a56d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
35 changed files with 4389 additions and 382 deletions

View File

@ -35,6 +35,8 @@ add_library(core STATIC
GPU2D_Soft.cpp GPU2D_Soft.cpp
GPU3D.cpp GPU3D.cpp
GPU3D_Soft.cpp GPU3D_Soft.cpp
GPU3D_Texcache.cpp
GPU3D_Texcache.h
melonDLDI.h melonDLDI.h
NDS.cpp NDS.cpp
NDSCart.cpp NDSCart.cpp
@ -78,6 +80,9 @@ if (ENABLE_OGLRENDERER)
GPU_OpenGL.cpp GPU_OpenGL.cpp
GPU_OpenGL_shaders.h GPU_OpenGL_shaders.h
GPU3D_OpenGL.cpp GPU3D_OpenGL.cpp
GPU3D_Compute.cpp
GPU3D_TexcacheOpenGL.cpp
GPU3D_TexcacheOpenGL.h
GPU3D_OpenGL_shaders.h GPU3D_OpenGL_shaders.h
OpenGLSupport.cpp) OpenGLSupport.cpp)

View File

@ -21,6 +21,7 @@
#include "DSi.h" #include "DSi.h"
#include "DMA.h" #include "DMA.h"
#include "GPU.h" #include "GPU.h"
#include "GPU3D.h"
#include "DMA_Timings.h" #include "DMA_Timings.h"
#include "Platform.h" #include "Platform.h"

View File

@ -22,6 +22,7 @@
#include "DSi_NDMA.h" #include "DSi_NDMA.h"
#include "GPU.h" #include "GPU.h"
#include "DSi_AES.h" #include "DSi_AES.h"
#include "GPU3D.h"
namespace melonDS namespace melonDS
{ {

View File

@ -23,7 +23,7 @@
#include "ARMJIT.h" #include "ARMJIT.h"
#include "GPU2D_Soft.h" #include "GPU2D_Soft.h"
#include "GPU3D_Soft.h" #include "GPU3D.h"
namespace melonDS namespace melonDS
{ {

View File

@ -20,6 +20,7 @@
#include <string.h> #include <string.h>
#include "NDS.h" #include "NDS.h"
#include "GPU.h" #include "GPU.h"
#include "GPU3D.h"
namespace melonDS namespace melonDS
{ {

View File

@ -18,7 +18,7 @@
#include "GPU2D_Soft.h" #include "GPU2D_Soft.h"
#include "GPU.h" #include "GPU.h"
#include "GPU3D_OpenGL.h" #include "GPU3D.h"
namespace melonDS namespace melonDS
{ {

View File

@ -24,6 +24,7 @@
#include "FIFO.h" #include "FIFO.h"
#include "GPU3D_Soft.h" #include "GPU3D_Soft.h"
#include "Platform.h" #include "Platform.h"
#include "GPU3D.h"
namespace melonDS namespace melonDS
{ {

View File

@ -349,7 +349,14 @@ public:
virtual void RestartFrame(GPU& gpu) {}; virtual void RestartFrame(GPU& gpu) {};
virtual u32* GetLine(int line) = 0; virtual u32* GetLine(int line) = 0;
virtual void Blit(const GPU& gpu) {}; virtual void Blit(const GPU& gpu) {};
virtual void SetupAccelFrame() {}
virtual void PrepareCaptureFrame() {} virtual void PrepareCaptureFrame() {}
virtual void BindOutputTexture(int buffer) {}
virtual bool NeedsShaderCompile() { return false; }
virtual void ShaderCompileStep(int& current, int& count) {}
protected: protected:
Renderer3D(bool Accelerated); Renderer3D(bool Accelerated);
}; };

1136
src/GPU3D_Compute.cpp Normal file

File diff suppressed because it is too large Load Diff

242
src/GPU3D_Compute.h Normal file
View File

@ -0,0 +1,242 @@
/*
Copyright 2016-2022 melonDS team
This file is part of melonDS.
melonDS is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option)
any later version.
melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with melonDS. If not, see http://www.gnu.org/licenses/.
*/
#ifndef GPU3D_COMPUTE
#define GPU3D_COMPUTE
#include <memory>
#include "types.h"
#include "GPU3D.h"
#include "OpenGLSupport.h"
#include "GPU_OpenGL.h"
#include "GPU3D_TexcacheOpenGL.h"
#include "NonStupidBitfield.h"
namespace melonDS
{
class ComputeRenderer : public Renderer3D
{
public:
static std::unique_ptr<ComputeRenderer> New();
~ComputeRenderer() override;
void Reset(GPU& gpu) override;
void SetRenderSettings(int scale, bool highResolutionCoordinates);
void VCount144(GPU& gpu) override;
void RenderFrame(GPU& gpu) override;
void RestartFrame(GPU& gpu) override;
u32* GetLine(int line) override;
void SetupAccelFrame() override;
void PrepareCaptureFrame() override;
void BindOutputTexture(int buffer) override;
void Blit(const GPU& gpu) override;
void Stop(const GPU& gpu) override;
bool NeedsShaderCompile() { return ShaderStepIdx != 33; }
void ShaderCompileStep(int& current, int& count) override;
private:
ComputeRenderer(GLCompositor&& compositor);
GLuint ShaderInterpXSpans[2];
GLuint ShaderBinCombined;
GLuint ShaderDepthBlend[2];
GLuint ShaderRasteriseNoTexture[2];
GLuint ShaderRasteriseNoTextureToon[2];
GLuint ShaderRasteriseNoTextureHighlight[2];
GLuint ShaderRasteriseUseTextureDecal[2];
GLuint ShaderRasteriseUseTextureModulate[2];
GLuint ShaderRasteriseUseTextureToon[2];
GLuint ShaderRasteriseUseTextureHighlight[2];
GLuint ShaderRasteriseShadowMask[2];
GLuint ShaderClearCoarseBinMask;
GLuint ShaderClearIndirectWorkCount;
GLuint ShaderCalculateWorkListOffset;
GLuint ShaderSortWork;
GLuint ShaderFinalPass[8];
GLuint YSpanIndicesTextureMemory;
GLuint YSpanIndicesTexture;
GLuint YSpanSetupMemory;
GLuint XSpanSetupMemory;
GLuint BinResultMemory;
GLuint RenderPolygonMemory;
GLuint WorkDescMemory;
enum
{
tilememoryLayer_Color,
tilememoryLayer_Depth,
tilememoryLayer_Attr,
tilememoryLayer_Num,
};
GLuint TileMemory[tilememoryLayer_Num];
GLuint FinalTileMemory;
u32 DummyLine[256] = {};
struct SpanSetupY
{
// Attributes
s32 Z0, Z1, W0, W1;
s32 ColorR0, ColorG0, ColorB0;
s32 ColorR1, ColorG1, ColorB1;
s32 TexcoordU0, TexcoordV0;
s32 TexcoordU1, TexcoordV1;
// Interpolator
s32 I0, I1;
s32 Linear;
s32 IRecip;
s32 W0n, W0d, W1d;
// Slope
s32 Increment;
s32 X0, X1, Y0, Y1;
s32 XMin, XMax;
s32 DxInitial;
s32 XCovIncr;
u32 IsDummy;
};
struct SpanSetupX
{
s32 X0, X1;
s32 EdgeLenL, EdgeLenR, EdgeCovL, EdgeCovR;
s32 XRecip;
u32 Flags;
s32 Z0, Z1, W0, W1;
s32 ColorR0, ColorG0, ColorB0;
s32 ColorR1, ColorG1, ColorB1;
s32 TexcoordU0, TexcoordV0;
s32 TexcoordU1, TexcoordV1;
s32 CovLInitial, CovRInitial;
};
struct SetupIndices
{
u16 PolyIdx, SpanIdxL, SpanIdxR, Y;
};
struct RenderPolygon
{
u32 FirstXSpan;
s32 YTop, YBot;
s32 XMin, XMax;
s32 XMinY, XMaxY;
u32 Variant;
u32 Attr;
float TextureLayer;
};
static constexpr int TileSize = 8;
static constexpr int CoarseTileCountX = 8;
static constexpr int CoarseTileCountY = 4;
static constexpr int CoarseTileW = CoarseTileCountX * TileSize;
static constexpr int CoarseTileH = CoarseTileCountY * TileSize;
static constexpr int BinStride = 2048/32;
static constexpr int CoarseBinStride = BinStride/32;
static constexpr int MaxVariants = 256;
static constexpr int UniformIdxCurVariant = 0;
static constexpr int UniformIdxTextureSize = 1;
static constexpr int MaxFullscreenLayers = 16;
struct BinResultHeader
{
u32 VariantWorkCount[MaxVariants*4];
u32 SortedWorkOffset[MaxVariants];
u32 SortWorkWorkCount[4];
};
static const int MaxYSpanSetups = 6144*2;
std::vector<SetupIndices> YSpanIndices;
SpanSetupY YSpanSetups[MaxYSpanSetups];
RenderPolygon RenderPolygons[2048];
TexcacheOpenGL Texcache;
struct MetaUniform
{
u32 NumPolygons;
u32 NumVariants;
u32 AlphaRef;
u32 DispCnt;
u32 ToonTable[4*34];
u32 ClearColor, ClearDepth, ClearAttr;
u32 FogOffset, FogShift, FogColor;
};
GLuint MetaUniformMemory;
GLuint Samplers[9];
GLuint Framebuffer = 0;
GLuint LowResFramebuffer;
GLuint PixelBuffer;
u32 FramebufferCPU[256*192];
int ScreenWidth, ScreenHeight;
int TilesPerLine, TileLines;
int ScaleFactor = -1;
int MaxWorkTiles;
bool HiresCoordinates;
GLCompositor CurGLCompositor;
int ShaderStepIdx = 0;
void DeleteShaders();
void SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to);
void SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2]);
void SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2]);
bool CompileShader(GLuint& shader, const std::string& source, const std::initializer_list<const char*>& defines);
};
}
#endif

1665
src/GPU3D_Compute_shaders.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -28,46 +28,32 @@
namespace melonDS namespace melonDS
{ {
bool GLRenderer::BuildRenderShader(u32 flags, const char* vs, const char* fs) bool GLRenderer::BuildRenderShader(u32 flags, const std::string& vs, const std::string& fs)
{ {
char shadername[32]; char shadername[32];
snprintf(shadername, sizeof(shadername), "RenderShader%02X", flags); snprintf(shadername, sizeof(shadername), "RenderShader%02X", flags);
int headerlen = strlen(kShaderHeader); int headerlen = strlen(kShaderHeader);
int vslen = strlen(vs); std::string vsbuf;
int vsclen = strlen(kRenderVSCommon); vsbuf += kShaderHeader;
char* vsbuf = new char[headerlen + vsclen + vslen + 1]; vsbuf += kRenderVSCommon;
strcpy(&vsbuf[0], kShaderHeader); vsbuf += vs;
strcpy(&vsbuf[headerlen], kRenderVSCommon);
strcpy(&vsbuf[headerlen + vsclen], vs);
int fslen = strlen(fs); std::string fsbuf;
int fsclen = strlen(kRenderFSCommon); fsbuf += kShaderHeader;
char* fsbuf = new char[headerlen + fsclen + fslen + 1]; fsbuf += kRenderFSCommon;
strcpy(&fsbuf[0], kShaderHeader); fsbuf += fs;
strcpy(&fsbuf[headerlen], kRenderFSCommon);
strcpy(&fsbuf[headerlen + fsclen], fs);
bool ret = OpenGL::BuildShaderProgram(vsbuf, fsbuf, RenderShader[flags], shadername); GLuint prog;
bool ret = OpenGL::CompileVertexFragmentProgram(prog,
delete[] vsbuf; vsbuf, fsbuf,
delete[] fsbuf; shadername,
{{"vPosition", 0}, {"vColor", 1}, {"vTexcoord", 2}, {"vPolygonAttr", 3}},
{{"oColor", 0}, {"oAttr", 1}});
if (!ret) return false; if (!ret) return false;
GLuint prog = RenderShader[flags][2];
glBindAttribLocation(prog, 0, "vPosition");
glBindAttribLocation(prog, 1, "vColor");
glBindAttribLocation(prog, 2, "vTexcoord");
glBindAttribLocation(prog, 3, "vPolygonAttr");
glBindFragDataLocation(prog, 0, "oColor");
glBindFragDataLocation(prog, 1, "oAttr");
if (!OpenGL::LinkShaderProgram(RenderShader[flags]))
return false;
GLint uni_id = glGetUniformBlockIndex(prog, "uConfig"); GLint uni_id = glGetUniformBlockIndex(prog, "uConfig");
glUniformBlockBinding(prog, uni_id, 0); glUniformBlockBinding(prog, uni_id, 0);
@ -78,13 +64,15 @@ bool GLRenderer::BuildRenderShader(u32 flags, const char* vs, const char* fs)
uni_id = glGetUniformLocation(prog, "TexPalMem"); uni_id = glGetUniformLocation(prog, "TexPalMem");
glUniform1i(uni_id, 1); glUniform1i(uni_id, 1);
RenderShader[flags] = prog;
return true; return true;
} }
void GLRenderer::UseRenderShader(u32 flags) void GLRenderer::UseRenderShader(u32 flags)
{ {
if (CurShaderID == flags) return; if (CurShaderID == flags) return;
glUseProgram(RenderShader[flags][2]); glUseProgram(RenderShader[flags]);
CurShaderID = flags; CurShaderID = flags;
} }
@ -125,21 +113,17 @@ std::unique_ptr<GLRenderer> GLRenderer::New() noexcept
glDepthRange(0, 1); glDepthRange(0, 1);
glClearDepth(1.0); glClearDepth(1.0);
if (!OpenGL::CompileVertexFragmentProgram(result->ClearShaderPlain,
if (!OpenGL::BuildShaderProgram(kClearVS, kClearFS, result->ClearShaderPlain, "ClearShader")) kClearVS, kClearFS,
"ClearShader",
{{"vPosition", 0}},
{{"oColor", 0}, {"oAttr", 1}}))
return nullptr; return nullptr;
glBindAttribLocation(result->ClearShaderPlain[2], 0, "vPosition"); result->ClearUniformLoc[0] = glGetUniformLocation(result->ClearShaderPlain, "uColor");
glBindFragDataLocation(result->ClearShaderPlain[2], 0, "oColor"); result->ClearUniformLoc[1] = glGetUniformLocation(result->ClearShaderPlain, "uDepth");
glBindFragDataLocation(result->ClearShaderPlain[2], 1, "oAttr"); result->ClearUniformLoc[2] = glGetUniformLocation(result->ClearShaderPlain, "uOpaquePolyID");
result->ClearUniformLoc[3] = glGetUniformLocation(result->ClearShaderPlain, "uFogFlag");
if (!OpenGL::LinkShaderProgram(result->ClearShaderPlain))
return nullptr;
result->ClearUniformLoc[0] = glGetUniformLocation(result->ClearShaderPlain[2], "uColor");
result->ClearUniformLoc[1] = glGetUniformLocation(result->ClearShaderPlain[2], "uDepth");
result->ClearUniformLoc[2] = glGetUniformLocation(result->ClearShaderPlain[2], "uOpaquePolyID");
result->ClearUniformLoc[3] = glGetUniformLocation(result->ClearShaderPlain[2], "uFogFlag");
memset(result->RenderShader, 0, sizeof(RenderShader)); memset(result->RenderShader, 0, sizeof(RenderShader));
@ -167,42 +151,35 @@ std::unique_ptr<GLRenderer> GLRenderer::New() noexcept
if (!result->BuildRenderShader(RenderFlag_ShadowMask | RenderFlag_WBuffer, kRenderVS_W, kRenderFS_WSM)) if (!result->BuildRenderShader(RenderFlag_ShadowMask | RenderFlag_WBuffer, kRenderVS_W, kRenderFS_WSM))
return nullptr; return nullptr;
if (!OpenGL::BuildShaderProgram(kFinalPassVS, kFinalPassEdgeFS, result->FinalPassEdgeShader, "FinalPassEdgeShader")) if (!OpenGL::CompileVertexFragmentProgram(result->FinalPassEdgeShader,
kFinalPassVS, kFinalPassEdgeFS,
"FinalPassEdgeShader",
{{"vPosition", 0}},
{{"oColor", 0}}))
return nullptr;
if (!OpenGL::CompileVertexFragmentProgram(result->FinalPassFogShader,
kFinalPassVS, kFinalPassFogFS,
"FinalPassFogShader",
{{"vPosition", 0}},
{{"oColor", 0}}))
return nullptr; return nullptr;
if (!OpenGL::BuildShaderProgram(kFinalPassVS, kFinalPassFogFS, result->FinalPassFogShader, "FinalPassFogShader")) GLuint uni_id = glGetUniformBlockIndex(result->FinalPassEdgeShader, "uConfig");
return nullptr; glUniformBlockBinding(result->FinalPassEdgeShader, uni_id, 0);
glBindAttribLocation(result->FinalPassEdgeShader[2], 0, "vPosition"); glUseProgram(result->FinalPassEdgeShader);
glBindFragDataLocation(result->FinalPassEdgeShader[2], 0, "oColor"); uni_id = glGetUniformLocation(result->FinalPassEdgeShader, "DepthBuffer");
if (!OpenGL::LinkShaderProgram(result->FinalPassEdgeShader))
return nullptr;
GLint uni_id = glGetUniformBlockIndex(result->FinalPassEdgeShader[2], "uConfig");
glUniformBlockBinding(result->FinalPassEdgeShader[2], uni_id, 0);
glUseProgram(result->FinalPassEdgeShader[2]);
uni_id = glGetUniformLocation(result->FinalPassEdgeShader[2], "DepthBuffer");
glUniform1i(uni_id, 0); glUniform1i(uni_id, 0);
uni_id = glGetUniformLocation(result->FinalPassEdgeShader[2], "AttrBuffer"); uni_id = glGetUniformLocation(result->FinalPassEdgeShader, "AttrBuffer");
glUniform1i(uni_id, 1); glUniform1i(uni_id, 1);
glBindAttribLocation(result->FinalPassFogShader[2], 0, "vPosition"); uni_id = glGetUniformBlockIndex(result->FinalPassFogShader, "uConfig");
glBindFragDataLocation(result->FinalPassFogShader[2], 0, "oColor"); glUniformBlockBinding(result->FinalPassFogShader, uni_id, 0);
if (!OpenGL::LinkShaderProgram(result->FinalPassFogShader)) glUseProgram(result->FinalPassFogShader);
return nullptr; uni_id = glGetUniformLocation(result->FinalPassFogShader, "DepthBuffer");
uni_id = glGetUniformBlockIndex(result->FinalPassFogShader[2], "uConfig");
glUniformBlockBinding(result->FinalPassFogShader[2], uni_id, 0);
glUseProgram(result->FinalPassFogShader[2]);
uni_id = glGetUniformLocation(result->FinalPassFogShader[2], "DepthBuffer");
glUniform1i(uni_id, 0); glUniform1i(uni_id, 0);
uni_id = glGetUniformLocation(result->FinalPassFogShader[2], "AttrBuffer"); uni_id = glGetUniformLocation(result->FinalPassFogShader, "AttrBuffer");
glUniform1i(uni_id, 1); glUniform1i(uni_id, 1);
@ -255,29 +232,26 @@ std::unique_ptr<GLRenderer> GLRenderer::New() noexcept
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, result->IndexBufferID); glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, result->IndexBufferID);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(IndexBuffer), nullptr, GL_DYNAMIC_DRAW); glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(IndexBuffer), nullptr, GL_DYNAMIC_DRAW);
glGenFramebuffers(4, &result->FramebufferID[0]); glGenFramebuffers(1, &result->MainFramebuffer);
glBindFramebuffer(GL_FRAMEBUFFER, result->FramebufferID[0]);
glGenTextures(8, &result->FramebufferTex[0]);
result->FrontBuffer = 0;
// color buffers // color buffers
SetupDefaultTexParams(result->FramebufferTex[0]); glGenTextures(1, &result->ColorBufferTex);
SetupDefaultTexParams(result->FramebufferTex[1]); SetupDefaultTexParams(result->ColorBufferTex);
// depth/stencil buffer // depth/stencil buffer
SetupDefaultTexParams(result->FramebufferTex[4]); glGenTextures(1, &result->DepthBufferTex);
SetupDefaultTexParams(result->FramebufferTex[6]); SetupDefaultTexParams(result->DepthBufferTex);
// attribute buffer // attribute buffer
// R: opaque polyID (for edgemarking) // R: opaque polyID (for edgemarking)
// G: edge flag // G: edge flag
// B: fog flag // B: fog flag
SetupDefaultTexParams(result->FramebufferTex[5]); glGenTextures(1, &result->AttrBufferTex);
SetupDefaultTexParams(result->FramebufferTex[7]); SetupDefaultTexParams(result->AttrBufferTex);
// downscale framebuffer for display capture (always 256x192) // downscale framebuffer for display capture (always 256x192)
SetupDefaultTexParams(result->FramebufferTex[3]); glGenTextures(1, &result->DownScaleBufferTex);
SetupDefaultTexParams(result->DownScaleBufferTex);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 256, 192, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 256, 192, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
glEnable(GL_BLEND); glEnable(GL_BLEND);
@ -315,8 +289,12 @@ GLRenderer::~GLRenderer()
glDeleteTextures(1, &TexMemID); glDeleteTextures(1, &TexMemID);
glDeleteTextures(1, &TexPalMemID); glDeleteTextures(1, &TexPalMemID);
glDeleteFramebuffers(4, &FramebufferID[0]); glDeleteFramebuffers(1, &MainFramebuffer);
glDeleteTextures(8, &FramebufferTex[0]); glDeleteFramebuffers(1, &DownscaleFramebuffer);
glDeleteTextures(1, &ColorBufferTex);
glDeleteTextures(1, &DepthBufferTex);
glDeleteTextures(1, &AttrBufferTex);
glDeleteTextures(1, &DownScaleBufferTex);
glDeleteVertexArrays(1, &VertexArrayID); glDeleteVertexArrays(1, &VertexArrayID);
glDeleteBuffers(1, &VertexBufferID); glDeleteBuffers(1, &VertexBufferID);
@ -327,8 +305,8 @@ GLRenderer::~GLRenderer()
for (int i = 0; i < 16; i++) for (int i = 0; i < 16; i++)
{ {
if (!RenderShader[i][2]) continue; if (!RenderShader[i]) continue;
OpenGL::DeleteShaderProgram(RenderShader[i]); glDeleteProgram(RenderShader[i]);
} }
} }
@ -361,40 +339,25 @@ void GLRenderer::SetRenderSettings(bool betterpolygons, int scale) noexcept
ScreenW = 256 * scale; ScreenW = 256 * scale;
ScreenH = 192 * scale; ScreenH = 192 * scale;
glBindTexture(GL_TEXTURE_2D, FramebufferTex[0]); glBindTexture(GL_TEXTURE_2D, ColorBufferTex);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, ScreenW, ScreenH, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
glBindTexture(GL_TEXTURE_2D, FramebufferTex[1]);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, ScreenW, ScreenH, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, ScreenW, ScreenH, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
glBindTexture(GL_TEXTURE_2D, FramebufferTex[4]); glBindTexture(GL_TEXTURE_2D, DepthBufferTex);
glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, ScreenW, ScreenH, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL); glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, ScreenW, ScreenH, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL);
glBindTexture(GL_TEXTURE_2D, FramebufferTex[5]); glBindTexture(GL_TEXTURE_2D, AttrBufferTex);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, ScreenW, ScreenH, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, ScreenW, ScreenH, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL);
glBindTexture(GL_TEXTURE_2D, FramebufferTex[6]); glBindFramebuffer(GL_FRAMEBUFFER, DownscaleFramebuffer);
glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, ScreenW, ScreenH, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL); glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, DownScaleBufferTex, 0);
glBindTexture(GL_TEXTURE_2D, FramebufferTex[7]);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, ScreenW, ScreenH, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL);
glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[3]);
glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, FramebufferTex[3], 0);
GLenum fbassign[2] = {GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1}; GLenum fbassign[2] = {GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1};
glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[0]); glBindFramebuffer(GL_FRAMEBUFFER, MainFramebuffer);
glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, FramebufferTex[0], 0); glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, ColorBufferTex, 0);
glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, FramebufferTex[4], 0); glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, DepthBufferTex, 0);
glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, FramebufferTex[5], 0); glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, AttrBufferTex, 0);
glDrawBuffers(2, fbassign); glDrawBuffers(2, fbassign);
glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[1]);
glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, FramebufferTex[1], 0);
glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, FramebufferTex[6], 0);
glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, FramebufferTex[7], 0);
glDrawBuffers(2, fbassign);
glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[0]);
glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID); glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID);
glBufferData(GL_PIXEL_PACK_BUFFER, 256*192*4, NULL, GL_DYNAMIC_READ); glBufferData(GL_PIXEL_PACK_BUFFER, 256*192*4, NULL, GL_DYNAMIC_READ);
@ -1103,9 +1066,9 @@ void GLRenderer::RenderSceneChunk(const GPU3D& gpu3d, int y, int h)
glStencilMask(0); glStencilMask(0);
glActiveTexture(GL_TEXTURE0); glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, FramebufferTex[FrontBuffer ? 6 : 4]); glBindTexture(GL_TEXTURE_2D, DepthBufferTex);
glActiveTexture(GL_TEXTURE1); glActiveTexture(GL_TEXTURE1);
glBindTexture(GL_TEXTURE_2D, FramebufferTex[FrontBuffer ? 7 : 5]); glBindTexture(GL_TEXTURE_2D, AttrBufferTex);
glBindBuffer(GL_ARRAY_BUFFER, ClearVertexBufferID); glBindBuffer(GL_ARRAY_BUFFER, ClearVertexBufferID);
glBindVertexArray(ClearVertexArrayID); glBindVertexArray(ClearVertexArrayID);
@ -1115,7 +1078,7 @@ void GLRenderer::RenderSceneChunk(const GPU3D& gpu3d, int y, int h)
// edge marking // edge marking
// TODO: depth/polyid values at screen edges // TODO: depth/polyid values at screen edges
glUseProgram(FinalPassEdgeShader[2]); glUseProgram(FinalPassEdgeShader);
glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ZERO, GL_ONE); glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ZERO, GL_ONE);
@ -1126,7 +1089,7 @@ void GLRenderer::RenderSceneChunk(const GPU3D& gpu3d, int y, int h)
{ {
// fog // fog
glUseProgram(FinalPassFogShader[2]); glUseProgram(FinalPassFogShader);
if (gpu3d.RenderDispCnt & (1<<6)) if (gpu3d.RenderDispCnt & (1<<6))
glBlendFuncSeparate(GL_ZERO, GL_ONE, GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_ALPHA); glBlendFuncSeparate(GL_ZERO, GL_ONE, GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_ALPHA);
@ -1154,7 +1117,7 @@ void GLRenderer::RenderFrame(GPU& gpu)
CurShaderID = -1; CurShaderID = -1;
glBindFramebuffer(GL_READ_FRAMEBUFFER, 0); glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, FramebufferID[FrontBuffer]); glBindFramebuffer(GL_DRAW_FRAMEBUFFER, MainFramebuffer);
ShaderConfig.uScreenSize[0] = ScreenW; ShaderConfig.uScreenSize[0] = ScreenW;
ShaderConfig.uScreenSize[1] = ScreenH; ShaderConfig.uScreenSize[1] = ScreenH;
@ -1260,7 +1223,7 @@ void GLRenderer::RenderFrame(GPU& gpu)
// TODO: check whether 'clear polygon ID' affects translucent polyID // TODO: check whether 'clear polygon ID' affects translucent polyID
// (for example when alpha is 1..30) // (for example when alpha is 1..30)
{ {
glUseProgram(ClearShaderPlain[2]); glUseProgram(ClearShaderPlain);
glDepthFunc(GL_ALWAYS); glDepthFunc(GL_ALWAYS);
u32 r = gpu.GPU3D.RenderClearAttr1 & 0x1F; u32 r = gpu.GPU3D.RenderClearAttr1 & 0x1F;
@ -1320,8 +1283,6 @@ void GLRenderer::RenderFrame(GPU& gpu)
RenderSceneChunk(gpu.GPU3D, 0, 192); RenderSceneChunk(gpu.GPU3D, 0, 192);
} }
FrontBuffer = FrontBuffer ? 0 : 1;
} }
void GLRenderer::Stop(const GPU& gpu) void GLRenderer::Stop(const GPU& gpu)
@ -1331,16 +1292,14 @@ void GLRenderer::Stop(const GPU& gpu)
void GLRenderer::PrepareCaptureFrame() void GLRenderer::PrepareCaptureFrame()
{ {
// TODO: make sure this picks the right buffer when doing antialiasing glBindFramebuffer(GL_READ_FRAMEBUFFER, MainFramebuffer);
int original_fb = FrontBuffer^1;
glBindFramebuffer(GL_READ_FRAMEBUFFER, FramebufferID[original_fb]);
glReadBuffer(GL_COLOR_ATTACHMENT0); glReadBuffer(GL_COLOR_ATTACHMENT0);
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, FramebufferID[3]); glBindFramebuffer(GL_DRAW_FRAMEBUFFER, DownscaleFramebuffer);
glDrawBuffer(GL_COLOR_ATTACHMENT0); glDrawBuffer(GL_COLOR_ATTACHMENT0);
glBlitFramebuffer(0, 0, ScreenW, ScreenH, 0, 0, 256, 192, GL_COLOR_BUFFER_BIT, GL_NEAREST); glBlitFramebuffer(0, 0, ScreenW, ScreenH, 0, 0, 256, 192, GL_COLOR_BUFFER_BIT, GL_NEAREST);
glBindFramebuffer(GL_READ_FRAMEBUFFER, FramebufferID[3]); glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID);
glBindFramebuffer(GL_READ_FRAMEBUFFER, DownscaleFramebuffer);
glReadPixels(0, 0, 256, 192, GL_BGRA, GL_UNSIGNED_BYTE, NULL); glReadPixels(0, 0, 256, 192, GL_BGRA, GL_UNSIGNED_BYTE, NULL);
} }
@ -1349,12 +1308,18 @@ void GLRenderer::Blit(const GPU& gpu)
CurGLCompositor.RenderFrame(gpu, *this); CurGLCompositor.RenderFrame(gpu, *this);
} }
void GLRenderer::BindOutputTexture(int buffer)
{
CurGLCompositor.BindOutputTexture(buffer);
}
u32* GLRenderer::GetLine(int line) u32* GLRenderer::GetLine(int line)
{ {
int stride = 256; int stride = 256;
if (line == 0) if (line == 0)
{ {
glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID);
u8* data = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY); u8* data = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
if (data) memcpy(&Framebuffer[stride*0], data, 4*stride*192); if (data) memcpy(&Framebuffer[stride*0], data, 4*stride*192);
glUnmapBuffer(GL_PIXEL_PACK_BUFFER); glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
@ -1374,7 +1339,7 @@ u32* GLRenderer::GetLine(int line)
void GLRenderer::SetupAccelFrame() void GLRenderer::SetupAccelFrame()
{ {
glBindTexture(GL_TEXTURE_2D, FramebufferTex[FrontBuffer]); glBindTexture(GL_TEXTURE_2D, ColorBufferTex);
} }
} }

View File

@ -44,12 +44,11 @@ public:
void Stop(const GPU& gpu) override; void Stop(const GPU& gpu) override;
u32* GetLine(int line) override; u32* GetLine(int line) override;
void SetupAccelFrame(); void SetupAccelFrame() override;
void PrepareCaptureFrame() override; void PrepareCaptureFrame() override;
void Blit(const GPU& gpu) override; void Blit(const GPU& gpu) override;
[[nodiscard]] const GLCompositor& GetCompositor() const noexcept { return CurGLCompositor; } void BindOutputTexture(int buffer) override;
GLCompositor& GetCompositor() noexcept { return CurGLCompositor; }
static std::unique_ptr<GLRenderer> New() noexcept; static std::unique_ptr<GLRenderer> New() noexcept;
private: private:
@ -77,7 +76,7 @@ private:
GLCompositor CurGLCompositor; GLCompositor CurGLCompositor;
RendererPolygon PolygonList[2048] {}; RendererPolygon PolygonList[2048] {};
bool BuildRenderShader(u32 flags, const char* vs, const char* fs); bool BuildRenderShader(u32 flags, const std::string& vs, const std::string& fs);
void UseRenderShader(u32 flags); void UseRenderShader(u32 flags);
void SetupPolygon(RendererPolygon* rp, Polygon* polygon) const; void SetupPolygon(RendererPolygon* rp, Polygon* polygon) const;
u32* SetupVertex(const Polygon* poly, int vid, const Vertex* vtx, u32 vtxattr, u32* vptr) const; u32* SetupVertex(const Polygon* poly, int vid, const Vertex* vtx, u32 vtxattr, u32* vptr) const;
@ -96,13 +95,13 @@ private:
}; };
GLuint ClearShaderPlain[3] {}; GLuint ClearShaderPlain {};
GLuint RenderShader[16][3] {}; GLuint RenderShader[16] {};
GLuint CurShaderID = -1; GLuint CurShaderID = -1;
GLuint FinalPassEdgeShader[3] {}; GLuint FinalPassEdgeShader {};
GLuint FinalPassFogShader[3] {}; GLuint FinalPassFogShader {};
// std140 compliant structure // std140 compliant structure
struct struct
@ -155,12 +154,12 @@ private:
bool BetterPolygons {}; bool BetterPolygons {};
int ScreenW {}, ScreenH {}; int ScreenW {}, ScreenH {};
GLuint FramebufferTex[8] {}; GLuint ColorBufferTex {}, DepthBufferTex {}, AttrBufferTex {};
int FrontBuffer {}; GLuint DownScaleBufferTex {};
GLuint FramebufferID[4] {}, PixelbufferID {}; GLuint PixelbufferID {};
GLuint MainFramebuffer {}, DownscaleFramebuffer {};
u32 Framebuffer[256*192] {}; u32 Framebuffer[256*192] {};
}; };
} }
#endif #endif

View File

@ -95,8 +95,8 @@ void SoftRenderer::EnableRenderThread()
} }
} }
SoftRenderer::SoftRenderer(bool threaded) noexcept SoftRenderer::SoftRenderer() noexcept
: Renderer3D(false), Threaded(threaded) : Renderer3D(false)
{ {
Sema_RenderStart = Platform::Semaphore_Create(); Sema_RenderStart = Platform::Semaphore_Create();
Sema_RenderDone = Platform::Semaphore_Create(); Sema_RenderDone = Platform::Semaphore_Create();

View File

@ -29,7 +29,7 @@ namespace melonDS
class SoftRenderer : public Renderer3D class SoftRenderer : public Renderer3D
{ {
public: public:
SoftRenderer(bool threaded = false) noexcept; SoftRenderer() noexcept;
~SoftRenderer() override; ~SoftRenderer() override;
void Reset(GPU& gpu) override; void Reset(GPU& gpu) override;
@ -504,7 +504,7 @@ private:
// threading // threading
bool Threaded; bool Threaded = false;
Platform::Thread* RenderThread; Platform::Thread* RenderThread;
std::atomic_bool RenderThreadRunning; std::atomic_bool RenderThreadRunning;
std::atomic_bool RenderThreadRendering; std::atomic_bool RenderThreadRendering;

269
src/GPU3D_Texcache.cpp Normal file
View File

@ -0,0 +1,269 @@
#include "GPU3D_Texcache.h"
namespace melonDS
{
inline u16 ColorAvg(u16 color0, u16 color1)
{
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
u32 b0 = color0 & 0x7C00;
u32 r1 = color1 & 0x001F;
u32 g1 = color1 & 0x03E0;
u32 b1 = color1 & 0x7C00;
u32 r = (r0 + r1) >> 1;
u32 g = ((g0 + g1) >> 1) & 0x03E0;
u32 b = ((b0 + b1) >> 1) & 0x7C00;
return r | g | b;
}
inline u16 Color5of3(u16 color0, u16 color1)
{
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
u32 b0 = color0 & 0x7C00;
u32 r1 = color1 & 0x001F;
u32 g1 = color1 & 0x03E0;
u32 b1 = color1 & 0x7C00;
u32 r = (r0*5 + r1*3) >> 3;
u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0;
u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00;
return r | g | b;
}
inline u16 Color3of5(u16 color0, u16 color1)
{
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
u32 b0 = color0 & 0x7C00;
u32 r1 = color1 & 0x001F;
u32 g1 = color1 & 0x03E0;
u32 b1 = color1 & 0x7C00;
u32 r = (r0*3 + r1*5) >> 3;
u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0;
u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00;
return r | g | b;
}
inline u32 ConvertRGB5ToRGB8(u16 val)
{
return (((u32)val & 0x1F) << 3)
| (((u32)val & 0x3E0) << 6)
| (((u32)val & 0x7C00) << 9);
}
inline u32 ConvertRGB5ToBGR8(u16 val)
{
return (((u32)val & 0x1F) << 9)
| (((u32)val & 0x3E0) << 6)
| (((u32)val & 0x7C00) << 3);
}
inline u32 ConvertRGB5ToRGB6(u16 val)
{
u8 r = (val & 0x1F) << 1;
u8 g = (val & 0x3E0) >> 4;
u8 b = (val & 0x7C00) >> 9;
if (r) r++;
if (g) g++;
if (b) b++;
return (u32)r | ((u32)g << 8) | ((u32)b << 16);
}
template <int outputFmt>
void ConvertBitmapTexture(u32 width, u32 height, u32* output, u8* texData)
{
for (u32 i = 0; i < width*height; i++)
{
u16 value = *(u16*)&texData[i * 2];
switch (outputFmt)
{
case outputFmt_RGB6A5:
output[i] = ConvertRGB5ToRGB6(value) | (value & 0x8000 ? 0x1F000000 : 0);
break;
case outputFmt_RGBA8:
output[i] = ConvertRGB5ToRGB8(value) | (value & 0x8000 ? 0xFF000000 : 0);
break;
case outputFmt_BGRA8:
output[i] = ConvertRGB5ToBGR8(value) | (value & 0x8000 ? 0xFF000000 : 0);
break;
}
}
}
template void ConvertBitmapTexture<outputFmt_RGB6A5>(u32 width, u32 height, u32* output, u8* texData);
template <int outputFmt>
void ConvertCompressedTexture(u32 width, u32 height, u32* output, u8* texData, u8* texAuxData, u16* palData)
{
// we process a whole block at the time
for (int y = 0; y < height / 4; y++)
{
for (int x = 0; x < width / 4; x++)
{
u32 data = ((u32*)texData)[x + y * (width / 4)];
u16 auxData = ((u16*)texAuxData)[x + y * (width / 4)];
u32 paletteOffset = auxData & 0x3FFF;
u16 color0 = palData[paletteOffset*2] | 0x8000;
u16 color1 = palData[paletteOffset*2+1] | 0x8000;
u16 color2, color3;
switch ((auxData >> 14) & 0x3)
{
case 0:
color2 = palData[paletteOffset*2+2] | 0x8000;
color3 = 0;
break;
case 1:
{
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
u32 b0 = color0 & 0x7C00;
u32 r1 = color1 & 0x001F;
u32 g1 = color1 & 0x03E0;
u32 b1 = color1 & 0x7C00;
u32 r = (r0 + r1) >> 1;
u32 g = ((g0 + g1) >> 1) & 0x03E0;
u32 b = ((b0 + b1) >> 1) & 0x7C00;
color2 = r | g | b | 0x8000;
}
color3 = 0;
break;
case 2:
color2 = palData[paletteOffset*2+2] | 0x8000;
color3 = palData[paletteOffset*2+3] | 0x8000;
break;
case 3:
{
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
u32 b0 = color0 & 0x7C00;
u32 r1 = color1 & 0x001F;
u32 g1 = color1 & 0x03E0;
u32 b1 = color1 & 0x7C00;
u32 r = (r0*5 + r1*3) >> 3;
u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0;
u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00;
color2 = r | g | b | 0x8000;
}
{
u32 r0 = color0 & 0x001F;
u32 g0 = color0 & 0x03E0;
u32 b0 = color0 & 0x7C00;
u32 r1 = color1 & 0x001F;
u32 g1 = color1 & 0x03E0;
u32 b1 = color1 & 0x7C00;
u32 r = (r0*3 + r1*5) >> 3;
u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0;
u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00;
color3 = r | g | b | 0x8000;
}
break;
}
// in 2020 our default data types are big enough to be used as lookup tables...
u64 packed = color0 | ((u64)color1 << 16) | ((u64)color2 << 32) | ((u64)color3 << 48);
for (int j = 0; j < 4; j++)
{
for (int i = 0; i < 4; i++)
{
u16 color = (packed >> 16 * (data >> 2 * (i + j * 4))) & 0xFFFF;
u32 res;
switch (outputFmt)
{
case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color)
| ((color & 0x8000) ? 0x1F000000 : 0); break;
case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color)
| ((color & 0x8000) ? 0xFF000000 : 0); break;
case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color)
| ((color & 0x8000) ? 0xFF000000 : 0); break;
}
output[x * 4 + i + (y * 4 + j) * width] = res;
}
}
}
}
}
template void ConvertCompressedTexture<outputFmt_RGB6A5>(u32, u32, u32*, u8*, u8*, u16*);
template <int outputFmt, int X, int Y>
void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData)
{
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x++)
{
u8 val = texData[x + y * width];
u32 idx = val & ((1 << Y) - 1);
u16 color = palData[idx];
u32 alpha = (val >> Y) & ((1 << X) - 1);
if (X != 5)
alpha = alpha * 4 + alpha / 2;
u32 res;
switch (outputFmt)
{
case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color) | alpha << 24; break;
// make sure full alpha == 255
case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break;
case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break;
}
output[x + y * width] = res;
}
}
}
template void ConvertAXIYTexture<outputFmt_RGB6A5, 5, 3>(u32, u32, u32*, u8*, u16*);
template void ConvertAXIYTexture<outputFmt_RGB6A5, 3, 5>(u32, u32, u32*, u8*, u16*);
template <int outputFmt, int colorBits>
void ConvertNColorsTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent)
{
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width / (8 / colorBits); x++)
{
u8 val = texData[x + y * (width / (8 / colorBits))];
for (int i = 0; i < 8 / colorBits; i++)
{
u32 index = (val >> (i * colorBits)) & ((1 << colorBits) - 1);
u16 color = palData[index];
bool transparent = color0Transparent && index == 0;
u32 res;
switch (outputFmt)
{
case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color)
| (transparent ? 0 : 0x1F000000); break;
case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color)
| (transparent ? 0 : 0xFF000000); break;
case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color)
| (transparent ? 0 : 0xFF000000); break;
}
output[x * (8 / colorBits) + y * width + i] = res;
}
}
}
}
template void ConvertNColorsTexture<outputFmt_RGB6A5, 2>(u32, u32, u32*, u8*, u16*, bool);
template void ConvertNColorsTexture<outputFmt_RGB6A5, 4>(u32, u32, u32*, u8*, u16*, bool);
template void ConvertNColorsTexture<outputFmt_RGB6A5, 8>(u32, u32, u32*, u8*, u16*, bool);
}

310
src/GPU3D_Texcache.h Normal file
View File

@ -0,0 +1,310 @@
#ifndef GPU3D_TEXCACHE
#define GPU3D_TEXCACHE
#include "types.h"
#include "GPU.h"
#include <assert.h>
#include <unordered_map>
#include <vector>
#define XXH_STATIC_LINKING_ONLY
#include "xxhash/xxhash.h"
namespace melonDS
{
inline u32 TextureWidth(u32 texparam)
{
return 8 << ((texparam >> 20) & 0x7);
}
inline u32 TextureHeight(u32 texparam)
{
return 8 << ((texparam >> 23) & 0x7);
}
enum
{
outputFmt_RGB6A5,
outputFmt_RGBA8,
outputFmt_BGRA8
};
template <int outputFmt>
void ConvertBitmapTexture(u32 width, u32 height, u32* output, u8* texData);
template <int outputFmt>
void ConvertCompressedTexture(u32 width, u32 height, u32* output, u8* texData, u8* texAuxData, u16* palData);
template <int outputFmt, int X, int Y>
void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData);
template <int outputFmt, int colorBits>
void ConvertNColorsTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent);
template <typename TexLoaderT, typename TexHandleT>
class Texcache
{
public:
Texcache(const TexLoaderT& texloader)
: TexLoader(texloader) // probably better if this would be a move constructor???
{}
bool Update(GPU& gpu)
{
auto textureDirty = gpu.VRAMDirty_Texture.DeriveState(gpu.VRAMMap_Texture, gpu);
auto texPalDirty = gpu.VRAMDirty_TexPal.DeriveState(gpu.VRAMMap_TexPal, gpu);
bool textureChanged = gpu.MakeVRAMFlat_TextureCoherent(textureDirty);
bool texPalChanged = gpu.MakeVRAMFlat_TexPalCoherent(texPalDirty);
if (textureChanged || texPalChanged)
{
//printf("check invalidation %d\n", TexCache.size());
for (auto it = Cache.begin(); it != Cache.end();)
{
TexCacheEntry& entry = it->second;
if (textureChanged)
{
for (u32 i = 0; i < 2; i++)
{
u32 startBit = entry.TextureRAMStart[i] / VRAMDirtyGranularity;
u32 bitsCount = ((entry.TextureRAMStart[i] + entry.TextureRAMSize[i] + VRAMDirtyGranularity - 1) / VRAMDirtyGranularity) - startBit;
u32 startEntry = startBit >> 6;
u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
for (u32 j = startEntry; j < startEntry + entriesCount; j++)
{
if (GetRangedBitMask(j, startBit, bitsCount) & textureDirty.Data[j])
{
u64 newTexHash = XXH3_64bits(&gpu.VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]);
if (newTexHash != entry.TextureHash[i])
goto invalidate;
}
}
}
}
if (texPalChanged && entry.TexPalSize > 0)
{
u32 startBit = entry.TexPalStart / VRAMDirtyGranularity;
u32 bitsCount = ((entry.TexPalStart + entry.TexPalSize + VRAMDirtyGranularity - 1) / VRAMDirtyGranularity) - startBit;
u32 startEntry = startBit >> 6;
u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
for (u32 j = startEntry; j < startEntry + entriesCount; j++)
{
if (GetRangedBitMask(j, startBit, bitsCount) & texPalDirty.Data[j])
{
u64 newPalHash = XXH3_64bits(&gpu.VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize);
if (newPalHash != entry.TexPalHash)
goto invalidate;
}
}
}
it++;
continue;
invalidate:
FreeTextures[entry.WidthLog2][entry.HeightLog2].push_back(entry.Texture);
//printf("invalidating texture %d\n", entry.ImageDescriptor);
it = Cache.erase(it);
}
return true;
}
return false;
}
void GetTexture(GPU& gpu, u32 texParam, u32 palBase, TexHandleT& textureHandle, u32& layer, u32*& helper)
{
// remove sampling and texcoord gen params
texParam &= ~0xC00F0000;
u32 fmt = (texParam >> 26) & 0x7;
u64 key = texParam;
if (fmt != 7)
{
key |= (u64)palBase << 32;
if (fmt == 5)
key &= ~((u64)1 << 29);
}
//printf("%" PRIx64 " %" PRIx32 " %" PRIx32 "\n", key, texParam, palBase);
assert(fmt != 0 && "no texture is not a texture format!");
auto it = Cache.find(key);
if (it != Cache.end())
{
textureHandle = it->second.Texture.TextureID;
layer = it->second.Texture.Layer;
helper = &it->second.LastVariant;
return;
}
u32 widthLog2 = (texParam >> 20) & 0x7;
u32 heightLog2 = (texParam >> 23) & 0x7;
u32 width = 8 << widthLog2;
u32 height = 8 << heightLog2;
u32 addr = (texParam & 0xFFFF) * 8;
TexCacheEntry entry = {0};
entry.TextureRAMStart[0] = addr;
entry.WidthLog2 = widthLog2;
entry.HeightLog2 = heightLog2;
// apparently a new texture
if (fmt == 7)
{
entry.TextureRAMSize[0] = width*height*2;
ConvertBitmapTexture<outputFmt_RGB6A5>(width, height, DecodingBuffer, &gpu.VRAMFlat_Texture[addr]);
}
else if (fmt == 5)
{
u8* texData = &gpu.VRAMFlat_Texture[addr];
u32 slot1addr = 0x20000 + ((addr & 0x1FFFC) >> 1);
if (addr >= 0x40000)
slot1addr += 0x10000;
u8* texAuxData = &gpu.VRAMFlat_Texture[slot1addr];
u16* palData = (u16*)(gpu.VRAMFlat_TexPal + palBase*16);
entry.TextureRAMSize[0] = width*height/16*4;
entry.TextureRAMStart[1] = slot1addr;
entry.TextureRAMSize[1] = width*height/16*2;
entry.TexPalStart = palBase*16;
entry.TexPalSize = 0x10000;
ConvertCompressedTexture<outputFmt_RGB6A5>(width, height, DecodingBuffer, texData, texAuxData, palData);
}
else
{
u32 texSize, palAddr = palBase*16, numPalEntries;
switch (fmt)
{
case 1: texSize = width*height; numPalEntries = 32; break;
case 6: texSize = width*height; numPalEntries = 8; break;
case 2: texSize = width*height/4; numPalEntries = 4; palAddr >>= 1; break;
case 3: texSize = width*height/2; numPalEntries = 16; break;
case 4: texSize = width*height; numPalEntries = 256; break;
}
palAddr &= 0x1FFFF;
/*printf("creating texture | fmt: %d | %dx%d | %08x | %08x\n", fmt, width, height, addr, palAddr);
svcSleepThread(1000*1000);*/
entry.TextureRAMSize[0] = texSize;
entry.TexPalStart = palAddr;
entry.TexPalSize = numPalEntries*2;
u8* texData = &gpu.VRAMFlat_Texture[addr];
u16* palData = (u16*)(gpu.VRAMFlat_TexPal + palAddr);
//assert(entry.TexPalStart+entry.TexPalSize <= 128*1024*1024);
bool color0Transparent = texParam & (1 << 29);
switch (fmt)
{
case 1: ConvertAXIYTexture<outputFmt_RGB6A5, 3, 5>(width, height, DecodingBuffer, texData, palData); break;
case 6: ConvertAXIYTexture<outputFmt_RGB6A5, 5, 3>(width, height, DecodingBuffer, texData, palData); break;
case 2: ConvertNColorsTexture<outputFmt_RGB6A5, 2>(width, height, DecodingBuffer, texData, palData, color0Transparent); break;
case 3: ConvertNColorsTexture<outputFmt_RGB6A5, 4>(width, height, DecodingBuffer, texData, palData, color0Transparent); break;
case 4: ConvertNColorsTexture<outputFmt_RGB6A5, 8>(width, height, DecodingBuffer, texData, palData, color0Transparent); break;
}
}
for (int i = 0; i < 2; i++)
{
if (entry.TextureRAMSize[i])
entry.TextureHash[i] = XXH3_64bits(&gpu.VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]);
}
if (entry.TexPalSize)
entry.TexPalHash = XXH3_64bits(&gpu.VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize);
auto& texArrays = TexArrays[widthLog2][heightLog2];
auto& freeTextures = FreeTextures[widthLog2][heightLog2];
if (freeTextures.size() == 0)
{
texArrays.resize(texArrays.size()+1);
TexHandleT& array = texArrays[texArrays.size()-1];
u32 layers = std::min<u32>((8*1024*1024) / (width*height*4), 64);
// allocate new array texture
//printf("allocating new layer set for %d %d %d %d\n", width, height, texArrays.size()-1, array.ImageDescriptor);
array = TexLoader.GenerateTexture(width, height, layers);
for (u32 i = 0; i < layers; i++)
{
freeTextures.push_back(TexArrayEntry{array, i});
}
}
TexArrayEntry storagePlace = freeTextures[freeTextures.size()-1];
freeTextures.pop_back();
entry.Texture = storagePlace;
TexLoader.UploadTexture(storagePlace.TextureID, width, height, storagePlace.Layer, DecodingBuffer);
//printf("using storage place %d %d | %d %d (%d)\n", width, height, storagePlace.TexArrayIdx, storagePlace.LayerIdx, array.ImageDescriptor);
textureHandle = storagePlace.TextureID;
layer = storagePlace.Layer;
helper = &Cache.emplace(std::make_pair(key, entry)).first->second.LastVariant;
}
void Reset()
{
for (u32 i = 0; i < 8; i++)
{
for (u32 j = 0; j < 8; j++)
{
for (u32 k = 0; k < TexArrays[i][j].size(); k++)
TexLoader.DeleteTexture(TexArrays[i][j][k]);
TexArrays[i][j].clear();
FreeTextures[i][j].clear();
}
}
Cache.clear();
}
private:
struct TexArrayEntry
{
TexHandleT TextureID;
u32 Layer;
};
struct TexCacheEntry
{
u32 LastVariant; // very cheap way to make variant lookup faster
u32 TextureRAMStart[2], TextureRAMSize[2];
u32 TexPalStart, TexPalSize;
u8 WidthLog2, HeightLog2;
TexArrayEntry Texture;
u64 TextureHash[2];
u64 TexPalHash;
};
std::unordered_map<u64, TexCacheEntry> Cache;
TexLoaderT TexLoader;
std::vector<TexArrayEntry> FreeTextures[8][8];
std::vector<TexHandleT> TexArrays[8][8];
u32 DecodingBuffer[1024*1024];
};
}
#endif

View File

@ -0,0 +1,29 @@
#include "GPU3D_TexcacheOpenGL.h"
namespace melonDS
{
GLuint TexcacheOpenGLLoader::GenerateTexture(u32 width, u32 height, u32 layers)
{
GLuint texarray;
glGenTextures(1, &texarray);
glBindTexture(GL_TEXTURE_2D_ARRAY, texarray);
glTexStorage3D(GL_TEXTURE_2D_ARRAY, 1, GL_RGBA8UI, width, height, layers);
return texarray;
}
void TexcacheOpenGLLoader::UploadTexture(GLuint handle, u32 width, u32 height, u32 layer, void* data)
{
glBindTexture(GL_TEXTURE_2D_ARRAY, handle);
glTexSubImage3D(GL_TEXTURE_2D_ARRAY,
0, 0, 0, layer,
width, height, 1,
GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, data);
}
void TexcacheOpenGLLoader::DeleteTexture(GLuint handle)
{
glDeleteTextures(1, &handle);
}
}

View File

@ -0,0 +1,25 @@
#ifndef GPU3D_TEXCACHEOPENGL
#define GPU3D_TEXCACHEOPENGL
#include "GPU3D_Texcache.h"
#include "OpenGLSupport.h"
namespace melonDS
{
template <typename, typename>
class Texcache;
class TexcacheOpenGLLoader
{
public:
GLuint GenerateTexture(u32 width, u32 height, u32 layers);
void UploadTexture(GLuint handle, u32 width, u32 height, u32 layer, void* data);
void DeleteTexture(GLuint handle);
};
using TexcacheOpenGL = Texcache<TexcacheOpenGLLoader, GLuint>;
}
#endif

View File

@ -36,32 +36,27 @@ using namespace OpenGL;
std::optional<GLCompositor> GLCompositor::New() noexcept std::optional<GLCompositor> GLCompositor::New() noexcept
{ {
assert(glBindAttribLocation != nullptr); assert(glBindAttribLocation != nullptr);
GLuint CompShader {};
std::array<GLuint, 3> CompShader {}; if (!OpenGL::CompileVertexFragmentProgram(CompShader,
if (!OpenGL::BuildShaderProgram(kCompositorVS, kCompositorFS_Nearest, &CompShader[0], "CompositorShader")) kCompositorVS, kCompositorFS_Nearest,
return std::nullopt; "CompositorShader",
{{"vPosition", 0}, {"vTexcoord", 1}},
glBindAttribLocation(CompShader[2], 0, "vPosition"); {{"oColor", 0}}))
glBindAttribLocation(CompShader[2], 1, "vTexcoord");
glBindFragDataLocation(CompShader[2], 0, "oColor");
if (!OpenGL::LinkShaderProgram(CompShader.data()))
// OpenGL::LinkShaderProgram already deletes the shader program object
// if linking the shaders together failed.
return std::nullopt; return std::nullopt;
return { GLCompositor(CompShader) }; return { GLCompositor(CompShader) };
} }
GLCompositor::GLCompositor(std::array<GLuint, 3> compShader) noexcept : CompShader(compShader) GLCompositor::GLCompositor(GLuint compShader) noexcept : CompShader(compShader)
{ {
CompScaleLoc = glGetUniformLocation(CompShader[2], "u3DScale"); CompScaleLoc = glGetUniformLocation(CompShader, "u3DScale");
Comp3DXPosLoc = glGetUniformLocation(CompShader[2], "u3DXPos"); Comp3DXPosLoc = glGetUniformLocation(CompShader, "u3DXPos");
glUseProgram(CompShader[2]); glUseProgram(CompShader);
GLuint screenTextureUniform = glGetUniformLocation(CompShader[2], "ScreenTex"); GLuint screenTextureUniform = glGetUniformLocation(CompShader, "ScreenTex");
glUniform1i(screenTextureUniform, 0); glUniform1i(screenTextureUniform, 0);
GLuint _3dTextureUniform = glGetUniformLocation(CompShader[2], "_3DTex"); GLuint _3dTextureUniform = glGetUniformLocation(CompShader, "_3DTex");
glUniform1i(_3dTextureUniform, 1); glUniform1i(_3dTextureUniform, 1);
// all this mess is to prevent bleeding // all this mess is to prevent bleeding
@ -136,7 +131,7 @@ GLCompositor::~GLCompositor()
glDeleteVertexArrays(1, &CompVertexArrayID); glDeleteVertexArrays(1, &CompVertexArrayID);
glDeleteBuffers(1, &CompVertexBufferID); glDeleteBuffers(1, &CompVertexBufferID);
OpenGL::DeleteShaderProgram(CompShader.data()); glDeleteProgram(CompShader);
} }
@ -174,7 +169,7 @@ GLCompositor& GLCompositor::operator=(GLCompositor&& other) noexcept
CompVertices = other.CompVertices; CompVertices = other.CompVertices;
// Clean up these resources before overwriting them // Clean up these resources before overwriting them
OpenGL::DeleteShaderProgram(CompShader.data()); glDeleteProgram(CompShader);
CompShader = other.CompShader; CompShader = other.CompShader;
glDeleteBuffers(1, &CompVertexBufferID); glDeleteBuffers(1, &CompVertexBufferID);
@ -244,11 +239,11 @@ void GLCompositor::Stop(const GPU& gpu) noexcept
glBindFramebuffer(GL_FRAMEBUFFER, 0); glBindFramebuffer(GL_FRAMEBUFFER, 0);
} }
void GLCompositor::RenderFrame(const GPU& gpu, GLRenderer& renderer) noexcept void GLCompositor::RenderFrame(const GPU& gpu, Renderer3D& renderer) noexcept
{ {
int frontbuf = gpu.FrontBuffer; int backbuf = gpu.FrontBuffer ^ 1;
glBindFramebuffer(GL_READ_FRAMEBUFFER, 0); glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, CompScreenOutputFB[frontbuf]); glBindFramebuffer(GL_DRAW_FRAMEBUFFER, CompScreenOutputFB[backbuf]);
glDisable(GL_DEPTH_TEST); glDisable(GL_DEPTH_TEST);
glDisable(GL_STENCIL_TEST); glDisable(GL_STENCIL_TEST);
@ -260,7 +255,7 @@ void GLCompositor::RenderFrame(const GPU& gpu, GLRenderer& renderer) noexcept
glClear(GL_COLOR_BUFFER_BIT); glClear(GL_COLOR_BUFFER_BIT);
// TODO: select more shaders (filtering, etc) // TODO: select more shaders (filtering, etc)
OpenGL::UseShaderProgram(CompShader.data()); glUseProgram(CompShader);
glUniform1ui(CompScaleLoc, Scale); glUniform1ui(CompScaleLoc, Scale);
// TODO: support setting this midframe, if ever needed // TODO: support setting this midframe, if ever needed
@ -269,12 +264,12 @@ void GLCompositor::RenderFrame(const GPU& gpu, GLRenderer& renderer) noexcept
glActiveTexture(GL_TEXTURE0); glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, CompScreenInputTex); glBindTexture(GL_TEXTURE_2D, CompScreenInputTex);
if (gpu.Framebuffer[frontbuf][0] && gpu.Framebuffer[frontbuf][1]) if (gpu.Framebuffer[backbuf][0] && gpu.Framebuffer[backbuf][1])
{ {
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256*3 + 1, 192, GL_RGBA_INTEGER, glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256*3 + 1, 192, GL_RGBA_INTEGER,
GL_UNSIGNED_BYTE, gpu.Framebuffer[frontbuf][0].get()); GL_UNSIGNED_BYTE, gpu.Framebuffer[backbuf][0].get());
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256*3 + 1, 192, GL_RGBA_INTEGER, glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256*3 + 1, 192, GL_RGBA_INTEGER,
GL_UNSIGNED_BYTE, gpu.Framebuffer[frontbuf][1].get()); GL_UNSIGNED_BYTE, gpu.Framebuffer[backbuf][1].get());
} }
glActiveTexture(GL_TEXTURE1); glActiveTexture(GL_TEXTURE1);

View File

@ -28,6 +28,7 @@ namespace melonDS
class GPU; class GPU;
struct RenderSettings; struct RenderSettings;
class GLRenderer; class GLRenderer;
class Renderer3D;
class GLCompositor class GLCompositor
{ {
public: public:
@ -42,14 +43,14 @@ public:
[[nodiscard]] int GetScaleFactor() const noexcept { return Scale; } [[nodiscard]] int GetScaleFactor() const noexcept { return Scale; }
void Stop(const GPU& gpu) noexcept; void Stop(const GPU& gpu) noexcept;
void RenderFrame(const GPU& gpu, GLRenderer& renderer) noexcept; void RenderFrame(const GPU& gpu, Renderer3D& renderer) noexcept;
void BindOutputTexture(int buf); void BindOutputTexture(int buf);
private: private:
GLCompositor(std::array<GLuint, 3> CompShader) noexcept; GLCompositor(GLuint CompShader) noexcept;
int Scale = 0; int Scale = 0;
int ScreenH = 0, ScreenW = 0; int ScreenH = 0, ScreenW = 0;
std::array<GLuint, 3> CompShader {}; GLuint CompShader {};
GLuint CompScaleLoc = 0; GLuint CompScaleLoc = 0;
GLuint Comp3DXPosLoc = 0; GLuint Comp3DXPosLoc = 0;

View File

@ -26,11 +26,38 @@
#include <initializer_list> #include <initializer_list>
#include <algorithm> #include <algorithm>
namespace melonDS
{
inline u64 GetRangedBitMask(u32 idx, u32 startBit, u32 bitsCount)
{
u32 startEntry = startBit >> 6;
u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
if (entriesCount > 1)
{
if (idx == startEntry)
return 0xFFFFFFFFFFFFFFFF << (startBit & 0x3F);
if (((startBit + bitsCount) & 0x3F) && idx == startEntry + entriesCount - 1)
return ~(0xFFFFFFFFFFFFFFFF << ((startBit + bitsCount) & 0x3F));
return 0xFFFFFFFFFFFFFFFF;
}
else if (idx == startEntry)
{
return bitsCount == 64
? 0xFFFFFFFFFFFFFFFF
: ((1ULL << bitsCount) - 1) << (startBit & 0x3F);
}
else
{
return 0;
}
}
// like std::bitset but less stupid and optimised for // like std::bitset but less stupid and optimised for
// our use case (keeping track of memory invalidations) // our use case (keeping track of memory invalidations)
namespace melonDS
{
template <u32 Size> template <u32 Size>
struct NonStupidBitField struct NonStupidBitField
{ {
@ -166,6 +193,11 @@ struct NonStupidBitField
return Ref{*this, idx}; return Ref{*this, idx};
} }
bool operator[](u32 idx) const
{
return Data[idx >> 6] & (1ULL << (idx & 0x3F));
}
void SetRange(u32 startBit, u32 bitsCount) void SetRange(u32 startBit, u32 bitsCount)
{ {
u32 startEntry = startBit >> 6; u32 startEntry = startBit >> 6;
@ -187,6 +219,26 @@ struct NonStupidBitField
} }
} }
int Min() const
{
for (int i = 0; i < DataLength; i++)
{
if (Data[i])
return i * 64 + __builtin_ctzll(Data[i]);
}
return -1;
}
int Max() const
{
for (int i = DataLength - 1; i >= 0; i--)
{
if (Data[i])
return i * 64 + (63 - __builtin_clzll(Data[i]));
}
return -1;
}
NonStupidBitField& operator|=(const NonStupidBitField<Size>& other) NonStupidBitField& operator|=(const NonStupidBitField<Size>& other)
{ {
for (u32 i = 0; i < DataLength; i++) for (u32 i = 0; i < DataLength; i++)
@ -195,6 +247,7 @@ struct NonStupidBitField
} }
return *this; return *this;
} }
NonStupidBitField& operator&=(const NonStupidBitField<Size>& other) NonStupidBitField& operator&=(const NonStupidBitField<Size>& other)
{ {
for (u32 i = 0; i < DataLength; i++) for (u32 i = 0; i < DataLength; i++)
@ -203,6 +256,20 @@ struct NonStupidBitField
} }
return *this; return *this;
} }
operator bool() const
{
for (int i = 0; i < DataLength - 1; i++)
{
if (Data[i])
return true;
}
if (Data[DataLength-1] & ((Size&0x3F) ? ~(0xFFFFFFFFFFFFFFFF << (Size&0x3F)) : 0xFFFFFFFFFFFFFFFF))
{
return true;
}
return false;
}
}; };
} }

View File

@ -18,6 +18,14 @@
#include "OpenGLSupport.h" #include "OpenGLSupport.h"
#include <unordered_map>
#include <vector>
#include <assert.h>
#define XXH_STATIC_LINKING_ONLY
#include "xxhash/xxhash.h"
namespace melonDS namespace melonDS
{ {
@ -27,9 +35,158 @@ using Platform::LogLevel;
namespace OpenGL namespace OpenGL
{ {
bool BuildShaderProgram(const char* vs, const char* fs, GLuint* ids, const char* name) struct ShaderCacheEntry
{
u32 Length;
u8* Data;
u32 BinaryFormat;
ShaderCacheEntry(u8* data, u32 length, u32 binaryFmt)
: Length(length), Data(data), BinaryFormat(binaryFmt)
{
assert(data != nullptr);
}
ShaderCacheEntry(const ShaderCacheEntry&) = delete;
ShaderCacheEntry(ShaderCacheEntry&& other)
{
Data = other.Data;
Length = other.Length;
BinaryFormat = other.BinaryFormat;
other.Data = nullptr;
other.Length = 0;
other.BinaryFormat = 0;
}
~ShaderCacheEntry()
{
if (Data) // check whether it was moved
delete[] Data;
}
};
std::unordered_map<u64, ShaderCacheEntry> ShaderCache;
std::vector<u64> NewShaders;
constexpr u32 ShaderCacheMagic = 0x11CAC4E1;
constexpr u32 ShaderCacheVersion = 1;
void LoadShaderCache()
{
// for now the shader cache only contains only compute shaders
// because they take the longest to compile
Platform::FileHandle* file = Platform::OpenLocalFile("shadercache", Platform::FileMode::Read);
if (file == nullptr)
{
Log(LogLevel::Error, "Could not find shader cache\n");
return;
}
u32 magic, version, numPrograms;
if (Platform::FileRead(&magic, 4, 1, file) != 1 || magic != ShaderCacheMagic)
{
Log(LogLevel::Error, "Shader cache file has invalid magic\n");
goto fileInvalid;
}
if (Platform::FileRead(&version, 4, 1, file) != 1 || version != ShaderCacheVersion)
{
Log(LogLevel::Error, "Shader cache file has bad version\n");
goto fileInvalid;
}
if (Platform::FileRead(&numPrograms, 4, 1, file) != 1)
{
Log(LogLevel::Error, "Shader cache file invalid program count\n");
goto fileInvalid;
}
// not the best approach, because once changes pile up
// we read and overwrite the old files
for (u32 i = 0; i < numPrograms; i++)
{
int error = 3;
u32 length, binaryFormat;
u64 sourceHash;
error -= Platform::FileRead(&sourceHash, 8, 1, file);
error -= Platform::FileRead(&length, 4, 1, file);
error -= Platform::FileRead(&binaryFormat, 4, 1, file);
if (error != 0)
{
Log(LogLevel::Error, "Invalid shader cache entry\n");
goto fileInvalid;
}
u8* data = new u8[length];
if (Platform::FileRead(data, length, 1, file) != 1)
{
Log(LogLevel::Error, "Could not read shader cache entry data\n");
delete[] data;
goto fileInvalid;
}
ShaderCache.erase(sourceHash);
ShaderCache.emplace(sourceHash, ShaderCacheEntry(data, length, binaryFormat));
}
fileInvalid:
Platform::CloseFile(file);
}
void SaveShaderCache()
{
Platform::FileHandle* file = Platform::OpenLocalFile("shadercache", Platform::FileMode::ReadWrite);
if (file == nullptr)
{
Log(LogLevel::Error, "Could not open or create shader cache file\n");
return;
}
int written = 3;
u32 magic = ShaderCacheMagic, version = ShaderCacheVersion, numPrograms = ShaderCache.size();
written -= Platform::FileWrite(&magic, 4, 1, file);
written -= Platform::FileWrite(&version, 4, 1, file);
written -= Platform::FileWrite(&numPrograms, 4, 1, file);
if (written != 0)
{
Log(LogLevel::Error, "Could not write shader cache header\n");
goto writeError;
}
Platform::FileSeek(file, 0, Platform::FileSeekOrigin::End);
printf("new shaders %d\n", NewShaders.size());
for (u64 newShader : NewShaders)
{
int error = 4;
auto it = ShaderCache.find(newShader);
error -= Platform::FileWrite(&it->first, 8, 1, file);
error -= Platform::FileWrite(&it->second.Length, 4, 1, file);
error -= Platform::FileWrite(&it->second.BinaryFormat, 4, 1, file);
error -= Platform::FileWrite(it->second.Data, it->second.Length, 1, file);
if (error != 0)
{
Log(LogLevel::Error, "Could not insert new shader cache entry\n");
goto writeError;
}
}
writeError:
Platform::CloseFile(file);
NewShaders.clear();
}
bool CompilerShader(GLuint& id, const std::string& source, const std::string& name, const std::string& type)
{ {
int len;
int res; int res;
if (!glCreateShader) if (!glCreateShader)
@ -38,61 +195,32 @@ bool BuildShaderProgram(const char* vs, const char* fs, GLuint* ids, const char*
return false; return false;
} }
ids[0] = glCreateShader(GL_VERTEX_SHADER); const char* sourceC = source.c_str();
len = strlen(vs); int len = source.length();
glShaderSource(ids[0], 1, &vs, &len); glShaderSource(id, 1, &sourceC, &len);
glCompileShader(ids[0]);
glGetShaderiv(ids[0], GL_COMPILE_STATUS, &res); glCompileShader(id);
glGetShaderiv(id, GL_COMPILE_STATUS, &res);
if (res != GL_TRUE) if (res != GL_TRUE)
{ {
glGetShaderiv(ids[0], GL_INFO_LOG_LENGTH, &res); glGetShaderiv(id, GL_INFO_LOG_LENGTH, &res);
if (res < 1) res = 1024; if (res < 1) res = 1024;
char* log = new char[res+1]; char* log = new char[res+1];
glGetShaderInfoLog(ids[0], res+1, NULL, log); glGetShaderInfoLog(id, res+1, NULL, log);
Log(LogLevel::Error, "OpenGL: failed to compile vertex shader %s: %s\n", name, log); Log(LogLevel::Error, "OpenGL: failed to compile %s shader %s: %s\n", type.c_str(), name.c_str(), log);
Log(LogLevel::Debug, "shader source:\n--\n%s\n--\n", vs); Log(LogLevel::Debug, "shader source:\n--\n%s\n--\n", source.c_str());
delete[] log; delete[] log;
glDeleteShader(ids[0]); glDeleteShader(id);
return false; return false;
} }
ids[1] = glCreateShader(GL_FRAGMENT_SHADER);
len = strlen(fs);
glShaderSource(ids[1], 1, &fs, &len);
glCompileShader(ids[1]);
glGetShaderiv(ids[1], GL_COMPILE_STATUS, &res);
if (res != GL_TRUE)
{
glGetShaderiv(ids[1], GL_INFO_LOG_LENGTH, &res);
if (res < 1) res = 1024;
char* log = new char[res+1];
glGetShaderInfoLog(ids[1], res+1, NULL, log);
Log(LogLevel::Error, "OpenGL: failed to compile fragment shader %s: %s\n", name, log);
//printf("shader source:\n--\n%s\n--\n", fs);
delete[] log;
Platform::FileHandle* logf = Platform::OpenFile("shaderfail.log", Platform::FileMode::WriteText);
Platform::FileWrite(fs, len+1, 1, logf);
Platform::CloseFile(logf);
glDeleteShader(ids[0]);
glDeleteShader(ids[1]);
return false;
}
ids[2] = glCreateProgram();
glAttachShader(ids[2], ids[0]);
glAttachShader(ids[2], ids[1]);
return true; return true;
} }
bool LinkShaderProgram(GLuint* ids) bool LinkProgram(GLuint& result, GLuint* ids, int numIds)
{ {
int res; int res;
@ -102,46 +230,132 @@ bool LinkShaderProgram(GLuint* ids)
return false; return false;
} }
glLinkProgram(ids[2]); for (int i = 0; i < numIds; i++)
{
glAttachShader(result, ids[i]);
}
glDetachShader(ids[2], ids[0]); glLinkProgram(result);
glDetachShader(ids[2], ids[1]);
glDeleteShader(ids[0]); for (int i = 0; i < numIds; i++)
glDeleteShader(ids[1]); glDetachShader(result, ids[i]);
glGetProgramiv(ids[2], GL_LINK_STATUS, &res); glGetProgramiv(result, GL_LINK_STATUS, &res);
if (res != GL_TRUE) if (res != GL_TRUE)
{ {
glGetProgramiv(ids[2], GL_INFO_LOG_LENGTH, &res); glGetProgramiv(result, GL_INFO_LOG_LENGTH, &res);
if (res < 1) res = 1024; if (res < 1) res = 1024;
char* log = new char[res+1]; char* log = new char[res+1];
glGetProgramInfoLog(ids[2], res+1, NULL, log); glGetProgramInfoLog(result, res+1, NULL, log);
Log(LogLevel::Error, "OpenGL: failed to link shader program: %s\n", log); Log(LogLevel::Error, "OpenGL: failed to link shader program: %s\n", log);
delete[] log; delete[] log;
glDeleteProgram(ids[2]);
return false; return false;
} }
return true; return true;
} }
void DeleteShaderProgram(GLuint* ids) bool CompileComputeProgram(GLuint& result, const std::string& source, const std::string& name)
{ {
if (glDeleteProgram) result = glCreateProgram();
{ // If OpenGL isn't loaded, then there's no shader program to delete
glDeleteProgram(ids[2]); /*u64 sourceHash = XXH64(source.data(), source.size(), 0);
auto it = ShaderCache.find(sourceHash);
if (it != ShaderCache.end())
{
glProgramBinary(result, it->second.BinaryFormat, it->second.Data, it->second.Length);
GLint linkStatus;
glGetProgramiv(result, GL_LINK_STATUS, &linkStatus);
if (linkStatus == GL_TRUE)
{
Log(LogLevel::Info, "Restored shader %s from cache\n", name.c_str());
return true;
} }
else
{
}
}*/
Log(LogLevel::Error, "Shader %s from cache was rejected\n", name.c_str());
GLuint shader;
bool linkingSucess = false;
if (!glCreateShader || !glDeleteShader)
goto error;
shader = glCreateShader(GL_COMPUTE_SHADER);
if (!CompilerShader(shader, source, name, "compute"))
goto error;
linkingSucess = LinkProgram(result, &shader, 1);
error:
glDeleteShader(shader);
if (!linkingSucess)
{
glDeleteProgram(result);
}
/*else
{
GLint length;
GLenum format;
glGetProgramiv(result, GL_PROGRAM_BINARY_LENGTH, &length);
u8* buffer = new u8[length];
glGetProgramBinary(result, length, nullptr, &format, buffer);
ShaderCache.emplace(sourceHash, ShaderCacheEntry(buffer, length, format));
NewShaders.push_back(sourceHash);
}*/
return linkingSucess;
} }
void UseShaderProgram(GLuint* ids) bool CompileVertexFragmentProgram(GLuint& result,
const std::string& vs, const std::string& fs,
const std::string& name,
const std::initializer_list<AttributeTarget>& vertexInAttrs,
const std::initializer_list<AttributeTarget>& fragmentOutAttrs)
{ {
if (glUseProgram) GLuint shaders[2] =
{ // If OpenGL isn't loaded, then there's no shader program to use {
glUseProgram(ids[2]); glCreateShader(GL_VERTEX_SHADER),
glCreateShader(GL_FRAGMENT_SHADER)
};
result = glCreateProgram();
bool linkingSucess = false;
if (!CompilerShader(shaders[0], vs, name, "vertex"))
goto error;
if (!CompilerShader(shaders[1], fs, name, "fragment"))
goto error;
for (const AttributeTarget& target : vertexInAttrs)
{
glBindAttribLocation(result, target.Location, target.Name);
} }
for (const AttributeTarget& target : fragmentOutAttrs)
{
glBindFragDataLocation(result, target.Location, target.Name);
}
linkingSucess = LinkProgram(result, shaders, 2);
error:
glDeleteShader(shaders[1]);
glDeleteShader(shaders[0]);
if (!linkingSucess)
glDeleteProgram(result);
return linkingSucess;
} }
} }

View File

@ -28,10 +28,23 @@
namespace melonDS::OpenGL namespace melonDS::OpenGL
{ {
bool BuildShaderProgram(const char* vs, const char* fs, GLuint* ids, const char* name); void LoadShaderCache();
bool LinkShaderProgram(GLuint* ids); void SaveShaderCache();
void DeleteShaderProgram(GLuint* ids);
void UseShaderProgram(GLuint* ids); struct AttributeTarget
{
const char* Name;
u32 Location;
};
bool CompileVertexFragmentProgram(GLuint& result,
const std::string& vs, const std::string& fs,
const std::string& name,
const std::initializer_list<AttributeTarget>& vertexInAttrs,
const std::initializer_list<AttributeTarget>& fragmentOutAttrs);
bool CompileComputeProgram(GLuint& result, const std::string& source, const std::string& name);
} }

View File

@ -22,6 +22,7 @@
#include <inttypes.h> #include <inttypes.h>
#include "Platform.h" #include "Platform.h"
#include "Config.h" #include "Config.h"
#include "GPU.h"
namespace Config namespace Config
@ -59,6 +60,7 @@ bool Threaded3D;
int GL_ScaleFactor; int GL_ScaleFactor;
bool GL_BetterPolygons; bool GL_BetterPolygons;
bool GL_HiresCoordinates;
bool LimitFPS; bool LimitFPS;
int MaxFPS; int MaxFPS;
@ -246,11 +248,12 @@ ConfigEntry ConfigFile[] =
{"ScreenVSync", 1, &ScreenVSync, false, false}, {"ScreenVSync", 1, &ScreenVSync, false, false},
{"ScreenVSyncInterval", 0, &ScreenVSyncInterval, 1, false}, {"ScreenVSyncInterval", 0, &ScreenVSyncInterval, 1, false},
{"3DRenderer", 0, &_3DRenderer, 0, false}, {"3DRenderer", 0, &_3DRenderer, renderer3D_Software, false},
{"Threaded3D", 1, &Threaded3D, true, false}, {"Threaded3D", 1, &Threaded3D, true, false},
{"GL_ScaleFactor", 0, &GL_ScaleFactor, 1, false}, {"GL_ScaleFactor", 0, &GL_ScaleFactor, 1, false},
{"GL_BetterPolygons", 1, &GL_BetterPolygons, false, false}, {"GL_BetterPolygons", 1, &GL_BetterPolygons, false, false},
{"GL_HiresCoordinates", 1, &GL_HiresCoordinates, true, false},
{"LimitFPS", 1, &LimitFPS, true, false}, {"LimitFPS", 1, &LimitFPS, true, false},
{"MaxFPS", 0, &MaxFPS, 1000, false}, {"MaxFPS", 0, &MaxFPS, 1000, false},

View File

@ -51,6 +51,16 @@ enum
micInputType_MAX, micInputType_MAX,
}; };
enum
{
renderer3D_Software = 0,
#ifdef OGLRENDERER_ENABLED
renderer3D_OpenGL,
renderer3D_OpenGLCompute,
#endif
renderer3D_Max,
};
namespace Config namespace Config
{ {
@ -103,6 +113,7 @@ extern bool Threaded3D;
extern int GL_ScaleFactor; extern int GL_ScaleFactor;
extern bool GL_BetterPolygons; extern bool GL_BetterPolygons;
extern bool GL_HiresCoordinates;
extern bool LimitFPS; extern bool LimitFPS;
extern int MaxFPS; extern int MaxFPS;

View File

@ -52,10 +52,12 @@
#include "DSi_I2C.h" #include "DSi_I2C.h"
#include "GPU3D_Soft.h" #include "GPU3D_Soft.h"
#include "GPU3D_OpenGL.h" #include "GPU3D_OpenGL.h"
#include "GPU3D_Compute.h"
#include "Savestate.h" #include "Savestate.h"
#include "ROMManager.h" #include "ROMManager.h"
#include "EmuThread.h"
//#include "ArchiveUtil.h" //#include "ArchiveUtil.h"
//#include "CameraManager.h" //#include "CameraManager.h"
@ -94,9 +96,8 @@ EmuThread::EmuThread(QObject* parent) : QThread(parent)
} }
std::unique_ptr<NDS> EmuThread::CreateConsole( std::unique_ptr<NDS> EmuThread::CreateConsole(
std::unique_ptr<melonDS::NDSCart::CartCommon>&& ndscart, std::unique_ptr<melonDS::NDSCart::CartCommon> &&ndscart,
std::unique_ptr<melonDS::GBACart::CartCommon>&& gbacart std::unique_ptr<melonDS::GBACart::CartCommon> &&gbacart) noexcept
) noexcept
{ {
auto arm7bios = ROMManager::LoadARM7BIOS(); auto arm7bios = ROMManager::LoadARM7BIOS();
if (!arm7bios) if (!arm7bios)
@ -326,21 +327,12 @@ void EmuThread::run()
videoRenderer = 0; videoRenderer = 0;
} }
if (videoRenderer == 0) updateRenderer();
{ // If we're using the software renderer...
NDS->GPU.SetRenderer3D(std::make_unique<SoftRenderer>(Config::Threaded3D != 0));
}
else
{
auto glrenderer = melonDS::GLRenderer::New();
glrenderer->SetRenderSettings(Config::GL_BetterPolygons, Config::GL_ScaleFactor);
NDS->GPU.SetRenderer3D(std::move(glrenderer));
}
Input::Init(); Input::Init();
u32 nframes = 0; u32 nframes = 0;
double perfCountsSec = 1.0 / SDL_GetPerformanceFrequency(); perfCountsSec = 1.0 / SDL_GetPerformanceFrequency();
double lastTime = SDL_GetPerformanceCounter() * perfCountsSec; double lastTime = SDL_GetPerformanceCounter() * perfCountsSec;
double frameLimitError = 0.0; double frameLimitError = 0.0;
double lastMeasureTime = lastTime; double lastMeasureTime = lastTime;
@ -451,20 +443,9 @@ void EmuThread::run()
videoRenderer = 0; videoRenderer = 0;
} }
videoRenderer = screenGL ? Config::_3DRenderer : 0; updateRenderer();
videoSettingsDirty = false; videoSettingsDirty = false;
if (videoRenderer == 0)
{ // If we're using the software renderer...
NDS->GPU.SetRenderer3D(std::make_unique<SoftRenderer>(Config::Threaded3D != 0));
}
else
{
auto glrenderer = melonDS::GLRenderer::New();
glrenderer->SetRenderSettings(Config::GL_BetterPolygons, Config::GL_ScaleFactor);
NDS->GPU.SetRenderer3D(std::move(glrenderer));
}
} }
// process input and hotkeys // process input and hotkeys
@ -512,7 +493,16 @@ void EmuThread::run()
// emulate // emulate
u32 nlines = NDS->RunFrame(); u32 nlines;
if (NDS->GPU.GetRenderer3D().NeedsShaderCompile())
{
compileShaders();
nlines = 0;
}
else
{
nlines = NDS->RunFrame();
}
if (ROMManager::NDSSave) if (ROMManager::NDSSave)
ROMManager::NDSSave->CheckFlush(); ROMManager::NDSSave->CheckFlush();
@ -750,3 +740,53 @@ bool EmuThread::emuIsActive()
{ {
return (RunningSomething == 1); return (RunningSomething == 1);
} }
void EmuThread::updateRenderer()
{
if (videoRenderer != lastVideoRenderer)
{
printf("creating renderer %d\n", videoRenderer);
switch (videoRenderer)
{
case renderer3D_Software:
NDS->GPU.SetRenderer3D(std::make_unique<SoftRenderer>());
break;
case renderer3D_OpenGL:
NDS->GPU.SetRenderer3D(GLRenderer::New());
break;
case renderer3D_OpenGLCompute:
NDS->GPU.SetRenderer3D(ComputeRenderer::New());
break;
default: __builtin_unreachable();
}
}
lastVideoRenderer = videoRenderer;
switch (videoRenderer)
{
case renderer3D_Software:
static_cast<SoftRenderer&>(NDS->GPU.GetRenderer3D()).SetThreaded(Config::Threaded3D, NDS->GPU);
break;
case renderer3D_OpenGL:
static_cast<GLRenderer&>(NDS->GPU.GetRenderer3D()).SetRenderSettings(Config::GL_BetterPolygons, Config::GL_ScaleFactor);
break;
case renderer3D_OpenGLCompute:
static_cast<ComputeRenderer&>(NDS->GPU.GetRenderer3D()).SetRenderSettings(Config::GL_ScaleFactor, Config::GL_HiresCoordinates);
break;
default: __builtin_unreachable();
}
}
void EmuThread::compileShaders()
{
int currentShader, shadersCount;
u64 startTime = SDL_GetPerformanceCounter();
// kind of hacky to look at the wallclock, though it is easier than
// than disabling vsync
do
{
NDS->GPU.GetRenderer3D().ShaderCompileStep(currentShader, shadersCount);
} while (NDS->GPU.GetRenderer3D().NeedsShaderCompile() &&
(SDL_GetPerformanceCounter() - startTime) * perfCountsSec < 1.0 / 6.0);
mainWindow->osdAddMessage(0, "Compiling shader %d/%d", currentShader+1, shadersCount);
}

View File

@ -94,6 +94,9 @@ signals:
void syncVolumeLevel(); void syncVolumeLevel();
private: private:
void updateRenderer();
void compileShaders();
std::unique_ptr<melonDS::NDS> CreateConsole( std::unique_ptr<melonDS::NDS> CreateConsole(
std::unique_ptr<melonDS::NDSCart::CartCommon>&& ndscart, std::unique_ptr<melonDS::NDSCart::CartCommon>&& ndscart,
std::unique_ptr<melonDS::GBACart::CartCommon>&& gbacart std::unique_ptr<melonDS::GBACart::CartCommon>&& gbacart
@ -127,8 +130,9 @@ private:
int autoScreenSizing; int autoScreenSizing;
int videoRenderer; int lastVideoRenderer = -1;
bool videoSettingsDirty;
double perfCountsSec;
}; };
#endif // EMUTHREAD_H #endif // EMUTHREAD_H

View File

@ -709,19 +709,17 @@ void ScreenPanelGL::initOpenGL()
glContext->MakeCurrent(); glContext->MakeCurrent();
OpenGL::BuildShaderProgram(kScreenVS, kScreenFS, screenShaderProgram, "ScreenShader"); OpenGL::CompileVertexFragmentProgram(screenShaderProgram,
GLuint pid = screenShaderProgram[2]; kScreenVS, kScreenFS,
glBindAttribLocation(pid, 0, "vPosition"); "ScreenShader",
glBindAttribLocation(pid, 1, "vTexcoord"); {{"vPosition", 0}, {"vTexcoord", 1}},
glBindFragDataLocation(pid, 0, "oColor"); {{"oColor", 0}});
OpenGL::LinkShaderProgram(screenShaderProgram); glUseProgram(screenShaderProgram);
glUniform1i(glGetUniformLocation(screenShaderProgram, "ScreenTex"), 0);
glUseProgram(pid); screenShaderScreenSizeULoc = glGetUniformLocation(screenShaderProgram, "uScreenSize");
glUniform1i(glGetUniformLocation(pid, "ScreenTex"), 0); screenShaderTransformULoc = glGetUniformLocation(screenShaderProgram, "uTransform");
screenShaderScreenSizeULoc = glGetUniformLocation(pid, "uScreenSize");
screenShaderTransformULoc = glGetUniformLocation(pid, "uTransform");
// to prevent bleeding between both parts of the screen // to prevent bleeding between both parts of the screen
// with bilinear filtering enabled // with bilinear filtering enabled
@ -769,21 +767,19 @@ void ScreenPanelGL::initOpenGL()
memset(zeroData, 0, sizeof(zeroData)); memset(zeroData, 0, sizeof(zeroData));
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256, 2, GL_RGBA, GL_UNSIGNED_BYTE, zeroData); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256, 2, GL_RGBA, GL_UNSIGNED_BYTE, zeroData);
OpenGL::CompileVertexFragmentProgram(osdShader,
kScreenVS_OSD, kScreenFS_OSD,
"OSDShader",
{{"vPosition", 0}},
{{"oColor", 0}});
OpenGL::BuildShaderProgram(kScreenVS_OSD, kScreenFS_OSD, osdShader, "OSDShader"); glUseProgram(osdShader);
glUniform1i(glGetUniformLocation(osdShader, "OSDTex"), 0);
pid = osdShader[2]; osdScreenSizeULoc = glGetUniformLocation(osdShader, "uScreenSize");
glBindAttribLocation(pid, 0, "vPosition"); osdPosULoc = glGetUniformLocation(osdShader, "uOSDPos");
glBindFragDataLocation(pid, 0, "oColor"); osdSizeULoc = glGetUniformLocation(osdShader, "uOSDSize");
osdScaleFactorULoc = glGetUniformLocation(osdShader, "uScaleFactor");
OpenGL::LinkShaderProgram(osdShader);
glUseProgram(pid);
glUniform1i(glGetUniformLocation(pid, "OSDTex"), 0);
osdScreenSizeULoc = glGetUniformLocation(pid, "uScreenSize");
osdPosULoc = glGetUniformLocation(pid, "uOSDPos");
osdSizeULoc = glGetUniformLocation(pid, "uOSDSize");
osdScaleFactorULoc = glGetUniformLocation(pid, "uScaleFactor");
const float osdvertices[6*2] = const float osdvertices[6*2] =
{ {
@ -818,8 +814,7 @@ void ScreenPanelGL::deinitOpenGL()
glDeleteVertexArrays(1, &screenVertexArray); glDeleteVertexArrays(1, &screenVertexArray);
glDeleteBuffers(1, &screenVertexBuffer); glDeleteBuffers(1, &screenVertexBuffer);
OpenGL::DeleteShaderProgram(screenShaderProgram); glDeleteProgram(screenShaderProgram);
for (const auto& [key, tex] : osdTextures) for (const auto& [key, tex] : osdTextures)
{ {
@ -830,8 +825,7 @@ void ScreenPanelGL::deinitOpenGL()
glDeleteVertexArrays(1, &osdVertexArray); glDeleteVertexArrays(1, &osdVertexArray);
glDeleteBuffers(1, &osdVertexBuffer); glDeleteBuffers(1, &osdVertexBuffer);
OpenGL::DeleteShaderProgram(osdShader); glDeleteProgram(osdShader);
glContext->DoneCurrent(); glContext->DoneCurrent();
@ -885,7 +879,7 @@ void ScreenPanelGL::drawScreenGL()
glViewport(0, 0, w, h); glViewport(0, 0, w, h);
glUseProgram(screenShaderProgram[2]); glUseProgram(screenShaderProgram);
glUniform2f(screenShaderScreenSizeULoc, w / factor, h / factor); glUniform2f(screenShaderScreenSizeULoc, w / factor, h / factor);
int frontbuf = emuThread->FrontBuffer; int frontbuf = emuThread->FrontBuffer;
@ -895,7 +889,7 @@ void ScreenPanelGL::drawScreenGL()
if (emuThread->NDS->GPU.GetRenderer3D().Accelerated) if (emuThread->NDS->GPU.GetRenderer3D().Accelerated)
{ {
// hardware-accelerated render // hardware-accelerated render
static_cast<GLRenderer&>(emuThread->NDS->GPU.GetRenderer3D()).GetCompositor().BindOutputTexture(frontbuf); emuThread->NDS->GPU.GetRenderer3D().BindOutputTexture(frontbuf);
} }
else else
#endif #endif
@ -936,7 +930,7 @@ void ScreenPanelGL::drawScreenGL()
u32 y = kOSDMargin; u32 y = kOSDMargin;
glUseProgram(osdShader[2]); glUseProgram(osdShader);
glUniform2f(osdScreenSizeULoc, w, h); glUniform2f(osdScreenSizeULoc, w, h);
glUniform1f(osdScaleFactorULoc, factor); glUniform1f(osdScaleFactorULoc, factor);

View File

@ -172,7 +172,7 @@ private:
GLuint screenVertexBuffer, screenVertexArray; GLuint screenVertexBuffer, screenVertexArray;
GLuint screenTexture; GLuint screenTexture;
GLuint screenShaderProgram[3]; GLuint screenShaderProgram;
GLuint screenShaderTransformULoc, screenShaderScreenSizeULoc; GLuint screenShaderTransformULoc, screenShaderScreenSizeULoc;
QMutex screenSettingsLock; QMutex screenSettingsLock;
@ -181,7 +181,7 @@ private:
int lastScreenWidth = -1, lastScreenHeight = -1; int lastScreenWidth = -1, lastScreenHeight = -1;
GLuint osdShader[3]; GLuint osdShader;
GLint osdScreenSizeULoc, osdPosULoc, osdSizeULoc; GLint osdScreenSizeULoc, osdPosULoc, osdSizeULoc;
GLfloat osdScaleFactorULoc; GLfloat osdScaleFactorULoc;
GLuint osdVertexArray; GLuint osdVertexArray;

View File

@ -23,6 +23,7 @@
#include "types.h" #include "types.h"
#include "Platform.h" #include "Platform.h"
#include "Config.h" #include "Config.h"
#include "GPU.h"
#include "VideoSettingsDialog.h" #include "VideoSettingsDialog.h"
#include "ui_VideoSettingsDialog.h" #include "ui_VideoSettingsDialog.h"
@ -30,11 +31,20 @@
inline bool UsesGL() inline bool UsesGL()
{ {
return (Config::ScreenUseGL != 0) || (Config::_3DRenderer != 0); return (Config::ScreenUseGL != 0) || (Config::_3DRenderer != renderer3D_Software);
} }
VideoSettingsDialog* VideoSettingsDialog::currentDlg = nullptr; VideoSettingsDialog* VideoSettingsDialog::currentDlg = nullptr;
void VideoSettingsDialog::setEnabled()
{
bool softwareRenderer = Config::_3DRenderer == renderer3D_Software;
ui->cbGLDisplay->setEnabled(softwareRenderer);
ui->cbSoftwareThreaded->setEnabled(softwareRenderer);
ui->cbxGLResolution->setEnabled(!softwareRenderer);
ui->cbBetterPolygons->setEnabled(Config::_3DRenderer == renderer3D_OpenGL);
ui->cbxComputeHiResCoords->setEnabled(Config::_3DRenderer == renderer3D_OpenGLCompute);
}
VideoSettingsDialog::VideoSettingsDialog(QWidget* parent) : QDialog(parent), ui(new Ui::VideoSettingsDialog) VideoSettingsDialog::VideoSettingsDialog(QWidget* parent) : QDialog(parent), ui(new Ui::VideoSettingsDialog)
{ {
@ -48,10 +58,12 @@ VideoSettingsDialog::VideoSettingsDialog(QWidget* parent) : QDialog(parent), ui(
oldSoftThreaded = Config::Threaded3D; oldSoftThreaded = Config::Threaded3D;
oldGLScale = Config::GL_ScaleFactor; oldGLScale = Config::GL_ScaleFactor;
oldGLBetterPolygons = Config::GL_BetterPolygons; oldGLBetterPolygons = Config::GL_BetterPolygons;
oldHiresCoordinates = Config::GL_HiresCoordinates;
grp3DRenderer = new QButtonGroup(this); grp3DRenderer = new QButtonGroup(this);
grp3DRenderer->addButton(ui->rb3DSoftware, 0); grp3DRenderer->addButton(ui->rb3DSoftware, renderer3D_Software);
grp3DRenderer->addButton(ui->rb3DOpenGL, 1); grp3DRenderer->addButton(ui->rb3DOpenGL, renderer3D_OpenGL);
grp3DRenderer->addButton(ui->rb3DCompute, renderer3D_OpenGLCompute);
#if QT_VERSION < QT_VERSION_CHECK(5, 15, 0) #if QT_VERSION < QT_VERSION_CHECK(5, 15, 0)
connect(grp3DRenderer, SIGNAL(buttonClicked(int)), this, SLOT(onChange3DRenderer(int))); connect(grp3DRenderer, SIGNAL(buttonClicked(int)), this, SLOT(onChange3DRenderer(int)));
#else #else
@ -75,25 +87,13 @@ VideoSettingsDialog::VideoSettingsDialog(QWidget* parent) : QDialog(parent), ui(
ui->cbxGLResolution->setCurrentIndex(Config::GL_ScaleFactor-1); ui->cbxGLResolution->setCurrentIndex(Config::GL_ScaleFactor-1);
ui->cbBetterPolygons->setChecked(Config::GL_BetterPolygons != 0); ui->cbBetterPolygons->setChecked(Config::GL_BetterPolygons != 0);
ui->cbxComputeHiResCoords->setChecked(Config::GL_HiresCoordinates != 0);
if (!Config::ScreenVSync) if (!Config::ScreenVSync)
ui->sbVSyncInterval->setEnabled(false); ui->sbVSyncInterval->setEnabled(false);
setVsyncControlEnable(UsesGL()); setVsyncControlEnable(UsesGL());
if (Config::_3DRenderer == 0) setEnabled();
{
ui->cbGLDisplay->setEnabled(true);
ui->cbSoftwareThreaded->setEnabled(true);
ui->cbxGLResolution->setEnabled(false);
ui->cbBetterPolygons->setEnabled(false);
}
else
{
ui->cbGLDisplay->setEnabled(false);
ui->cbSoftwareThreaded->setEnabled(false);
ui->cbxGLResolution->setEnabled(true);
ui->cbBetterPolygons->setEnabled(true);
}
} }
VideoSettingsDialog::~VideoSettingsDialog() VideoSettingsDialog::~VideoSettingsDialog()
@ -119,6 +119,7 @@ void VideoSettingsDialog::on_VideoSettingsDialog_rejected()
Config::Threaded3D = oldSoftThreaded; Config::Threaded3D = oldSoftThreaded;
Config::GL_ScaleFactor = oldGLScale; Config::GL_ScaleFactor = oldGLScale;
Config::GL_BetterPolygons = oldGLBetterPolygons; Config::GL_BetterPolygons = oldGLBetterPolygons;
Config::GL_HiresCoordinates = oldHiresCoordinates;
emit updateVideoSettings(old_gl != UsesGL()); emit updateVideoSettings(old_gl != UsesGL());
@ -133,31 +134,18 @@ void VideoSettingsDialog::setVsyncControlEnable(bool hasOGL)
void VideoSettingsDialog::onChange3DRenderer(int renderer) void VideoSettingsDialog::onChange3DRenderer(int renderer)
{ {
bool old_gl = (Config::ScreenUseGL != 0) || (Config::_3DRenderer != 0); bool old_gl = UsesGL();
Config::_3DRenderer = renderer; Config::_3DRenderer = renderer;
if (renderer == 0) setEnabled();
{
ui->cbGLDisplay->setEnabled(true);
ui->cbSoftwareThreaded->setEnabled(true);
ui->cbxGLResolution->setEnabled(false);
ui->cbBetterPolygons->setEnabled(false);
}
else
{
ui->cbGLDisplay->setEnabled(false);
ui->cbSoftwareThreaded->setEnabled(false);
ui->cbxGLResolution->setEnabled(true);
ui->cbBetterPolygons->setEnabled(true);
}
emit updateVideoSettings(old_gl != UsesGL()); emit updateVideoSettings(old_gl != UsesGL());
} }
void VideoSettingsDialog::on_cbGLDisplay_stateChanged(int state) void VideoSettingsDialog::on_cbGLDisplay_stateChanged(int state)
{ {
bool old_gl = (Config::ScreenUseGL != 0) || (Config::_3DRenderer != 0); bool old_gl = UsesGL();
Config::ScreenUseGL = (state != 0); Config::ScreenUseGL = (state != 0);
@ -205,3 +193,10 @@ void VideoSettingsDialog::on_cbBetterPolygons_stateChanged(int state)
emit updateVideoSettings(false); emit updateVideoSettings(false);
} }
void VideoSettingsDialog::on_cbxComputeHiResCoords_stateChanged(int state)
{
Config::GL_HiresCoordinates = (state != 0);
emit updateVideoSettings(false);
}

View File

@ -65,10 +65,12 @@ private slots:
void on_cbxGLResolution_currentIndexChanged(int idx); void on_cbxGLResolution_currentIndexChanged(int idx);
void on_cbBetterPolygons_stateChanged(int state); void on_cbBetterPolygons_stateChanged(int state);
void on_cbxComputeHiResCoords_stateChanged(int state);
void on_cbSoftwareThreaded_stateChanged(int state); void on_cbSoftwareThreaded_stateChanged(int state);
private: private:
void setVsyncControlEnable(bool hasOGL); void setVsyncControlEnable(bool hasOGL);
void setEnabled();
Ui::VideoSettingsDialog* ui; Ui::VideoSettingsDialog* ui;
@ -81,6 +83,7 @@ private:
int oldSoftThreaded; int oldSoftThreaded;
int oldGLScale; int oldGLScale;
int oldGLBetterPolygons; int oldGLBetterPolygons;
int oldHiresCoordinates;
}; };
#endif // VIDEOSETTINGSDIALOG_H #endif // VIDEOSETTINGSDIALOG_H

View File

@ -6,7 +6,7 @@
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>408</width> <width>427</width>
<height>262</height> <height>262</height>
</rect> </rect>
</property> </property>
@ -24,7 +24,7 @@
<enum>QLayout::SetFixedSize</enum> <enum>QLayout::SetFixedSize</enum>
</property> </property>
<property name="horizontalSpacing"> <property name="horizontalSpacing">
<number>-1</number> <number>6</number>
</property> </property>
<item row="1" column="1"> <item row="1" column="1">
<widget class="QGroupBox" name="groupBox_3"> <widget class="QGroupBox" name="groupBox_3">
@ -39,13 +39,6 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="1" column="0">
<widget class="QComboBox" name="cbxGLResolution">
<property name="whatsThis">
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The resolution at which the 3D graphics will be rendered. Higher resolutions improve graphics quality when the main window is enlarged, but may also cause glitches.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
</property>
</widget>
</item>
<item row="2" column="0"> <item row="2" column="0">
<widget class="QCheckBox" name="cbBetterPolygons"> <widget class="QCheckBox" name="cbBetterPolygons">
<property name="whatsThis"> <property name="whatsThis">
@ -56,6 +49,20 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="1" column="0">
<widget class="QComboBox" name="cbxGLResolution">
<property name="whatsThis">
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The resolution at which the 3D graphics will be rendered. Higher resolutions improve graphics quality when the main window is enlarged, but may also cause glitches.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
</property>
</widget>
</item>
<item row="3" column="0">
<widget class="QCheckBox" name="cbxComputeHiResCoords">
<property name="text">
<string>Use high resolution coordinates</string>
</property>
</widget>
</item>
</layout> </layout>
</widget> </widget>
</item> </item>
@ -94,23 +101,7 @@
<string>Display settings</string> <string>Display settings</string>
</property> </property>
<layout class="QGridLayout" name="gridLayout_2"> <layout class="QGridLayout" name="gridLayout_2">
<item row="6" column="0"> <item row="7" column="1">
<widget class="QLabel" name="label_2">
<property name="sizePolicy">
<sizepolicy hsizetype="Fixed" vsizetype="Fixed">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="whatsThis">
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The interval at which to synchronize to the monitor's refresh rate. Set to 1 for a 60Hz monitor, 2 for 120Hz, ...&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
</property>
<property name="text">
<string>VSync interval:</string>
</property>
</widget>
</item>
<item row="6" column="1">
<widget class="QSpinBox" name="sbVSyncInterval"> <widget class="QSpinBox" name="sbVSyncInterval">
<property name="whatsThis"> <property name="whatsThis">
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The interval at which to synchronize to the monitor's refresh rate. Set to 1 for a 60Hz monitor, 2 for 120Hz, ...&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string> <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The interval at which to synchronize to the monitor's refresh rate. Set to 1 for a 60Hz monitor, 2 for 120Hz, ...&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
@ -123,7 +114,7 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="4" column="0" colspan="2"> <item row="5" column="0" colspan="2">
<widget class="QCheckBox" name="cbGLDisplay"> <widget class="QCheckBox" name="cbGLDisplay">
<property name="whatsThis"> <property name="whatsThis">
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;Use OpenGL to draw the DS screens to the main window. May result in better frame pacing. Mandatory when using the OpenGL 3D renderer.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string> <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;Use OpenGL to draw the DS screens to the main window. May result in better frame pacing. Mandatory when using the OpenGL 3D renderer.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
@ -133,17 +124,7 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="5" column="0" colspan="2"> <item row="4" column="0" colspan="2">
<widget class="QCheckBox" name="cbVSync">
<property name="whatsThis">
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;When using OpenGL, synchronize the video output to your monitor's refresh rate.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
</property>
<property name="text">
<string>VSync</string>
</property>
</widget>
</item>
<item row="3" column="0" colspan="2">
<spacer name="verticalSpacer"> <spacer name="verticalSpacer">
<property name="orientation"> <property name="orientation">
<enum>Qt::Vertical</enum> <enum>Qt::Vertical</enum>
@ -159,13 +140,39 @@
</property> </property>
</spacer> </spacer>
</item> </item>
<item row="6" column="0" colspan="2">
<widget class="QCheckBox" name="cbVSync">
<property name="whatsThis">
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;When using OpenGL, synchronize the video output to your monitor's refresh rate.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
</property>
<property name="text">
<string>VSync</string>
</property>
</widget>
</item>
<item row="7" column="0">
<widget class="QLabel" name="label_2">
<property name="sizePolicy">
<sizepolicy hsizetype="Fixed" vsizetype="Fixed">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="whatsThis">
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The interval at which to synchronize to the monitor's refresh rate. Set to 1 for a 60Hz monitor, 2 for 120Hz, ...&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
</property>
<property name="text">
<string>VSync interval:</string>
</property>
</widget>
</item>
<item row="2" column="0" colspan="2"> <item row="2" column="0" colspan="2">
<widget class="QRadioButton" name="rb3DOpenGL"> <widget class="QRadioButton" name="rb3DOpenGL">
<property name="whatsThis"> <property name="whatsThis">
<string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The OpenGL renderer may be faster than software and supports graphical enhancements, but is more prone to glitches.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string> <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The OpenGL renderer may be faster than software and supports graphical enhancements, but is more prone to glitches.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
</property> </property>
<property name="text"> <property name="text">
<string>OpenGL</string> <string>OpenGL (Classic)</string>
</property> </property>
</widget> </widget>
</item> </item>
@ -186,6 +193,13 @@
</property> </property>
</widget> </widget>
</item> </item>
<item row="3" column="0">
<widget class="QRadioButton" name="rb3DCompute">
<property name="text">
<string>OpenGL (Compute shader)</string>
</property>
</widget>
</item>
</layout> </layout>
</widget> </widget>
</item> </item>

View File

@ -2048,6 +2048,7 @@ void MainWindow::onUpdateVideoSettings(bool glchange)
connect(emuThread, SIGNAL(windowUpdate()), panel, SLOT(repaint())); connect(emuThread, SIGNAL(windowUpdate()), panel, SLOT(repaint()));
} }
printf("update video settings\n");
videoSettingsDirty = true; videoSettingsDirty = true;
if (glchange) if (glchange)

View File

@ -175,10 +175,6 @@ bool camStarted[2];
//extern int AspectRatiosNum; //extern int AspectRatiosNum;
static bool FileExtensionInList(const QString& filename, const QStringList& extensions, Qt::CaseSensitivity cs = Qt::CaseInsensitive) static bool FileExtensionInList(const QString& filename, const QStringList& extensions, Qt::CaseSensitivity cs = Qt::CaseInsensitive)
{ {
return std::any_of(extensions.cbegin(), extensions.cend(), [&](const auto& ext) { return std::any_of(extensions.cbegin(), extensions.cend(), [&](const auto& ext) {
@ -339,10 +335,10 @@ int main(int argc, char** argv)
if (!Config::Load()) QMessageBox::critical(NULL, "melonDS", "Unable to write to config.\nPlease check the write permissions of the folder you placed melonDS in."); if (!Config::Load()) QMessageBox::critical(NULL, "melonDS", "Unable to write to config.\nPlease check the write permissions of the folder you placed melonDS in.");
#define SANITIZE(var, min, max) { var = std::clamp(var, min, max); } #define SANITIZE(var, min, max) { var = std::clamp<int>(var, min, max); }
SANITIZE(Config::ConsoleType, 0, 1); SANITIZE(Config::ConsoleType, 0, 1);
#ifdef OGLRENDERER_ENABLED #ifdef OGLRENDERER_ENABLED
SANITIZE(Config::_3DRenderer, 0, 1); // 0 is the software renderer, 1 is the OpenGL renderer SANITIZE(Config::_3DRenderer, 0, renderer3D_Max);
#else #else
SANITIZE(Config::_3DRenderer, 0, 0); SANITIZE(Config::_3DRenderer, 0, 0);
#endif #endif