Allow for a more modular renderer backends (#990)

* Draft GPU3D renderer modularization * Update sources C++ standard to C++17 The top-level `CMakeLists.txt` is already using the C++17 standard. * Move GLCompositor into class type Some other misc fixes to push towards better modularity * Make renderer-implementation types move-only These types are going to be holding onto handles of GPU-side resources and shouldn't ever be copied around. * Fix OSX: Remove 'register' storage class specifier `register` has been removed in C++17... But this keyword hasn't done anything in years anyways. OSX builds consider this "warning" an error and it stops the whole build. * Add RestartFrame to Renderer3D interface * Move Accelerated property to Renderer3D interface There are points in the code base where we do: `renderer != 0` to know if we are feeding an openGL renderer. Rather than that we can instead just have this be a property of the renderer itself. With this pattern a renderer can just say how it wants its data to come in rather than have everyone know that they're talking to an OpenGL renderer. * Remove Accelerated flag from GPU * Move 2D_Soft interface in separate header Also make the current 2D engine an "owned" unique_ptr. * Update alignment attribute to standard alignas Uses standardized `alignas` rather than compiler-specific attributes. https://en.cppreference.com/w/cpp/language/alignas * Fix Clang: alignas specifier Alignment must be specified before the array to align the entire array. https://en.cppreference.com/w/cpp/language/alignas * Converted Renderer3D Accelerated to variable This flag is checked a lot during scanline rasterization. So rather than having an expensive vtable-lookup call during mainline rendering code, it is now a public constant bool type that is written to only once during Renderer3D initialization.
2025-07-25 23:29:55 -06:00 · 2021-02-09 14:38:51 -08:00
parent 891427c75c
commit a7029aebae
16 changed files with 1039 additions and 836 deletions
--- a/src/GPU3D_Soft.cpp
+++ b/src/GPU3D_Soft.cpp
@ -16,82 +16,43 @@
    with melonDS. If not, see http://www.gnu.org/licenses/.
 */

+#include "GPU3D_Soft.h"
+
 #include <stdio.h>
 #include <string.h>
 #include "NDS.h"
 #include "GPU.h"
 #include "Config.h"
-#include "Platform.h"


 namespace GPU3D
 {
-namespace SoftRenderer
-{
-
-// buffer dimensions are 258x194 to add a offscreen 1px border
-// which simplifies edge marking tests
-// buffer is duplicated to keep track of the two topmost pixels
-// TODO: check if the hardware can accidentally plot pixels
-// offscreen in that border
-
-const int ScanlineWidth = 258;
-const int NumScanlines = 194;
-const int BufferSize = ScanlineWidth * NumScanlines;
-const int FirstPixelOffset = ScanlineWidth + 1;
-
-u32 ColorBuffer[BufferSize * 2];
-u32 DepthBuffer[BufferSize * 2];
-u32 AttrBuffer[BufferSize * 2];
-
-// attribute buffer:
-// bit0-3: edge flags (left/right/top/bottom)
-// bit4: backfacing flag
-// bit8-12: antialiasing alpha
-// bit15: fog enable
-// bit16-21: polygon ID for translucent pixels
-// bit22: translucent flag
-// bit24-29: polygon ID for opaque pixels
-
-u8 StencilBuffer[256*2];
-bool PrevIsShadowMask;
-
-bool Enabled;
-
-bool FrameIdentical;
-
-// threading
-
-bool Threaded;
-Platform::Thread* RenderThread;
-bool RenderThreadRunning;
-bool RenderThreadRendering;
-Platform::Semaphore* Sema_RenderStart;
-Platform::Semaphore* Sema_RenderDone;
-Platform::Semaphore* Sema_ScanlineCount;

 void RenderThreadFunc();


-void StopRenderThread()
+void SoftRenderer::StopRenderThread()
 {
    if (RenderThreadRunning)
    {
        RenderThreadRunning = false;
        Platform::Semaphore_Post(Sema_RenderStart);
-        Platform::Thread_Wait(RenderThread);
-        Platform::Thread_Free(RenderThread);
+        // Platform::Thread_Wait(RenderThread);
+        // Platform::Thread_Free(RenderThread);
+        RenderThread.join();
+        
    }
 }

-void SetupRenderThread()
+void SoftRenderer::SetupRenderThread()
 {
    if (Threaded)
    {
        if (!RenderThreadRunning)
        {
            RenderThreadRunning = true;
-            RenderThread = Platform::Thread_Create(RenderThreadFunc);
+            //RenderThread = Platform::Thread_Create(RenderThreadFunc);
+            RenderThread = std::thread(&SoftRenderer::RenderThreadFunc, this);
        }

        // otherwise more than one frame can be queued up at once
@ -113,7 +74,13 @@ void SetupRenderThread()
 }


-bool Init()
+SoftRenderer::SoftRenderer()
+    : Renderer3D(false)
+{
+
+}
+
+bool SoftRenderer::Init()
 {
    Sema_RenderStart = Platform::Semaphore_Create();
    Sema_RenderDone = Platform::Semaphore_Create();
@ -126,7 +93,7 @@ bool Init()
    return true;
 }

-void DeInit()
+void SoftRenderer::DeInit()
 {
    StopRenderThread();

@ -135,7 +102,7 @@ void DeInit()
    Platform::Semaphore_Free(Sema_ScanlineCount);
 }

-void Reset()
+void SoftRenderer::Reset()
 {
    memset(ColorBuffer, 0, BufferSize * 2 * 4);
    memset(DepthBuffer, 0, BufferSize * 2 * 4);
@ -146,428 +113,13 @@ void Reset()
    SetupRenderThread();
 }

-void SetRenderSettings(GPU::RenderSettings& settings)
+void SoftRenderer::SetRenderSettings(GPU::RenderSettings& settings)
 {
    Threaded = settings.Soft_Threaded;
    SetupRenderThread();
 }

-
-
-// Notes on the interpolator:
-//
-// This is a theory on how the DS hardware interpolates values. It matches hardware output
-// in the tests I did, but the hardware may be doing it differently. You never know.
-//
-// Assuming you want to perspective-correctly interpolate a variable named A across two points
-// in a typical rasterizer, you would calculate A/W and 1/W at each point, interpolate linearly,
-// then divide A/W by 1/W to recover the correct A value.
-//
-// The DS GPU approximates interpolation by calculating a perspective-correct interpolation
-// between 0 and 1, then using the result as a factor to linearly interpolate the actual
-// vertex attributes. The factor has 9 bits of precision when interpolating along Y and
-// 8 bits along X.
-//
-// There's a special path for when the two W values are equal: it directly does linear
-// interpolation, avoiding precision loss from the aforementioned approximation.
-// Which is desirable when using the GPU to draw 2D graphics.
-
-template<int dir>
-class Interpolator
-{
-public:
-    Interpolator() {}
-    Interpolator(s32 x0, s32 x1, s32 w0, s32 w1)
-    {
-        Setup(x0, x1, w0, w1);
-    }
-
-    void Setup(s32 x0, s32 x1, s32 w0, s32 w1)
-    {
-        this->x0 = x0;
-        this->x1 = x1;
-        this->xdiff = x1 - x0;
-
-        // calculate reciprocals for linear mode and Z interpolation
-        // TODO eventually: use a faster reciprocal function?
-        if (this->xdiff != 0)
-            this->xrecip = (1<<30) / this->xdiff;
-        else
-            this->xrecip = 0;
-        this->xrecip_z = this->xrecip >> 8;
-
-        // linear mode is used if both W values are equal and have
-        // low-order bits cleared (0-6 along X, 1-6 along Y)
-        u32 mask = dir ? 0x7E : 0x7F;
-        if ((w0 == w1) && !(w0 & mask) && !(w1 & mask))
-            this->linear = true;
-        else
-            this->linear = false;
-
-        if (dir)
-        {
-            // along Y
-
-            if ((w0 & 0x1) && !(w1 & 0x1))
-            {
-                this->w0n = w0 - 1;
-                this->w0d = w0 + 1;
-                this->w1d = w1;
-            }
-            else
-            {
-                this->w0n = w0 & 0xFFFE;
-                this->w0d = w0 & 0xFFFE;
-                this->w1d = w1 & 0xFFFE;
-            }
-
-            this->shift = 9;
-        }
-        else
-        {
-            // along X
-
-            this->w0n = w0;
-            this->w0d = w0;
-            this->w1d = w1;
-
-            this->shift = 8;
-        }
-    }
-
-    void SetX(s32 x)
-    {
-        x -= x0;
-        this->x = x;
-        if (xdiff != 0 && !linear)
-        {
-            s64 num = ((s64)x * w0n) << shift;
-            s32 den = (x * w0d) + ((xdiff-x) * w1d);
-
-            // this seems to be a proper division on hardware :/
-            // I haven't been able to find cases that produce imperfect output
-            if (den == 0) yfactor = 0;
-            else          yfactor = (s32)(num / den);
-        }
-    }
-
-    s32 Interpolate(s32 y0, s32 y1)
-    {
-        if (xdiff == 0 || y0 == y1) return y0;
-
-        if (!linear)
-        {
-            // perspective-correct approx. interpolation
-            if (y0 < y1)
-                return y0 + (((y1-y0) * yfactor) >> shift);
-            else
-                return y1 + (((y0-y1) * ((1<<shift)-yfactor)) >> shift);
-        }
-        else
-        {
-            // linear interpolation
-            // checkme: the rounding bias there (3<<24) is a guess
-            if (y0 < y1)
-                return y0 + ((((s64)(y1-y0) * x * xrecip) + (3<<24)) >> 30);
-            else
-                return y1 + ((((s64)(y0-y1) * (xdiff-x) * xrecip) + (3<<24)) >> 30);
-        }
-    }
-
-    s32 InterpolateZ(s32 z0, s32 z1, bool wbuffer)
-    {
-        if (xdiff == 0 || z0 == z1) return z0;
-
-        if (wbuffer)
-        {
-            // W-buffering: perspective-correct approx. interpolation
-            if (z0 < z1)
-                return z0 + (((s64)(z1-z0) * yfactor) >> shift);
-            else
-                return z1 + (((s64)(z0-z1) * ((1<<shift)-yfactor)) >> shift);
-        }
-        else
-        {
-            // Z-buffering: linear interpolation
-            // still doesn't quite match hardware...
-            s32 base, disp, factor;
-
-            if (z0 < z1)
-            {
-                base = z0;
-                disp = z1 - z0;
-                factor = x;
-            }
-            else
-            {
-                base = z1;
-                disp = z0 - z1,
-                factor = xdiff - x;
-            }
-
-            if (dir)
-            {
-                int shift = 0;
-                while (disp > 0x3FF)
-                {
-                    disp >>= 1;
-                    shift++;
-                }
-
-                return base + ((((s64)disp * factor * xrecip_z) >> 22) << shift);
-            }
-            else
-            {
-                disp >>= 9;
-                return base + (((s64)disp * factor * xrecip_z) >> 13);
-            }
-        }
-    }
-
-private:
-    s32 x0, x1, xdiff, x;
-
-    int shift;
-    bool linear;
-
-    s32 xrecip, xrecip_z;
-    s32 w0n, w0d, w1d;
-
-    u32 yfactor;
-};
-
-
-template<int side>
-class Slope
-{
-public:
-    Slope() {}
-
-    s32 SetupDummy(s32 x0)
-    {
-        if (side)
-        {
-            dx = -0x40000;
-            x0--;
-        }
-        else
-        {
-            dx = 0;
-        }
-
-        this->x0 = x0;
-        this->xmin = x0;
-        this->xmax = x0;
-
-        Increment = 0;
-        XMajor = false;
-
-        Interp.Setup(0, 0, 0, 0);
-        Interp.SetX(0);
-
-        xcov_incr = 0;
-
-        return x0;
-    }
-
-    s32 Setup(s32 x0, s32 x1, s32 y0, s32 y1, s32 w0, s32 w1, s32 y)
-    {
-        this->x0 = x0;
-        this->y = y;
-
-        if (x1 > x0)
-        {
-            this->xmin = x0;
-            this->xmax = x1-1;
-            this->Negative = false;
-        }
-        else if (x1 < x0)
-        {
-            this->xmin = x1;
-            this->xmax = x0-1;
-            this->Negative = true;
-        }
-        else
-        {
-            this->xmin = x0;
-            if (side) this->xmin--;
-            this->xmax = this->xmin;
-            this->Negative = false;
-        }
-
-        xlen = xmax+1 - xmin;
-        ylen = y1 - y0;
-
-        // slope increment has a 18-bit fractional part
-        // note: for some reason, x/y isn't calculated directly,
-        // instead, 1/y is calculated and then multiplied by x
-        // TODO: this is still not perfect (see for example x=169 y=33)
-        if (ylen == 0)
-            Increment = 0;
-        else if (ylen == xlen)
-            Increment = 0x40000;
-        else
-        {
-            s32 yrecip = (1<<18) / ylen;
-            Increment = (x1-x0) * yrecip;
-            if (Increment < 0) Increment = -Increment;
-        }
-
-        XMajor = (Increment > 0x40000);
-
-        if (side)
-        {
-            // right
-
-            if (XMajor)              dx = Negative ? (0x20000 + 0x40000) : (Increment - 0x20000);
-            else if (Increment != 0) dx = Negative ? 0x40000 : 0;
-            else                     dx = -0x40000;
-        }
-        else
-        {
-            // left
-
-            if (XMajor)              dx = Negative ? ((Increment - 0x20000) + 0x40000) : 0x20000;
-            else if (Increment != 0) dx = Negative ? 0x40000 : 0;
-            else                     dx = 0;
-        }
-
-        dx += (y - y0) * Increment;
-
-        s32 x = XVal();
-
-        if (XMajor)
-        {
-            if (side) Interp.Setup(x0-1, x1-1, w0, w1); // checkme
-            else      Interp.Setup(x0, x1, w0, w1);
-            Interp.SetX(x);
-
-            // used for calculating AA coverage
-            xcov_incr = (ylen << 10) / xlen;
-        }
-        else
-        {
-            Interp.Setup(y0, y1, w0, w1);
-            Interp.SetX(y);
-        }
-
-        return x;
-    }
-
-    s32 Step()
-    {
-        dx += Increment;
-        y++;
-
-        s32 x = XVal();
-        if (XMajor)
-        {
-            Interp.SetX(x);
-        }
-        else
-        {
-            Interp.SetX(y);
-        }
-        return x;
-    }
-
-    s32 XVal()
-    {
-        s32 ret;
-        if (Negative) ret = x0 - (dx >> 18);
-        else          ret = x0 + (dx >> 18);
-
-        if (ret < xmin) ret = xmin;
-        else if (ret > xmax) ret = xmax;
-        return ret;
-    }
-
-    void EdgeParams_XMajor(s32* length, s32* coverage)
-    {
-        if (side ^ Negative)
-            *length = (dx >> 18) - ((dx-Increment) >> 18);
-        else
-            *length = ((dx+Increment) >> 18) - (dx >> 18);
-
-        // for X-major edges, we return the coverage
-        // for the first pixel, and the increment for
-        // further pixels on the same scanline
-        s32 startx = dx >> 18;
-        if (Negative) startx = xlen - startx;
-        if (side)     startx = startx - *length + 1;
-
-        s32 startcov = (((startx << 10) + 0x1FF) * ylen) / xlen;
-        *coverage = (1<<31) | ((startcov & 0x3FF) << 12) | (xcov_incr & 0x3FF);
-    }
-
-    void EdgeParams_YMajor(s32* length, s32* coverage)
-    {
-        *length = 1;
-
-        if (Increment == 0)
-        {
-            *coverage = 31;
-        }
-        else
-        {
-            s32 cov = ((dx >> 9) + (Increment >> 10)) >> 4;
-            if ((cov >> 5) != (dx >> 18)) cov = 31;
-            cov &= 0x1F;
-            if (!(side ^ Negative)) cov = 0x1F - cov;
-
-            *coverage = cov;
-        }
-    }
-
-    void EdgeParams(s32* length, s32* coverage)
-    {
-        if (XMajor)
-            return EdgeParams_XMajor(length, coverage);
-        else
-            return EdgeParams_YMajor(length, coverage);
-    }
-
-    s32 Increment;
-    bool Negative;
-    bool XMajor;
-    Interpolator<1> Interp;
-
-private:
-    s32 x0, xmin, xmax;
-    s32 xlen, ylen;
-    s32 dx;
-    s32 y;
-
-    s32 xcov_incr;
-    s32 ycoverage, ycov_incr;
-};
-
-struct RendererPolygon
-{
-    Polygon* PolyData;
-
-    Slope<0> SlopeL;
-    Slope<1> SlopeR;
-    s32 XL, XR;
-    u32 CurVL, CurVR;
-    u32 NextVL, NextVR;
-
-};
-
-RendererPolygon PolygonList[2048];
-
-template <typename T>
-inline T ReadVRAM_Texture(u32 addr)
-{
-    return *(T*)&GPU::VRAMFlat_Texture[addr & 0x7FFFF];
-}
-template <typename T>
-inline T ReadVRAM_TexPal(u32 addr)
-{
-    return *(T*)&GPU::VRAMFlat_TexPal[addr & 0x1FFFF];
-}
-
-void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha)
+void SoftRenderer::TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha)
 {
    u32 vramaddr = (texparam & 0xFFFF) << 3;

@ -873,7 +425,7 @@ u32 AlphaBlend(u32 srccolor, u32 dstcolor, u32 alpha)
    return srcR | (srcG << 8) | (srcB << 16) | (dstalpha << 24);
 }

-u32 RenderPixel(Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t)
+u32 SoftRenderer::RenderPixel(Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t)
 {
    u8 r, g, b, a;

@ -981,7 +533,7 @@ u32 RenderPixel(Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t)
    return r | (g << 8) | (b << 16) | (a << 24);
 }

-void PlotTranslucentPixel(u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow)
+void SoftRenderer::PlotTranslucentPixel(u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow)
 {
    u32 dstattr = AttrBuffer[pixeladdr];
    u32 attr = (polyattr & 0xE0F0) | ((polyattr >> 8) & 0xFF0000) | (1<<22) | (dstattr & 0xFF001F0F);
@ -1020,7 +572,7 @@ void PlotTranslucentPixel(u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 sha
    AttrBuffer[pixeladdr] = attr;
 }

-void SetupPolygonLeftEdge(RendererPolygon* rp, s32 y)
+void SoftRenderer::SetupPolygonLeftEdge(SoftRenderer::RendererPolygon* rp, s32 y)
 {
    Polygon* polygon = rp->PolyData;

@ -1047,7 +599,7 @@ void SetupPolygonLeftEdge(RendererPolygon* rp, s32 y)
                              polygon->FinalW[rp->CurVL], polygon->FinalW[rp->NextVL], y);
 }

-void SetupPolygonRightEdge(RendererPolygon* rp, s32 y)
+void SoftRenderer::SetupPolygonRightEdge(SoftRenderer::RendererPolygon* rp, s32 y)
 {
    Polygon* polygon = rp->PolyData;

@ -1074,7 +626,7 @@ void SetupPolygonRightEdge(RendererPolygon* rp, s32 y)
                              polygon->FinalW[rp->CurVR], polygon->FinalW[rp->NextVR], y);
 }

-void SetupPolygon(RendererPolygon* rp, Polygon* polygon)
+void SoftRenderer::SetupPolygon(SoftRenderer::RendererPolygon* rp, Polygon* polygon)
 {
    u32 nverts = polygon->NumVertices;

@ -1127,7 +679,7 @@ void SetupPolygon(RendererPolygon* rp, Polygon* polygon)
    }
 }

-void RenderShadowMaskScanline(RendererPolygon* rp, s32 y)
+void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y)
 {
    Polygon* polygon = rp->PolyData;

@ -1340,7 +892,7 @@ void RenderShadowMaskScanline(RendererPolygon* rp, s32 y)
    rp->XR = rp->SlopeR.Step();
 }

-void RenderPolygonScanline(RendererPolygon* rp, s32 y)
+void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y)
 {
    Polygon* polygon = rp->PolyData;

@ -1755,7 +1307,7 @@ void RenderPolygonScanline(RendererPolygon* rp, s32 y)
    rp->XR = rp->SlopeR.Step();
 }

-void RenderScanline(s32 y, int npolys)
+void SoftRenderer::RenderScanline(s32 y, int npolys)
 {
    for (int i = 0; i < npolys; i++)
    {
@ -1772,8 +1324,7 @@ void RenderScanline(s32 y, int npolys)
    }
 }

-
-u32 CalculateFogDensity(u32 pixeladdr)
+u32 SoftRenderer::CalculateFogDensity(u32 pixeladdr)
 {
    u32 z = DepthBuffer[pixeladdr];
    u32 densityid, densityfrac;
@ -1812,7 +1363,7 @@ u32 CalculateFogDensity(u32 pixeladdr)
    return density;
 }

-void ScanlineFinalPass(s32 y)
+void SoftRenderer::ScanlineFinalPass(s32 y)
 {
    // to consider:
    // clearing all polygon fog flags if the master flag isn't set?
@ -1981,7 +1532,7 @@ void ScanlineFinalPass(s32 y)
    }
 }

-void ClearBuffers()
+void SoftRenderer::ClearBuffers()
 {
    u32 clearz = ((RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF;
    u32 polyid = RenderClearAttr1 & 0x3F000000; // this sets the opaque polygonID
@ -2055,7 +1606,7 @@ void ClearBuffers()
        u32 a = (RenderClearAttr1 >> 16) & 0x1F;
        u32 color = r | (g << 8) | (b << 16) | (a << 24);

-		polyid |= (RenderClearAttr1 & 0x8000);
+        polyid |= (RenderClearAttr1 & 0x8000);

        for (int y = 0; y < ScanlineWidth*192; y+=ScanlineWidth)
        {
@ -2070,7 +1621,7 @@ void ClearBuffers()
    }
 }

-void RenderPolygons(bool threaded, Polygon** polygons, int npolys)
+void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys)
 {
    int j = 0;
    for (int i = 0; i < npolys; i++)
@ -2096,13 +1647,13 @@ void RenderPolygons(bool threaded, Polygon** polygons, int npolys)
        Platform::Semaphore_Post(Sema_ScanlineCount);
 }

-void VCount144()
+void SoftRenderer::VCount144()
 {
    if (RenderThreadRunning)
        Platform::Semaphore_Wait(Sema_RenderDone);
 }

-void RenderFrame()
+void SoftRenderer::RenderFrame()
 {
    auto textureDirty = GPU::VRAMDirty_Texture.DeriveState(GPU::VRAMMap_Texture);
    auto texPalDirty = GPU::VRAMDirty_TexPal.DeriveState(GPU::VRAMMap_TexPal);
@ -2123,7 +1674,12 @@ void RenderFrame()
    }
 }

-void RenderThreadFunc()
+void SoftRenderer::RestartFrame()
+{
+    SetupRenderThread();
+}
+
+void SoftRenderer::RenderThreadFunc()
 {
    for (;;)
    {
@ -2146,7 +1702,7 @@ void RenderThreadFunc()
    }
 }

-u32* GetLine(int line)
+u32* SoftRenderer::GetLine(int line)
 {
    if (RenderThreadRunning)
    {
@ -2158,4 +1714,3 @@ u32* GetLine(int line)
 }

 }
-}