From 249687a2ce9c0fd08f5e2b2b69f0129754d00214 Mon Sep 17 00:00:00 2001
From: Jaklyy <102590697+Jaklyy@users.noreply.github.com>
Date: Sat, 24 Feb 2024 14:18:45 -0500
Subject: [PATCH] rework 4: now with proper edge marking bug emulation!

---
 src/GPU3D.h        |  30 ++---
 src/GPU3D_Soft.cpp | 273 +++++++++++++++------------------------------
 src/GPU3D_Soft.h   |   1 +
 3 files changed, 106 insertions(+), 198 deletions(-)

diff --git a/src/GPU3D.h b/src/GPU3D.h
index 27162854..8719a7e1 100644
--- a/src/GPU3D.h
+++ b/src/GPU3D.h
@@ -341,13 +341,13 @@ public:
     static constexpr int ScanlineReadInc = DelayBetweenReads + ScanlineReadSpeed;
 
 
-    static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair.
-    static constexpr int GPU2DSpeedSecondInPair = 296 * TimingFrac; // 296 | 295??? | the delay between finishing reading the first scanline
+    //static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair.
+    //static constexpr int GPU2DSpeedSecondInPair = 296 * TimingFrac; // 296 | 295??? | the delay between finishing reading the first scanline
                                                                     // and beginning reading the second scanline of a scanline pair.
-    static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline.
-    static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment.
-    static constexpr int InitGPU2DTimeout = 52128 * TimingFrac; // 51618? 51874? | when it finishes reading the first scanline.
-    static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines.
+    //static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline.
+    //static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment.
+    static constexpr int InitGPU2DTimeout = 51875 * TimingFrac; // 51618? 51874? 52128? | when it finishes reading the first scanline.
+    //static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines.
     static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this?
 
     // GPU 3D Rasterization Timings: For Emulating Scanline Timeout
@@ -358,22 +358,20 @@ public:
     //static constexpr int ScanlineBreak2 = 40 * TimingFrac;
     //static constexpr int FakeTiming = 2 * TimingFrac;
     //static constexpr int FraudulentTiming = 1120 * TimingFrac; // bad theory. todo: find a better one.
-    static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair
-    static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is,
+    //static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair
+    //static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is,
                                                          // it just cares about if its the first 50 scanlines to speedrun rendering?
     static constexpr int FinalPassLen = 500 * TimingFrac; // 496 (might technically be 500?) | the next scanline cannot begin while a scanline's final pass is in progress
                                                         // (can be interpreted as the minimum amount of cycles for the next scanline
                                                         // pair to start after the previous pair began) (related to final pass?)
     static constexpr int ScanlinePushDelay = 242 * TimingFrac;
-    static constexpr int TimeoutIncrement = 2130 * TimingFrac;
-    static constexpr int ScanlineIncrementold = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair
-    static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains"
-    static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126)
+    //static constexpr int TimeoutIncrement = 2130 * TimingFrac;
+    //static constexpr int ScanlineIncrementold = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair
+    //static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains"
+    //static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126)
                                                            // (why does the next pair get more time if the previous scanline is aborted?)
     static constexpr int UnderflowFlag = 14 * TimingFrac; // 14 | How many cycles need to be left for the 3ddispcnt rdlines underflow flag to be set
-    static constexpr int RastDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?)
-                                                    // (Amount of time before the end of the cycle a scanline must abort?)
-    static constexpr int FinishScanline = 512 * TimingFrac;
+    //static constexpr int FinishScanline = 512 * TimingFrac;
 
     // GPU 3D Rasterization Timings II: For Tracking Timing Behaviors
 
@@ -392,6 +390,8 @@ public:
     static constexpr int FirstPerSlope = 1 * TimingFrac; // 1 | for each "slope" the first polygon has in this scanline increment it by 1.
                                                          // (see DoTimingsSlopes() in GPU3D_Soft.cpp for more info)
     static constexpr int FirstNull = 1 * TimingFrac; // 1 | if the first polygon is "null" (probably wrong?)
+    static constexpr int RastDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?)
+                                                    // (Amount of time before the end of the cycle a scanline must abort?)
 
    // static constexpr int RasterTimingCap = 51116 * TimingFrac;
    // static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED
diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp
index 923b8a77..d3b72e08 100644
--- a/src/GPU3D_Soft.cpp
+++ b/src/GPU3D_Soft.cpp
@@ -19,6 +19,7 @@
 #include "GPU3D_Soft.h"
 
 #include <algorithm>
+#include <initializer_list>
 #include <stdio.h>
 #include <string.h>
 #include "NDS.h"
@@ -175,6 +176,8 @@ u32 SoftRenderer::DoTimingsPixels(s32 pixels, s32* timingcounter)
 
 bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, s32* timingcounter)
 {
+    DoTimings(RastDelay, timingcounter);
+
     // determine the timing impact of the first polygon's slopes.
     
     Polygon* polygon = rp->PolyData;
@@ -1457,6 +1460,7 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3
 
 void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter)
 {
+    *timingcounter = 0;
     bool abort = false;
     bool first = true;
     for (int i = 0; i < npolys; i++)
@@ -1466,7 +1470,7 @@ void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timing
 
         if (y == polygon->YBottom && y != polygon->YTop)
         {
-            if (!abort) abort = (first && DoTimings(FirstNull, timingcounter)) || DoTimings(EmptyPolyScanline, timingcounter);
+            if (!abort) abort = (first && DoTimings(FirstNull+RastDelay, timingcounter)) || DoTimings(EmptyPolyScanline, timingcounter);
 
             first = false;
         }
@@ -1555,6 +1559,7 @@ bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z, u32 pixel
     }
 }
 
+template <bool push>
 void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext)
 {
     // to consider:
@@ -1728,6 +1733,11 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev,
             ColorBuffer[pixeladdr] = topR | (topG << 8) | (topB << 16) | (topA << 24);
         }
     }
+    if constexpr (push)
+    {
+        memcpy(&FinalBuffer[y*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], ScanlineWidth*4);
+        Platform::Semaphore_Post(Sema_ScanlineCount);
+    }
 }
 
 void SoftRenderer::ClearBuffers(const GPU& gpu)
@@ -1846,190 +1856,87 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys)
     //init internal buffer
     ClearBuffers(gpu);
 
-    // init all this junk i need to keep track of
-    s32 rasterevents[RasterEvents_MAX];
-    rasterevents[RenderStart] = 0;
-    rasterevents[RenderFinal] = FrameLength;
-    rasterevents[PushScanline] = FrameLength;
-    rasterevents[PushScanlineP2] = FrameLength;
-    rasterevents[ScanlineRead] = InitGPU2DTimeout;
-    ScanlineTimeout = FrameLength;
-    RasterTiming = 0;
-    s32 rastertimingeven = 0;
-    s32 rastertimingodd = 0;
-    u8 scanlinesread = 0;
-    u8 scanlinesinit = 0;
-    u8 scanlinesfin = 0;
-    u8 scanlinespushed = 0;
-    u8 scanlinespushed2 = 0;
-    s16 scanlineswaitingforpush = 0;
-    s16 scanlineswaitingforread = 0;
-    u8 nextevent;
-    u16 leftovers;
-    bool evenread = false;
-    s32 timespent = 0;
-    s32 prevtimespent = 0;
-    bool edgebug = false;
-    bool prevedgebug = false;
-
-    // until all scanlines have been pushed and read continue looping... CHECKME: unless its time for the next 3d frame to begin
-    while ((scanlinesread < 192 || scanlinespushed2 < 192) && (RasterTiming < (FrameLength-RastDelay)))
+    u32 slread[192]; // scanline read times
+    for (int i = 0, time = InitGPU2DTimeout; i < 192; i++, time += ScanlineReadInc) // CHECKME: is this computed at compile time?
     {
-        // check all events to find the earliest scheduled one
-        nextevent = 0;
-        for (u8 i = 1; i < RasterEvents_MAX; i++)
-        {
-            if (rasterevents[i] < rasterevents[nextevent])
-                nextevent = i;
-        }
-
-        // if all events are scheduled for after the next frame begins, ABORT
-        if (rasterevents[nextevent] >= FrameLength) break;
-
-        switch (nextevent)
-        {
-
-        // initial rendering pass (polygons, texturing, etc.) (variable cycle length)
-        case RenderStart:
-        {
-            // set current raster time to the start of the event
-            RasterTiming = rasterevents[RenderStart];
-
-            s32 rastertimingeven = 0;
-            s32 rastertimingodd = 0;
-            // scanlines are rendered in pairs of two
-            RenderScanline(gpu, scanlinesinit, j, &rastertimingeven);
-            RenderScanline(gpu, scanlinesinit+1, j, &rastertimingodd);
-            scanlinesinit += 2;
-
-            // a new scanline pair cannot begin until both scanlines are finished.
-            prevtimespent = timespent;
-            timespent = std::max(rastertimingeven, rastertimingodd);
-
-            // a new scanline pair cannot begin until the finishing pass + push is done.
-            if ((RasterTiming + timespent) < (RasterTiming+FinalPassLen))
-                RasterTiming += FinalPassLen;
-            else
-                RasterTiming += timespent;
-
-            // 12 cycles at the end of the scanline are always used, unless the scanline got within 12 cycles of timing out. Don't ask why, it just does.
-            s32 timeoutdist = ScanlineTimeout - RasterTiming;
-            prevedgebug = edgebug;
-            if (timeoutdist < 49385) edgebug = true;
-            else edgebug = false;
-            RasterTiming += std::clamp(timeoutdist, 0, 12);
-
-            //set next scanline timeout
-            if (ScanlineTimeout == FrameLength) ScanlineTimeout = rasterevents[ScanlineRead] - FinalPassLen + (ScanlineReadInc*evenread);//(ScanlineReadSpeed+RastDelay);
-            else ScanlineTimeout += TimeoutIncrement;
-
-            // schedule next scanline pair + the final pass of the latest pair
-            rasterevents[RenderFinal] = RasterTiming;
-            if (scanlinesinit < 192) rasterevents[RenderStart] = RasterTiming+RastDelay; // scheduled 4 cycles late (presumably due to initial polygon timing shenanigans?)
-            else rasterevents[RenderStart] = FrameLength;
-            break;
-        }
-
-        // final rendering pass (edge marking, anti-aliasing, fog) (fixed length of 496 (maybe 500?) cycles)
-        case RenderFinal:
-        {
-            // schedule a scanline push event
-            rasterevents[PushScanline] = rasterevents[RenderFinal] + ScanlinePushDelay;
-
-            // if the first scanline pair was just finished only render one scanline
-            if (scanlinesfin > 0)
-            {
-                ScanlineFinalPass(gpu.GPU3D, scanlinesfin, timespent+4 < 501 || edgebug, prevtimespent+4 < 501 || prevedgebug);
-                scanlineswaitingforpush++;
-                scanlinesfin++;
-            }
-
-            // if the last scanline pair was just finished only render one scanline
-            if (scanlinesfin < 191)
-            {
-                ScanlineFinalPass(gpu.GPU3D, scanlinesfin, timespent+4 < 501 || edgebug, prevtimespent+4 < 501 || prevedgebug);
-                scanlineswaitingforpush++;
-                scanlinesfin++;
-            }
-            // unschedule final pass event
-            if (scanlinesfin != 191)
-                rasterevents[RenderFinal] = FrameLength;
-            else // schedule next final pass event to immediately after the current one
-                rasterevents[RenderFinal] += FinalPassLen;
-            break;
-        }
-
-        // push scanlines to the intermediary "frame buffer" for the 2d engine to read them. (fixed length of ??? cycles) 256?
-        case PushScanline:
-        {
-            // reschedule events if buffer is full
-            if (scanlineswaitingforread >= 48)
-            {
-                rasterevents[PushScanline] = rasterevents[ScanlineRead];
-                
-                // dont reschedule these events if they're done.
-                if (scanlinesinit < 192)
-                    rasterevents[RenderStart] =  rasterevents[ScanlineRead] + RastDelay;
-                if (scanlinesfin < 192)
-                    rasterevents[RenderFinal] = rasterevents[ScanlineRead];
-
-                break;
-            }
-
-            // if a scanline push might intersect a read determine the point at which it intersects
-            s32 pixelstopush = (scanlinespushed > scanlinesread ? 256 : (rasterevents[ScanlineRead] + (ScanlineReadInc*scanlineswaitingforread)) - rasterevents[PushScanline]);
-            leftovers = BeginPushScanline(scanlinespushed, pixelstopush);
-
-            scanlineswaitingforpush--;
-            scanlinespushed++;
-
-            // schedule the finish push event if needed
-            if (leftovers != 0) rasterevents[PushScanlineP2] = rasterevents[ScanlineRead];
-            else
-            {
-                scanlineswaitingforread++;
-                scanlinespushed2++;
-            }
-
-            if (scanlineswaitingforpush <= 0)
-                rasterevents[PushScanline] = FrameLength; // unsched event if no scanlines are waiting to be finished
-
-            break;
-        }
-
-        // 2d engine reading scanlines from the intermediary "framebuffer"
-        case ScanlineRead:
-        {
-            // read scanline from buffer
-            ReadScanline(scanlinesread);
-
-            // avoid breaking seperate thread.
-            if constexpr (threaded)
-                Platform::Semaphore_Post(Sema_ScanlineCount);
-
-            scanlinesread++;
-            scanlineswaitingforread--;
-            evenread = !evenread;
-
-            // reschedule event for one scanline later unless all scanlines have been read
-            if (scanlinesread < 192) rasterevents[ScanlineRead] += ScanlineReadInc;
-            else rasterevents[ScanlineRead] = FrameLength;
-            break;
-        }
-
-        // finish pushing a scanline to the buffer if it got interrupted by the read process.
-        case PushScanlineP2:
-        {
-            FinishPushScanline(scanlinespushed2, leftovers);
-            scanlineswaitingforread++;
-            scanlinespushed2++;
-
-            // unschedule event if all partially pushed scanlines have been pushed
-            if (scanlinespushed2 >= scanlinespushed) rasterevents[PushScanlineP2] = FrameLength;
-            break;
-        }
-        }
+        slread[i] = time;
     }
+
+    ScanlineTimeout = FrameLength; // CHECKME
+    
+    s32 rastertimingeven; // always init to 0 at the start of a scanline render
+    s32 rastertimingodd;
+
+    s32 scanlineswaiting = 0;
+    s32 nextread = 0;
+
+    u32 timespent;
+    u32 prevtimespent;
+    // scanlines are rendered in pairs of two
+    RenderScanline(gpu, 0, j, &rastertimingeven);
+    RenderScanline(gpu, 1, j, &rastertimingodd);
+
+    RasterTiming = timespent = std::max(std::initializer_list<s32> {rastertimingeven, rastertimingodd, FinalPassLen});
+    RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);
+
+    // if first pair was not delayed past the first read, then later scanlines cannot either
+    // this allows us to implement a fast path
+    //if (slread[0] - timespent + ScanlinePushDelay >= 256)
+    {
+        ScanlineTimeout = slread[1] - FinalPassLen;
+
+        RenderScanline(gpu, 2, j, &rastertimingeven);
+        RenderScanline(gpu, 3, j, &rastertimingodd);
+
+        prevtimespent = timespent;
+        RasterTiming += timespent = std::max(std::initializer_list<s32> {rastertimingeven, rastertimingodd, FinalPassLen});
+        RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);
+
+        ScanlineFinalPass<true>(gpu.GPU3D, 0, true, true);
+        scanlineswaiting++;
+        for (int y = 4; y < 192; y+=2)
+        {
+            ScanlineTimeout = slread[y-1] - FinalPassLen;
+
+            RenderScanline(gpu, y, j, &rastertimingeven);
+            RenderScanline(gpu, y+1, j, &rastertimingodd);
+            
+            prevtimespent = timespent;
+            RasterTiming += timespent = std::max(std::initializer_list<s32> {rastertimingeven, rastertimingodd, FinalPassLen});
+            RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);
+            
+            scanlineswaiting+=2;
+
+            while (scanlineswaiting >= 47)
+            {
+                if (RasterTiming < slread[nextread]) RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565?
+                scanlineswaiting--;
+                nextread++;
+            }
+
+            ScanlineFinalPass<true>(gpu.GPU3D, y-3, prevtimespent >= 502 || y-3 == 1, timespent >= 502);
+            ScanlineFinalPass<true>(gpu.GPU3D, y-2, prevtimespent >= 502, timespent >= 502);
+        }
+
+        ScanlineFinalPass<true>(gpu.GPU3D, 189, timespent >= 502, timespent >= 502);
+        ScanlineFinalPass<true>(gpu.GPU3D, 190, timespent >= 502, true);
+
+        ScanlineFinalPass<true>(gpu.GPU3D, 191, true, true);
+    }
+    /*else
+    {
+        ScanlineFinalPass(gpu, 0, false, false);
+    
+        s32 pixelstopush = slread[0] - (timespent + ScanlinePushDelay);
+        if (pixelstopush > 256) pixelstopush = 256;
+        //timespent + ScanlinePushDelay + ScanlineReadSpeed > slread[0]
+
+        rastertimingeven = 0;
+        rastertimingodd = 0;
+
+        RenderScanline(gpu, 2, j, &rastertimingeven);
+        RenderScanline(gpu, 3, j, &rastertimingodd);
+    }*/
 }
 
 void SoftRenderer::VCount144(GPU& gpu)
diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h
index 6f81fae6..3814762d 100644
--- a/src/GPU3D_Soft.h
+++ b/src/GPU3D_Soft.h
@@ -472,6 +472,7 @@ private:
     void RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter);
     u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const;
     bool CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr);
+    template <bool push>
     void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext);
     void ClearBuffers(const GPU& gpu);
     u16 BeginPushScanline(s32 y, s32 pixelstodraw);