From 9ffa04dfbc1bf187f3876864f224d404a69a3b05 Mon Sep 17 00:00:00 2001
From: Jaklyy <102590697+Jaklyy@users.noreply.github.com>
Date: Sun, 25 Feb 2024 22:41:33 -0500
Subject: [PATCH] approximate rdlines_count; implement underflow flag

---
 src/GPU.cpp        |   4 ++
 src/GPU3D.cpp      |   3 +-
 src/GPU3D.h        |   3 +-
 src/GPU3D_Soft.cpp | 115 ++++++++++++++++++++++++++-------------------
 4 files changed, 74 insertions(+), 51 deletions(-)
diff --git a/src/GPU.cpp b/src/GPU.cpp
index f23e641e..a78deba6 100644
--- a/src/GPU.cpp
+++ b/src/GPU.cpp
@@ -1041,6 +1041,10 @@ void GPU::StartScanline(u32 line) noexcept
             if (GPU3D.IsRendererAccelerated())
                 GPU3D.Blit(*this);
         }
+        else if (VCount == 183)
+        {
+            GPU3D.DispCnt |= GPU3D.RDLinesUnderflow << 12;
+        }
     }
 
     NDS.ScheduleEvent(Event_LCD, true, HBLANK_CYCLES, LCD_StartHBlank, line);
diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp
index 8706724b..a9524e88 100644
--- a/src/GPU3D.cpp
+++ b/src/GPU3D.cpp
@@ -2509,7 +2509,6 @@ void GPU3D::VBlank() noexcept
 
 void GPU3D::VCount215(GPU& gpu) noexcept
 {
-    //RDLinesTemp = 46;
     CurrentRenderer->RenderFrame(gpu);
 }
 
@@ -2647,7 +2646,7 @@ u16 GPU3D::Read16(u32 addr) noexcept
         return DispCnt;
 
     case 0x04000320:
-        return RDLines; // CHECKME: Can this always be read? Even when the gpu is powered off?
+        return RDLines; // CHECKME: Can this always be read? Even when the gpu is powered off? also check 8 bit reads
 
     case 0x04000600:
         {
diff --git a/src/GPU3D.h b/src/GPU3D.h
index 3d3b0e7f..fb779a68 100644
--- a/src/GPU3D.h
+++ b/src/GPU3D.h
@@ -246,6 +246,7 @@ public:
     bool RenderingEnabled = false;
 
     u32 DispCnt = 0;
+    bool RDLinesUnderflow = false;
     u8 RDLines = 63;
     u8 RDLinesTemp = 46;
     u8 AlphaRefVal = 0;
@@ -371,7 +372,7 @@ public:
     //static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains"
     //static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126)
                                                            // (why does the next pair get more time if the previous scanline is aborted?)
-    static constexpr int UnderflowFlag = 14 * TimingFrac; // 14 | How many cycles need to be left for the 3ddispcnt rdlines underflow flag to be set
+    //static constexpr int UnderflowFlag = 2 * TimingFrac; // 14 | How many cycles need to be left for the 3ddispcnt rdlines underflow flag to be set
     //static constexpr int FinishScanline = 512 * TimingFrac;
 
     // GPU 3D Rasterization Timings II: For Tracking Timing Behaviors
diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp
index 50d1104f..0600b435 100644
--- a/src/GPU3D_Soft.cpp
+++ b/src/GPU3D_Soft.cpp
@@ -1855,6 +1855,10 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys)
     
     //init internal buffer
     ClearBuffers(gpu);
+    
+    // reset scanline trackers
+    gpu.GPU3D.RDLinesUnderflow = false;
+    gpu.GPU3D.RDLinesTemp = 63;
 
     u32 slread[192]; // scanline read times
     for (int i = 0, time = InitGPU2DTimeout; i < 192; i++, time += ScanlineReadInc) // CHECKME: is this computed at compile time?
@@ -1872,6 +1876,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys)
 
     u32 timespent;
     u32 prevtimespent;
+
     // scanlines are rendered in pairs of two
     RenderScanline(gpu, 0, j, &rastertimingeven);
     RenderScanline(gpu, 1, j, &rastertimingodd);
@@ -1880,8 +1885,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys)
     RasterTiming = timespent = std::max(std::initializer_list<s32> {rastertimingeven, rastertimingodd, FinalPassLen});
     // 12 cycles at the end of a "timeout" are always used for w/e reason
     RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);
-
-    gpu.GPU3D.RDLinesTemp = 46;
+    
     // if first pair was not delayed past the first read, then later scanlines cannot either
     // this allows us to implement a fast path
     //if (slread[0] - timespent + ScanlinePushDelay >= 256)
@@ -1892,13 +1896,72 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys)
         RenderScanline(gpu, 2, j, &rastertimingeven);
         RenderScanline(gpu, 3, j, &rastertimingodd);
 
+        // the time spent on the previous scanline pair is important for emulating the edge marking bug properly
         prevtimespent = timespent;
         RasterTiming += timespent = std::max(std::initializer_list<s32> {rastertimingeven, rastertimingodd, FinalPassLen});
         RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);
         
-        scanlineswaiting++;
+        // set the underflow flag if one of the scanlines came within 14 cycles of visible underflow
+        if (ScanlineTimeout <= RasterTiming) gpu.GPU3D.RDLinesUnderflow = true;
 
+        scanlineswaiting++;
+        
+        // simulate the process of scanlines being read from the 48 scanline buffer
         while (RasterTiming >= slread[nextread] + 565)
+        {
+            if (RasterTiming < slread[nextread] + 565)
+            {
+                RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565?
+                timespent += 571; // fixes edge marking bug emulation. not sure why this is needed?
+            }
+            scanlineswaiting--;
+            nextread++;
+            // update rdlines_count register
+            if (gpu.GPU3D.RDLinesTemp > scanlineswaiting) gpu.GPU3D.RDLinesTemp = scanlineswaiting; // TODO: not accurate, rdlines appears to update early in some manner?
+        }
+
+        // final pass pairs are the previous scanline pair offset -1 scanline, thus we start with only building one
+        ScanlineFinalPass<true>(gpu.GPU3D, 0, true, timespent >= 502);
+        for (int y = 4; y < 192; y+=2)
+        {
+            //update sl timeout
+            ScanlineTimeout = slread[y-1] - FinalPassLen;
+
+            RenderScanline(gpu, y, j, &rastertimingeven);
+            RenderScanline(gpu, y+1, j, &rastertimingodd);
+            
+            prevtimespent = timespent;
+            RasterTiming += timespent = std::max(std::initializer_list<s32> {rastertimingeven, rastertimingodd, FinalPassLen});
+            RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);
+            
+            // set the underflow flag if one of the scanlines came within 14 cycles of visible underflow
+            if (ScanlineTimeout <= RasterTiming) gpu.GPU3D.RDLinesUnderflow = true;
+
+            scanlineswaiting+=2;
+
+            // simulate the process of scanlines being read from the 48 scanline buffer
+            while (scanlineswaiting >= 47 || RasterTiming >= slread[nextread] + 565)
+            {
+                if (RasterTiming < slread[nextread] + 565)
+                {
+                    RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565?
+                    timespent += 571; // fixes edge marking bug emulation. not sure why this is needed?
+                }
+                scanlineswaiting--;
+                nextread++;
+                // update rdlines_count register
+                if (gpu.GPU3D.RDLinesTemp > scanlineswaiting) gpu.GPU3D.RDLinesTemp = scanlineswaiting; // TODO: not accurate, rdlines appears to update early in some manner?
+            }
+
+            ScanlineFinalPass<true>(gpu.GPU3D, y-3, prevtimespent >= 502 || y-3 == 1, timespent >= 502);
+            ScanlineFinalPass<true>(gpu.GPU3D, y-2, prevtimespent >= 502, timespent >= 502);
+        }
+            scanlineswaiting+= 2;
+            prevtimespent = timespent;
+
+        // emulate read timings one last time, since it shouldn't matter after this
+        // additionally dont bother tracking rdlines anymore since it shouldn't be able to decrement anymore (CHECKME)
+        while (scanlineswaiting >= 47 || RasterTiming >= slread[nextread] + 565)
         {
             if (RasterTiming < slread[nextread] + 565)
             {
@@ -1909,55 +1972,11 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys)
             nextread++;
         }
 
-        ScanlineFinalPass<true>(gpu.GPU3D, 0, true, timespent >= 502);
-        for (int y = 4; y < 192; y+=2)
-        {
-            ScanlineTimeout = slread[y-1] - FinalPassLen;
-
-            RenderScanline(gpu, y, j, &rastertimingeven);
-            RenderScanline(gpu, y+1, j, &rastertimingodd);
-            
-            prevtimespent = timespent;
-            RasterTiming += timespent = std::max(std::initializer_list<s32> {rastertimingeven, rastertimingodd, FinalPassLen});
-            RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);
-            
-            scanlineswaiting+=2;
-
-            while (scanlineswaiting >= 47 || RasterTiming >= slread[nextread] + 565)
-            {
-                if (RasterTiming < slread[nextread] + 565)
-                {
-                    RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565?
-                    timespent += 571; // fixes edge marking bug emulation. not sure why this is needed?
-                }
-                scanlineswaiting--;
-                nextread++;
-            }
-
-            ScanlineFinalPass<true>(gpu.GPU3D, y-3, prevtimespent >= 502 || y-3 == 1, timespent >= 502);
-            ScanlineFinalPass<true>(gpu.GPU3D, y-2, prevtimespent >= 502, timespent >= 502);
-        }
-            scanlineswaiting+= 2;
-            prevtimespent = timespent;
-
-            // do this one last time to allow for edge marking bug emulation.
-            while (scanlineswaiting >= 47 || RasterTiming >= slread[nextread] + 565)
-            {
-                if (RasterTiming < slread[nextread] + 565)
-                {
-                    RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565?
-                    timespent += 571; // fixes edge marking bug emulation. not sure why this is needed?
-                }
-                scanlineswaiting--;
-                nextread++;
-            }
-
+        // finish the last 3 scanlines
         ScanlineFinalPass<true>(gpu.GPU3D, 189, prevtimespent >= 502, timespent >= 502);
         ScanlineFinalPass<true>(gpu.GPU3D, 190, prevtimespent >= 502, true);
 
-        // skip timing emulation here since it's irrelevant, also use timespent instead of prev because we're skipping timing emulation
         ScanlineFinalPass<true>(gpu.GPU3D, 191, timespent >= 502, true);
-
     }
     /*else
     {