From 8e2c9cbff68920820a76d331092bd5078ded16b9 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 8 Dec 2023 21:27:44 -0500 Subject: [PATCH 01/53] wip initial draft --- src/GPU3D.cpp | 47 ++++++++++++++++++++++++++++++++++++++++++++-- src/GPU3D.h | 19 +++++++++++++++++++ src/GPU3D_Soft.cpp | 25 ++++++++++++++++-------- src/GPU3D_Soft.h | 15 +++++++++------ 4 files changed, 90 insertions(+), 16 deletions(-) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 1a879abf..e8ac23b4 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -221,6 +221,12 @@ void GPU3D::Reset() noexcept DispCnt = 0; AlphaRefVal = 0; AlphaRef = 0; + + RDLines = 46; + RDLinesMin = 46; + RasterTimingCounterPrev = 0; + RasterTimingCounterOdd = 0; + RasterTimingCounterEven = 0; memset(ToonTable, 0, sizeof(ToonTable)); memset(EdgeTable, 0, sizeof(EdgeTable)); @@ -770,7 +776,40 @@ void GPU3D::StallPolygonPipeline(s32 delay, s32 nonstalldelay) noexcept } } +bool GPU3D::DoTimings(s32 cycles, bool odd) +{ + if (odd) + { + RasterTimingCounterOdd += cycles; + if ((RasterTimingCounterOdd + RasterTimingCounterPrev) < RasterTimingCap) return 0; + } + else + { + RasterTimingCounterEven += cycles; + if ((RasterTimingCounterEven + RasterTimingCounterPrev) < RasterTimingCap) return 0; + } + DispCnt |= (1<<12); + return 1; +} + +void GPU3D::EndScanline(bool odd) +{ + if (!odd) + { + RasterTimingCounterPrev += std::max(RasterTimingCounterOdd, RasterTimingCounterEven); + RasterTimingCounterPrev -= PerScanlineRecup; // wip + if (RasterTimingCounterPrev < 0) RasterTimingCounterPrev = 0; + // calc is wrong, seems to round up...? + RDLines = (RasterTimingCap - RasterTimingCounterPrev) / PerScanlineTiming; + // seems to display the lowest scanline buffer count reached during the current frame. + // we also caps it to 46 here, because this reg does that too for some reason. + if (RDLines > RDLinesMin) RDLines = RDLinesMin; + if (RDLines < RDLinesMin) RDLinesMin = RDLines; + RasterTimingCounterOdd = 0; + RasterTimingCounterEven = 0; + } +} template void ClipSegment(Vertex* outbuf, Vertex* vin, Vertex* vout) @@ -2369,6 +2408,10 @@ void GPU3D::CheckFIFODMA() noexcept void GPU3D::VCount144() noexcept { + RDLinesMin = 46; + RasterTimingCounterPrev = 0; + RasterTimingCounterOdd = 0; + RasterTimingCounterEven = 0; CurrentRenderer->VCount144(); } @@ -2612,7 +2655,7 @@ u16 GPU3D::Read16(u32 addr) noexcept return DispCnt; case 0x04000320: - return 46; // TODO, eventually + return RDLines; // IT IS TIME case 0x04000600: { @@ -2656,7 +2699,7 @@ u32 GPU3D::Read32(u32 addr) noexcept return DispCnt; case 0x04000320: - return 46; // TODO, eventually + return RDLines; // IT IS TIME case 0x04000600: { diff --git a/src/GPU3D.h b/src/GPU3D.h index dda78b78..924344f7 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -25,10 +25,20 @@ #include "Savestate.h" #include "FIFO.h" + namespace melonDS { class GPU; +// numbers based on 339 poly 64-172 horiz. line poly +static constexpr int RasterTimingCap = 51116; +static constexpr int PerPolyTiming = 12; +static constexpr int PerScanlineTiming = 1064; +static constexpr int PerScanlineRecup = 2010;//1910; +//static constexpr int EmptyPolyScanline; +//static constexpr int FirstPixelTiming; +static constexpr int PerPixelTiming = 1; + struct Vertex { s32 Position[4]; @@ -114,6 +124,9 @@ public: void WriteToGXFIFO(u32 val) noexcept; + bool DoTimings(s32 cycles, bool odd); + void EndScanline(bool odd); + [[nodiscard]] bool IsRendererAccelerated() const noexcept; [[nodiscard]] Renderer3D& GetCurrentRenderer() noexcept { return *CurrentRenderer; } [[nodiscard]] const Renderer3D& GetCurrentRenderer() const noexcept { return *CurrentRenderer; } @@ -126,6 +139,7 @@ public: void Write16(u32 addr, u16 val) noexcept; void Write32(u32 addr, u32 val) noexcept; void Blit() noexcept; + private: melonDS::NDS& NDS; typedef union @@ -242,6 +256,11 @@ public: bool RenderingEnabled = false; u32 DispCnt = 0; + u32 RDLines = 0; + u32 RDLinesMin = 0; + s32 RasterTimingCounterPrev = 0; + s32 RasterTimingCounterOdd = 0; + s32 RasterTimingCounterEven = 0; u8 AlphaRefVal = 0; u8 AlphaRef = 0; diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 03c6265e..1061228e 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -900,10 +900,11 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) rp->XR = rp->SlopeR.Step(); } -void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y) +void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) { + if (GPU.GPU3D.DoTimings(PerPolyTiming, odd)) return; + int pixelsrendered = 0; Polygon* polygon = rp->PolyData; - u32 polyattr = (polygon->Attr & 0x3F008000); if (!polygon->FacingView) polyattr |= (1<<4); @@ -1076,10 +1077,11 @@ void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y) if (xcov == 0x3FF) xcov = 0; } - if (!l_filledge) x = xlimit; - else for (; x < xlimit; x++) { + if (pixelsrendered >= 4 && GPU.GPU3D.DoTimings(PerPixelTiming, odd)) return; + pixelsrendered++; + if (!l_filledge) continue; u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; @@ -1172,10 +1174,11 @@ void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y) if (xlimit > xend+1) xlimit = xend+1; if (xlimit > 256) xlimit = 256; - if (wireframe && !edge) x = std::max(x, xlimit); - else for (; x < xlimit; x++) { + if (pixelsrendered >= 4 && GPU.GPU3D.DoTimings(PerPixelTiming, odd)) return; + pixelsrendered++; + if (wireframe && !edge) continue; u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; @@ -1265,9 +1268,11 @@ void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y) if (xcov == 0x3FF) xcov = 0; } - if (r_filledge) for (; x < xlimit; x++) { + if (pixelsrendered >= 4 && GPU.GPU3D.DoTimings(PerPixelTiming, odd)) return; + pixelsrendered++; + if (!r_filledge) continue; u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; @@ -1360,8 +1365,11 @@ void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y) void SoftRenderer::RenderScanline(s32 y, int npolys) { + bool odd = !(y % 2); for (int i = 0; i < npolys; i++) { + if (GPU.GPU3D.DoTimings(0, odd)) break; + RendererPolygon* rp = &PolygonList[i]; Polygon* polygon = rp->PolyData; @@ -1370,9 +1378,10 @@ void SoftRenderer::RenderScanline(s32 y, int npolys) if (polygon->IsShadowMask) RenderShadowMaskScanline(rp, y); else - RenderPolygonScanline(rp, y); + RenderPolygonScanline(rp, y, odd); } } + GPU.GPU3D.EndScanline(odd); } u32 SoftRenderer::CalculateFogDensity(u32 pixeladdr) diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 2f5664e2..e5cd44eb 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -461,7 +461,7 @@ private: void SetupPolygonRightEdge(RendererPolygon* rp, s32 y); void SetupPolygon(RendererPolygon* rp, Polygon* polygon); void RenderShadowMaskScanline(RendererPolygon* rp, s32 y); - void RenderPolygonScanline(RendererPolygon* rp, s32 y); + void RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd); void RenderScanline(s32 y, int npolys); u32 CalculateFogDensity(u32 pixeladdr); void ScanlineFinalPass(s32 y); @@ -476,14 +476,17 @@ private: // TODO: check if the hardware can accidentally plot pixels // offscreen in that border - static constexpr int ScanlineWidth = 258; - static constexpr int NumScanlines = 194; + static constexpr int ScanlineWidth = 256; + static constexpr int NumScanlines = 192; + static constexpr int NumScanlinesRDLines = 192; + static constexpr int RDLinesBufferSize = ScanlineWidth * NumScanlinesRDLines; static constexpr int BufferSize = ScanlineWidth * NumScanlines; static constexpr int FirstPixelOffset = ScanlineWidth + 1; - u32 ColorBuffer[BufferSize * 2]; - u32 DepthBuffer[BufferSize * 2]; - u32 AttrBuffer[BufferSize * 2]; + u32 ColorBuffer[RDLinesBufferSize * 2]; + u32 DepthBuffer[RDLinesBufferSize * 2]; + u32 AttrBuffer[RDLinesBufferSize * 2]; + u32 FinalBuffer[BufferSize * 2]; // attribute buffer: // bit0-3: edge flags (left/right/top/bottom) From 447cd50422dd34f13d609833730e427276823cb4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 9 Dec 2023 08:19:43 -0500 Subject: [PATCH 02/53] holds true when slopes are vertical and y > 50 --- src/GPU3D.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index 924344f7..cba5cf73 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -32,12 +32,12 @@ class GPU; // numbers based on 339 poly 64-172 horiz. line poly static constexpr int RasterTimingCap = 51116; -static constexpr int PerPolyTiming = 12; -static constexpr int PerScanlineTiming = 1064; -static constexpr int PerScanlineRecup = 2010;//1910; +static constexpr int PerPolyTiming = 12; // should be correct for *most* line polygons +static constexpr int PerPixelTiming = 1; // does not apply to the first 4 pixels in a polygon (per scanline?) +static constexpr int PerScanlineTiming = 1064;// approximate currently, used to calc RDLines +static constexpr int PerScanlineRecup = 2112; // seems to check out? //static constexpr int EmptyPolyScanline; //static constexpr int FirstPixelTiming; -static constexpr int PerPixelTiming = 1; struct Vertex { From 92ca04e47920676f887ced3a1672bfe723f18fa3 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 9 Dec 2023 10:58:08 -0500 Subject: [PATCH 03/53] i forgot i changed those-- oops no wonder edge marking was broken how did this even still work --- src/GPU3D_Soft.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index e5cd44eb..64660ac3 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -476,9 +476,9 @@ private: // TODO: check if the hardware can accidentally plot pixels // offscreen in that border - static constexpr int ScanlineWidth = 256; - static constexpr int NumScanlines = 192; - static constexpr int NumScanlinesRDLines = 192; + static constexpr int ScanlineWidth = 258; + static constexpr int NumScanlines = 194; + static constexpr int NumScanlinesRDLines = 194; static constexpr int RDLinesBufferSize = ScanlineWidth * NumScanlinesRDLines; static constexpr int BufferSize = ScanlineWidth * NumScanlines; static constexpr int FirstPixelOffset = ScanlineWidth + 1; From c45d3320d0a39c0445cae7d80823d50c51a36f09 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 9 Dec 2023 15:56:36 -0500 Subject: [PATCH 04/53] tentative timings for "empty" polys scanlines, fix swapped polys breaking --- src/GPU3D.cpp | 8 ++--- src/GPU3D.h | 19 ++++++----- src/GPU3D_Soft.cpp | 78 +++++++++++++++++++++++++++++----------------- src/GPU3D_Soft.h | 4 ++- 4 files changed, 68 insertions(+), 41 deletions(-) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index e8ac23b4..72b2c78c 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -781,16 +781,16 @@ bool GPU3D::DoTimings(s32 cycles, bool odd) if (odd) { RasterTimingCounterOdd += cycles; - if ((RasterTimingCounterOdd + RasterTimingCounterPrev) < RasterTimingCap) return 0; + if ((RasterTimingCounterOdd + RasterTimingCounterPrev) < RasterTimingCap) return false; } else { RasterTimingCounterEven += cycles; - if ((RasterTimingCounterEven + RasterTimingCounterPrev) < RasterTimingCap) return 0; + if ((RasterTimingCounterEven + RasterTimingCounterPrev) < RasterTimingCap) return false; } DispCnt |= (1<<12); - return 1; + return true; } void GPU3D::EndScanline(bool odd) @@ -805,7 +805,7 @@ void GPU3D::EndScanline(bool odd) // seems to display the lowest scanline buffer count reached during the current frame. // we also caps it to 46 here, because this reg does that too for some reason. if (RDLines > RDLinesMin) RDLines = RDLinesMin; - if (RDLines < RDLinesMin) RDLinesMin = RDLines; + else if (RDLines < RDLinesMin) RDLinesMin = RDLines; RasterTimingCounterOdd = 0; RasterTimingCounterEven = 0; } diff --git a/src/GPU3D.h b/src/GPU3D.h index cba5cf73..4450a539 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -30,14 +30,6 @@ namespace melonDS { class GPU; -// numbers based on 339 poly 64-172 horiz. line poly -static constexpr int RasterTimingCap = 51116; -static constexpr int PerPolyTiming = 12; // should be correct for *most* line polygons -static constexpr int PerPixelTiming = 1; // does not apply to the first 4 pixels in a polygon (per scanline?) -static constexpr int PerScanlineTiming = 1064;// approximate currently, used to calc RDLines -static constexpr int PerScanlineRecup = 2112; // seems to check out? -//static constexpr int EmptyPolyScanline; -//static constexpr int FirstPixelTiming; struct Vertex { @@ -345,6 +337,17 @@ public: u32 ScrolledLine[256]; }; + // numbers based on 339 poly 64-172 horiz. line poly + static constexpr int Frac = 481; // add a fractional component if pixels is not enough precision + static constexpr int RasterTimingCap = 51116*Frac; + static constexpr int PerScanlineTiming = 1064*Frac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED + static constexpr int PerScanlineRecup = 2112*Frac; // seems to check out? + + static constexpr int PerPolyTiming = 12*Frac; // should be correct for *most* line polygons and polygons with vertical slopes + static constexpr int PerPixelTiming = 1*Frac; // does not apply to the first 4 pixels in a polygon (per scanline?) + static constexpr int EmptyPolyScanline = 4*Frac - 14; // seems to be slightly under 4? + //static constexpr int FirstPixelTiming; + class Renderer3D { public: diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 1061228e..2cd234b6 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -672,6 +672,31 @@ void SoftRenderer::SetupPolygon(SoftRenderer::RendererPolygon* rp, Polygon* poly } } +bool SoftRenderer::Step(RendererPolygon* rp, bool abortscanline) +{ + rp->XL = rp->SlopeL.Step(); + rp->XR = rp->SlopeR.Step(); + return abortscanline; +} + +void SoftRenderer::CheckSlope(RendererPolygon* rp, s32 y) +{ + Polygon* polygon = rp->PolyData; + + if (polygon->YTop != polygon->YBottom) + { + if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) + { + SetupPolygonLeftEdge(rp, y); + } + + if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) + { + SetupPolygonRightEdge(rp, y); + } + } +} + void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) { Polygon* polygon = rp->PolyData; @@ -900,10 +925,8 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) rp->XR = rp->SlopeR.Step(); } -void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) +bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) { - if (GPU.GPU3D.DoTimings(PerPolyTiming, odd)) return; - int pixelsrendered = 0; Polygon* polygon = rp->PolyData; u32 polyattr = (polygon->Attr & 0x3F008000); if (!polygon->FacingView) polyattr |= (1<<4); @@ -921,18 +944,9 @@ void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) PrevIsShadowMask = false; - if (polygon->YTop != polygon->YBottom) - { - if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) - { - SetupPolygonLeftEdge(rp, y); - } - - if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) - { - SetupPolygonRightEdge(rp, y); - } - } + CheckSlope(rp, y); + + if (GPU.GPU3D.DoTimings(PerPolyTiming, odd)) return Step(rp, true); Vertex *vlcur, *vlnext, *vrcur, *vrnext; s32 xstart, xend; @@ -941,6 +955,7 @@ void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) s32 l_edgecov, r_edgecov; Interpolator<1>* interp_start; Interpolator<1>* interp_end; + u8 pixelsrendered = 0; // for tracking timings xstart = rp->XL; xend = rp->XR; @@ -1076,11 +1091,12 @@ void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) xcov = (l_edgecov >> 12) & 0x3FF; if (xcov == 0x3FF) xcov = 0; } + for (; x < xlimit; x++) { - if (pixelsrendered >= 4 && GPU.GPU3D.DoTimings(PerPixelTiming, odd)) return; - pixelsrendered++; + if (pixelsrendered >= 4 && GPU.GPU3D.DoTimings(PerPixelTiming, odd)) return Step(rp, true); + else pixelsrendered++; if (!l_filledge) continue; u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; @@ -1176,8 +1192,8 @@ void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) for (; x < xlimit; x++) { - if (pixelsrendered >= 4 && GPU.GPU3D.DoTimings(PerPixelTiming, odd)) return; - pixelsrendered++; + if (pixelsrendered >= 4 && GPU.GPU3D.DoTimings(PerPixelTiming, odd)) return Step(rp, true); + else pixelsrendered++; if (wireframe && !edge) continue; u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; @@ -1270,8 +1286,8 @@ void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) for (; x < xlimit; x++) { - if (pixelsrendered >= 4 && GPU.GPU3D.DoTimings(PerPixelTiming, odd)) return; - pixelsrendered++; + if (pixelsrendered >= 4 && GPU.GPU3D.DoTimings(PerPixelTiming, odd)) return Step(rp, true); + else pixelsrendered++; if (!r_filledge) continue; u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; @@ -1358,27 +1374,33 @@ void SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) PlotTranslucentPixel(pixeladdr+BufferSize, color, z, polyattr, polygon->IsShadow); } } - - rp->XL = rp->SlopeL.Step(); - rp->XR = rp->SlopeR.Step(); + return Step(rp, false); } void SoftRenderer::RenderScanline(s32 y, int npolys) { bool odd = !(y % 2); + bool abort = false; for (int i = 0; i < npolys; i++) { - if (GPU.GPU3D.DoTimings(0, odd)) break; - RendererPolygon* rp = &PolygonList[i]; Polygon* polygon = rp->PolyData; - if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) + if (abort) + { + CheckSlope(rp, y); + Step(rp, NULL); + } + else if (y == polygon->YBottom && y != polygon->YTop) + { + if (GPU.GPU3D.DoTimings(EmptyPolyScanline, odd)) abort = true; + } + else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) { if (polygon->IsShadowMask) RenderShadowMaskScanline(rp, y); else - RenderPolygonScanline(rp, y, odd); + if (RenderPolygonScanline(rp, y, odd)) abort = true; } } GPU.GPU3D.EndScanline(odd); diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 64660ac3..11de0fef 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -460,8 +460,10 @@ private: void SetupPolygonLeftEdge(RendererPolygon* rp, s32 y); void SetupPolygonRightEdge(RendererPolygon* rp, s32 y); void SetupPolygon(RendererPolygon* rp, Polygon* polygon); + bool Step(RendererPolygon* rp, bool abortscanline); + void CheckSlope(RendererPolygon* rp, s32 y); void RenderShadowMaskScanline(RendererPolygon* rp, s32 y); - void RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd); + bool RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd); void RenderScanline(s32 y, int npolys); u32 CalculateFogDensity(u32 pixeladdr); void ScanlineFinalPass(s32 y); From 63a39b130e693f0df227452da52bc2488f79754d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 10 Dec 2023 12:18:46 -0500 Subject: [PATCH 05/53] refactor framebuffers to be more similar to hw allows for emulation of niche scanline glitches --- src/GPU3D.cpp | 41 --------- src/GPU3D.h | 8 +- src/GPU3D_Soft.cpp | 211 ++++++++++++++++++++++++++++----------------- src/GPU3D_Soft.h | 37 +++++--- 4 files changed, 158 insertions(+), 139 deletions(-) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 72b2c78c..12da23db 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -224,9 +224,6 @@ void GPU3D::Reset() noexcept RDLines = 46; RDLinesMin = 46; - RasterTimingCounterPrev = 0; - RasterTimingCounterOdd = 0; - RasterTimingCounterEven = 0; memset(ToonTable, 0, sizeof(ToonTable)); memset(EdgeTable, 0, sizeof(EdgeTable)); @@ -776,41 +773,6 @@ void GPU3D::StallPolygonPipeline(s32 delay, s32 nonstalldelay) noexcept } } -bool GPU3D::DoTimings(s32 cycles, bool odd) -{ - if (odd) - { - RasterTimingCounterOdd += cycles; - if ((RasterTimingCounterOdd + RasterTimingCounterPrev) < RasterTimingCap) return false; - } - else - { - RasterTimingCounterEven += cycles; - if ((RasterTimingCounterEven + RasterTimingCounterPrev) < RasterTimingCap) return false; - } - - DispCnt |= (1<<12); - return true; -} - -void GPU3D::EndScanline(bool odd) -{ - if (!odd) - { - RasterTimingCounterPrev += std::max(RasterTimingCounterOdd, RasterTimingCounterEven); - RasterTimingCounterPrev -= PerScanlineRecup; // wip - if (RasterTimingCounterPrev < 0) RasterTimingCounterPrev = 0; - // calc is wrong, seems to round up...? - RDLines = (RasterTimingCap - RasterTimingCounterPrev) / PerScanlineTiming; - // seems to display the lowest scanline buffer count reached during the current frame. - // we also caps it to 46 here, because this reg does that too for some reason. - if (RDLines > RDLinesMin) RDLines = RDLinesMin; - else if (RDLines < RDLinesMin) RDLinesMin = RDLines; - RasterTimingCounterOdd = 0; - RasterTimingCounterEven = 0; - } -} - template void ClipSegment(Vertex* outbuf, Vertex* vin, Vertex* vout) { @@ -2409,9 +2371,6 @@ void GPU3D::CheckFIFODMA() noexcept void GPU3D::VCount144() noexcept { RDLinesMin = 46; - RasterTimingCounterPrev = 0; - RasterTimingCounterOdd = 0; - RasterTimingCounterEven = 0; CurrentRenderer->VCount144(); } diff --git a/src/GPU3D.h b/src/GPU3D.h index 4450a539..8c4b22ca 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -116,9 +116,6 @@ public: void WriteToGXFIFO(u32 val) noexcept; - bool DoTimings(s32 cycles, bool odd); - void EndScanline(bool odd); - [[nodiscard]] bool IsRendererAccelerated() const noexcept; [[nodiscard]] Renderer3D& GetCurrentRenderer() noexcept { return *CurrentRenderer; } [[nodiscard]] const Renderer3D& GetCurrentRenderer() const noexcept { return *CurrentRenderer; } @@ -250,9 +247,6 @@ public: u32 DispCnt = 0; u32 RDLines = 0; u32 RDLinesMin = 0; - s32 RasterTimingCounterPrev = 0; - s32 RasterTimingCounterOdd = 0; - s32 RasterTimingCounterEven = 0; u8 AlphaRefVal = 0; u8 AlphaRef = 0; @@ -342,7 +336,7 @@ public: static constexpr int RasterTimingCap = 51116*Frac; static constexpr int PerScanlineTiming = 1064*Frac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED static constexpr int PerScanlineRecup = 2112*Frac; // seems to check out? - + static constexpr int PerRightSlope = 1*Frac; static constexpr int PerPolyTiming = 12*Frac; // should be correct for *most* line polygons and polygons with vertical slopes static constexpr int PerPixelTiming = 1*Frac; // does not apply to the first 4 pixels in a polygon (per scanline?) static constexpr int EmptyPolyScanline = 4*Frac - 14; // seems to be slightly under 4? diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 2cd234b6..04544b84 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -112,6 +112,43 @@ void SoftRenderer::SetThreaded(bool threaded) noexcept } } +template +bool SoftRenderer::DoTimings(s32 cycles) +{ + if constexpr (odd) + { + RasterTimingCounterOdd += cycles; + if ((RasterTimingCounterOdd + RasterTimingCounterPrev) < RasterTimingCap) return false; + } + else + { + RasterTimingCounterEven += cycles; + if ((RasterTimingCounterEven + RasterTimingCounterPrev) < RasterTimingCap) return false; + } + + GPU.GPU3D.DispCnt |= (1<<12); + return true; +} + +template +void SoftRenderer::EndScanline() +{ + if constexpr (!odd) + { + RasterTimingCounterPrev += std::max(RasterTimingCounterOdd, RasterTimingCounterEven); + RasterTimingCounterPrev -= PerScanlineRecup; // wip + if (RasterTimingCounterPrev < 0) RasterTimingCounterPrev = 0; + // calc is wrong, seems to round up...? + GPU.GPU3D.RDLines = (RasterTimingCap - RasterTimingCounterPrev) / PerScanlineTiming; + // seems to display the lowest scanline buffer count reached during the current frame. + // we also caps it to 46 here, because this reg does that too for some reason. + if (GPU.GPU3D.RDLines > GPU.GPU3D.RDLinesMin) GPU.GPU3D.RDLines = GPU.GPU3D.RDLinesMin; + else if (GPU.GPU3D.RDLines < GPU.GPU3D.RDLinesMin) GPU.GPU3D.RDLinesMin = GPU.GPU3D.RDLines; + RasterTimingCounterOdd = 0; + RasterTimingCounterEven = 0; + } +} + void SoftRenderer::TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) { u32 vramaddr = (texparam & 0xFFFF) << 3; @@ -925,7 +962,8 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) rp->XR = rp->SlopeR.Step(); } -bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) +template +bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y) { Polygon* polygon = rp->PolyData; u32 polyattr = (polygon->Attr & 0x3F008000); @@ -946,7 +984,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) CheckSlope(rp, y); - if (GPU.GPU3D.DoTimings(PerPolyTiming, odd)) return Step(rp, true); + if (DoTimings(PerPolyTiming)) return Step(rp, true); Vertex *vlcur, *vlnext, *vrcur, *vrnext; s32 xstart, xend; @@ -955,7 +993,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) s32 l_edgecov, r_edgecov; Interpolator<1>* interp_start; Interpolator<1>* interp_end; - u8 pixelsrendered = 0; // for tracking timings + u16 pixelsrendered = 0; // for tracking timings xstart = rp->XL; xend = rp->XR; @@ -1095,10 +1133,12 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) for (; x < xlimit; x++) { - if (pixelsrendered >= 4 && GPU.GPU3D.DoTimings(PerPixelTiming, odd)) return Step(rp, true); - else pixelsrendered++; + if (pixelsrendered >= 4 && DoTimings(PerPixelTiming)) return Step(rp, true); + pixelsrendered++; + if (!l_filledge) continue; - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + + u32 pixeladdr = (BufferOffset * ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; // check stencil buffer for shadows @@ -1192,10 +1232,12 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) for (; x < xlimit; x++) { - if (pixelsrendered >= 4 && GPU.GPU3D.DoTimings(PerPixelTiming, odd)) return Step(rp, true); - else pixelsrendered++; + if (pixelsrendered >= 4 && DoTimings(PerPixelTiming)) return Step(rp, true); + pixelsrendered++; + if (wireframe && !edge) continue; - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + + u32 pixeladdr = (BufferOffset * ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; // check stencil buffer for shadows @@ -1283,13 +1325,15 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) xcov = (r_edgecov >> 12) & 0x3FF; if (xcov == 0x3FF) xcov = 0; } - + //if (rp->SlopeR.Increment != 0 && DoTimings(PerRightSlope, odd)) return Step(rp, true); // should be fine to not immediately return? might be wrong for (; x < xlimit; x++) { - if (pixelsrendered >= 4 && GPU.GPU3D.DoTimings(PerPixelTiming, odd)) return Step(rp, true); - else pixelsrendered++; + if (pixelsrendered >= 4 && DoTimings(PerPixelTiming)) return Step(rp, true); + pixelsrendered++; + if (!r_filledge) continue; - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + + u32 pixeladdr = (BufferOffset * ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; // check stencil buffer for shadows @@ -1377,9 +1421,9 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) return Step(rp, false); } -void SoftRenderer::RenderScanline(s32 y, int npolys) +template +bool SoftRenderer::RenderScanline(s32 y, int npolys) { - bool odd = !(y % 2); bool abort = false; for (int i = 0; i < npolys; i++) { @@ -1393,17 +1437,20 @@ void SoftRenderer::RenderScanline(s32 y, int npolys) } else if (y == polygon->YBottom && y != polygon->YTop) { - if (GPU.GPU3D.DoTimings(EmptyPolyScanline, odd)) abort = true; + if (DoTimings(EmptyPolyScanline)) abort = true; } else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) { if (polygon->IsShadowMask) - RenderShadowMaskScanline(rp, y); + ;//RenderShadowMaskScanline(rp, y); else - if (RenderPolygonScanline(rp, y, odd)) abort = true; + if (RenderPolygonScanline(rp, y)) abort = true; } } - GPU.GPU3D.EndScanline(odd); + + BufferOffset = (BufferOffset + 1) & 0x7; // loop if == 8 + EndScanline(); + return abort; } u32 SoftRenderer::CalculateFogDensity(u32 pixeladdr) @@ -1445,20 +1492,28 @@ u32 SoftRenderer::CalculateFogDensity(u32 pixeladdr) return density; } -void SoftRenderer::ScanlineFinalPass(s32 y) +template +void SoftRenderer::ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool late) { // to consider: // clearing all polygon fog flags if the master flag isn't set? // merging all final pass loops into one? + u8 tempoffset; + if constexpr (finish) + tempoffset = (BufferOffset - 2 + (!odd)); + else + tempoffset = (BufferOffset - 4 + (!odd)); - if (GPU.GPU3D.RenderDispCnt & (1<<5)) + if (tempoffset > 7) tempoffset -= 0xF8; // handle underflows + + /*if (GPU.GPU3D.RenderDispCnt & (1<<5)) { // edge marking // only applied to topmost pixels for (int x = 0; x < 256; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (tempoffset * ScanlineWidth) + x; u32 attr = AttrBuffer[pixeladdr]; if (!(attr & 0xF)) continue; @@ -1482,7 +1537,7 @@ void SoftRenderer::ScanlineFinalPass(s32 y) AttrBuffer[pixeladdr] = (AttrBuffer[pixeladdr] & 0xFFFFE0FF) | 0x00001000; } } - } + }*/ if (GPU.GPU3D.RenderDispCnt & (1<<7)) { @@ -1506,7 +1561,7 @@ void SoftRenderer::ScanlineFinalPass(s32 y) for (int x = 0; x < 256; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (tempoffset * ScanlineWidth) + x; u32 density, srccolor, srcR, srcG, srcB, srcA; u32 attr = AttrBuffer[pixeladdr]; @@ -1571,7 +1626,7 @@ void SoftRenderer::ScanlineFinalPass(s32 y) for (int x = 0; x < 256; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (tempoffset * ScanlineWidth) + x; u32 attr = AttrBuffer[pixeladdr]; if (!(attr & 0xF)) continue; @@ -1613,47 +1668,32 @@ void SoftRenderer::ScanlineFinalPass(s32 y) ColorBuffer[pixeladdr] = topR | (topG << 8) | (topB << 16) | (topA << 24); } } + + if (late) + { + memcpy(&FinalBuffer[y*ScanlineWidth], &RDBuffer[rdbufferoffset*ScanlineWidth], 4 * ScanlineWidth); + memcpy(&RDBuffer[rdbufferoffset*ScanlineWidth], &ColorBuffer[tempoffset*ScanlineWidth], 4 * ScanlineWidth); + } + else + { + memcpy(&RDBuffer[rdbufferoffset*ScanlineWidth], &ColorBuffer[tempoffset*ScanlineWidth], 4 * ScanlineWidth); + memcpy(&FinalBuffer[y*ScanlineWidth], &RDBuffer[rdbufferoffset*ScanlineWidth], 4 * ScanlineWidth); + } } -void SoftRenderer::ClearBuffers() +void SoftRenderer::ClearBuffers(s32 y) { u32 clearz = ((GPU.GPU3D.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF; u32 polyid = GPU.GPU3D.RenderClearAttr1 & 0x3F000000; // this sets the opaque polygonID - // fill screen borders for edge marking - - for (int x = 0; x < ScanlineWidth; x++) - { - ColorBuffer[x] = 0; - DepthBuffer[x] = clearz; - AttrBuffer[x] = polyid; - } - - for (int x = ScanlineWidth; x < ScanlineWidth*193; x+=ScanlineWidth) - { - ColorBuffer[x] = 0; - DepthBuffer[x] = clearz; - AttrBuffer[x] = polyid; - ColorBuffer[x+257] = 0; - DepthBuffer[x+257] = clearz; - AttrBuffer[x+257] = polyid; - } - - for (int x = ScanlineWidth*193; x < ScanlineWidth*194; x++) - { - ColorBuffer[x] = 0; - DepthBuffer[x] = clearz; - AttrBuffer[x] = polyid; - } - // clear the screen if (GPU.GPU3D.RenderDispCnt & (1<<14)) { u8 xoff = (GPU.GPU3D.RenderClearAttr2 >> 16) & 0xFF; - u8 yoff = (GPU.GPU3D.RenderClearAttr2 >> 24) & 0xFF; + u8 yoff = ((GPU.GPU3D.RenderClearAttr2 >> 24) & 0xFF) + y; - for (int y = 0; y < ScanlineWidth*192; y+=ScanlineWidth) + for (int i = 0; i < 2; i++) { for (int x = 0; x < 256; x++) { @@ -1669,7 +1709,7 @@ void SoftRenderer::ClearBuffers() u32 z = ((val3 & 0x7FFF) * 0x200) + 0x1FF; - u32 pixeladdr = FirstPixelOffset + y + x; + u32 pixeladdr = ((BufferOffset+i) * ScanlineWidth) + x; ColorBuffer[pixeladdr] = color; DepthBuffer[pixeladdr] = z; AttrBuffer[pixeladdr] = polyid | (val3 & 0x8000); @@ -1690,12 +1730,12 @@ void SoftRenderer::ClearBuffers() u32 color = r | (g << 8) | (b << 16) | (a << 24); polyid |= (GPU.GPU3D.RenderClearAttr1 & 0x8000); - - for (int y = 0; y < ScanlineWidth*192; y+=ScanlineWidth) + + for (int i = 0; i < 2; i++) { for (int x = 0; x < 256; x++) { - u32 pixeladdr = FirstPixelOffset + y + x; + u32 pixeladdr = ((BufferOffset+i) * ScanlineWidth) + x; ColorBuffer[pixeladdr] = color; DepthBuffer[pixeladdr] = clearz; AttrBuffer[pixeladdr] = polyid; @@ -1712,19 +1752,38 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) if (polygons[i]->Degenerate) continue; SetupPolygon(&PolygonList[j++], polygons[i]); } + + s32 y = 0; + s8 prevbufferline = -2; + bool latebuffer[192] = {}; - RenderScanline(0, j); + for (u8 quarter = 0; quarter < 4; quarter++) + for (u8 bufferline = 0; bufferline < 48; bufferline += 2) + { + ClearBuffers(y); + latebuffer[y] = RenderScanline(y, j); + latebuffer[y+1] = RenderScanline(y+1, j); + - for (s32 y = 1; y < 192; y++) - { - RenderScanline(y, j); - ScanlineFinalPass(y-1); + if (prevbufferline >= 0) + { + if (!latebuffer[y-2]) latebuffer[y] = false; + + if (!latebuffer[y-1]) latebuffer[y+1] = false; - if (threaded) - Platform::Semaphore_Post(Sema_ScanlineCount); - } + ScanlineFinalPass(y-2, prevbufferline, latebuffer[y-2]); + ScanlineFinalPass(y-1, prevbufferline+1, latebuffer[y-1]); + } - ScanlineFinalPass(191); + y += 2; + prevbufferline = bufferline; + + if (threaded) + Platform::Semaphore_Post(Sema_ScanlineCount); + } + + ScanlineFinalPass(190, prevbufferline, latebuffer[190]); + ScanlineFinalPass(191, prevbufferline+1, latebuffer[191]); if (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); @@ -1734,6 +1793,10 @@ void SoftRenderer::VCount144() { if (RenderThreadRunning.load(std::memory_order_relaxed) && !GPU.GPU3D.AbortFrame) Platform::Semaphore_Wait(Sema_RenderDone); + + RasterTimingCounterPrev = 0; + RasterTimingCounterOdd = 0; + RasterTimingCounterEven = 0; } void SoftRenderer::RenderFrame() @@ -1750,11 +1813,7 @@ void SoftRenderer::RenderFrame() { Platform::Semaphore_Post(Sema_RenderStart); } - else if (!FrameIdentical) - { - ClearBuffers(); - RenderPolygons(false, &GPU.GPU3D.RenderPolygonRAM[0], GPU.GPU3D.RenderNumPolygons); - } + else if (!FrameIdentical) RenderPolygons(false, &GPU.GPU3D.RenderPolygonRAM[0], GPU.GPU3D.RenderNumPolygons); } void SoftRenderer::RestartFrame() @@ -1774,11 +1833,7 @@ void SoftRenderer::RenderThreadFunc() { Platform::Semaphore_Post(Sema_ScanlineCount, 192); } - else - { - ClearBuffers(); - RenderPolygons(true, &GPU.GPU3D.RenderPolygonRAM[0], GPU.GPU3D.RenderNumPolygons); - } + else RenderPolygons(true, &GPU.GPU3D.RenderPolygonRAM[0], GPU.GPU3D.RenderNumPolygons); Platform::Semaphore_Post(Sema_RenderDone); RenderThreadRendering = false; @@ -1793,7 +1848,7 @@ u32* SoftRenderer::GetLine(int line) Platform::Semaphore_Wait(Sema_ScanlineCount); } - return &ColorBuffer[(line * ScanlineWidth) + FirstPixelOffset]; + return &FinalBuffer[(line * ScanlineWidth) + FirstPixelOffset]; } } diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 11de0fef..aac79076 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -454,6 +454,8 @@ private: melonDS::GPU& GPU; RendererPolygon PolygonList[2048]; + template bool DoTimings(s32 cycles); + template void EndScanline(); void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha); u32 RenderPixel(Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t); void PlotTranslucentPixel(u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow); @@ -463,14 +465,19 @@ private: bool Step(RendererPolygon* rp, bool abortscanline); void CheckSlope(RendererPolygon* rp, s32 y); void RenderShadowMaskScanline(RendererPolygon* rp, s32 y); - bool RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd); - void RenderScanline(s32 y, int npolys); + template bool RenderPolygonScanline(RendererPolygon* rp, s32 y); + template bool RenderScanline(s32 y, int npolys); u32 CalculateFogDensity(u32 pixeladdr); - void ScanlineFinalPass(s32 y); - void ClearBuffers(); + template void ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool late); + void ClearBuffers(s32 y); void RenderPolygons(bool threaded, Polygon** polygons, int npolys); void RenderThreadFunc(); + + // counters for scanline rasterization timings + s32 RasterTimingCounterPrev = 0; + s32 RasterTimingCounterOdd = 0; + s32 RasterTimingCounterEven = 0; // buffer dimensions are 258x194 to add a offscreen 1px border // which simplifies edge marking tests @@ -478,17 +485,21 @@ private: // TODO: check if the hardware can accidentally plot pixels // offscreen in that border - static constexpr int ScanlineWidth = 258; - static constexpr int NumScanlines = 194; - static constexpr int NumScanlinesRDLines = 194; - static constexpr int RDLinesBufferSize = ScanlineWidth * NumScanlinesRDLines; + static constexpr int ScanlineWidth = 256; + static constexpr int NumScanlines = 192; + static constexpr int NumScanlinesRD = 48; + static constexpr int NumScanlinesInternal = 8; + static constexpr int InternalBufferSize = ScanlineWidth * NumScanlinesInternal; + static constexpr int RDBufferSize = ScanlineWidth * NumScanlinesRD; static constexpr int BufferSize = ScanlineWidth * NumScanlines; - static constexpr int FirstPixelOffset = ScanlineWidth + 1; + static constexpr int FirstPixelOffset = 0; - u32 ColorBuffer[RDLinesBufferSize * 2]; - u32 DepthBuffer[RDLinesBufferSize * 2]; - u32 AttrBuffer[RDLinesBufferSize * 2]; - u32 FinalBuffer[BufferSize * 2]; + u32 ColorBuffer[InternalBufferSize * 2]; + u32 DepthBuffer[InternalBufferSize * 2]; + u32 AttrBuffer[InternalBufferSize * 2]; + u8 BufferOffset; + u32 RDBuffer[RDBufferSize]; + u32 FinalBuffer[BufferSize]; // attribute buffer: // bit0-3: edge flags (left/right/top/bottom) From 785fab024fbc71ed698364da62607596f48522d2 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 10 Dec 2023 13:03:54 -0500 Subject: [PATCH 06/53] dont use templates bigger code <<< slower code --- src/GPU3D_Soft.cpp | 47 +++++++++++++++++++++------------------------- src/GPU3D_Soft.h | 10 +++++----- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 04544b84..4aaf00ea 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -112,10 +112,9 @@ void SoftRenderer::SetThreaded(bool threaded) noexcept } } -template -bool SoftRenderer::DoTimings(s32 cycles) +bool SoftRenderer::DoTimings(s32 cycles, bool odd) { - if constexpr (odd) + if (odd) { RasterTimingCounterOdd += cycles; if ((RasterTimingCounterOdd + RasterTimingCounterPrev) < RasterTimingCap) return false; @@ -130,10 +129,9 @@ bool SoftRenderer::DoTimings(s32 cycles) return true; } -template -void SoftRenderer::EndScanline() +void SoftRenderer::EndScanline(bool odd) { - if constexpr (!odd) + if (!odd) { RasterTimingCounterPrev += std::max(RasterTimingCounterOdd, RasterTimingCounterEven); RasterTimingCounterPrev -= PerScanlineRecup; // wip @@ -962,8 +960,7 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) rp->XR = rp->SlopeR.Step(); } -template -bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y) +bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) { Polygon* polygon = rp->PolyData; u32 polyattr = (polygon->Attr & 0x3F008000); @@ -984,7 +981,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y) CheckSlope(rp, y); - if (DoTimings(PerPolyTiming)) return Step(rp, true); + if (DoTimings(PerPolyTiming, odd)) return Step(rp, true); Vertex *vlcur, *vlnext, *vrcur, *vrnext; s32 xstart, xend; @@ -1133,7 +1130,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y) for (; x < xlimit; x++) { - if (pixelsrendered >= 4 && DoTimings(PerPixelTiming)) return Step(rp, true); + if (pixelsrendered >= 4 && DoTimings(PerPixelTiming, odd)) return Step(rp, true); pixelsrendered++; if (!l_filledge) continue; @@ -1232,7 +1229,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y) for (; x < xlimit; x++) { - if (pixelsrendered >= 4 && DoTimings(PerPixelTiming)) return Step(rp, true); + if (pixelsrendered >= 4 && DoTimings(PerPixelTiming, odd)) return Step(rp, true); pixelsrendered++; if (wireframe && !edge) continue; @@ -1328,7 +1325,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y) //if (rp->SlopeR.Increment != 0 && DoTimings(PerRightSlope, odd)) return Step(rp, true); // should be fine to not immediately return? might be wrong for (; x < xlimit; x++) { - if (pixelsrendered >= 4 && DoTimings(PerPixelTiming)) return Step(rp, true); + if (pixelsrendered >= 4 && DoTimings(PerPixelTiming, odd)) return Step(rp, true); pixelsrendered++; if (!r_filledge) continue; @@ -1421,8 +1418,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y) return Step(rp, false); } -template -bool SoftRenderer::RenderScanline(s32 y, int npolys) +bool SoftRenderer::RenderScanline(s32 y, int npolys, bool odd) { bool abort = false; for (int i = 0; i < npolys; i++) @@ -1437,19 +1433,19 @@ bool SoftRenderer::RenderScanline(s32 y, int npolys) } else if (y == polygon->YBottom && y != polygon->YTop) { - if (DoTimings(EmptyPolyScanline)) abort = true; + if (DoTimings(EmptyPolyScanline, odd)) abort = true; } else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) { if (polygon->IsShadowMask) ;//RenderShadowMaskScanline(rp, y); else - if (RenderPolygonScanline(rp, y)) abort = true; + if (RenderPolygonScanline(rp, y, odd)) abort = true; } } BufferOffset = (BufferOffset + 1) & 0x7; // loop if == 8 - EndScanline(); + EndScanline(odd); return abort; } @@ -1492,14 +1488,13 @@ u32 SoftRenderer::CalculateFogDensity(u32 pixeladdr) return density; } -template -void SoftRenderer::ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool late) +void SoftRenderer::ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool late, bool odd, bool finish) { // to consider: // clearing all polygon fog flags if the master flag isn't set? // merging all final pass loops into one? u8 tempoffset; - if constexpr (finish) + if (finish) tempoffset = (BufferOffset - 2 + (!odd)); else tempoffset = (BufferOffset - 4 + (!odd)); @@ -1761,8 +1756,8 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) for (u8 bufferline = 0; bufferline < 48; bufferline += 2) { ClearBuffers(y); - latebuffer[y] = RenderScanline(y, j); - latebuffer[y+1] = RenderScanline(y+1, j); + latebuffer[y] = RenderScanline(y, j, true); + latebuffer[y+1] = RenderScanline(y+1, j, false); if (prevbufferline >= 0) @@ -1771,8 +1766,8 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) if (!latebuffer[y-1]) latebuffer[y+1] = false; - ScanlineFinalPass(y-2, prevbufferline, latebuffer[y-2]); - ScanlineFinalPass(y-1, prevbufferline+1, latebuffer[y-1]); + ScanlineFinalPass(y-2, prevbufferline, latebuffer[y-2], true, false); + ScanlineFinalPass(y-1, prevbufferline+1, latebuffer[y-1], false, false); } y += 2; @@ -1782,8 +1777,8 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) Platform::Semaphore_Post(Sema_ScanlineCount); } - ScanlineFinalPass(190, prevbufferline, latebuffer[190]); - ScanlineFinalPass(191, prevbufferline+1, latebuffer[191]); + ScanlineFinalPass(190, prevbufferline, latebuffer[190], true, true); + ScanlineFinalPass(191, prevbufferline+1, latebuffer[191], false, true); if (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index aac79076..d9b925f3 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -454,8 +454,8 @@ private: melonDS::GPU& GPU; RendererPolygon PolygonList[2048]; - template bool DoTimings(s32 cycles); - template void EndScanline(); + bool DoTimings(s32 cycles, bool odd); + void EndScanline(bool odd); void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha); u32 RenderPixel(Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t); void PlotTranslucentPixel(u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow); @@ -465,10 +465,10 @@ private: bool Step(RendererPolygon* rp, bool abortscanline); void CheckSlope(RendererPolygon* rp, s32 y); void RenderShadowMaskScanline(RendererPolygon* rp, s32 y); - template bool RenderPolygonScanline(RendererPolygon* rp, s32 y); - template bool RenderScanline(s32 y, int npolys); + bool RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd); + bool RenderScanline(s32 y, int npolys, bool odd); u32 CalculateFogDensity(u32 pixeladdr); - template void ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool late); + void ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool late, bool odd, bool finish); void ClearBuffers(s32 y); void RenderPolygons(bool threaded, Polygon** polygons, int npolys); From 2bf033e0bcc70026524a8a75cb2bf48c7a3ea496 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 10 Dec 2023 18:41:17 -0500 Subject: [PATCH 07/53] optimize per pixel timing counting --- src/GPU3D.h | 16 ++++----- src/GPU3D_Soft.cpp | 88 ++++++++++++++++++++++++++++++---------------- src/GPU3D_Soft.h | 3 +- 3 files changed, 67 insertions(+), 40 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index 8c4b22ca..284e101f 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -332,14 +332,14 @@ public: }; // numbers based on 339 poly 64-172 horiz. line poly - static constexpr int Frac = 481; // add a fractional component if pixels is not enough precision - static constexpr int RasterTimingCap = 51116*Frac; - static constexpr int PerScanlineTiming = 1064*Frac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED - static constexpr int PerScanlineRecup = 2112*Frac; // seems to check out? - static constexpr int PerRightSlope = 1*Frac; - static constexpr int PerPolyTiming = 12*Frac; // should be correct for *most* line polygons and polygons with vertical slopes - static constexpr int PerPixelTiming = 1*Frac; // does not apply to the first 4 pixels in a polygon (per scanline?) - static constexpr int EmptyPolyScanline = 4*Frac - 14; // seems to be slightly under 4? + static constexpr int RasterFrac = 481; // add a fractional component if pixels is not enough precision + static constexpr int RasterTimingCap = 51116*RasterFrac; + static constexpr int PerScanlineTiming = 1064*RasterFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED + static constexpr int PerScanlineRecup = 2112*RasterFrac; // seems to check out? + static constexpr int PerRightSlope = 1*RasterFrac; + static constexpr int PerPolyTiming = 12*RasterFrac; // should be correct for *most* line polygons and polygons with vertical slopes + static constexpr int PerPixelTiming = 1*RasterFrac; // does not apply to the first 4 pixels in a polygon (per scanline?) + static constexpr int EmptyPolyScanline = 4*RasterFrac - 14; // seems to be slightly under 4? //static constexpr int FirstPixelTiming; class Renderer3D diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 4aaf00ea..306d7db1 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -129,6 +129,29 @@ bool SoftRenderer::DoTimings(s32 cycles, bool odd) return true; } +s32 SoftRenderer::DoTimingsPixels(u32 pixels, bool odd) +{ + // return the difference between the old span and the new span + if (pixels <= 4) return 0; + + u32 pixeltiming = (pixels - 4) * RasterFrac; + + if (odd) + { + u32 rasterend = RasterTimingCap - (RasterTimingCounterOdd + RasterTimingCounterPrev); + pixeltiming = rasterend - pixeltiming; + } + else + { + u32 rasterend = RasterTimingCap - (RasterTimingCounterEven + RasterTimingCounterPrev); + pixeltiming = rasterend - pixeltiming; + } + if (pixeltiming > 0) return 0; + + GPU.GPU3D.DispCnt |= (1<<12); + return pixels - (((pixeltiming + (RasterFrac-1)) / RasterFrac) + 4); +} + void SoftRenderer::EndScanline(bool odd) { if (!odd) @@ -707,11 +730,10 @@ void SoftRenderer::SetupPolygon(SoftRenderer::RendererPolygon* rp, Polygon* poly } } -bool SoftRenderer::Step(RendererPolygon* rp, bool abortscanline) +void SoftRenderer::Step(RendererPolygon* rp) { rp->XL = rp->SlopeL.Step(); rp->XR = rp->SlopeR.Step(); - return abortscanline; } void SoftRenderer::CheckSlope(RendererPolygon* rp, s32 y) @@ -981,7 +1003,11 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) CheckSlope(rp, y); - if (DoTimings(PerPolyTiming, odd)) return Step(rp, true); + if (DoTimings(PerPolyTiming, odd)) + { + Step(rp); + return true; + } Vertex *vlcur, *vlnext, *vrcur, *vrnext; s32 xstart, xend; @@ -990,7 +1016,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) s32 l_edgecov, r_edgecov; Interpolator<1>* interp_start; Interpolator<1>* interp_end; - u16 pixelsrendered = 0; // for tracking timings + bool abortscanline = false; // to abort the rest of the scanline after finishing this polygon xstart = rp->XL; xend = rp->XR; @@ -1109,31 +1135,38 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) int edge; s32 x = xstart; - Interpolator<0> interpX(xstart, xend+1, wl, wr); + xend += 1; + Interpolator<0> interpX(xstart, xend, wl, wr); if (x < 0) x = 0; s32 xlimit; s32 xcov = 0; + if (xend > 256) xend = 256; + + // determine if the span can be rendered within the time allotted to the scanline + s32 diff = DoTimingsPixels(xend-x, odd); + if (diff != 0) + { + xend -= diff; + r_edgelen -= diff; + abortscanline = true; + } // part 1: left edge edge = yedge | 0x1; xlimit = xstart+l_edgelen; - if (xlimit > xend+1) xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + if (xlimit > xend) xlimit = xend; if (l_edgecov & (1<<31)) { xcov = (l_edgecov >> 12) & 0x3FF; if (xcov == 0x3FF) xcov = 0; } - - for (; x < xlimit; x++) + + if (!l_filledge) x = xlimit; + else for (; x < xlimit; x++) { - if (pixelsrendered >= 4 && DoTimings(PerPixelTiming, odd)) return Step(rp, true); - pixelsrendered++; - - if (!l_filledge) continue; u32 pixeladdr = (BufferOffset * ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; @@ -1223,16 +1256,12 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) // part 2: polygon inside edge = yedge; - xlimit = xend-r_edgelen+1; - if (xlimit > xend+1) xlimit = xend+1; - if (xlimit > 256) xlimit = 256; - - for (; x < xlimit; x++) + xlimit = xend-r_edgelen; + if (xlimit > xend) xlimit = xend; + + if (wireframe && !edge) x = std::max(x, xlimit); + else for (; x < xlimit; x++) { - if (pixelsrendered >= 4 && DoTimings(PerPixelTiming, odd)) return Step(rp, true); - pixelsrendered++; - - if (wireframe && !edge) continue; u32 pixeladdr = (BufferOffset * ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; @@ -1315,20 +1344,16 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) // part 3: right edge edge = yedge | 0x2; - xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + xlimit = xend; if (r_edgecov & (1<<31)) { xcov = (r_edgecov >> 12) & 0x3FF; if (xcov == 0x3FF) xcov = 0; } - //if (rp->SlopeR.Increment != 0 && DoTimings(PerRightSlope, odd)) return Step(rp, true); // should be fine to not immediately return? might be wrong + + if (r_filledge) for (; x < xlimit; x++) { - if (pixelsrendered >= 4 && DoTimings(PerPixelTiming, odd)) return Step(rp, true); - pixelsrendered++; - - if (!r_filledge) continue; u32 pixeladdr = (BufferOffset * ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; @@ -1415,7 +1440,8 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) PlotTranslucentPixel(pixeladdr+BufferSize, color, z, polyattr, polygon->IsShadow); } } - return Step(rp, false); + Step(rp); + return abortscanline; } bool SoftRenderer::RenderScanline(s32 y, int npolys, bool odd) @@ -1429,7 +1455,7 @@ bool SoftRenderer::RenderScanline(s32 y, int npolys, bool odd) if (abort) { CheckSlope(rp, y); - Step(rp, NULL); + Step(rp); } else if (y == polygon->YBottom && y != polygon->YTop) { diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index d9b925f3..43037281 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -455,6 +455,7 @@ private: melonDS::GPU& GPU; RendererPolygon PolygonList[2048]; bool DoTimings(s32 cycles, bool odd); + s32 DoTimingsPixels(u32 pixels, bool odd); void EndScanline(bool odd); void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha); u32 RenderPixel(Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t); @@ -462,7 +463,7 @@ private: void SetupPolygonLeftEdge(RendererPolygon* rp, s32 y); void SetupPolygonRightEdge(RendererPolygon* rp, s32 y); void SetupPolygon(RendererPolygon* rp, Polygon* polygon); - bool Step(RendererPolygon* rp, bool abortscanline); + void Step(RendererPolygon* rp); void CheckSlope(RendererPolygon* rp, s32 y); void RenderShadowMaskScanline(RendererPolygon* rp, s32 y); bool RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd); From 0d6a8e0fb97b50674ac662edb9aff8187c80751d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 10 Dec 2023 19:22:30 -0500 Subject: [PATCH 08/53] ok this one actually works --- src/GPU3D_Soft.cpp | 25 +++++++++++++++++-------- src/GPU3D_Soft.h | 2 +- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 306d7db1..42117698 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -129,27 +129,36 @@ bool SoftRenderer::DoTimings(s32 cycles, bool odd) return true; } -s32 SoftRenderer::DoTimingsPixels(u32 pixels, bool odd) +u32 SoftRenderer::DoTimingsPixels(u32 pixels, bool odd) { // return the difference between the old span and the new span if (pixels <= 4) return 0; - u32 pixeltiming = (pixels - 4) * RasterFrac; + u32 pixelsremain = pixels-4; + u32 timinglimit = RasterTimingCap - RasterTimingCounterPrev; + //todo: do this without a for loop somehow. if (odd) { - u32 rasterend = RasterTimingCap - (RasterTimingCounterOdd + RasterTimingCounterPrev); - pixeltiming = rasterend - pixeltiming; + for (; pixelsremain > 0; pixelsremain--) + { + RasterTimingCounterOdd += RasterFrac; + if (RasterTimingCounterOdd >= timinglimit) break; + } } else { - u32 rasterend = RasterTimingCap - (RasterTimingCounterEven + RasterTimingCounterPrev); - pixeltiming = rasterend - pixeltiming; + for (; pixelsremain > 0; pixelsremain--) + { + RasterTimingCounterEven += RasterFrac; + if (RasterTimingCounterEven >= timinglimit) break; + } } - if (pixeltiming > 0) return 0; + + if (pixelsremain <= 0) return 0; GPU.GPU3D.DispCnt |= (1<<12); - return pixels - (((pixeltiming + (RasterFrac-1)) / RasterFrac) + 4); + return pixelsremain; } void SoftRenderer::EndScanline(bool odd) diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 43037281..1e6846c2 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -455,7 +455,7 @@ private: melonDS::GPU& GPU; RendererPolygon PolygonList[2048]; bool DoTimings(s32 cycles, bool odd); - s32 DoTimingsPixels(u32 pixels, bool odd); + u32 DoTimingsPixels(u32 pixels, bool odd); void EndScanline(bool odd); void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha); u32 RenderPixel(Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t); From 24eecec50f74d410d864435ac47c81b1818f00e4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 12 Dec 2023 00:01:26 -0500 Subject: [PATCH 09/53] implement first draft of improved timing structure --- src/GPU3D.h | 27 ++++++--- src/GPU3D_Soft.cpp | 142 +++++++++++++++++++-------------------------- src/GPU3D_Soft.h | 14 ++--- 3 files changed, 86 insertions(+), 97 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index 284e101f..e3e4cc09 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -331,15 +331,24 @@ public: u32 ScrolledLine[256]; }; - // numbers based on 339 poly 64-172 horiz. line poly - static constexpr int RasterFrac = 481; // add a fractional component if pixels is not enough precision - static constexpr int RasterTimingCap = 51116*RasterFrac; - static constexpr int PerScanlineTiming = 1064*RasterFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED - static constexpr int PerScanlineRecup = 2112*RasterFrac; // seems to check out? - static constexpr int PerRightSlope = 1*RasterFrac; - static constexpr int PerPolyTiming = 12*RasterFrac; // should be correct for *most* line polygons and polygons with vertical slopes - static constexpr int PerPixelTiming = 1*RasterFrac; // does not apply to the first 4 pixels in a polygon (per scanline?) - static constexpr int EmptyPolyScanline = 4*RasterFrac - 14; // seems to be slightly under 4? + // rasteriztion timing constants + static constexpr int TimingFrac = 1; // add a fractional component if pixels is not enough precision + + static constexpr int GPU2DSpeedWithinPair = 296 * TimingFrac; + static constexpr int GPU2DSpeedOutsidePair = 948 * TimingFrac; + static constexpr int ScanlinePairLength = 2130 * TimingFrac; + static constexpr int ScanlineTimeout = 2126 * TimingFrac; + static constexpr int InitGPU2DTimeout = 51618 * TimingFrac; + static constexpr int ScanlineBreak = 4 * TimingFrac; + + static constexpr int PerPolyTiming = 12 * TimingFrac; // should be correct for *most* line polygons and polygons with vertical slopes + static constexpr int PerPixelTiming = 1 * TimingFrac; // does not apply to the first 4 pixels in a polygon (per scanline?) + + // static constexpr int RasterTimingCap = 51116 * TimingFrac; + static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED + static constexpr int PerScanlineRecup = 2112 * TimingFrac; // seems to check out? // should be the "free" time the gpu has to do the calculation + static constexpr int PerRightSlope = 1 * TimingFrac; + static constexpr int EmptyPolyScanline = 4 * TimingFrac - 14; // seems to be slightly under 4? //static constexpr int FirstPixelTiming; class Renderer3D diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 42117698..47bfa3d1 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -114,16 +114,14 @@ void SoftRenderer::SetThreaded(bool threaded) noexcept bool SoftRenderer::DoTimings(s32 cycles, bool odd) { - if (odd) - { - RasterTimingCounterOdd += cycles; - if ((RasterTimingCounterOdd + RasterTimingCounterPrev) < RasterTimingCap) return false; - } - else - { - RasterTimingCounterEven += cycles; - if ((RasterTimingCounterEven + RasterTimingCounterPrev) < RasterTimingCap) return false; - } + // add timings to a counter and check if underflowed. + + s32* counter; + if (odd) counter = &RasterTimingOdd; + else counter = &RasterTimingEven; + + *counter += cycles; + if (RasterTiming - *counter) return false; GPU.GPU3D.DispCnt |= (1<<12); return true; @@ -131,28 +129,22 @@ bool SoftRenderer::DoTimings(s32 cycles, bool odd) u32 SoftRenderer::DoTimingsPixels(u32 pixels, bool odd) { - // return the difference between the old span and the new span + // calculate and return the difference between the old span and the new span, while adding timings to the timings counter + + // pixels dont count towards timings if they're the first 4 pixels in a scanline (for some reason?) if (pixels <= 4) return 0; - + u32 pixelsremain = pixels-4; - u32 timinglimit = RasterTimingCap - RasterTimingCounterPrev; + s32* counter; + if (odd) counter = &RasterTimingOdd; + else counter = &RasterTimingEven; + //todo: do this without a for loop somehow. - if (odd) + for (; pixelsremain > 0; pixelsremain--) { - for (; pixelsremain > 0; pixelsremain--) - { - RasterTimingCounterOdd += RasterFrac; - if (RasterTimingCounterOdd >= timinglimit) break; - } - } - else - { - for (; pixelsremain > 0; pixelsremain--) - { - RasterTimingCounterEven += RasterFrac; - if (RasterTimingCounterEven >= timinglimit) break; - } + *counter += TimingFrac; + if (!(RasterTiming - *counter)) break; } if (pixelsremain <= 0) return 0; @@ -165,17 +157,7 @@ void SoftRenderer::EndScanline(bool odd) { if (!odd) { - RasterTimingCounterPrev += std::max(RasterTimingCounterOdd, RasterTimingCounterEven); - RasterTimingCounterPrev -= PerScanlineRecup; // wip - if (RasterTimingCounterPrev < 0) RasterTimingCounterPrev = 0; - // calc is wrong, seems to round up...? - GPU.GPU3D.RDLines = (RasterTimingCap - RasterTimingCounterPrev) / PerScanlineTiming; - // seems to display the lowest scanline buffer count reached during the current frame. - // we also caps it to 46 here, because this reg does that too for some reason. - if (GPU.GPU3D.RDLines > GPU.GPU3D.RDLinesMin) GPU.GPU3D.RDLines = GPU.GPU3D.RDLinesMin; - else if (GPU.GPU3D.RDLines < GPU.GPU3D.RDLinesMin) GPU.GPU3D.RDLinesMin = GPU.GPU3D.RDLines; - RasterTimingCounterOdd = 0; - RasterTimingCounterEven = 0; + RasterTiming += std::max(RasterTimingOdd, RasterTimingEven); } } @@ -1176,8 +1158,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) if (!l_filledge) x = xlimit; else for (; x < xlimit; x++) { - - u32 pixeladdr = (BufferOffset * ScanlineWidth) + x; + u32 pixeladdr = (y * ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; // check stencil buffer for shadows @@ -1271,8 +1252,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) if (wireframe && !edge) x = std::max(x, xlimit); else for (; x < xlimit; x++) { - - u32 pixeladdr = (BufferOffset * ScanlineWidth) + x; + u32 pixeladdr = (y * ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; // check stencil buffer for shadows @@ -1363,8 +1343,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) if (r_filledge) for (; x < xlimit; x++) { - - u32 pixeladdr = (BufferOffset * ScanlineWidth) + x; + u32 pixeladdr = (y * ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; // check stencil buffer for shadows @@ -1479,7 +1458,6 @@ bool SoftRenderer::RenderScanline(s32 y, int npolys, bool odd) } } - BufferOffset = (BufferOffset + 1) & 0x7; // loop if == 8 EndScanline(odd); return abort; } @@ -1523,18 +1501,11 @@ u32 SoftRenderer::CalculateFogDensity(u32 pixeladdr) return density; } -void SoftRenderer::ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool late, bool odd, bool finish) +void SoftRenderer::ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool odd) { // to consider: // clearing all polygon fog flags if the master flag isn't set? // merging all final pass loops into one? - u8 tempoffset; - if (finish) - tempoffset = (BufferOffset - 2 + (!odd)); - else - tempoffset = (BufferOffset - 4 + (!odd)); - - if (tempoffset > 7) tempoffset -= 0xF8; // handle underflows /*if (GPU.GPU3D.RenderDispCnt & (1<<5)) { @@ -1591,7 +1562,7 @@ void SoftRenderer::ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool late, bool o for (int x = 0; x < 256; x++) { - u32 pixeladdr = (tempoffset * ScanlineWidth) + x; + u32 pixeladdr = (y * ScanlineWidth) + x; u32 density, srccolor, srcR, srcG, srcB, srcA; u32 attr = AttrBuffer[pixeladdr]; @@ -1656,7 +1627,7 @@ void SoftRenderer::ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool late, bool o for (int x = 0; x < 256; x++) { - u32 pixeladdr = (tempoffset * ScanlineWidth) + x; + u32 pixeladdr = (y * ScanlineWidth) + x; u32 attr = AttrBuffer[pixeladdr]; if (!(attr & 0xF)) continue; @@ -1699,19 +1670,20 @@ void SoftRenderer::ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool late, bool o } } - if (late) + // if the first two scanlines are late then it's delayed by 48 scanlines + if (false)//late) { memcpy(&FinalBuffer[y*ScanlineWidth], &RDBuffer[rdbufferoffset*ScanlineWidth], 4 * ScanlineWidth); - memcpy(&RDBuffer[rdbufferoffset*ScanlineWidth], &ColorBuffer[tempoffset*ScanlineWidth], 4 * ScanlineWidth); + memcpy(&RDBuffer[rdbufferoffset*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], 4 * ScanlineWidth); } else { - memcpy(&RDBuffer[rdbufferoffset*ScanlineWidth], &ColorBuffer[tempoffset*ScanlineWidth], 4 * ScanlineWidth); + memcpy(&RDBuffer[rdbufferoffset*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], 4 * ScanlineWidth); memcpy(&FinalBuffer[y*ScanlineWidth], &RDBuffer[rdbufferoffset*ScanlineWidth], 4 * ScanlineWidth); } } -void SoftRenderer::ClearBuffers(s32 y) +void SoftRenderer::ClearBuffers() { u32 clearz = ((GPU.GPU3D.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF; u32 polyid = GPU.GPU3D.RenderClearAttr1 & 0x3F000000; // this sets the opaque polygonID @@ -1721,9 +1693,9 @@ void SoftRenderer::ClearBuffers(s32 y) if (GPU.GPU3D.RenderDispCnt & (1<<14)) { u8 xoff = (GPU.GPU3D.RenderClearAttr2 >> 16) & 0xFF; - u8 yoff = ((GPU.GPU3D.RenderClearAttr2 >> 24) & 0xFF) + y; + u8 yoff = ((GPU.GPU3D.RenderClearAttr2 >> 24) & 0xFF); - for (int i = 0; i < 2; i++) + for (int y = 0; y < 192; y++) { for (int x = 0; x < 256; x++) { @@ -1739,7 +1711,7 @@ void SoftRenderer::ClearBuffers(s32 y) u32 z = ((val3 & 0x7FFF) * 0x200) + 0x1FF; - u32 pixeladdr = ((BufferOffset+i) * ScanlineWidth) + x; + u32 pixeladdr = (y * ScanlineWidth) + x; ColorBuffer[pixeladdr] = color; DepthBuffer[pixeladdr] = z; AttrBuffer[pixeladdr] = polyid | (val3 & 0x8000); @@ -1761,11 +1733,11 @@ void SoftRenderer::ClearBuffers(s32 y) polyid |= (GPU.GPU3D.RenderClearAttr1 & 0x8000); - for (int i = 0; i < 2; i++) + for (int y = 0; y < 192; y++) { for (int x = 0; x < 256; x++) { - u32 pixeladdr = ((BufferOffset+i) * ScanlineWidth) + x; + u32 pixeladdr = (y * ScanlineWidth) + x; ColorBuffer[pixeladdr] = color; DepthBuffer[pixeladdr] = clearz; AttrBuffer[pixeladdr] = polyid; @@ -1785,24 +1757,36 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) s32 y = 0; s8 prevbufferline = -2; - bool latebuffer[192] = {}; + + u8 buffersize = 0; + RasterTiming = (ScanlinePairLength * 24); + RasterTimingOdd = 0; + RasterTimingEven = 0; + ClearBuffers(); for (u8 quarter = 0; quarter < 4; quarter++) for (u8 bufferline = 0; bufferline < 48; bufferline += 2) { - ClearBuffers(y); - latebuffer[y] = RenderScanline(y, j, true); - latebuffer[y+1] = RenderScanline(y+1, j, false); - + RasterTiming += (ScanlineTimeout); + + if (buffersize >= 50) RasterTiming = (ScanlinePairLength * 23) + ScanlineTimeout; + + RenderScanline(y, j, true); + RenderScanline(y+1, j, false); + RasterTiming += ScanlineBreak; + + u32* RDLinesReg = &GPU.GPU3D.RDLines; + *RDLinesReg = 0; + for (int i = RasterTiming; i > ScanlinePairLength / 2; i -= ScanlinePairLength / 2) *RDLinesReg += 1; + // seems to display the lowest scanline buffer count reached during the current frame. + // we also caps it to 46 here, because this reg does that too for some reason. + if (*RDLinesReg > GPU.GPU3D.RDLinesMin) *RDLinesReg = GPU.GPU3D.RDLinesMin; + else if (*RDLinesReg < GPU.GPU3D.RDLinesMin) GPU.GPU3D.RDLinesMin = *RDLinesReg; if (prevbufferline >= 0) { - if (!latebuffer[y-2]) latebuffer[y] = false; - - if (!latebuffer[y-1]) latebuffer[y+1] = false; - - ScanlineFinalPass(y-2, prevbufferline, latebuffer[y-2], true, false); - ScanlineFinalPass(y-1, prevbufferline+1, latebuffer[y-1], false, false); + ScanlineFinalPass(y-2, prevbufferline, true); + ScanlineFinalPass(y-1, prevbufferline+1, false); } y += 2; @@ -1812,8 +1796,8 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) Platform::Semaphore_Post(Sema_ScanlineCount); } - ScanlineFinalPass(190, prevbufferline, latebuffer[190], true, true); - ScanlineFinalPass(191, prevbufferline+1, latebuffer[191], false, true); + ScanlineFinalPass(190, prevbufferline, true); + ScanlineFinalPass(191, prevbufferline+1, false); if (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); @@ -1823,10 +1807,6 @@ void SoftRenderer::VCount144() { if (RenderThreadRunning.load(std::memory_order_relaxed) && !GPU.GPU3D.AbortFrame) Platform::Semaphore_Wait(Sema_RenderDone); - - RasterTimingCounterPrev = 0; - RasterTimingCounterOdd = 0; - RasterTimingCounterEven = 0; } void SoftRenderer::RenderFrame() diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 1e6846c2..5628d73e 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -469,16 +469,17 @@ private: bool RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd); bool RenderScanline(s32 y, int npolys, bool odd); u32 CalculateFogDensity(u32 pixeladdr); - void ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool late, bool odd, bool finish); - void ClearBuffers(s32 y); + void ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool odd); + void ClearBuffers(); void RenderPolygons(bool threaded, Polygon** polygons, int npolys); void RenderThreadFunc(); // counters for scanline rasterization timings - s32 RasterTimingCounterPrev = 0; - s32 RasterTimingCounterOdd = 0; - s32 RasterTimingCounterEven = 0; + s32 RasterTiming = 0; + //s32 RasterTimingCounterPrev = 0; + s32 RasterTimingOdd = 0; + s32 RasterTimingEven = 0; // buffer dimensions are 258x194 to add a offscreen 1px border // which simplifies edge marking tests @@ -489,7 +490,7 @@ private: static constexpr int ScanlineWidth = 256; static constexpr int NumScanlines = 192; static constexpr int NumScanlinesRD = 48; - static constexpr int NumScanlinesInternal = 8; + static constexpr int NumScanlinesInternal = 192; static constexpr int InternalBufferSize = ScanlineWidth * NumScanlinesInternal; static constexpr int RDBufferSize = ScanlineWidth * NumScanlinesRD; static constexpr int BufferSize = ScanlineWidth * NumScanlines; @@ -498,7 +499,6 @@ private: u32 ColorBuffer[InternalBufferSize * 2]; u32 DepthBuffer[InternalBufferSize * 2]; u32 AttrBuffer[InternalBufferSize * 2]; - u8 BufferOffset; u32 RDBuffer[RDBufferSize]; u32 FinalBuffer[BufferSize]; From a46316d71f187717ad8f45fcc3c42a6ed3a70c6d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 14 Dec 2023 15:18:39 -0500 Subject: [PATCH 10/53] improved timings for the first 50 scanlines --- src/GPU3D.cpp | 9 ++--- src/GPU3D.h | 21 +++++++---- src/GPU3D_Soft.cpp | 93 ++++++++++++++++++++++++++++++---------------- src/GPU3D_Soft.h | 3 +- 4 files changed, 79 insertions(+), 47 deletions(-) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 12da23db..6fb24979 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -222,8 +222,7 @@ void GPU3D::Reset() noexcept AlphaRefVal = 0; AlphaRef = 0; - RDLines = 46; - RDLinesMin = 46; + RDLinesDisplay = 46; memset(ToonTable, 0, sizeof(ToonTable)); memset(EdgeTable, 0, sizeof(EdgeTable)); @@ -2370,7 +2369,7 @@ void GPU3D::CheckFIFODMA() noexcept void GPU3D::VCount144() noexcept { - RDLinesMin = 46; + RDLinesDisplay = 46; CurrentRenderer->VCount144(); } @@ -2614,7 +2613,7 @@ u16 GPU3D::Read16(u32 addr) noexcept return DispCnt; case 0x04000320: - return RDLines; // IT IS TIME + return RDLinesDisplay; // IT IS TIME case 0x04000600: { @@ -2658,7 +2657,7 @@ u32 GPU3D::Read32(u32 addr) noexcept return DispCnt; case 0x04000320: - return RDLines; // IT IS TIME + return RDLinesDisplay; // IT IS TIME case 0x04000600: { diff --git a/src/GPU3D.h b/src/GPU3D.h index e3e4cc09..57553782 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -245,8 +245,7 @@ public: bool RenderingEnabled = false; u32 DispCnt = 0; - u32 RDLines = 0; - u32 RDLinesMin = 0; + u32 RDLinesDisplay = 0; u8 AlphaRefVal = 0; u8 AlphaRef = 0; @@ -334,13 +333,21 @@ public: // rasteriztion timing constants static constexpr int TimingFrac = 1; // add a fractional component if pixels is not enough precision + // GPU 2D read timings, for emulating race conditions static constexpr int GPU2DSpeedWithinPair = 296 * TimingFrac; - static constexpr int GPU2DSpeedOutsidePair = 948 * TimingFrac; - static constexpr int ScanlinePairLength = 2130 * TimingFrac; - static constexpr int ScanlineTimeout = 2126 * TimingFrac; + static constexpr int GPU2DSpeedOutsidePair = 810 * TimingFrac; + static constexpr int GPU2DSpeedReadScanline = 256 * TimingFrac; static constexpr int InitGPU2DTimeout = 51618 * TimingFrac; - static constexpr int ScanlineBreak = 4 * TimingFrac; + // GPU 3D rasterization timings, for emulating the timeout + static constexpr int ScanlinePairLength = 2130 * TimingFrac; + static constexpr int ScanlineTimeout = 1686 * TimingFrac; // 2126? 1686? + static constexpr int ScanlineBreak = 4 * TimingFrac; + static constexpr int ScanlineBreak2 = 40 * TimingFrac; + static constexpr int IncrementStrange = 1618 * TimingFrac; // 1882? 1442? 1618?? + static constexpr int FreeTiming = 440 * TimingFrac; + + // GPU 3D rasterization timings II, for counting each element with timing characteristics static constexpr int PerPolyTiming = 12 * TimingFrac; // should be correct for *most* line polygons and polygons with vertical slopes static constexpr int PerPixelTiming = 1 * TimingFrac; // does not apply to the first 4 pixels in a polygon (per scanline?) @@ -348,7 +355,7 @@ public: static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED static constexpr int PerScanlineRecup = 2112 * TimingFrac; // seems to check out? // should be the "free" time the gpu has to do the calculation static constexpr int PerRightSlope = 1 * TimingFrac; - static constexpr int EmptyPolyScanline = 4 * TimingFrac - 14; // seems to be slightly under 4? + static constexpr int EmptyPolyScanline = 4 * TimingFrac;// - 14; // seems to be slightly under 4? //static constexpr int FirstPixelTiming; class Renderer3D diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 47bfa3d1..7cb8002a 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -121,44 +121,43 @@ bool SoftRenderer::DoTimings(s32 cycles, bool odd) else counter = &RasterTimingEven; *counter += cycles; - if (RasterTiming - *counter) return false; + if (RasterTiming - *counter > 0) return false; GPU.GPU3D.DispCnt |= (1<<12); return true; } -u32 SoftRenderer::DoTimingsPixels(u32 pixels, bool odd) +u32 SoftRenderer::DoTimingsPixels(s32 pixels, bool odd) { // calculate and return the difference between the old span and the new span, while adding timings to the timings counter // pixels dont count towards timings if they're the first 4 pixels in a scanline (for some reason?) if (pixels <= 4) return 0; - u32 pixelsremain = pixels-4; + pixels -= 4; s32* counter; if (odd) counter = &RasterTimingOdd; else counter = &RasterTimingEven; - //todo: do this without a for loop somehow. - for (; pixelsremain > 0; pixelsremain--) + //todo: figure out a faster way to support TimingFrac > 1 without using a for loop somehow. + if constexpr (TimingFrac > 1) + for (; pixels > 0; pixels--) + { + *counter += TimingFrac; + if ((RasterTiming - *counter) <= 0) break; + } + else { - *counter += TimingFrac; - if (!(RasterTiming - *counter)) break; + *counter += pixels; + pixels = -(RasterTiming - *counter); + if (pixels > 0) *counter -= pixels; } - if (pixelsremain <= 0) return 0; + if (pixels <= 0) return 0; GPU.GPU3D.DispCnt |= (1<<12); - return pixelsremain; -} - -void SoftRenderer::EndScanline(bool odd) -{ - if (!odd) - { - RasterTiming += std::max(RasterTimingOdd, RasterTimingEven); - } + return pixels; } void SoftRenderer::TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) @@ -1458,7 +1457,6 @@ bool SoftRenderer::RenderScanline(s32 y, int npolys, bool odd) } } - EndScanline(odd); return abort; } @@ -1758,30 +1756,59 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) s32 y = 0; s8 prevbufferline = -2; - u8 buffersize = 0; - RasterTiming = (ScanlinePairLength * 24); - RasterTimingOdd = 0; - RasterTimingEven = 0; - + s8 buffersize = 0; + RasterTiming = InitialTiming; + s32 timingadvance = InitialTiming; + bool abort = false; + //u32* RDLinesReg = &GPU.GPU3D.RDLines; ClearBuffers(); for (u8 quarter = 0; quarter < 4; quarter++) for (u8 bufferline = 0; bufferline < 48; bufferline += 2) { - RasterTiming += (ScanlineTimeout); + RasterTimingOdd = 0; + RasterTimingEven = 0; + + if (buffersize > 48) + { + RasterTiming = ScanlinePairLength * 23; + timingadvance = 0; + buffersize = 48; + } + if (!abort) RasterTiming += IncrementStrange; + else RasterTiming += ScanlineTimeout; - if (buffersize >= 50) RasterTiming = (ScanlinePairLength * 23) + ScanlineTimeout; + abort = RenderScanline(y, j, true); + abort = RenderScanline(y+1, j, false); - RenderScanline(y, j, true); - RenderScanline(y+1, j, false); - RasterTiming += ScanlineBreak; + buffersize += 2; + //RasterTiming += ScanlineBreak; + s32 timespent = std::max(RasterTimingOdd, RasterTimingEven); - u32* RDLinesReg = &GPU.GPU3D.RDLines; - *RDLinesReg = 0; - for (int i = RasterTiming; i > ScanlinePairLength / 2; i -= ScanlinePairLength / 2) *RDLinesReg += 1; + /*if (timespent > FreeTiming) + { + abort = true; + timespent -= FreeTiming; + } + else if (!abort) + { + abort = false; + timespent -= FreeTiming; + }*/ + //if (!abort) + timespent -= FreeTiming; + + if (timespent > 0) + { + RasterTiming -= timespent; + timingadvance -= timespent; + } + + if (timingadvance < 0) for (s32 i = (ScanlinePairLength / 2) * buffersize; i > RasterTiming + (ScanlinePairLength / 2); i -= ScanlinePairLength / 2) buffersize -= 1; + if (buffersize < 0) buffersize = 0; + // seems to display the lowest scanline buffer count reached during the current frame. // we also caps it to 46 here, because this reg does that too for some reason. - if (*RDLinesReg > GPU.GPU3D.RDLinesMin) *RDLinesReg = GPU.GPU3D.RDLinesMin; - else if (*RDLinesReg < GPU.GPU3D.RDLinesMin) GPU.GPU3D.RDLinesMin = *RDLinesReg; + if (quarter >= 1 && buffersize < GPU.GPU3D.RDLinesDisplay) GPU.GPU3D.RDLinesDisplay = buffersize; if (prevbufferline >= 0) { diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 5628d73e..01187a8a 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -455,8 +455,7 @@ private: melonDS::GPU& GPU; RendererPolygon PolygonList[2048]; bool DoTimings(s32 cycles, bool odd); - u32 DoTimingsPixels(u32 pixels, bool odd); - void EndScanline(bool odd); + u32 DoTimingsPixels(s32 pixels, bool odd); void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha); u32 RenderPixel(Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t); void PlotTranslucentPixel(u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow); From 2217a34d3912d71a87e91cf90b04fb49cd329808 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 14 Dec 2023 23:00:12 -0500 Subject: [PATCH 11/53] misc improvements --- src/GPU3D.h | 19 ++++++++++++------- src/GPU3D_Soft.cpp | 30 +++++++++++++++++------------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index 57553782..2dfacdc0 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -341,15 +341,20 @@ public: // GPU 3D rasterization timings, for emulating the timeout static constexpr int ScanlinePairLength = 2130 * TimingFrac; - static constexpr int ScanlineTimeout = 1686 * TimingFrac; // 2126? 1686? - static constexpr int ScanlineBreak = 4 * TimingFrac; - static constexpr int ScanlineBreak2 = 40 * TimingFrac; - static constexpr int IncrementStrange = 1618 * TimingFrac; // 1882? 1442? 1618?? - static constexpr int FreeTiming = 440 * TimingFrac; + //static constexpr int ScanlineTimeout = 1686 * TimingFrac; // 2126? 1686? + //static constexpr int ScanlineBreak = 4 * TimingFrac; + //static constexpr int ScanlineBreak2 = 40 * TimingFrac; + static constexpr int ScanlineIncrement = 1618 * TimingFrac; // how much to increment per scanline pair + static constexpr int AbortIncrement = 12 * TimingFrac; // how much extra to increment after an aborted scanline (total 1630) + static constexpr int FreeTiming = 496 * TimingFrac; // every scanline has a free 496 pixels worth of timing for some reason. + static constexpr int InitialTiming = 48688 * TimingFrac; // add 1618*2 to get the timeout of the second scanline pair + static constexpr int Post50Max = 51116 * TimingFrac; // for some reason it doesn't care about how full it actually is, it just cares about if its the first 50 scanlines to speedrun rendering? // GPU 3D rasterization timings II, for counting each element with timing characteristics - static constexpr int PerPolyTiming = 12 * TimingFrac; // should be correct for *most* line polygons and polygons with vertical slopes - static constexpr int PerPixelTiming = 1 * TimingFrac; // does not apply to the first 4 pixels in a polygon (per scanline?) + static constexpr int FirstPolyScanline = 0 * TimingFrac; + static constexpr int PerPolyScanline = 12 * TimingFrac; // should be correct for *most* line polygons and polygons with vertical slopes + static constexpr int PerPixelTiming = 1 * TimingFrac; // 1 pixel = 1 pixel + static constexpr int NumFreePixels = 4; // First 4 pixels in a polygon scanline are free (for some reason) // static constexpr int RasterTimingCap = 51116 * TimingFrac; static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 7cb8002a..af23132d 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -993,7 +993,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) CheckSlope(rp, y); - if (DoTimings(PerPolyTiming, odd)) + if (DoTimings(PerPolyScanline, odd)) { Step(rp); return true; @@ -1439,18 +1439,19 @@ bool SoftRenderer::RenderScanline(s32 y, int npolys, bool odd) RendererPolygon* rp = &PolygonList[i]; Polygon* polygon = rp->PolyData; - if (abort) - { - CheckSlope(rp, y); - Step(rp); - } - else if (y == polygon->YBottom && y != polygon->YTop) + if (y == polygon->YBottom && y != polygon->YTop) { if (DoTimings(EmptyPolyScanline, odd)) abort = true; } else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) { - if (polygon->IsShadowMask) + if (y == polygon->YTop) if(DoTimings(FirstPolyScanline, odd)) abort = true; + if (abort) + { + CheckSlope(rp, y); + Step(rp); + } + else if (polygon->IsShadowMask) ;//RenderShadowMaskScanline(rp, y); else if (RenderPolygonScanline(rp, y, odd)) abort = true; @@ -1768,17 +1769,18 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) RasterTimingOdd = 0; RasterTimingEven = 0; - if (buffersize > 48) + RasterTiming += ScanlineIncrement; + if (abort) RasterTiming += AbortIncrement; // if previous scanline was aborted, allow an extra 12 pixels worth of timing + + if (y >= 50) { - RasterTiming = ScanlinePairLength * 23; + if (RasterTiming > Post50Max) RasterTiming = Post50Max; timingadvance = 0; buffersize = 48; } - if (!abort) RasterTiming += IncrementStrange; - else RasterTiming += ScanlineTimeout; abort = RenderScanline(y, j, true); - abort = RenderScanline(y+1, j, false); + abort |= RenderScanline(y+1, j, false); buffersize += 2; //RasterTiming += ScanlineBreak; @@ -1795,6 +1797,8 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) timespent -= FreeTiming; }*/ //if (!abort) + //if (buffersize > 48) timespent -= PerScanlineRecup; + /*else*/ timespent -= FreeTiming; if (timespent > 0) From 1054011c90973278f8df1e658d59c7187f8805c4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 19 Dec 2023 22:52:54 -0500 Subject: [PATCH 12/53] wip --- src/GPU3D.cpp | 8 +- src/GPU3D.h | 53 +++++++----- src/GPU3D_Soft.cpp | 203 ++++++++++++++++++++++++++++----------------- src/GPU3D_Soft.h | 6 +- 4 files changed, 170 insertions(+), 100 deletions(-) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 6fb24979..056d5735 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -222,7 +222,7 @@ void GPU3D::Reset() noexcept AlphaRefVal = 0; AlphaRef = 0; - RDLinesDisplay = 46; + RDLines = 46; memset(ToonTable, 0, sizeof(ToonTable)); memset(EdgeTable, 0, sizeof(EdgeTable)); @@ -2369,7 +2369,7 @@ void GPU3D::CheckFIFODMA() noexcept void GPU3D::VCount144() noexcept { - RDLinesDisplay = 46; + RDLines = 46; CurrentRenderer->VCount144(); } @@ -2613,7 +2613,7 @@ u16 GPU3D::Read16(u32 addr) noexcept return DispCnt; case 0x04000320: - return RDLinesDisplay; // IT IS TIME + return RDLines; // IT IS TIME case 0x04000600: { @@ -2657,7 +2657,7 @@ u32 GPU3D::Read32(u32 addr) noexcept return DispCnt; case 0x04000320: - return RDLinesDisplay; // IT IS TIME + return RDLines; // IT IS TIME case 0x04000600: { diff --git a/src/GPU3D.h b/src/GPU3D.h index 2dfacdc0..6413935e 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -245,7 +245,7 @@ public: bool RenderingEnabled = false; u32 DispCnt = 0; - u32 RDLinesDisplay = 0; + u32 RDLines = 0; u8 AlphaRefVal = 0; u8 AlphaRef = 0; @@ -334,34 +334,47 @@ public: static constexpr int TimingFrac = 1; // add a fractional component if pixels is not enough precision // GPU 2D read timings, for emulating race conditions - static constexpr int GPU2DSpeedWithinPair = 296 * TimingFrac; - static constexpr int GPU2DSpeedOutsidePair = 810 * TimingFrac; - static constexpr int GPU2DSpeedReadScanline = 256 * TimingFrac; - static constexpr int InitGPU2DTimeout = 51618 * TimingFrac; + static constexpr int GPU2DSpeedWithinPair = 296 * TimingFrac; // the delay between finishing reading the first scanline and beginning reading the second scanline of a scanline pair. + static constexpr int GPU2DSpeedOutsidePair = 810 * TimingFrac; // the delay between finishing reading a pair and beginning reading a new pair. + static constexpr int GPU2DReadScanline = 256 * TimingFrac; // the time it takes to read a scanline. + static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // notably the same as the scanline increment. + static constexpr int InitGPU2DTimeout = 50000 * TimingFrac; // 51618? | when it starts reading the first scanline. + static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 48 * TimingFrac; // time to read 48 scanlines. // GPU 3D rasterization timings, for emulating the timeout - static constexpr int ScanlinePairLength = 2130 * TimingFrac; + + //static constexpr int ScanlinePairLength = 2130 * TimingFrac; //static constexpr int ScanlineTimeout = 1686 * TimingFrac; // 2126? 1686? //static constexpr int ScanlineBreak = 4 * TimingFrac; //static constexpr int ScanlineBreak2 = 40 * TimingFrac; - static constexpr int ScanlineIncrement = 1618 * TimingFrac; // how much to increment per scanline pair - static constexpr int AbortIncrement = 12 * TimingFrac; // how much extra to increment after an aborted scanline (total 1630) - static constexpr int FreeTiming = 496 * TimingFrac; // every scanline has a free 496 pixels worth of timing for some reason. - static constexpr int InitialTiming = 48688 * TimingFrac; // add 1618*2 to get the timeout of the second scanline pair - static constexpr int Post50Max = 51116 * TimingFrac; // for some reason it doesn't care about how full it actually is, it just cares about if its the first 50 scanlines to speedrun rendering? + //static constexpr int FakeTiming = 2 * TimingFrac; + //static constexpr int FraudulentTiming = 1120 * TimingFrac; // bad theory. todo: find a better one. + static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair + static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is, it just cares about if its the first 50 scanlines to speedrun rendering? + static constexpr int FreeTiming = 496 * TimingFrac; // 496 | every scanline has a free 496 pixels worth of timing for some reason. + static constexpr int ScanlineIncrement = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair + static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 1630) // GPU 3D rasterization timings II, for counting each element with timing characteristics - static constexpr int FirstPolyScanline = 0 * TimingFrac; - static constexpr int PerPolyScanline = 12 * TimingFrac; // should be correct for *most* line polygons and polygons with vertical slopes - static constexpr int PerPixelTiming = 1 * TimingFrac; // 1 pixel = 1 pixel - static constexpr int NumFreePixels = 4; // First 4 pixels in a polygon scanline are free (for some reason) + + //static constexpr int FirstPolyScanline = 0 * TimingFrac; + static constexpr int PerPolyScanline = 12 * TimingFrac; // 12 | should be 12, but 14 is "correct" // should be correct for *most* line polygons and polygons with vertical slopes + static constexpr int PerPixelTiming = 1 * TimingFrac; // 1 | 1 pixel = 1 pixel + static constexpr int NumFreePixels = 4; // 4 | First 4 pixels in a polygon scanline are free (for some reason) + static constexpr int MinToStartPoly = 2 * TimingFrac; // 1 | if there is not 1 cycle remaining, do not bother rendering polygon (CHECKME: I dont think this should decrement timings by anything?) + static constexpr int EmptyPolyScanline = 4 * TimingFrac; // - 14; // 4 | seems to be slightly under 4 px? + + // GPU 3D rasterization timing III, for first polygon exclusive timing characteristics + // should be done first, as these are "async" pre-calcs of polygon attributes + + static constexpr int FirstVSlope = 0 * TimingFrac; // 1 | the first polygon in a scanline having two vertical slopes adds 1 to timings...? + static constexpr int FirstNull = 1 * TimingFrac; // 1 | if the first polygon is "null" (probably wrong?) // static constexpr int RasterTimingCap = 51116 * TimingFrac; - static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED - static constexpr int PerScanlineRecup = 2112 * TimingFrac; // seems to check out? // should be the "free" time the gpu has to do the calculation - static constexpr int PerRightSlope = 1 * TimingFrac; - static constexpr int EmptyPolyScanline = 4 * TimingFrac;// - 14; // seems to be slightly under 4? - //static constexpr int FirstPixelTiming; + // static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED + // static constexpr int PerScanlineRecup = 2112 * TimingFrac; // seems to check out? // should be the "free" time the gpu has to do the calculation + // static constexpr int PerRightSlope = 1 * TimingFrac; + // static constexpr int FirstPixelTiming; class Renderer3D { diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index af23132d..688785d0 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -127,20 +127,32 @@ bool SoftRenderer::DoTimings(s32 cycles, bool odd) return true; } +bool SoftRenderer::CheckTimings(s32 cycles, bool odd) +{ + // check if there are 'cycles' amount of cycles remaining. + + s32* counter; + if (odd) counter = &RasterTimingOdd; + else counter = &RasterTimingEven; + + if (RasterTiming - *counter >= cycles) return true; + else return false; +} + u32 SoftRenderer::DoTimingsPixels(s32 pixels, bool odd) { // calculate and return the difference between the old span and the new span, while adding timings to the timings counter // pixels dont count towards timings if they're the first 4 pixels in a scanline (for some reason?) - if (pixels <= 4) return 0; + if (pixels <= NumFreePixels) return 0; - pixels -= 4; + pixels -= NumFreePixels; s32* counter; if (odd) counter = &RasterTimingOdd; else counter = &RasterTimingEven; - //todo: figure out a faster way to support TimingFrac > 1 without using a for loop somehow. + //todo: figure out a faster way to support TimingFrac > 1 without using a for loop somehow. (fingers crossed we dont have to!) if constexpr (TimingFrac > 1) for (; pixels > 0; pixels--) { @@ -160,6 +172,26 @@ u32 SoftRenderer::DoTimingsPixels(s32 pixels, bool odd) return pixels; } +bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, bool odd) +{ + // determine the timing impact of the first polygon's slopes. + + Polygon* polygon = rp->PolyData; + + if (polygon->YTop == polygon->YBottom) return false; + if (y == polygon->YTop) return false; + + s32* counter; + if (odd) counter = &RasterTimingOdd; + else counter = &RasterTimingEven; + + if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) *counter += 1; + + if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) *counter += 1; + + return DoTimings(2, odd); // CHECKME: does this need to be done time its incremented here? +} + void SoftRenderer::TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) { u32 vramaddr = (texparam & 0xFFFF) << 3; @@ -744,7 +776,7 @@ void SoftRenderer::CheckSlope(RendererPolygon* rp, s32 y) } } -void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) +bool SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y, bool odd) { Polygon* polygon = rp->PolyData; @@ -766,19 +798,8 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) memset(&StencilBuffer[256 * (y&0x1)], 0, 256); PrevIsShadowMask = true; - - if (polygon->YTop != polygon->YBottom) - { - if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) - { - SetupPolygonLeftEdge(rp, y); - } - - if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) - { - SetupPolygonRightEdge(rp, y); - } - } + + CheckSlope(rp, y); Vertex *vlcur, *vlnext, *vrcur, *vrnext; s32 xstart, xend; @@ -787,6 +808,7 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) s32 l_edgecov, r_edgecov; Interpolator<1>* interp_start; Interpolator<1>* interp_end; + bool abortscanline; // to abort the rest of the scanline after finishing this polygon xstart = rp->XL; xend = rp->XR; @@ -870,7 +892,7 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) // similarly, we can perform alpha test early (checkme) if (wireframe) polyalpha = 31; - if (polyalpha <= GPU.GPU3D.RenderAlphaRef) return; + if (polyalpha <= GPU.GPU3D.RenderAlphaRef) return false; // TODO: check how this impacts timings? // in wireframe mode, there are special rules for equal Z (TODO) @@ -880,10 +902,23 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) int edge; s32 x = xstart; - Interpolator<0> interpX(xstart, xend+1, wl, wr); + xend += 1; + Interpolator<0> interpX(xstart, xend, wl, wr); if (x < 0) x = 0; s32 xlimit; + if (xend > 256) xend = 256; + + // determine if the span can be rendered within the time allotted to the scanline + // TODO: verify the timing characteristics of shadow masks are the same as regular polygons. + s32 diff = DoTimingsPixels(xend-x, odd); + if (diff != 0) + { + xend -= diff; + r_edgelen -= diff; + abortscanline = true; + } + else abortscanline = false; // for shadow masks: set stencil bits where the depth test fails. // draw nothing. @@ -891,8 +926,7 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) // part 1: left edge edge = yedge | 0x1; xlimit = xstart+l_edgelen; - if (xlimit > xend+1) xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + if (xlimit > xend) xlimit = xend; if (!l_filledge) x = xlimit; else @@ -918,9 +952,8 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) // part 2: polygon inside edge = yedge; - xlimit = xend-r_edgelen+1; - if (xlimit > xend+1) xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + xlimit = xend-r_edgelen; + if (xlimit > xend) xlimit = xend; if (wireframe && !edge) x = std::max(x, xlimit); else for (; x < xlimit; x++) { @@ -944,8 +977,7 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) // part 3: right edge edge = yedge | 0x2; - xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + xlimit = xend; if (r_filledge) for (; x < xlimit; x++) @@ -967,9 +999,9 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) StencilBuffer[256*(y&0x1) + x] |= 0x2; } } - - rp->XL = rp->SlopeL.Step(); - rp->XR = rp->SlopeR.Step(); + + Step(rp); + return abortscanline; } bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) @@ -993,12 +1025,6 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) CheckSlope(rp, y); - if (DoTimings(PerPolyScanline, odd)) - { - Step(rp); - return true; - } - Vertex *vlcur, *vlnext, *vrcur, *vrnext; s32 xstart, xend; bool l_filledge, r_filledge; @@ -1006,7 +1032,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) s32 l_edgecov, r_edgecov; Interpolator<1>* interp_start; Interpolator<1>* interp_end; - bool abortscanline = false; // to abort the rest of the scanline after finishing this polygon + bool abortscanline; // to abort the rest of the scanline after finishing this polygon xstart = rp->XL; xend = rp->XR; @@ -1142,6 +1168,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) r_edgelen -= diff; abortscanline = true; } + else abortscanline = false; // part 1: left edge edge = yedge | 0x1; @@ -1434,6 +1461,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) bool SoftRenderer::RenderScanline(s32 y, int npolys, bool odd) { bool abort = false; + bool first = true; for (int i = 0; i < npolys; i++) { RendererPolygon* rp = &PolygonList[i]; @@ -1441,20 +1469,29 @@ bool SoftRenderer::RenderScanline(s32 y, int npolys, bool odd) if (y == polygon->YBottom && y != polygon->YTop) { - if (DoTimings(EmptyPolyScanline, odd)) abort = true; + if (!abort) abort = (first && DoTimings(FirstNull, odd)) || DoTimings(EmptyPolyScanline, odd); + + first = false; } else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) { - if (y == polygon->YTop) if(DoTimings(FirstPolyScanline, odd)) abort = true; + //if (y == polygon->YTop) if(DoTimings(FirstPolyScanline, odd)) abort = true; + + if (!abort) abort = (first && DoTimingsSlopes(rp, y, odd)) // incorrect. needs research; behavior is strange... + || DoTimings(PerPolyScanline, odd) + || (!CheckTimings(MinToStartPoly, odd)); + if (abort) { CheckSlope(rp, y); Step(rp); } else if (polygon->IsShadowMask) - ;//RenderShadowMaskScanline(rp, y); + abort = RenderShadowMaskScanline(rp, y, odd); else - if (RenderPolygonScanline(rp, y, odd)) abort = true; + abort = RenderPolygonScanline(rp, y, odd); + + first = false; } } @@ -1500,7 +1537,7 @@ u32 SoftRenderer::CalculateFogDensity(u32 pixeladdr) return density; } -void SoftRenderer::ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool odd) +void SoftRenderer::ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool odd, s32 uhohzone) { // to consider: // clearing all polygon fog flags if the master flag isn't set? @@ -1759,10 +1796,13 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) s8 buffersize = 0; RasterTiming = InitialTiming; - s32 timingadvance = InitialTiming; bool abort = false; - //u32* RDLinesReg = &GPU.GPU3D.RDLines; ClearBuffers(); + s32 gpu2dtracking = InitGPU2DTimeout; + s32 gpu2dfreetime = InitGPU2DTimeout; + s32 prev2dtime; + bool readodd = true; + for (u8 quarter = 0; quarter < 4; quarter++) for (u8 bufferline = 0; bufferline < 48; bufferline += 2) { @@ -1770,13 +1810,19 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) RasterTimingEven = 0; RasterTiming += ScanlineIncrement; + gpu2dtracking += GPU2DReadSLPair; if (abort) RasterTiming += AbortIncrement; // if previous scanline was aborted, allow an extra 12 pixels worth of timing if (y >= 50) { - if (RasterTiming > Post50Max) RasterTiming = Post50Max; - timingadvance = 0; - buffersize = 48; + gpu2dfreetime = 0; + if (RasterTiming > Post50Max) + { + s32 temp = RasterTiming - Post50Max; + RasterTiming = Post50Max; + gpu2dtracking -= temp; + } + if (buffersize > 48) buffersize = 48; } abort = RenderScanline(y, j, true); @@ -1785,50 +1831,59 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) buffersize += 2; //RasterTiming += ScanlineBreak; s32 timespent = std::max(RasterTimingOdd, RasterTimingEven); - - /*if (timespent > FreeTiming) - { - abort = true; - timespent -= FreeTiming; - } - else if (!abort) - { - abort = false; - timespent -= FreeTiming; - }*/ - //if (!abort) - //if (buffersize > 48) timespent -= PerScanlineRecup; - /*else*/ + timespent -= FreeTiming; - if (timespent > 0) - { - RasterTiming -= timespent; - timingadvance -= timespent; - } + // measure scanlines being read here. + gpu2dtracking -= timespent; + gpu2dfreetime -= timespent; - if (timingadvance < 0) for (s32 i = (ScanlinePairLength / 2) * buffersize; i > RasterTiming + (ScanlinePairLength / 2); i -= ScanlinePairLength / 2) buffersize -= 1; - if (buffersize < 0) buffersize = 0; + if (timespent > 0) RasterTiming -= timespent; + + //if (RasterTiming < 0) RasterTiming = 0; + if (gpu2dfreetime <= 0) + { + buffersize = 0; + if (gpu2dtracking > 0) + { + s32 i = gpu2dtracking; + while (true) + { + s32 comp = GPU2DReadSLPair/2; + //if (readodd) comp = GPU2DSpeedOutsidePair + GPU2DReadScanline; + //else comp = GPU2DSpeedWithinPair + GPU2DReadScanline; - // seems to display the lowest scanline buffer count reached during the current frame. - // we also caps it to 46 here, because this reg does that too for some reason. - if (quarter >= 1 && buffersize < GPU.GPU3D.RDLinesDisplay) GPU.GPU3D.RDLinesDisplay = buffersize; + if (i < comp) break; + + i -= comp; + buffersize++; + //readodd = !readodd; + } + + if (i > 0) buffersize++; + } + + // seems to display the lowest scanline buffer count reached during the current frame. + // we also caps it to 46 here, because this reg does that too for some reason. + if (GPU.GPU3D.RDLines > buffersize) GPU.GPU3D.RDLines = buffersize; + } if (prevbufferline >= 0) { - ScanlineFinalPass(y-2, prevbufferline, true); - ScanlineFinalPass(y-1, prevbufferline+1, false); + ScanlineFinalPass(y-2, prevbufferline, true, prev2dtime); + ScanlineFinalPass(y-1, prevbufferline+1, false, prev2dtime); } y += 2; prevbufferline = bufferline; + prev2dtime = gpu2dtracking; if (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); } - ScanlineFinalPass(190, prevbufferline, true); - ScanlineFinalPass(191, prevbufferline+1, false); + ScanlineFinalPass(190, prevbufferline, true, prev2dtime); + ScanlineFinalPass(191, prevbufferline+1, false, prev2dtime); if (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 01187a8a..4b9b31eb 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -455,7 +455,9 @@ private: melonDS::GPU& GPU; RendererPolygon PolygonList[2048]; bool DoTimings(s32 cycles, bool odd); + bool CheckTimings(s32 cycles, bool odd); u32 DoTimingsPixels(s32 pixels, bool odd); + bool DoTimingsSlopes(RendererPolygon* rp, s32 y, bool odd); void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha); u32 RenderPixel(Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t); void PlotTranslucentPixel(u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow); @@ -464,11 +466,11 @@ private: void SetupPolygon(RendererPolygon* rp, Polygon* polygon); void Step(RendererPolygon* rp); void CheckSlope(RendererPolygon* rp, s32 y); - void RenderShadowMaskScanline(RendererPolygon* rp, s32 y); + bool RenderShadowMaskScanline(RendererPolygon* rp, s32 y, bool odd); bool RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd); bool RenderScanline(s32 y, int npolys, bool odd); u32 CalculateFogDensity(u32 pixeladdr); - void ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool odd); + void ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool odd, s32 uhohzone); void ClearBuffers(); void RenderPolygons(bool threaded, Polygon** polygons, int npolys); From 8cc42490ded8119f30551adff9dd1f0ab71a0618 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 20 Dec 2023 21:51:33 -0500 Subject: [PATCH 13/53] fix build but also sw renderer crashes now --- src/GPU3D.h | 2 +- src/GPU3D_Soft.cpp | 44 ++++++++++++++++++++++---------------------- src/GPU3D_Soft.h | 16 ++++++++-------- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index ce8fedf3..fd568883 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -366,7 +366,7 @@ public: // GPU 3D rasterization timing III, for first polygon exclusive timing characteristics // should be done first, as these are "async" pre-calcs of polygon attributes - static constexpr int FirstVSlope = 0 * TimingFrac; // 1 | the first polygon in a scanline having two vertical slopes adds 1 to timings...? + static constexpr int FirstPerSlope = 1 * TimingFrac; // 1 | for each "slope" the first polygon has in this scanline increment it by 1. (see DoTimingsSlopes in GPU3D_Soft.cpp for more info) static constexpr int FirstNull = 1 * TimingFrac; // 1 | if the first polygon is "null" (probably wrong?) // static constexpr int RasterTimingCap = 51116 * TimingFrac; diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 7a5722db..89bd84bc 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -114,7 +114,7 @@ void SoftRenderer::SetThreaded(bool threaded, GPU& gpu) noexcept } } -bool SoftRenderer::DoTimings(s32 cycles, bool odd) +bool SoftRenderer::DoTimings(GPU3D& gpu3d, s32 cycles, bool odd) { // add timings to a counter and check if underflowed. @@ -125,7 +125,7 @@ bool SoftRenderer::DoTimings(s32 cycles, bool odd) *counter += cycles; if (RasterTiming - *counter > 0) return false; - GPU.GPU3D.DispCnt |= (1<<12); + gpu3d.DispCnt |= (1<<12); return true; } @@ -141,7 +141,7 @@ bool SoftRenderer::CheckTimings(s32 cycles, bool odd) else return false; } -u32 SoftRenderer::DoTimingsPixels(s32 pixels, bool odd) +u32 SoftRenderer::DoTimingsPixels(GPU3D& gpu3d, s32 pixels, bool odd) { // calculate and return the difference between the old span and the new span, while adding timings to the timings counter @@ -170,31 +170,31 @@ u32 SoftRenderer::DoTimingsPixels(s32 pixels, bool odd) if (pixels <= 0) return 0; - GPU.GPU3D.DispCnt |= (1<<12); + gpu3d.DispCnt |= (1<<12); return pixels; } -bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, bool odd) +bool SoftRenderer::DoTimingsSlopes(GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd) { // determine the timing impact of the first polygon's slopes. Polygon* polygon = rp->PolyData; - if (polygon->YTop == polygon->YBottom) return false; + if (polygon->YTop == polygon->YBottom) return false; // 0 px tall line polygons do not have slopes, and thus no timing penalty if (y == polygon->YTop) return false; s32* counter; if (odd) counter = &RasterTimingOdd; else counter = &RasterTimingEven; - if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) *counter += 1; + if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) *counter += FirstPerSlope; - if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) *counter += 1; + if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) *counter += FirstPerSlope; - return DoTimings(2, odd); // CHECKME: does this need to be done time its incremented here? + return DoTimings(gpu3d, FirstPerSlope*2, odd); // CHECKME: does this need to be done every time its incremented here? does this even need to be done *at all?* } -void SoftRenderer::TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) +void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const { u32 vramaddr = (texparam & 0xFFFF) << 3; @@ -778,7 +778,7 @@ void SoftRenderer::CheckSlope(RendererPolygon* rp, s32 y) } } -bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd) +bool SoftRenderer::RenderShadowMaskScanline(GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd) { Polygon* polygon = rp->PolyData; @@ -913,7 +913,7 @@ bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* // determine if the span can be rendered within the time allotted to the scanline // TODO: verify the timing characteristics of shadow masks are the same as regular polygons. - s32 diff = DoTimingsPixels(xend-x, odd); + s32 diff = DoTimingsPixels(gpu3d, xend-x, odd); if (diff != 0) { xend -= diff; @@ -1006,7 +1006,7 @@ bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* return abortscanline; } -bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, bool odd) +bool SoftRenderer::RenderPolygonScanline(GPU& gpu, RendererPolygon* rp, s32 y, bool odd) { Polygon* polygon = rp->PolyData; u32 polyattr = (polygon->Attr & 0x3F008000); @@ -1163,7 +1163,7 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 if (xend > 256) xend = 256; // determine if the span can be rendered within the time allotted to the scanline - s32 diff = DoTimingsPixels(xend-x, odd); + s32 diff = DoTimingsPixels(gpu.GPU3D, xend-x, odd); if (diff != 0) { xend -= diff; @@ -1460,7 +1460,7 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 return abortscanline; } -bool SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, bool odd) +bool SoftRenderer::RenderScanline(GPU& gpu, s32 y, int npolys, bool odd) { bool abort = false; bool first = true; @@ -1471,16 +1471,16 @@ bool SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, bool odd) if (y == polygon->YBottom && y != polygon->YTop) { - if (!abort) abort = (first && DoTimings(FirstNull, odd)) || DoTimings(EmptyPolyScanline, odd); + if (!abort) abort = (first && DoTimings(gpu.GPU3D, FirstNull, odd)) || DoTimings(gpu.GPU3D, EmptyPolyScanline, odd); first = false; } else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) { - //if (y == polygon->YTop) if(DoTimings(FirstPolyScanline, odd)) abort = true; + //if (y == polygon->YTop) if(DoTimings(gpu.GPU3D, FirstPolyScanline, odd)) abort = true; - if (!abort) abort = (first && DoTimingsSlopes(rp, y, odd)) // incorrect. needs research; behavior is strange... - || DoTimings(PerPolyScanline, odd) + if (!abort) abort = (first && DoTimingsSlopes(gpu.GPU3D, rp, y, odd)) // incorrect. needs research; behavior is strange... + || DoTimings(gpu.GPU3D, PerPolyScanline, odd) || (!CheckTimings(MinToStartPoly, odd)); if (abort) @@ -1784,7 +1784,7 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) } } -void SoftRenderer::RenderPolygons(const GPU& gpu, bool threaded, Polygon** polygons, int npolys) +void SoftRenderer::RenderPolygons(GPU& gpu, bool threaded, Polygon** polygons, int npolys) { int j = 0; for (int i = 0; i < npolys; i++) @@ -1799,7 +1799,7 @@ void SoftRenderer::RenderPolygons(const GPU& gpu, bool threaded, Polygon** polyg s8 buffersize = 0; RasterTiming = InitialTiming; bool abort = false; - ClearBuffers(); + ClearBuffers(gpu); s32 gpu2dtracking = InitGPU2DTimeout; s32 gpu2dfreetime = InitGPU2DTimeout; s32 prev2dtime; @@ -1867,7 +1867,7 @@ void SoftRenderer::RenderPolygons(const GPU& gpu, bool threaded, Polygon** polyg // seems to display the lowest scanline buffer count reached during the current frame. // we also caps it to 46 here, because this reg does that too for some reason. - if (GPU.GPU3D.RDLines > buffersize) GPU.GPU3D.RDLines = buffersize; + if (gpu.GPU3D.RDLines > buffersize) gpu.GPU3D.RDLines = buffersize; } for (s32 y = 1; y < 192; y++) diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index df6e1d94..7f062cf1 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -453,10 +453,10 @@ private: }; RendererPolygon PolygonList[2048]; - bool DoTimings(s32 cycles, bool odd); + bool DoTimings(GPU3D& gpu3d, s32 cycles, bool odd); bool CheckTimings(s32 cycles, bool odd); - u32 DoTimingsPixels(s32 pixels, bool odd); - bool DoTimingsSlopes(RendererPolygon* rp, s32 y, bool odd); + u32 DoTimingsPixels(GPU3D& gpu3d, s32 pixels, bool odd); + bool DoTimingsSlopes(GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd); void TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const; u32 RenderPixel(const GPU& gpu, const Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t) const; void PlotTranslucentPixel(const GPU3D& gpu3d, u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow); @@ -465,13 +465,13 @@ private: void SetupPolygon(RendererPolygon* rp, Polygon* polygon) const; void Step(RendererPolygon* rp); void CheckSlope(RendererPolygon* rp, s32 y); - bool RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd); - bool RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, bool odd); - bool RenderScanline(const GPU& gpu, s32 y, int npolys, bool odd); + bool RenderShadowMaskScanline(GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd); + bool RenderPolygonScanline(GPU& gpu, RendererPolygon* rp, s32 y, bool odd); + bool RenderScanline(GPU& gpu, s32 y, int npolys, bool odd); u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const; void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, u8 rdbufferoffset, bool odd, s32 uhohzone); - void ClearBuffers((const GPU& gpu); - void RenderPolygons(const GPU& gpu, bool threaded, Polygon** polygons, int npolys); + void ClearBuffers(const GPU& gpu); + void RenderPolygons(GPU& gpu, bool threaded, Polygon** polygons, int npolys); void RenderThreadFunc(GPU& gpu); From 6cee0a7ad780ff74b49036e6a489db72360094c4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 20 Dec 2023 23:15:07 -0500 Subject: [PATCH 14/53] no idea how that one slipped in --- src/GPU3D_Soft.cpp | 4 ++-- src/GPU3D_Soft.h | 8 +++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 89bd84bc..98101184 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1870,7 +1870,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, bool threaded, Polygon** polygons, i if (gpu.GPU3D.RDLines > buffersize) gpu.GPU3D.RDLines = buffersize; } - for (s32 y = 1; y < 192; y++) + if (prevbufferline >= 0) { ScanlineFinalPass(gpu.GPU3D, y-2, prevbufferline, true, prev2dtime); ScanlineFinalPass(gpu.GPU3D, y-1, prevbufferline+1, false, prev2dtime); @@ -1880,7 +1880,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, bool threaded, Polygon** polygons, i prevbufferline = bufferline; prev2dtime = gpu2dtracking; - if (threaded) + if (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); } diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 7f062cf1..ac3d8e72 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -490,15 +490,13 @@ private: static constexpr int ScanlineWidth = 256; static constexpr int NumScanlines = 192; static constexpr int NumScanlinesRD = 48; - static constexpr int NumScanlinesInternal = 192; - static constexpr int InternalBufferSize = ScanlineWidth * NumScanlinesInternal; static constexpr int RDBufferSize = ScanlineWidth * NumScanlinesRD; static constexpr int BufferSize = ScanlineWidth * NumScanlines; static constexpr int FirstPixelOffset = 0; - u32 ColorBuffer[InternalBufferSize * 2]; - u32 DepthBuffer[InternalBufferSize * 2]; - u32 AttrBuffer[InternalBufferSize * 2]; + u32 ColorBuffer[BufferSize * 2]; + u32 DepthBuffer[BufferSize * 2]; + u32 AttrBuffer[BufferSize * 2]; u32 RDBuffer[RDBufferSize]; u32 FinalBuffer[BufferSize]; From 78da2846e6e7e0765febc28550b72b2040723ee1 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 23 Dec 2023 00:38:39 -0500 Subject: [PATCH 15/53] wip - rewrite 3 - scheduler edition --- src/GPU3D.h | 59 +++++--- src/GPU3D_Soft.cpp | 355 ++++++++++++++++++++++++++++++++++----------- src/GPU3D_Soft.h | 43 ++++-- 3 files changed, 333 insertions(+), 124 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index 7e8048f0..03dfa4f0 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -329,18 +329,25 @@ public: u32 ScrolledLine[256]; }; - // rasteriztion timing constants + // Rasterization Timing Constants + static constexpr int TimingFrac = 1; // add a fractional component if pixels is not enough precision - // GPU 2D read timings, for emulating race conditions + // GPU 2D Read Timings: For Emulating Buffer Read/Write Race Conditions + static constexpr int DelayBetweenReads = 809 * TimingFrac; + static constexpr int ScanlineReadSpeed = 256 * TimingFrac; + static constexpr int ScanlineReadInc = DelayBetweenReads + ReadScanline; - static constexpr int GPU2DSpeedWithinPair = 296 * TimingFrac; // the delay between finishing reading the first scanline and beginning reading the second scanline of a scanline pair. - static constexpr int GPU2DSpeedOutsidePair = 810 * TimingFrac; // the delay between finishing reading a pair and beginning reading a new pair. - static constexpr int GPU2DReadScanline = 256 * TimingFrac; // the time it takes to read a scanline. - static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // notably the same as the scanline increment. - static constexpr int InitGPU2DTimeout = 50000 * TimingFrac; // 51618? | when it starts reading the first scanline. - static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 48 * TimingFrac; // time to read 48 scanlines. - // GPU 3D rasterization timings, for emulating the timeout + + static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair. + static constexpr int GPU2DSpeedSecondInPair = 296 * TimingFrac; // 296 | 295??? | the delay between finishing reading the first scanline + // and beginning reading the second scanline of a scanline pair. + static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline. + static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment. + static constexpr int InitGPU2DTimeout = 51874 * TimingFrac; // 51618? | when it starts reading the first scanline. + static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines. + + // GPU 3D Rasterization Timings: For Emulating Scanline Timeout //static constexpr int ScanlinePairLength = 2130 * TimingFrac; //static constexpr int ScanlineTimeout = 1686 * TimingFrac; // 2126? 1686? @@ -349,24 +356,36 @@ public: //static constexpr int FakeTiming = 2 * TimingFrac; //static constexpr int FraudulentTiming = 1120 * TimingFrac; // bad theory. todo: find a better one. static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair - static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is, it just cares about if its the first 50 scanlines to speedrun rendering? - static constexpr int FreeTiming = 496 * TimingFrac; // 496 | every scanline has a free 496 pixels worth of timing for some reason. - static constexpr int ScanlineIncrement = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair - static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 1630) + static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is, + // it just cares about if its the first 50 scanlines to speedrun rendering? + static constexpr int FinalPassLen = 500 * TimingFrac; // 496 (might technically be 500?) | the next scanline cannot begin while a scanline's final pass is in progress + // (can be interpreted as the minimum amount of cycles for the next scanline + // pair to start after the previous pair began) (related to final pass?) + static constexpr int ScanlineIncrementold = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair + static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains" + static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126) + // (why does the next pair get more time if the previous scanline is aborted?) + static constexpr int UnderflowFlag = 14 * TimingFrac; // 14 | How many cycles need to be left for the 3ddispcnt rdlines underflow flag to be set + static constexpr int RastDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?) + // (Amount of time before the end of the cycle a scanline must abort?) + static constexpr int FinishScanline = 512 * TimingFrac; - // GPU 3D rasterization timings II, for counting each element with timing characteristics + // GPU 3D Rasterization Timings II: For Tracking Timing Behaviors //static constexpr int FirstPolyScanline = 0 * TimingFrac; - static constexpr int PerPolyScanline = 12 * TimingFrac; // 12 | should be 12, but 14 is "correct" // should be correct for *most* line polygons and polygons with vertical slopes + static constexpr int PerPolyScanline = 12 * TimingFrac; // 12 | The basic timing cost for polygons. Applies per polygon per scanline. static constexpr int PerPixelTiming = 1 * TimingFrac; // 1 | 1 pixel = 1 pixel static constexpr int NumFreePixels = 4; // 4 | First 4 pixels in a polygon scanline are free (for some reason) - static constexpr int MinToStartPoly = 2 * TimingFrac; // 1 | if there is not 1 cycle remaining, do not bother rendering polygon (CHECKME: I dont think this should decrement timings by anything?) - static constexpr int EmptyPolyScanline = 4 * TimingFrac; // - 14; // 4 | seems to be slightly under 4 px? + static constexpr int MinToStartPoly = 2 * TimingFrac; // 1 | if there aren't 2 (why two?) cycles remaining after the polygon timing penalty, + // do not bother rendering the polygon (CHECKME: I dont think this should decrement timings by anything?) + static constexpr int EmptyPolyScanline = 4 * TimingFrac; // 4 | the ignored "empty" bottom-most scanline of a polygon + // which shouldn't be rendered for some reason has timing characteristics. - // GPU 3D rasterization timing III, for first polygon exclusive timing characteristics - // should be done first, as these are "async" pre-calcs of polygon attributes + // GPU 3D Rasterization Timings III, For First Polygon "Pre-Calc" Timings + // should be added before other timings, as these are "async" pre-calcs of polygon attributes - static constexpr int FirstPerSlope = 1 * TimingFrac; // 1 | for each "slope" the first polygon has in this scanline increment it by 1. (see DoTimingsSlopes in GPU3D_Soft.cpp for more info) + static constexpr int FirstPerSlope = 1 * TimingFrac; // 1 | for each "slope" the first polygon has in this scanline increment it by 1. + // (see DoTimingsSlopes() in GPU3D_Soft.cpp for more info) static constexpr int FirstNull = 1 * TimingFrac; // 1 | if the first polygon is "null" (probably wrong?) // static constexpr int RasterTimingCap = 51116 * TimingFrac; diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 98101184..10bbd053 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -114,7 +114,7 @@ void SoftRenderer::SetThreaded(bool threaded, GPU& gpu) noexcept } } -bool SoftRenderer::DoTimings(GPU3D& gpu3d, s32 cycles, bool odd) +bool SoftRenderer::DoTimings(s32 cycles, bool odd) { // add timings to a counter and check if underflowed. @@ -123,9 +123,8 @@ bool SoftRenderer::DoTimings(GPU3D& gpu3d, s32 cycles, bool odd) else counter = &RasterTimingEven; *counter += cycles; - if (RasterTiming - *counter > 0) return false; + if (RasterTiming + *counter > ScanlineTimeout) return false; - gpu3d.DispCnt |= (1<<12); return true; } @@ -137,11 +136,11 @@ bool SoftRenderer::CheckTimings(s32 cycles, bool odd) if (odd) counter = &RasterTimingOdd; else counter = &RasterTimingEven; - if (RasterTiming - *counter >= cycles) return true; + if (ScanlineTimeout - (RasterTiming + *counter) >= cycles) return true; else return false; } -u32 SoftRenderer::DoTimingsPixels(GPU3D& gpu3d, s32 pixels, bool odd) +u32 SoftRenderer::DoTimingsPixels(s32 pixels, bool odd) { // calculate and return the difference between the old span and the new span, while adding timings to the timings counter @@ -154,27 +153,18 @@ u32 SoftRenderer::DoTimingsPixels(GPU3D& gpu3d, s32 pixels, bool odd) if (odd) counter = &RasterTimingOdd; else counter = &RasterTimingEven; - //todo: figure out a faster way to support TimingFrac > 1 without using a for loop somehow. (fingers crossed we dont have to!) - if constexpr (TimingFrac > 1) - for (; pixels > 0; pixels--) - { - *counter += TimingFrac; - if ((RasterTiming - *counter) <= 0) break; - } - else - { - *counter += pixels; - pixels = -(RasterTiming - *counter); - if (pixels > 0) *counter -= pixels; - } - - if (pixels <= 0) return 0; + *counter += pixels; + pixels = -(ScanlineTimeout - (RasterTiming + *counter)); - gpu3d.DispCnt |= (1<<12); - return pixels; + if (pixels > 0) + { + *counter -= pixels; + return pixels; + } + else return 0; } -bool SoftRenderer::DoTimingsSlopes(GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd) +bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, bool odd) { // determine the timing impact of the first polygon's slopes. @@ -191,7 +181,7 @@ bool SoftRenderer::DoTimingsSlopes(GPU3D& gpu3d, RendererPolygon* rp, s32 y, boo if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) *counter += FirstPerSlope; - return DoTimings(gpu3d, FirstPerSlope*2, odd); // CHECKME: does this need to be done every time its incremented here? does this even need to be done *at all?* + return DoTimings(FirstPerSlope*2, odd); // CHECKME: does this need to be done every time its incremented here? does this even need to be done *at all?* } void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const @@ -778,7 +768,7 @@ void SoftRenderer::CheckSlope(RendererPolygon* rp, s32 y) } } -bool SoftRenderer::RenderShadowMaskScanline(GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd) +bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd) { Polygon* polygon = rp->PolyData; @@ -913,7 +903,7 @@ bool SoftRenderer::RenderShadowMaskScanline(GPU3D& gpu3d, RendererPolygon* rp, s // determine if the span can be rendered within the time allotted to the scanline // TODO: verify the timing characteristics of shadow masks are the same as regular polygons. - s32 diff = DoTimingsPixels(gpu3d, xend-x, odd); + s32 diff = DoTimingsPixels(xend-x, odd); if (diff != 0) { xend -= diff; @@ -934,7 +924,7 @@ bool SoftRenderer::RenderShadowMaskScanline(GPU3D& gpu3d, RendererPolygon* rp, s else for (; x < xlimit; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; interpX.SetX(x); @@ -959,7 +949,7 @@ bool SoftRenderer::RenderShadowMaskScanline(GPU3D& gpu3d, RendererPolygon* rp, s if (wireframe && !edge) x = std::max(x, xlimit); else for (; x < xlimit; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; interpX.SetX(x); @@ -984,7 +974,7 @@ bool SoftRenderer::RenderShadowMaskScanline(GPU3D& gpu3d, RendererPolygon* rp, s if (r_filledge) for (; x < xlimit; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; interpX.SetX(x); @@ -1006,7 +996,7 @@ bool SoftRenderer::RenderShadowMaskScanline(GPU3D& gpu3d, RendererPolygon* rp, s return abortscanline; } -bool SoftRenderer::RenderPolygonScanline(GPU& gpu, RendererPolygon* rp, s32 y, bool odd) +bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, bool odd) { Polygon* polygon = rp->PolyData; u32 polyattr = (polygon->Attr & 0x3F008000); @@ -1163,7 +1153,7 @@ bool SoftRenderer::RenderPolygonScanline(GPU& gpu, RendererPolygon* rp, s32 y, b if (xend > 256) xend = 256; // determine if the span can be rendered within the time allotted to the scanline - s32 diff = DoTimingsPixels(gpu.GPU3D, xend-x, odd); + s32 diff = DoTimingsPixels(xend-x, odd); if (diff != 0) { xend -= diff; @@ -1460,7 +1450,7 @@ bool SoftRenderer::RenderPolygonScanline(GPU& gpu, RendererPolygon* rp, s32 y, b return abortscanline; } -bool SoftRenderer::RenderScanline(GPU& gpu, s32 y, int npolys, bool odd) +bool SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, bool odd) { bool abort = false; bool first = true; @@ -1471,16 +1461,16 @@ bool SoftRenderer::RenderScanline(GPU& gpu, s32 y, int npolys, bool odd) if (y == polygon->YBottom && y != polygon->YTop) { - if (!abort) abort = (first && DoTimings(gpu.GPU3D, FirstNull, odd)) || DoTimings(gpu.GPU3D, EmptyPolyScanline, odd); + if (!abort) abort = (first && DoTimings(FirstNull, odd)) || DoTimings(EmptyPolyScanline, odd); first = false; } else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) { - //if (y == polygon->YTop) if(DoTimings(gpu.GPU3D, FirstPolyScanline, odd)) abort = true; + //if (y == polygon->YTop) if(DoTimings(FirstPolyScanline, odd)) abort = true; - if (!abort) abort = (first && DoTimingsSlopes(gpu.GPU3D, rp, y, odd)) // incorrect. needs research; behavior is strange... - || DoTimings(gpu.GPU3D, PerPolyScanline, odd) + if (!abort) abort = (first && DoTimingsSlopes(rp, y, odd)) // incorrect. needs research; behavior is strange... + || DoTimings(PerPolyScanline, odd) || (!CheckTimings(MinToStartPoly, odd)); if (abort) @@ -1539,7 +1529,7 @@ u32 SoftRenderer::CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const return density; } -void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, u8 rdbufferoffset, bool odd, s32 uhohzone) +void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y) { // to consider: // clearing all polygon fog flags if the master flag isn't set? @@ -1707,18 +1697,6 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, u8 rdbufferoffse ColorBuffer[pixeladdr] = topR | (topG << 8) | (topB << 16) | (topA << 24); } } - - // if the first two scanlines are late then it's delayed by 48 scanlines - if (false)//late) - { - memcpy(&FinalBuffer[y*ScanlineWidth], &RDBuffer[rdbufferoffset*ScanlineWidth], 4 * ScanlineWidth); - memcpy(&RDBuffer[rdbufferoffset*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], 4 * ScanlineWidth); - } - else - { - memcpy(&RDBuffer[rdbufferoffset*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], 4 * ScanlineWidth); - memcpy(&FinalBuffer[y*ScanlineWidth], &RDBuffer[rdbufferoffset*ScanlineWidth], 4 * ScanlineWidth); - } } void SoftRenderer::ClearBuffers(const GPU& gpu) @@ -1784,7 +1762,50 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) } } -void SoftRenderer::RenderPolygons(GPU& gpu, bool threaded, Polygon** polygons, int npolys) +u16 SoftRenderer::BeginPushScanline(s32 y, s32 pixelstodraw) +{ + // push the finished scanline to the appropriate frame buffers. + // if a scanline is late enough to intersect with the 2d engine read time it will be partially drawn + u16 start; + if (pixelstodraw > 256) + { + start = 0; + pixelstodraw = 256; + } + else if (pixelstodraw <= 0) + { + return 256; + } + else + { + start = ScanlineWidth - pixelstodraw; + + // it seems to read in pairs of two every two cycles? looks jittery + bool jitter = pixelstodraw % 2; + // chcckme: + & - might be backwards + pixelstodraw += jitter; + start -= jitter; + } + bufferpos = y % 48; + memcpy(&RDBuffer[bufferpos*ScanlineWidth+start], &ColorBuffer[y*ScanlineWidth+start], 4 * pixelstodraw); + return start; +} + +void SoftRenderer::ReadScanline(s32 y) +{ + memcpy(&FinalBuffer[y*ScanlineWidth], &RDBuffer[bufferpos*ScanlineWidth], 4 * ScanlineWidth); +} + +void SoftRenderer::FinishPushScanline(s32 y s32 pixelsremain) +{ + if (pixelsremain = 0) return; + + bufferpos = y % 48; + memcpy(&RDBuffer[bufferpos*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], 4 * pixelsremain); +} + +template +void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) { int j = 0; for (int i = 0; i < npolys; i++) @@ -1792,16 +1813,186 @@ void SoftRenderer::RenderPolygons(GPU& gpu, bool threaded, Polygon** polygons, i if (polygons[i]->Degenerate) continue; SetupPolygon(&PolygonList[j++], polygons[i]); } + + ClearBuffers(gpu); + s32 rasterevents[RasterEvents_MAX]; s32 y = 0; + s32 yold; + rasterevents[RenderStart] = 0; + rasterevents[RenderFinal] = INT_MAX/2; + rasterevents[RenderFinalP2] = INT_MAX; + rasterevents[ScanlineWrite] = INT_MAX; + rasterevents[ScanlineRead] = InitGPU2DTimeout; + ScanlineTimeout = INT_MAX; + RasterTiming = 0; + RasterTimingEven = 0; + RasterTimingOdd = 0; + u8 scanlinesread = 0 + u8 scanlinesrendered; + s8 scanlineswaiting = 0; + u8 nextevent; + u16 leftoversa; + u16 leftoversb; + bool finalunsched = true; + + while (scanlinesread < 192) + { + nextevent = 0; + for (int i = 1; i < RasterEvents_MAX - finalunsched; i++) + { + if (rasterevents[nextevent] > rasterevents[i]) + nextevent = i; + } + + switch (nextevent) + { + case RenderStart: + + bool abort = RenderScanline(gpu, y, j, true); + abort |= RenderScanline(gpu, y+1, j, false); + + timespent = std::max(RasterTimingEven, RasterTimingOdd); + RasterTiming += timespent; + if ((RasterTiming + timespent) < (rasterevents[RenderFinal]+FinalPassLen)) + RasterTiming += FinalPassLen; + else + RasterTiming += timespent; + + s32 timeoutdist = ScanlineTimeout - RasterTiming; + RasterTiming += std::clamp(timeoutdist, 0, 12); + + rasterevents[RenderFinal] = RasterTiming; + rasterevents[RenderScanline] = RasterTiming+RastDelay; + finalunsched = false; + break; + + case RenderFinal: + + if (y > 2) + { + ScanlineFinalPass(gpu.GPU3D, y-1); + leftoversa = BeginPushScanline(y-1, (rasterevents[ScanlineRead] - ScanlineReadSpeed) - (rasterevents[RenderFinal] + FinalPassLen)); + + if (leftoversa != 0) + { + rasterevents[RenderFinalP2] = rasterevents[ScanlineRead] - ScanlineReadSpeed; + yold = y; + } + else + { + scanlineswaiting++; + scanlinesrendered++; + } + } + if (y < 192) + { + ScanlineFinalPass(gpu.GPU3D, y); + leftoversb = BeginPushScanline(y, (rasterevents[ScanlineRead] + DelayBetweenReads) - (rasterevents[RenderFinal] + FinalPassLen)); + + if (leftoversb != 0) + { + rasterevents[RenderFinalP2] = rasterevents[ScanlineRead] + DelaybetweenReads; + yold = y; + } + else + { + scanlineswaiting++; + scanlinesrendered++; + } + + finalunsched = true; + } + else + { + rasterevents[RenderFinal] += FinalPassLen; + } + + y += 2; + break; + + case ScanlineRead: + + ReadScanline(scanlinesread); + rasterevents[ScanlineRead] += ScanlineIncrement; + + if constexpr (threaded) + Platform::Semaphore_Post(Sema_ScanlineCount); + + scanlinesread++; + scanlineswaiting--; + break; + + case RenderFinalP2: + + if (y > 2) + { + FinishPushScanline(yold-1, leftoversa); + scanlineswaiting++; + scanlinesrendered++; + } + if (y < 192) + { + FinishPushScanline(yold, leftoversb); + scanlineswaiting++; + scanlinesrendered++; + } + + rasterevents[RenderFinalP2] = INT_MAX; + break; + } + } +} + /*ScanlineRead = InitGPU2DTimeout; + ScanlineTimeout = INT_MAX; + RasterTiming = 0; + s32 prevscanlineread; + s32 prevrastertiming; + + for (y = 0; y < 192; y += 2) + { + RasterTimingEven = 0; + RasterTimingOdd = 0; + // scanlines are rendered in pairs simultaneously + bool abort = RenderScanline(gpu, y, j, true); + abort |= RenderScanline(gpu, y+1, j, false); + + timespent = std::max(RasterTimingEven, RasterTimingOdd); + if (timespend > FreeTiming) + RasterTiming += timespent; + + // the next loop begins + if (y!=0) + { + // finish second scanline from 2 pairs back + ScanlineFinalPass(gpu.GPU3D, y-1); + PushScanline(y-1, (ScanlineRead+GPU2DReadScanline)-(RasterTiming+FinishScanline)); + ScanlineRead += ScanlineReadInc; + if constexpr (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); + } + // finish previous first scanline + ScanlineFinalPass(gpu.GPU3D, y); + PushScanline(y, (ScanlineRead+GPU2DReadScanline)-(RasterTiming+FinishScanline)); + ScanlineRead += ScanlineReadInc; + if constexpr (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); + y += 2; + } + RasterTiming += + // one more loop just to finish off the final scanline + ScanlineFinalPass(gpu.GPU3D, 191); + PushScanline(191, (ScanlineRead+GPU2DReadScanline)-(RasterTiming+FinishScanline)); + if constexpr (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); + */ + + + /*s32 y = 0; s8 prevbufferline = -2; s8 buffersize = 0; - RasterTiming = InitialTiming; + RasterTiming = INT_MAX/2; bool abort = false; ClearBuffers(gpu); s32 gpu2dtracking = InitGPU2DTimeout; - s32 gpu2dfreetime = InitGPU2DTimeout; s32 prev2dtime; bool readodd = true; @@ -1811,13 +2002,14 @@ void SoftRenderer::RenderPolygons(GPU& gpu, bool threaded, Polygon** polygons, i RasterTimingOdd = 0; RasterTimingEven = 0; + if (y == 2) RasterTiming = InitialTiming; + RasterTiming += ScanlineIncrement; gpu2dtracking += GPU2DReadSLPair; if (abort) RasterTiming += AbortIncrement; // if previous scanline was aborted, allow an extra 12 pixels worth of timing if (y >= 50) { - gpu2dfreetime = 0; if (RasterTiming > Post50Max) { s32 temp = RasterTiming - Post50Max; @@ -1834,62 +2026,49 @@ void SoftRenderer::RenderPolygons(GPU& gpu, bool threaded, Polygon** polygons, i //RasterTiming += ScanlineBreak; s32 timespent = std::max(RasterTimingOdd, RasterTimingEven); + if (RasterTiming - timespent <= UnderflowFlag) gpu.GPU3D.DispCnt |= (1<<12); // checkme: should this flag set itself every frame a "underflowed" frame is rendered, even if said frame is duplicated? + timespent -= FreeTiming; - // measure scanlines being read here. - gpu2dtracking -= timespent; - gpu2dfreetime -= timespent; + if (timespent > 0) + { + RasterTiming -= timespent; + gpu2dtracking -= timespent; + } - if (timespent > 0) RasterTiming -= timespent; //if (RasterTiming < 0) RasterTiming = 0; - if (gpu2dfreetime <= 0) - { - buffersize = 0; - if (gpu2dtracking > 0) - { - s32 i = gpu2dtracking; - while (true) - { - s32 comp = GPU2DReadSLPair/2; - //if (readodd) comp = GPU2DSpeedOutsidePair + GPU2DReadScanline; - //else comp = GPU2DSpeedWithinPair + GPU2DReadScanline; - - if (i < comp) break; - - i -= comp; - buffersize++; - //readodd = !readodd; - } + buffersize = 0; + for (int i = gpu2dtracking; i > 0; i -= GPU2DReadSLPair/2) buffersize++; - if (i > 0) buffersize++; - } - - // seems to display the lowest scanline buffer count reached during the current frame. - // we also caps it to 46 here, because this reg does that too for some reason. - if (gpu.GPU3D.RDLines > buffersize) gpu.GPU3D.RDLines = buffersize; - } + if (buffersize < gpu.GPU3D.RDLines) gpu.GPU3D.RDLines = buffersize; if (prevbufferline >= 0) { ScanlineFinalPass(gpu.GPU3D, y-2, prevbufferline, true, prev2dtime); ScanlineFinalPass(gpu.GPU3D, y-1, prevbufferline+1, false, prev2dtime); + if (threaded) + { + Platform::Semaphore_Post(Sema_ScanlineCount); + Platform::Semaphore_Post(Sema_ScanlineCount); + } } y += 2; prevbufferline = bufferline; prev2dtime = gpu2dtracking; - if (threaded) - Platform::Semaphore_Post(Sema_ScanlineCount); } ScanlineFinalPass(gpu.GPU3D, 190, prevbufferline, true, prev2dtime); ScanlineFinalPass(gpu.GPU3D, 191, prevbufferline+1, false, prev2dtime); if (threaded) + { Platform::Semaphore_Post(Sema_ScanlineCount); -} + Platform::Semaphore_Post(Sema_ScanlineCount); + } +}*/ void SoftRenderer::VCount144(GPU& gpu) { @@ -1911,7 +2090,7 @@ void SoftRenderer::RenderFrame(GPU& gpu) { Platform::Semaphore_Post(Sema_RenderStart); } - else if (!FrameIdentical) RenderPolygons(gpu, false, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + else if (!FrameIdentical) RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); } void SoftRenderer::RestartFrame(GPU& gpu) @@ -1931,7 +2110,7 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) { Platform::Semaphore_Post(Sema_ScanlineCount, 192); } - else RenderPolygons(gpu, true, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + else RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); Platform::Semaphore_Post(Sema_RenderDone); RenderThreadRendering = false; @@ -1946,7 +2125,7 @@ u32* SoftRenderer::GetLine(int line) Platform::Semaphore_Wait(Sema_ScanlineCount); } - return &FinalBuffer[(line * ScanlineWidth) + FirstPixelOffset]; + return &FinalBuffer[line * ScanlineWidth]; } } diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index ac3d8e72..69aeeb1d 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -453,10 +453,10 @@ private: }; RendererPolygon PolygonList[2048]; - bool DoTimings(GPU3D& gpu3d, s32 cycles, bool odd); + bool DoTimings(s32 cycles, bool odd); bool CheckTimings(s32 cycles, bool odd); - u32 DoTimingsPixels(GPU3D& gpu3d, s32 pixels, bool odd); - bool DoTimingsSlopes(GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd); + u32 DoTimingsPixels(s32 pixels, bool odd); + bool DoTimingsSlopes(RendererPolygon* rp, s32 y, bool odd); void TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const; u32 RenderPixel(const GPU& gpu, const Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t) const; void PlotTranslucentPixel(const GPU3D& gpu3d, u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow); @@ -465,21 +465,31 @@ private: void SetupPolygon(RendererPolygon* rp, Polygon* polygon) const; void Step(RendererPolygon* rp); void CheckSlope(RendererPolygon* rp, s32 y); - bool RenderShadowMaskScanline(GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd); - bool RenderPolygonScanline(GPU& gpu, RendererPolygon* rp, s32 y, bool odd); - bool RenderScanline(GPU& gpu, s32 y, int npolys, bool odd); + bool RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd); + bool RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, bool odd); + bool RenderScanline(const GPU& gpu, s32 y, int npolys, bool odd); u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const; void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, u8 rdbufferoffset, bool odd, s32 uhohzone); void ClearBuffers(const GPU& gpu); - void RenderPolygons(GPU& gpu, bool threaded, Polygon** polygons, int npolys); + template void RenderPolygons(GPU& gpu, Polygon** polygons, int npolys); + voi PushScanline(s32 y, s32 pixelstodraw); void RenderThreadFunc(GPU& gpu); // counters for scanline rasterization timings - s32 RasterTiming = 0; - //s32 RasterTimingCounterPrev = 0; - s32 RasterTimingOdd = 0; - s32 RasterTimingEven = 0; + s32 ScanlineTimeout; + s32 RasterTiming; + s32 RasterTimingOdd; + s32 RasterTimingEven; + + enum + { + RenderStart = 0, + ScanlineRead, + RenderFinalP2, + RenderFinal, + RasterEvents_MAX, + }; // buffer dimensions are 258x194 to add a offscreen 1px border // which simplifies edge marking tests @@ -488,17 +498,18 @@ private: // offscreen in that border static constexpr int ScanlineWidth = 256; - static constexpr int NumScanlines = 192; + static constexpr int NumScanlinesIntBuf = 192; static constexpr int NumScanlinesRD = 48; + static constexpr int NumScanlinesFinal = 192; + static constexpr int BufferSize = ScanlineWidth * NumScanlinesIntBuf; static constexpr int RDBufferSize = ScanlineWidth * NumScanlinesRD; - static constexpr int BufferSize = ScanlineWidth * NumScanlines; - static constexpr int FirstPixelOffset = 0; + static constexpr int FinalBufferSize = ScanlineWidth * NumScanlinesFinal; u32 ColorBuffer[BufferSize * 2]; u32 DepthBuffer[BufferSize * 2]; u32 AttrBuffer[BufferSize * 2]; - u32 RDBuffer[RDBufferSize]; - u32 FinalBuffer[BufferSize]; + u32 RDBuffer[RDBufferSize]; // is this buffer ever initialized by hw before writing to it? what is its initial value? can you transfer 3d framebuffer data between games? + u32 FinalBuffer[FinalBufferSize]; // attribute buffer: // bit0-3: edge flags (left/right/top/bottom) From bf26b6817d444bba8752a645639aa56e4872b63c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 23 Dec 2023 21:26:49 -0500 Subject: [PATCH 16/53] partially rendering --- src/GPU3D.h | 5 +- src/GPU3D_Soft.cpp | 370 +++++++++++++++++---------------------------- src/GPU3D_Soft.h | 25 +-- 3 files changed, 151 insertions(+), 249 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index 03dfa4f0..3c3da750 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -336,7 +336,7 @@ public: // GPU 2D Read Timings: For Emulating Buffer Read/Write Race Conditions static constexpr int DelayBetweenReads = 809 * TimingFrac; static constexpr int ScanlineReadSpeed = 256 * TimingFrac; - static constexpr int ScanlineReadInc = DelayBetweenReads + ReadScanline; + static constexpr int ScanlineReadInc = DelayBetweenReads + ScanlineReadSpeed; static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair. @@ -344,8 +344,9 @@ public: // and beginning reading the second scanline of a scanline pair. static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline. static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment. - static constexpr int InitGPU2DTimeout = 51874 * TimingFrac; // 51618? | when it starts reading the first scanline. + static constexpr int InitGPU2DTimeout = 51874 * TimingFrac; // 51618? | when it finishes reading the first scanline. static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines. + static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this? // GPU 3D Rasterization Timings: For Emulating Scanline Timeout diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 10bbd053..458537d9 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -114,33 +114,22 @@ void SoftRenderer::SetThreaded(bool threaded, GPU& gpu) noexcept } } -bool SoftRenderer::DoTimings(s32 cycles, bool odd) +bool SoftRenderer::DoTimings(s32 cycles, s32* timingcounter) { // add timings to a counter and check if underflowed. - - s32* counter; - if (odd) counter = &RasterTimingOdd; - else counter = &RasterTimingEven; - - *counter += cycles; - if (RasterTiming + *counter > ScanlineTimeout) return false; - - return true; + *timingcounter += cycles; + if (RasterTiming + *timingcounter <= ScanlineTimeout) return false; + else return true; } -bool SoftRenderer::CheckTimings(s32 cycles, bool odd) +bool SoftRenderer::CheckTimings(s32 cycles, s32* timingcounter) { // check if there are 'cycles' amount of cycles remaining. - - s32* counter; - if (odd) counter = &RasterTimingOdd; - else counter = &RasterTimingEven; - - if (ScanlineTimeout - (RasterTiming + *counter) >= cycles) return true; + if (RasterTiming + *timingcounter <= ScanlineTimeout - cycles) return true; else return false; } -u32 SoftRenderer::DoTimingsPixels(s32 pixels, bool odd) +u32 SoftRenderer::DoTimingsPixels(s32 pixels, s32* timingcounter) { // calculate and return the difference between the old span and the new span, while adding timings to the timings counter @@ -148,23 +137,19 @@ u32 SoftRenderer::DoTimingsPixels(s32 pixels, bool odd) if (pixels <= NumFreePixels) return 0; pixels -= NumFreePixels; - - s32* counter; - if (odd) counter = &RasterTimingOdd; - else counter = &RasterTimingEven; - *counter += pixels; - pixels = -(ScanlineTimeout - (RasterTiming + *counter)); + *timingcounter += pixels; + pixels = -(ScanlineTimeout - (RasterTiming + *timingcounter)); if (pixels > 0) { - *counter -= pixels; + *timingcounter -= pixels; return pixels; } else return 0; } -bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, bool odd) +bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, s32* timingcounter) { // determine the timing impact of the first polygon's slopes. @@ -173,15 +158,11 @@ bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, bool odd) if (polygon->YTop == polygon->YBottom) return false; // 0 px tall line polygons do not have slopes, and thus no timing penalty if (y == polygon->YTop) return false; - s32* counter; - if (odd) counter = &RasterTimingOdd; - else counter = &RasterTimingEven; + if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) *timingcounter += FirstPerSlope; - if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) *counter += FirstPerSlope; + if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) *timingcounter += FirstPerSlope; - if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) *counter += FirstPerSlope; - - return DoTimings(FirstPerSlope*2, odd); // CHECKME: does this need to be done every time its incremented here? does this even need to be done *at all?* + return DoTimings(FirstPerSlope*2, timingcounter); // CHECKME: does this need to be done every time its incremented here? does this even need to be done *at all?* } void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const @@ -768,7 +749,7 @@ void SoftRenderer::CheckSlope(RendererPolygon* rp, s32 y) } } -bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd) +bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, s32* timingcounter) { Polygon* polygon = rp->PolyData; @@ -903,7 +884,7 @@ bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* // determine if the span can be rendered within the time allotted to the scanline // TODO: verify the timing characteristics of shadow masks are the same as regular polygons. - s32 diff = DoTimingsPixels(xend-x, odd); + s32 diff = DoTimingsPixels(xend-x, timingcounter); if (diff != 0) { xend -= diff; @@ -996,7 +977,7 @@ bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* return abortscanline; } -bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, bool odd) +bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter) { Polygon* polygon = rp->PolyData; u32 polyattr = (polygon->Attr & 0x3F008000); @@ -1153,7 +1134,7 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 if (xend > 256) xend = 256; // determine if the span can be rendered within the time allotted to the scanline - s32 diff = DoTimingsPixels(xend-x, odd); + s32 diff = DoTimingsPixels(xend-x, timingcounter); if (diff != 0) { xend -= diff; @@ -1450,7 +1431,7 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 return abortscanline; } -bool SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, bool odd) +void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter) { bool abort = false; bool first = true; @@ -1461,17 +1442,17 @@ bool SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, bool odd) if (y == polygon->YBottom && y != polygon->YTop) { - if (!abort) abort = (first && DoTimings(FirstNull, odd)) || DoTimings(EmptyPolyScanline, odd); + if (!abort) abort = (first && DoTimings(FirstNull, timingcounter)) || DoTimings(EmptyPolyScanline, timingcounter); first = false; } else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) { - //if (y == polygon->YTop) if(DoTimings(FirstPolyScanline, odd)) abort = true; + //if (y == polygon->YTop) if(DoTimings(FirstPolyScanline, timingcounter)) abort = true; - if (!abort) abort = (first && DoTimingsSlopes(rp, y, odd)) // incorrect. needs research; behavior is strange... - || DoTimings(PerPolyScanline, odd) - || (!CheckTimings(MinToStartPoly, odd)); + if (!abort) abort = (first && DoTimingsSlopes(rp, y, timingcounter)) // incorrect. needs research; behavior is strange... + || DoTimings(PerPolyScanline, timingcounter) + || (!CheckTimings(MinToStartPoly, timingcounter)); if (abort) { @@ -1479,15 +1460,15 @@ bool SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, bool odd) Step(rp); } else if (polygon->IsShadowMask) - abort = RenderShadowMaskScanline(gpu.GPU3D, rp, y, odd); + abort = RenderShadowMaskScanline(gpu.GPU3D, rp, y, timingcounter); else - abort = RenderPolygonScanline(gpu, rp, y, odd); + abort = RenderPolygonScanline(gpu, rp, y, timingcounter); first = false; } } - return abort; + return; } u32 SoftRenderer::CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const @@ -1766,8 +1747,9 @@ u16 SoftRenderer::BeginPushScanline(s32 y, s32 pixelstodraw) { // push the finished scanline to the appropriate frame buffers. // if a scanline is late enough to intersect with the 2d engine read time it will be partially drawn + /* u16 start; - if (pixelstodraw > 256) + if (pixelstodraw >= 256) { start = 0; pixelstodraw = 256; @@ -1786,21 +1768,26 @@ u16 SoftRenderer::BeginPushScanline(s32 y, s32 pixelstodraw) pixelstodraw += jitter; start -= jitter; } - bufferpos = y % 48; + u8 bufferpos = y % 48; memcpy(&RDBuffer[bufferpos*ScanlineWidth+start], &ColorBuffer[y*ScanlineWidth+start], 4 * pixelstodraw); return start; + */ + u8 bufferpos = y % 48; + memcpy(&RDBuffer[bufferpos*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], 4*ScanlineWidth); + return 0; } void SoftRenderer::ReadScanline(s32 y) { + u8 bufferpos = y % 48; memcpy(&FinalBuffer[y*ScanlineWidth], &RDBuffer[bufferpos*ScanlineWidth], 4 * ScanlineWidth); } -void SoftRenderer::FinishPushScanline(s32 y s32 pixelsremain) +void SoftRenderer::FinishPushScanline(s32 y, s32 pixelsremain) { if (pixelsremain = 0) return; - bufferpos = y % 48; + u8 bufferpos = y % 48; memcpy(&RDBuffer[bufferpos*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], 4 * pixelsremain); } @@ -1821,27 +1808,27 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) s32 yold; rasterevents[RenderStart] = 0; rasterevents[RenderFinal] = INT_MAX/2; - rasterevents[RenderFinalP2] = INT_MAX; - rasterevents[ScanlineWrite] = INT_MAX; + rasterevents[PushScanline] = INT_MAX/2; + rasterevents[PushScanlineP2] = INT_MAX/2; rasterevents[ScanlineRead] = InitGPU2DTimeout; - ScanlineTimeout = INT_MAX; + ScanlineTimeout = INT_MAX/2; RasterTiming = 0; - RasterTimingEven = 0; - RasterTimingOdd = 0; - u8 scanlinesread = 0 - u8 scanlinesrendered; + s32 rastertimingeven = 0; + s32 rastertimingodd = 0; + u8 scanlinesread = 0; + u8 scanlinespushed = 0; + s8 scanlinesrendered = 0; s8 scanlineswaiting = 0; u8 nextevent; - u16 leftoversa; - u16 leftoversb; - bool finalunsched = true; + bool doa, dob, fina; + u16 leftoversa, leftoversb; - while (scanlinesread < 192) + while ((scanlinesread < 192 || scanlinespushed < 192) && (RasterTiming < FrameLength)) { nextevent = 0; - for (int i = 1; i < RasterEvents_MAX - finalunsched; i++) + for (s32 i = 1; i < RasterEvents_MAX; i++) { - if (rasterevents[nextevent] > rasterevents[i]) + if (rasterevents[i] < rasterevents[nextevent]) nextevent = i; } @@ -1849,68 +1836,107 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) { case RenderStart: - bool abort = RenderScanline(gpu, y, j, true); - abort |= RenderScanline(gpu, y+1, j, false); + RasterTiming = rasterevents[RenderStart]; + + { + s32 rastertimingeven = 0; + s32 rastertimingodd = 0; + RenderScanline(gpu, y, j, &rastertimingeven); + RenderScanline(gpu, y+1, j, &rastertimingodd); + + s32 timespent = std::max(rastertimingeven, rastertimingodd); - timespent = std::max(RasterTimingEven, RasterTimingOdd); - RasterTiming += timespent; if ((RasterTiming + timespent) < (rasterevents[RenderFinal]+FinalPassLen)) RasterTiming += FinalPassLen; else RasterTiming += timespent; - + + s32 timeoutdist = ScanlineTimeout - RasterTiming; RasterTiming += std::clamp(timeoutdist, 0, 12); - + } rasterevents[RenderFinal] = RasterTiming; - rasterevents[RenderScanline] = RasterTiming+RastDelay; - finalunsched = false; + if (y < 190) rasterevents[RenderStart] = RasterTiming+RastDelay; + else rasterevents[RenderStart] = INT_MAX/2; break; case RenderFinal: + + rasterevents[PushScanline] = rasterevents[RenderFinal] + RastDelay; - if (y > 2) + if (y >= 2) { ScanlineFinalPass(gpu.GPU3D, y-1); - leftoversa = BeginPushScanline(y-1, (rasterevents[ScanlineRead] - ScanlineReadSpeed) - (rasterevents[RenderFinal] + FinalPassLen)); - - if (leftoversa != 0) - { - rasterevents[RenderFinalP2] = rasterevents[ScanlineRead] - ScanlineReadSpeed; - yold = y; - } - else - { - scanlineswaiting++; - scanlinesrendered++; - } - } - if (y < 192) - { - ScanlineFinalPass(gpu.GPU3D, y); - leftoversb = BeginPushScanline(y, (rasterevents[ScanlineRead] + DelayBetweenReads) - (rasterevents[RenderFinal] + FinalPassLen)); - - if (leftoversb != 0) - { - rasterevents[RenderFinalP2] = rasterevents[ScanlineRead] + DelaybetweenReads; - yold = y; - } - else - { - scanlineswaiting++; - scanlinesrendered++; - } - - finalunsched = true; + scanlinesrendered++; + doa = true; } else { - rasterevents[RenderFinal] += FinalPassLen; + doa = false; } + if (y < 192) + { + ScanlineFinalPass(gpu.GPU3D, y); + scanlinesrendered++; + rasterevents[RenderFinal] = INT_MAX/2; + } + else + rasterevents[RenderFinal] += FinalPassLen; + y += 2; break; + case PushScanline: + if (scanlineswaiting >= 48) + { + //reschedule events if buffer is full + rasterevents[PushScanline] = rasterevents[ScanlineRead]; + rasterevents[RenderStart] = ((y >= 190) ? INT_MAX/2 : rasterevents[ScanlineRead] + RastDelay); + rasterevents[RenderFinal] = ((y >= 192) ? INT_MAX/2 : rasterevents[ScanlineRead]); + break; + } + + if (doa) + { + leftoversa = BeginPushScanline(scanlinespushed, 256);//(rasterevents[ScanlineRead] - ScanlineReadSpeed) - (rasterevents[PushScanline] + FinalPassLen)); + scanlinesrendered--; + + if (leftoversa != 0) + { + rasterevents[PushScanlineP2] = rasterevents[ScanlineRead] - ScanlineReadSpeed; // todo: fix this + fina = true; + } + else + { + scanlineswaiting++; + scanlinespushed++; + } + } + else + { + leftoversb = BeginPushScanline(scanlinespushed, 256);//(rasterevents[ScanlineRead] + DelayBetweenReads) - (rasterevents[PushScanline] + FinalPassLen)); + scanlinesrendered--; + + if (leftoversb != 0) + { + rasterevents[PushScanlineP2] = rasterevents[ScanlineRead] + DelayBetweenReads; // todo: fix this + fina = false; + } + else + { + scanlineswaiting++; + scanlinespushed++; + } + } + + if (scanlinesrendered <= 0) + rasterevents[PushScanline] = INT_MAX/2; + else + doa = !doa; + + break; + case ScanlineRead: ReadScanline(scanlinesread); @@ -1923,152 +1949,26 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) scanlineswaiting--; break; - case RenderFinalP2: + case PushScanlineP2: - if (y > 2) + if (fina) { FinishPushScanline(yold-1, leftoversa); scanlineswaiting++; - scanlinesrendered++; + scanlinespushed++; } - if (y < 192) + else { FinishPushScanline(yold, leftoversb); scanlineswaiting++; - scanlinesrendered++; + scanlinespushed++; } - rasterevents[RenderFinalP2] = INT_MAX; + rasterevents[PushScanlineP2] = INT_MAX/2; break; } } } - /*ScanlineRead = InitGPU2DTimeout; - ScanlineTimeout = INT_MAX; - RasterTiming = 0; - s32 prevscanlineread; - s32 prevrastertiming; - - for (y = 0; y < 192; y += 2) - { - RasterTimingEven = 0; - RasterTimingOdd = 0; - // scanlines are rendered in pairs simultaneously - bool abort = RenderScanline(gpu, y, j, true); - abort |= RenderScanline(gpu, y+1, j, false); - - timespent = std::max(RasterTimingEven, RasterTimingOdd); - if (timespend > FreeTiming) - RasterTiming += timespent; - - // the next loop begins - if (y!=0) - { - // finish second scanline from 2 pairs back - ScanlineFinalPass(gpu.GPU3D, y-1); - PushScanline(y-1, (ScanlineRead+GPU2DReadScanline)-(RasterTiming+FinishScanline)); - ScanlineRead += ScanlineReadInc; - if constexpr (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); - } - // finish previous first scanline - ScanlineFinalPass(gpu.GPU3D, y); - PushScanline(y, (ScanlineRead+GPU2DReadScanline)-(RasterTiming+FinishScanline)); - ScanlineRead += ScanlineReadInc; - if constexpr (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); - y += 2; - } - RasterTiming += - // one more loop just to finish off the final scanline - ScanlineFinalPass(gpu.GPU3D, 191); - PushScanline(191, (ScanlineRead+GPU2DReadScanline)-(RasterTiming+FinishScanline)); - if constexpr (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); - */ - - - /*s32 y = 0; - s8 prevbufferline = -2; - - s8 buffersize = 0; - RasterTiming = INT_MAX/2; - bool abort = false; - ClearBuffers(gpu); - s32 gpu2dtracking = InitGPU2DTimeout; - s32 prev2dtime; - bool readodd = true; - - for (u8 quarter = 0; quarter < 4; quarter++) - for (u8 bufferline = 0; bufferline < 48; bufferline += 2) - { - RasterTimingOdd = 0; - RasterTimingEven = 0; - - if (y == 2) RasterTiming = InitialTiming; - - RasterTiming += ScanlineIncrement; - gpu2dtracking += GPU2DReadSLPair; - if (abort) RasterTiming += AbortIncrement; // if previous scanline was aborted, allow an extra 12 pixels worth of timing - - if (y >= 50) - { - if (RasterTiming > Post50Max) - { - s32 temp = RasterTiming - Post50Max; - RasterTiming = Post50Max; - gpu2dtracking -= temp; - } - if (buffersize > 48) buffersize = 48; - } - - abort = RenderScanline(gpu, y, j, true); - abort |= RenderScanline(gpu, y+1, j, false); - - buffersize += 2; - //RasterTiming += ScanlineBreak; - s32 timespent = std::max(RasterTimingOdd, RasterTimingEven); - - if (RasterTiming - timespent <= UnderflowFlag) gpu.GPU3D.DispCnt |= (1<<12); // checkme: should this flag set itself every frame a "underflowed" frame is rendered, even if said frame is duplicated? - - timespent -= FreeTiming; - - if (timespent > 0) - { - RasterTiming -= timespent; - gpu2dtracking -= timespent; - } - - - //if (RasterTiming < 0) RasterTiming = 0; - buffersize = 0; - for (int i = gpu2dtracking; i > 0; i -= GPU2DReadSLPair/2) buffersize++; - - if (buffersize < gpu.GPU3D.RDLines) gpu.GPU3D.RDLines = buffersize; - - if (prevbufferline >= 0) - { - ScanlineFinalPass(gpu.GPU3D, y-2, prevbufferline, true, prev2dtime); - ScanlineFinalPass(gpu.GPU3D, y-1, prevbufferline+1, false, prev2dtime); - if (threaded) - { - Platform::Semaphore_Post(Sema_ScanlineCount); - Platform::Semaphore_Post(Sema_ScanlineCount); - } - } - - y += 2; - prevbufferline = bufferline; - prev2dtime = gpu2dtracking; - - } - - ScanlineFinalPass(gpu.GPU3D, 190, prevbufferline, true, prev2dtime); - ScanlineFinalPass(gpu.GPU3D, 191, prevbufferline+1, false, prev2dtime); - - if (threaded) - { - Platform::Semaphore_Post(Sema_ScanlineCount); - Platform::Semaphore_Post(Sema_ScanlineCount); - } -}*/ void SoftRenderer::VCount144(GPU& gpu) { diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 69aeeb1d..33576e3b 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -453,10 +453,10 @@ private: }; RendererPolygon PolygonList[2048]; - bool DoTimings(s32 cycles, bool odd); - bool CheckTimings(s32 cycles, bool odd); - u32 DoTimingsPixels(s32 pixels, bool odd); - bool DoTimingsSlopes(RendererPolygon* rp, s32 y, bool odd); + bool DoTimings(s32 cycles, s32* timingcounter); + bool CheckTimings(s32 cycles, s32* timingcounter); + u32 DoTimingsPixels(s32 pixels, s32* timingcounter); + bool DoTimingsSlopes(RendererPolygon* rp, s32 y, s32* timingcounter); void TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const; u32 RenderPixel(const GPU& gpu, const Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t) const; void PlotTranslucentPixel(const GPU3D& gpu3d, u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow); @@ -465,28 +465,29 @@ private: void SetupPolygon(RendererPolygon* rp, Polygon* polygon) const; void Step(RendererPolygon* rp); void CheckSlope(RendererPolygon* rp, s32 y); - bool RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, bool odd); - bool RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, bool odd); - bool RenderScanline(const GPU& gpu, s32 y, int npolys, bool odd); + bool RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, s32* timingcounter); + bool RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter); + void RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter); u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const; - void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, u8 rdbufferoffset, bool odd, s32 uhohzone); + void ScanlineFinalPass(const GPU3D& gpu3d, s32 y); void ClearBuffers(const GPU& gpu); + u16 BeginPushScanline(s32 y, s32 pixelstodraw); + void ReadScanline(s32 y); + void FinishPushScanline(s32 y, s32 pixelsremain); template void RenderPolygons(GPU& gpu, Polygon** polygons, int npolys); - voi PushScanline(s32 y, s32 pixelstodraw); void RenderThreadFunc(GPU& gpu); // counters for scanline rasterization timings s32 ScanlineTimeout; s32 RasterTiming; - s32 RasterTimingOdd; - s32 RasterTimingEven; enum { RenderStart = 0, ScanlineRead, - RenderFinalP2, + PushScanline, + PushScanlineP2, RenderFinal, RasterEvents_MAX, }; From c05c79321a7503f43fe7a0ad61baf0255aab0827 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 23 Dec 2023 22:24:09 -0500 Subject: [PATCH 17/53] it works again! --- src/GPU3D_Soft.cpp | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 458537d9..574562be 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1817,7 +1817,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) s32 rastertimingodd = 0; u8 scanlinesread = 0; u8 scanlinespushed = 0; - s8 scanlinesrendered = 0; + s16 scanlinesrendered = 0; s8 scanlineswaiting = 0; u8 nextevent; bool doa, dob, fina; @@ -1846,23 +1846,24 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) s32 timespent = std::max(rastertimingeven, rastertimingodd); - if ((RasterTiming + timespent) < (rasterevents[RenderFinal]+FinalPassLen)) + if ((RasterTiming + timespent) < (RasterTiming+FinalPassLen)) RasterTiming += FinalPassLen; else RasterTiming += timespent; - - + s32 timeoutdist = ScanlineTimeout - RasterTiming; RasterTiming += std::clamp(timeoutdist, 0, 12); } + rasterevents[RenderFinal] = RasterTiming; - if (y < 190) rasterevents[RenderStart] = RasterTiming+RastDelay; - else rasterevents[RenderStart] = INT_MAX/2; + rasterevents[RenderStart] = RasterTiming+RastDelay; + //if (y < 190) rasterevents[RenderStart] = RasterTiming+RastDelay; + //else rasterevents[RenderStart] = INT_MAX/2; break; case RenderFinal: - rasterevents[PushScanline] = rasterevents[RenderFinal] + RastDelay; + rasterevents[PushScanline] = rasterevents[RenderFinal] + 4; if (y >= 2) { @@ -1875,7 +1876,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) doa = false; } - if (y < 192) + if (y <= 192) { ScanlineFinalPass(gpu.GPU3D, y); scanlinesrendered++; @@ -1892,8 +1893,8 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) { //reschedule events if buffer is full rasterevents[PushScanline] = rasterevents[ScanlineRead]; - rasterevents[RenderStart] = ((y >= 190) ? INT_MAX/2 : rasterevents[ScanlineRead] + RastDelay); - rasterevents[RenderFinal] = ((y >= 192) ? INT_MAX/2 : rasterevents[ScanlineRead]); + rasterevents[RenderStart] = /*((y >= 190) ? INT_MAX/2 :*/ rasterevents[ScanlineRead] + RastDelay; + rasterevents[RenderFinal] = /*((y >= 192) ? INT_MAX/2 :*/ rasterevents[ScanlineRead]; break; } @@ -1940,7 +1941,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) case ScanlineRead: ReadScanline(scanlinesread); - rasterevents[ScanlineRead] += ScanlineIncrement; + rasterevents[ScanlineRead] += ScanlineReadInc; if constexpr (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); From fb5b2c299cb633269446985f977065fbf704e854 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 24 Dec 2023 17:39:33 -0500 Subject: [PATCH 18/53] new feature: crashes --- src/GPU3D.h | 4 +- src/GPU3D_Soft.cpp | 151 +++++++++++++++++++++------------------------ 2 files changed, 72 insertions(+), 83 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index 3c3da750..685ece5d 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -344,7 +344,7 @@ public: // and beginning reading the second scanline of a scanline pair. static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline. static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment. - static constexpr int InitGPU2DTimeout = 51874 * TimingFrac; // 51618? | when it finishes reading the first scanline. + static constexpr int InitGPU2DTimeout = 52128 * TimingFrac; // 51618? 51874? | when it finishes reading the first scanline. static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines. static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this? @@ -362,6 +362,8 @@ public: static constexpr int FinalPassLen = 500 * TimingFrac; // 496 (might technically be 500?) | the next scanline cannot begin while a scanline's final pass is in progress // (can be interpreted as the minimum amount of cycles for the next scanline // pair to start after the previous pair began) (related to final pass?) + static constexpr int ScanlinePushDelay = 242 * TimingFrac; + static constexpr int TimeoutIncrement = 2130 * TimingFrac; static constexpr int ScanlineIncrementold = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains" static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 574562be..ac3d09da 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1747,18 +1747,14 @@ u16 SoftRenderer::BeginPushScanline(s32 y, s32 pixelstodraw) { // push the finished scanline to the appropriate frame buffers. // if a scanline is late enough to intersect with the 2d engine read time it will be partially drawn - /* + u16 start; - if (pixelstodraw >= 256) + if (pixelstodraw >= 256) // if scheduled after or 256 cycles before a scanline read render full scanline { start = 0; pixelstodraw = 256; } - else if (pixelstodraw <= 0) - { - return 256; - } - else + else // render partial scanline { start = ScanlineWidth - pixelstodraw; @@ -1771,10 +1767,6 @@ u16 SoftRenderer::BeginPushScanline(s32 y, s32 pixelstodraw) u8 bufferpos = y % 48; memcpy(&RDBuffer[bufferpos*ScanlineWidth+start], &ColorBuffer[y*ScanlineWidth+start], 4 * pixelstodraw); return start; - */ - u8 bufferpos = y % 48; - memcpy(&RDBuffer[bufferpos*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], 4*ScanlineWidth); - return 0; } void SoftRenderer::ReadScanline(s32 y) @@ -1801,11 +1793,12 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) SetupPolygon(&PolygonList[j++], polygons[i]); } + //init internal buffer ClearBuffers(gpu); + // init all this junk i need to keep track of s32 rasterevents[RasterEvents_MAX]; s32 y = 0; - s32 yold; rasterevents[RenderStart] = 0; rasterevents[RenderFinal] = INT_MAX/2; rasterevents[PushScanline] = INT_MAX/2; @@ -1817,14 +1810,16 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) s32 rastertimingodd = 0; u8 scanlinesread = 0; u8 scanlinespushed = 0; + u8 scanlinespushed2 = 0; s16 scanlinesrendered = 0; - s8 scanlineswaiting = 0; + s16 scanlineswaiting = 0; u8 nextevent; - bool doa, dob, fina; - u16 leftoversa, leftoversb; + u16 leftovers; - while ((scanlinesread < 192 || scanlinespushed < 192) && (RasterTiming < FrameLength)) + // until all scanlines have been pushed and read continue looping... CHECKME: unless its time for the next 3d frame should begin + while ((scanlinesread < 192 || scanlinespushed2 < 192) && (RasterTiming < FrameLength)) { + // check all events to find the earliest scheduled one nextevent = 0; for (s32 i = 1; i < RasterEvents_MAX; i++) { @@ -1832,117 +1827,115 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) nextevent = i; } + // if all events are scheduled for after the next frame begins, ABORT + if (rasterevents[nextevent] >= FrameLength) break; + switch (nextevent) { - case RenderStart: + // initial rendering pass (polygons, texturing, etc.) (variable cycle length) + case RenderStart: + // set current raster time to the start of the event RasterTiming = rasterevents[RenderStart]; { s32 rastertimingeven = 0; s32 rastertimingodd = 0; + // scanlines are rendered in pairs of two RenderScanline(gpu, y, j, &rastertimingeven); RenderScanline(gpu, y+1, j, &rastertimingodd); + // a new scanline pair cannot begin until both scanlines are finished. s32 timespent = std::max(rastertimingeven, rastertimingodd); + // a new scanline pair cannot begin until the finishing pass + push is done. if ((RasterTiming + timespent) < (RasterTiming+FinalPassLen)) RasterTiming += FinalPassLen; else RasterTiming += timespent; - + + // 12 cycles at the end of the scanline are always used, unless the scanline got within 12 cycles of timing out. Don't ask why, it just does. s32 timeoutdist = ScanlineTimeout - RasterTiming; RasterTiming += std::clamp(timeoutdist, 0, 12); } - + if (ScanlineTimeout == INT_MAX/2) ScanlineTimeout = rasterevents[ScanlineRead] - FinalPassLen; + else ScanlineTimeout += TimeoutIncrement; + // schedule next scanline pair + the final pass of the latest pair rasterevents[RenderFinal] = RasterTiming; - rasterevents[RenderStart] = RasterTiming+RastDelay; - //if (y < 190) rasterevents[RenderStart] = RasterTiming+RastDelay; - //else rasterevents[RenderStart] = INT_MAX/2; + if (y < 190) rasterevents[RenderStart] = RasterTiming+RastDelay; // scheduled 4 cycles late (presumably due to initial polygon timing shenanigans?) + else rasterevents[RenderStart] = INT_MAX/2; break; - case RenderFinal: - - rasterevents[PushScanline] = rasterevents[RenderFinal] + 4; + // final rendering pass (edge marking, anti-aliasing, fog) (fixed length of 496 (maybe 500?) cycles) + case RenderFinal: + + // schedule a scanline push event + rasterevents[PushScanline] = rasterevents[RenderFinal] + ScanlinePushDelay; + + // if the first scanline pair was just finished only render one scanline if (y >= 2) { ScanlineFinalPass(gpu.GPU3D, y-1); scanlinesrendered++; - doa = true; - } - else - { - doa = false; } + // if the last scanline pair was just finished only render one scanline if (y <= 192) { ScanlineFinalPass(gpu.GPU3D, y); scanlinesrendered++; + // unschedule final pass event rasterevents[RenderFinal] = INT_MAX/2; } - else + else // schedule next final pass event to immediately after the current one rasterevents[RenderFinal] += FinalPassLen; - + + // increment y for main rendering passes y += 2; break; + + // push scanlines to the intermediary "frame buffer" for the 2d engine to read them. (fixed length of ??? cycles) case PushScanline: + + // reschedule events if buffer is full if (scanlineswaiting >= 48) { - //reschedule events if buffer is full rasterevents[PushScanline] = rasterevents[ScanlineRead]; - rasterevents[RenderStart] = /*((y >= 190) ? INT_MAX/2 :*/ rasterevents[ScanlineRead] + RastDelay; - rasterevents[RenderFinal] = /*((y >= 192) ? INT_MAX/2 :*/ rasterevents[ScanlineRead]; + // dont reschedule these events if they're done. + rasterevents[RenderStart] = ((y > 190) ? INT_MAX/2 : rasterevents[ScanlineRead] + RastDelay); + rasterevents[RenderFinal] = ((y > 194) ? INT_MAX/2 : rasterevents[ScanlineRead]); break; } - if (doa) - { - leftoversa = BeginPushScanline(scanlinespushed, 256);//(rasterevents[ScanlineRead] - ScanlineReadSpeed) - (rasterevents[PushScanline] + FinalPassLen)); - scanlinesrendered--; - - if (leftoversa != 0) - { - rasterevents[PushScanlineP2] = rasterevents[ScanlineRead] - ScanlineReadSpeed; // todo: fix this - fina = true; - } - else - { - scanlineswaiting++; - scanlinespushed++; - } - } + leftovers = BeginPushScanline(scanlinespushed, (rasterevents[ScanlineRead] + (ScanlineReadInc*scanlineswaiting)) - rasterevents[PushScanline]); + scanlinesrendered--; + scanlinespushed++; + + // schedule the finish push event if needed + if (leftovers != 0) rasterevents[PushScanlineP2] = rasterevents[ScanlineRead]; else { - leftoversb = BeginPushScanline(scanlinespushed, 256);//(rasterevents[ScanlineRead] + DelayBetweenReads) - (rasterevents[PushScanline] + FinalPassLen)); - scanlinesrendered--; - - if (leftoversb != 0) - { - rasterevents[PushScanlineP2] = rasterevents[ScanlineRead] + DelayBetweenReads; // todo: fix this - fina = false; - } - else - { - scanlineswaiting++; - scanlinespushed++; - } + scanlineswaiting++; + scanlinespushed2++; } if (scanlinesrendered <= 0) - rasterevents[PushScanline] = INT_MAX/2; - else - doa = !doa; + rasterevents[PushScanline] = INT_MAX/2; // unsched event if no scanlines are waiting to be finished break; - case ScanlineRead: + // 2d engine reading scanlines from the intermediary "framebuffer" + case ScanlineRead: + + // read scanline from buffer ReadScanline(scanlinesread); + // reschedule event for one scanline later rasterevents[ScanlineRead] += ScanlineReadInc; + // avoid breaking seperate thread. if constexpr (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); @@ -1950,22 +1943,16 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) scanlineswaiting--; break; + + // finish pushing a scanline to the buffer if it got interrupted by the read process. case PushScanlineP2: - if (fina) - { - FinishPushScanline(yold-1, leftoversa); - scanlineswaiting++; - scanlinespushed++; - } - else - { - FinishPushScanline(yold, leftoversb); - scanlineswaiting++; - scanlinespushed++; - } - - rasterevents[PushScanlineP2] = INT_MAX/2; + FinishPushScanline(scanlinespushed2, leftovers); + scanlineswaiting++; + scanlinespushed2++; + + // unschedule event if all partially pushed scanlines have been pushed + if (scanlinespushed2 >= scanlinespushed) rasterevents[PushScanlineP2] = INT_MAX/2; break; } } From f239d0cf0dfcdcbfe045c3676f73e9b4b07aa522 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 24 Dec 2023 20:27:24 -0500 Subject: [PATCH 19/53] fix a crash and scanlines being incorrectly partially read --- src/GPU3D_Soft.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index ac3d09da..78d19966 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1817,7 +1817,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) u16 leftovers; // until all scanlines have been pushed and read continue looping... CHECKME: unless its time for the next 3d frame should begin - while ((scanlinesread < 192 || scanlinespushed2 < 192) && (RasterTiming < FrameLength)) + while ((scanlinesread < 192 || scanlinespushed2 < 192) && (RasterTiming < (FrameLength-RastDelay))) { // check all events to find the earliest scheduled one nextevent = 0; @@ -1932,8 +1932,6 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // read scanline from buffer ReadScanline(scanlinesread); - // reschedule event for one scanline later - rasterevents[ScanlineRead] += ScanlineReadInc; // avoid breaking seperate thread. if constexpr (threaded) @@ -1941,6 +1939,10 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) scanlinesread++; scanlineswaiting--; + + // reschedule event for one scanline later unless all scanlines have been read + if (scanlinesread < 192) rasterevents[ScanlineRead] += ScanlineReadInc; + else rasterevents[ScanlineRead] = INT_MAX/2; break; From ee3e38aed31795e354846d43ebb2908619ef5bb0 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 25 Dec 2023 13:10:32 -0500 Subject: [PATCH 20/53] fix bottom scanline bugging out use a method of tracking progress through rendering that's less prone to me messing it up --- src/GPU3D_Soft.cpp | 66 ++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 78d19966..b537522d 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1798,7 +1798,6 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // init all this junk i need to keep track of s32 rasterevents[RasterEvents_MAX]; - s32 y = 0; rasterevents[RenderStart] = 0; rasterevents[RenderFinal] = INT_MAX/2; rasterevents[PushScanline] = INT_MAX/2; @@ -1809,10 +1808,12 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) s32 rastertimingeven = 0; s32 rastertimingodd = 0; u8 scanlinesread = 0; + u8 scanlinesinit = 0; + u8 scanlinesfin = 0; u8 scanlinespushed = 0; u8 scanlinespushed2 = 0; - s16 scanlinesrendered = 0; - s16 scanlineswaiting = 0; + s16 scanlineswaitingforpush = 0; + s16 scanlineswaitingforread = 0; u8 nextevent; u16 leftovers; @@ -1835,6 +1836,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // initial rendering pass (polygons, texturing, etc.) (variable cycle length) case RenderStart: + // set current raster time to the start of the event RasterTiming = rasterevents[RenderStart]; @@ -1842,9 +1844,10 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) s32 rastertimingeven = 0; s32 rastertimingodd = 0; // scanlines are rendered in pairs of two - RenderScanline(gpu, y, j, &rastertimingeven); - RenderScanline(gpu, y+1, j, &rastertimingodd); - + RenderScanline(gpu, scanlinesinit, j, &rastertimingeven); + RenderScanline(gpu, scanlinesinit+1, j, &rastertimingodd); + scanlinesinit += 2; + // a new scanline pair cannot begin until both scanlines are finished. s32 timespent = std::max(rastertimingeven, rastertimingodd); @@ -1858,11 +1861,14 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) s32 timeoutdist = ScanlineTimeout - RasterTiming; RasterTiming += std::clamp(timeoutdist, 0, 12); } + + //set next scanline timeout if (ScanlineTimeout == INT_MAX/2) ScanlineTimeout = rasterevents[ScanlineRead] - FinalPassLen; else ScanlineTimeout += TimeoutIncrement; + // schedule next scanline pair + the final pass of the latest pair rasterevents[RenderFinal] = RasterTiming; - if (y < 190) rasterevents[RenderStart] = RasterTiming+RastDelay; // scheduled 4 cycles late (presumably due to initial polygon timing shenanigans?) + if (scanlinesinit < 192) rasterevents[RenderStart] = RasterTiming+RastDelay; // scheduled 4 cycles late (presumably due to initial polygon timing shenanigans?) else rasterevents[RenderStart] = INT_MAX/2; break; @@ -1874,25 +1880,25 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) rasterevents[PushScanline] = rasterevents[RenderFinal] + ScanlinePushDelay; // if the first scanline pair was just finished only render one scanline - if (y >= 2) + if (scanlinesfin > 0) { - ScanlineFinalPass(gpu.GPU3D, y-1); - scanlinesrendered++; + ScanlineFinalPass(gpu.GPU3D, scanlinesfin); + scanlineswaitingforpush++; + scanlinesfin++; } // if the last scanline pair was just finished only render one scanline - if (y <= 192) + if (scanlinesfin < 191) { - ScanlineFinalPass(gpu.GPU3D, y); - scanlinesrendered++; - // unschedule final pass event - rasterevents[RenderFinal] = INT_MAX/2; + ScanlineFinalPass(gpu.GPU3D, scanlinesfin); + scanlineswaitingforpush++; + scanlinesfin++; } + // unschedule final pass event + if (scanlinesfin != 191) + rasterevents[RenderFinal] = INT_MAX/2; else // schedule next final pass event to immediately after the current one rasterevents[RenderFinal] += FinalPassLen; - - // increment y for main rendering passes - y += 2; break; @@ -1900,28 +1906,32 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) case PushScanline: // reschedule events if buffer is full - if (scanlineswaiting >= 48) + if (scanlineswaitingforread >= 48) { rasterevents[PushScanline] = rasterevents[ScanlineRead]; + // dont reschedule these events if they're done. - rasterevents[RenderStart] = ((y > 190) ? INT_MAX/2 : rasterevents[ScanlineRead] + RastDelay); - rasterevents[RenderFinal] = ((y > 194) ? INT_MAX/2 : rasterevents[ScanlineRead]); + if (scanlinesinit < 192) + rasterevents[RenderStart] = rasterevents[ScanlineRead] + RastDelay; + if (scanlinesfin < 192) + rasterevents[RenderFinal] = rasterevents[ScanlineRead]; + break; } - leftovers = BeginPushScanline(scanlinespushed, (rasterevents[ScanlineRead] + (ScanlineReadInc*scanlineswaiting)) - rasterevents[PushScanline]); - scanlinesrendered--; + leftovers = BeginPushScanline(scanlinespushed, (rasterevents[ScanlineRead] + (ScanlineReadInc*scanlineswaitingforread)) - rasterevents[PushScanline]); + scanlineswaitingforpush--; scanlinespushed++; - + // schedule the finish push event if needed if (leftovers != 0) rasterevents[PushScanlineP2] = rasterevents[ScanlineRead]; else { - scanlineswaiting++; + scanlineswaitingforread++; scanlinespushed2++; } - if (scanlinesrendered <= 0) + if (scanlineswaitingforpush <= 0) rasterevents[PushScanline] = INT_MAX/2; // unsched event if no scanlines are waiting to be finished break; @@ -1938,7 +1948,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) Platform::Semaphore_Post(Sema_ScanlineCount); scanlinesread++; - scanlineswaiting--; + scanlineswaitingforread--; // reschedule event for one scanline later unless all scanlines have been read if (scanlinesread < 192) rasterevents[ScanlineRead] += ScanlineReadInc; @@ -1950,7 +1960,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) case PushScanlineP2: FinishPushScanline(scanlinespushed2, leftovers); - scanlineswaiting++; + scanlineswaitingforread++; scanlinespushed2++; // unschedule event if all partially pushed scanlines have been pushed From 4f3b99f5c4c1d106a4c16af6c937e923ab41cbe9 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 25 Dec 2023 15:26:48 -0500 Subject: [PATCH 21/53] fix another crash + bug w/ scanline delay --- src/GPU3D_Soft.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index b537522d..e1311966 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1747,9 +1747,9 @@ u16 SoftRenderer::BeginPushScanline(s32 y, s32 pixelstodraw) { // push the finished scanline to the appropriate frame buffers. // if a scanline is late enough to intersect with the 2d engine read time it will be partially drawn - + u16 start; - if (pixelstodraw >= 256) // if scheduled after or 256 cycles before a scanline read render full scanline + if (pixelstodraw >= 256 || pixelstodraw <= 0) // if scheduled after or 256 cycles before a scanline read render full scanline { start = 0; pixelstodraw = 256; @@ -1919,7 +1919,10 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) break; } - leftovers = BeginPushScanline(scanlinespushed, (rasterevents[ScanlineRead] + (ScanlineReadInc*scanlineswaitingforread)) - rasterevents[PushScanline]); + { + s32 pixelstopush = (scanlinespushed > scanlinesread ? 256 : (rasterevents[ScanlineRead] + (ScanlineReadInc*scanlineswaitingforread)) - rasterevents[PushScanline]); + leftovers = BeginPushScanline(scanlinespushed, pixelstopush); + } scanlineswaitingforpush--; scanlinespushed++; From bffc529c04d6f36694bda2567e43f813f5dfd0ce Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 25 Dec 2023 19:24:35 -0500 Subject: [PATCH 22/53] meh --- src/GPU3D_Soft.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index e1311966..4e0e7b5d 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1760,7 +1760,6 @@ u16 SoftRenderer::BeginPushScanline(s32 y, s32 pixelstodraw) // it seems to read in pairs of two every two cycles? looks jittery bool jitter = pixelstodraw % 2; - // chcckme: + & - might be backwards pixelstodraw += jitter; start -= jitter; } @@ -1822,7 +1821,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) { // check all events to find the earliest scheduled one nextevent = 0; - for (s32 i = 1; i < RasterEvents_MAX; i++) + for (u8 i = 1; i < RasterEvents_MAX; i++) { if (rasterevents[i] < rasterevents[nextevent]) nextevent = i; @@ -1920,6 +1919,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) } { + // if a scanline push might intersect a read determine the point at which it intersects s32 pixelstopush = (scanlinespushed > scanlinesread ? 256 : (rasterevents[ScanlineRead] + (ScanlineReadInc*scanlineswaitingforread)) - rasterevents[PushScanline]); leftovers = BeginPushScanline(scanlinespushed, pixelstopush); } From 4cb2c23ad6ed5f0d0723f6bdaa54d9b8589d24b4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 25 Dec 2023 19:39:54 -0500 Subject: [PATCH 23/53] fine linux --- src/GPU3D_Soft.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 4e0e7b5d..bd82a607 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1798,11 +1798,11 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // init all this junk i need to keep track of s32 rasterevents[RasterEvents_MAX]; rasterevents[RenderStart] = 0; - rasterevents[RenderFinal] = INT_MAX/2; - rasterevents[PushScanline] = INT_MAX/2; - rasterevents[PushScanlineP2] = INT_MAX/2; + rasterevents[RenderFinal] = FrameLength; + rasterevents[PushScanline] = FrameLength; + rasterevents[PushScanlineP2] = FrameLength; rasterevents[ScanlineRead] = InitGPU2DTimeout; - ScanlineTimeout = INT_MAX/2; + ScanlineTimeout = FrameLength; RasterTiming = 0; s32 rastertimingeven = 0; s32 rastertimingodd = 0; @@ -1862,13 +1862,13 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) } //set next scanline timeout - if (ScanlineTimeout == INT_MAX/2) ScanlineTimeout = rasterevents[ScanlineRead] - FinalPassLen; + if (ScanlineTimeout == FrameLength) ScanlineTimeout = rasterevents[ScanlineRead] - FinalPassLen; else ScanlineTimeout += TimeoutIncrement; // schedule next scanline pair + the final pass of the latest pair rasterevents[RenderFinal] = RasterTiming; if (scanlinesinit < 192) rasterevents[RenderStart] = RasterTiming+RastDelay; // scheduled 4 cycles late (presumably due to initial polygon timing shenanigans?) - else rasterevents[RenderStart] = INT_MAX/2; + else rasterevents[RenderStart] = FrameLength; break; @@ -1895,7 +1895,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) } // unschedule final pass event if (scanlinesfin != 191) - rasterevents[RenderFinal] = INT_MAX/2; + rasterevents[RenderFinal] = FrameLength; else // schedule next final pass event to immediately after the current one rasterevents[RenderFinal] += FinalPassLen; break; @@ -1935,7 +1935,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) } if (scanlineswaitingforpush <= 0) - rasterevents[PushScanline] = INT_MAX/2; // unsched event if no scanlines are waiting to be finished + rasterevents[PushScanline] = FrameLength; // unsched event if no scanlines are waiting to be finished break; @@ -1955,7 +1955,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // reschedule event for one scanline later unless all scanlines have been read if (scanlinesread < 192) rasterevents[ScanlineRead] += ScanlineReadInc; - else rasterevents[ScanlineRead] = INT_MAX/2; + else rasterevents[ScanlineRead] = FrameLength; break; @@ -1967,7 +1967,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) scanlinespushed2++; // unschedule event if all partially pushed scanlines have been pushed - if (scanlinespushed2 >= scanlinespushed) rasterevents[PushScanlineP2] = INT_MAX/2; + if (scanlinespushed2 >= scanlinespushed) rasterevents[PushScanlineP2] = FrameLength; break; } } From ae934021e5e9deb90fa4b6298cd02b34699df553 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 13 Feb 2024 21:38:53 -0500 Subject: [PATCH 24/53] improve scanline timeout slightly i hope finally touch this again after 2 months --- src/GPU3D_Soft.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index c4b93383..4b3c78d4 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1859,11 +1859,10 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // initial rendering pass (polygons, texturing, etc.) (variable cycle length) case RenderStart: - + { // set current raster time to the start of the event RasterTiming = rasterevents[RenderStart]; - { s32 rastertimingeven = 0; s32 rastertimingodd = 0; // scanlines are rendered in pairs of two @@ -1883,10 +1882,9 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // 12 cycles at the end of the scanline are always used, unless the scanline got within 12 cycles of timing out. Don't ask why, it just does. s32 timeoutdist = ScanlineTimeout - RasterTiming; RasterTiming += std::clamp(timeoutdist, 0, 12); - } //set next scanline timeout - if (ScanlineTimeout == FrameLength) ScanlineTimeout = rasterevents[ScanlineRead] - FinalPassLen; + if (ScanlineTimeout == FrameLength) ScanlineTimeout = rasterevents[ScanlineRead] - (ScanlineReadSpeed+RastDelay); else ScanlineTimeout += TimeoutIncrement; // schedule next scanline pair + the final pass of the latest pair @@ -1894,11 +1892,11 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) if (scanlinesinit < 192) rasterevents[RenderStart] = RasterTiming+RastDelay; // scheduled 4 cycles late (presumably due to initial polygon timing shenanigans?) else rasterevents[RenderStart] = FrameLength; break; - + } // final rendering pass (edge marking, anti-aliasing, fog) (fixed length of 496 (maybe 500?) cycles) case RenderFinal: - + { // schedule a scanline push event rasterevents[PushScanline] = rasterevents[RenderFinal] + ScanlinePushDelay; @@ -1923,11 +1921,11 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) else // schedule next final pass event to immediately after the current one rasterevents[RenderFinal] += FinalPassLen; break; - + } // push scanlines to the intermediary "frame buffer" for the 2d engine to read them. (fixed length of ??? cycles) case PushScanline: - + { // reschedule events if buffer is full if (scanlineswaitingforread >= 48) { @@ -1942,11 +1940,10 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) break; } - { // if a scanline push might intersect a read determine the point at which it intersects s32 pixelstopush = (scanlinespushed > scanlinesread ? 256 : (rasterevents[ScanlineRead] + (ScanlineReadInc*scanlineswaitingforread)) - rasterevents[PushScanline]); leftovers = BeginPushScanline(scanlinespushed, pixelstopush); - } + scanlineswaitingforpush--; scanlinespushed++; @@ -1962,11 +1959,11 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) rasterevents[PushScanline] = FrameLength; // unsched event if no scanlines are waiting to be finished break; - + } // 2d engine reading scanlines from the intermediary "framebuffer" case ScanlineRead: - + { // read scanline from buffer ReadScanline(scanlinesread); @@ -1981,11 +1978,11 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) if (scanlinesread < 192) rasterevents[ScanlineRead] += ScanlineReadInc; else rasterevents[ScanlineRead] = FrameLength; break; - + } // finish pushing a scanline to the buffer if it got interrupted by the read process. case PushScanlineP2: - + { FinishPushScanline(scanlinespushed2, leftovers); scanlineswaitingforread++; scanlinespushed2++; @@ -1994,6 +1991,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) if (scanlinespushed2 >= scanlinespushed) rasterevents[PushScanlineP2] = FrameLength; break; } + } } } From 3256e054fa5a0ddc3f589d77fbcbd065597c0b4f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 23 Feb 2024 09:55:34 -0500 Subject: [PATCH 25/53] wip --- src/GPU3D_Soft.cpp | 67 +++++++++++++++++++++++++++++++++++----------- src/GPU3D_Soft.h | 3 ++- 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 4b3c78d4..923b8a77 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1534,31 +1534,57 @@ u32 SoftRenderer::CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const return density; } -void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y) +bool SoftRenderer::CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr) +{ + if ((polyid != AttrBuffer[pixeladdr] >> 24) && (z < DepthBuffer[pixeladdr])) return true; + else return false; +} + +bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z, u32 pixeladdr) +{ + if (gpu3d.RenderDispCnt & (1<<14)) + { + return true; + } + else + { + u32 clearz = ((gpu3d.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF; + + if ((polyid != gpu3d.RenderClearAttr1>>24) && (z < clearz)) return true; + else return false; + } +} + +void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext) { // to consider: // clearing all polygon fog flags if the master flag isn't set? // merging all final pass loops into one? - /*if (gpu3d.RenderDispCnt & (1<<5)) + if (gpu3d.RenderDispCnt & (1<<5)) { // edge marking // only applied to topmost pixels for (int x = 0; x < 256; x++) { - u32 pixeladdr = (tempoffset * ScanlineWidth) + x; + u32 pixeladdr = (y * ScanlineWidth) + x; u32 attr = AttrBuffer[pixeladdr]; if (!(attr & 0xF)) continue; u32 polyid = attr >> 24; // opaque polygon IDs are used for edgemarking u32 z = DepthBuffer[pixeladdr]; + bool doit = false; - if (((polyid != (AttrBuffer[pixeladdr-1] >> 24)) && (z < DepthBuffer[pixeladdr-1])) || - ((polyid != (AttrBuffer[pixeladdr+1] >> 24)) && (z < DepthBuffer[pixeladdr+1])) || - ((polyid != (AttrBuffer[pixeladdr-ScanlineWidth] >> 24)) && (z < DepthBuffer[pixeladdr-ScanlineWidth])) || - ((polyid != (AttrBuffer[pixeladdr+ScanlineWidth] >> 24)) && (z < DepthBuffer[pixeladdr+ScanlineWidth]))) + if ((checkprev && (x == 0) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z, pixeladdr+1)) || + (checknext && (x == 255) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z, pixeladdr+1)) || + ((y == 0) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z, pixeladdr-ScanlineWidth)) || + ((y == 191) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z, pixeladdr+ScanlineWidth)) || + ((x != 0) && CheckEdgeMarkingPixel(polyid, z, pixeladdr-1)) || + ((x != 255) && CheckEdgeMarkingPixel(polyid, z, pixeladdr+1)) || + ((y != 0) && CheckEdgeMarkingPixel(polyid, z, pixeladdr-ScanlineWidth)) || + ((y != 191) && CheckEdgeMarkingPixel(polyid, z, pixeladdr+ScanlineWidth))) { u16 edgecolor = gpu3d.RenderEdgeTable[polyid >> 3]; u32 edgeR = (edgecolor << 1) & 0x3E; if (edgeR) edgeR++; @@ -1571,7 +1597,7 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y) AttrBuffer[pixeladdr] = (AttrBuffer[pixeladdr] & 0xFFFFE0FF) | 0x00001000; } } - }*/ + } if (gpu3d.RenderDispCnt & (1<<7)) { @@ -1706,7 +1732,6 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y) void SoftRenderer::ClearBuffers(const GPU& gpu) { - u32 clearz = ((gpu.GPU3D.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF; u32 polyid = gpu.GPU3D.RenderClearAttr1 & 0x3F000000; // this sets the opaque polygonID // clear the screen @@ -1745,6 +1770,8 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) } else { + u32 clearz = ((gpu.GPU3D.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF; + // TODO: confirm color conversion u32 r = (gpu.GPU3D.RenderClearAttr1 << 1) & 0x3E; if (r) r++; u32 g = (gpu.GPU3D.RenderClearAttr1 >> 4) & 0x3E; if (g) g++; @@ -1839,8 +1866,13 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) s16 scanlineswaitingforread = 0; u8 nextevent; u16 leftovers; + bool evenread = false; + s32 timespent = 0; + s32 prevtimespent = 0; + bool edgebug = false; + bool prevedgebug = false; - // until all scanlines have been pushed and read continue looping... CHECKME: unless its time for the next 3d frame should begin + // until all scanlines have been pushed and read continue looping... CHECKME: unless its time for the next 3d frame to begin while ((scanlinesread < 192 || scanlinespushed2 < 192) && (RasterTiming < (FrameLength-RastDelay))) { // check all events to find the earliest scheduled one @@ -1871,7 +1903,8 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) scanlinesinit += 2; // a new scanline pair cannot begin until both scanlines are finished. - s32 timespent = std::max(rastertimingeven, rastertimingodd); + prevtimespent = timespent; + timespent = std::max(rastertimingeven, rastertimingodd); // a new scanline pair cannot begin until the finishing pass + push is done. if ((RasterTiming + timespent) < (RasterTiming+FinalPassLen)) @@ -1881,10 +1914,13 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // 12 cycles at the end of the scanline are always used, unless the scanline got within 12 cycles of timing out. Don't ask why, it just does. s32 timeoutdist = ScanlineTimeout - RasterTiming; + prevedgebug = edgebug; + if (timeoutdist < 49385) edgebug = true; + else edgebug = false; RasterTiming += std::clamp(timeoutdist, 0, 12); //set next scanline timeout - if (ScanlineTimeout == FrameLength) ScanlineTimeout = rasterevents[ScanlineRead] - (ScanlineReadSpeed+RastDelay); + if (ScanlineTimeout == FrameLength) ScanlineTimeout = rasterevents[ScanlineRead] - FinalPassLen + (ScanlineReadInc*evenread);//(ScanlineReadSpeed+RastDelay); else ScanlineTimeout += TimeoutIncrement; // schedule next scanline pair + the final pass of the latest pair @@ -1903,7 +1939,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // if the first scanline pair was just finished only render one scanline if (scanlinesfin > 0) { - ScanlineFinalPass(gpu.GPU3D, scanlinesfin); + ScanlineFinalPass(gpu.GPU3D, scanlinesfin, timespent+4 < 501 || edgebug, prevtimespent+4 < 501 || prevedgebug); scanlineswaitingforpush++; scanlinesfin++; } @@ -1911,7 +1947,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // if the last scanline pair was just finished only render one scanline if (scanlinesfin < 191) { - ScanlineFinalPass(gpu.GPU3D, scanlinesfin); + ScanlineFinalPass(gpu.GPU3D, scanlinesfin, timespent+4 < 501 || edgebug, prevtimespent+4 < 501 || prevedgebug); scanlineswaitingforpush++; scanlinesfin++; } @@ -1923,7 +1959,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) break; } - // push scanlines to the intermediary "frame buffer" for the 2d engine to read them. (fixed length of ??? cycles) + // push scanlines to the intermediary "frame buffer" for the 2d engine to read them. (fixed length of ??? cycles) 256? case PushScanline: { // reschedule events if buffer is full @@ -1973,6 +2009,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) scanlinesread++; scanlineswaitingforread--; + evenread = !evenread; // reschedule event for one scanline later unless all scanlines have been read if (scanlinesread < 192) rasterevents[ScanlineRead] += ScanlineReadInc; diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 3429f951..6f81fae6 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -471,7 +471,8 @@ private: bool RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter); void RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter); u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const; - void ScanlineFinalPass(const GPU3D& gpu3d, s32 y); + bool CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr); + void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext); void ClearBuffers(const GPU& gpu); u16 BeginPushScanline(s32 y, s32 pixelstodraw); void ReadScanline(s32 y); From 249687a2ce9c0fd08f5e2b2b69f0129754d00214 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 24 Feb 2024 14:18:45 -0500 Subject: [PATCH 26/53] rework 4: now with proper edge marking bug emulation! --- src/GPU3D.h | 30 ++--- src/GPU3D_Soft.cpp | 273 +++++++++++++++------------------------------ src/GPU3D_Soft.h | 1 + 3 files changed, 106 insertions(+), 198 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index 27162854..8719a7e1 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -341,13 +341,13 @@ public: static constexpr int ScanlineReadInc = DelayBetweenReads + ScanlineReadSpeed; - static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair. - static constexpr int GPU2DSpeedSecondInPair = 296 * TimingFrac; // 296 | 295??? | the delay between finishing reading the first scanline + //static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair. + //static constexpr int GPU2DSpeedSecondInPair = 296 * TimingFrac; // 296 | 295??? | the delay between finishing reading the first scanline // and beginning reading the second scanline of a scanline pair. - static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline. - static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment. - static constexpr int InitGPU2DTimeout = 52128 * TimingFrac; // 51618? 51874? | when it finishes reading the first scanline. - static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines. + //static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline. + //static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment. + static constexpr int InitGPU2DTimeout = 51875 * TimingFrac; // 51618? 51874? 52128? | when it finishes reading the first scanline. + //static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines. static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this? // GPU 3D Rasterization Timings: For Emulating Scanline Timeout @@ -358,22 +358,20 @@ public: //static constexpr int ScanlineBreak2 = 40 * TimingFrac; //static constexpr int FakeTiming = 2 * TimingFrac; //static constexpr int FraudulentTiming = 1120 * TimingFrac; // bad theory. todo: find a better one. - static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair - static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is, + //static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair + //static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is, // it just cares about if its the first 50 scanlines to speedrun rendering? static constexpr int FinalPassLen = 500 * TimingFrac; // 496 (might technically be 500?) | the next scanline cannot begin while a scanline's final pass is in progress // (can be interpreted as the minimum amount of cycles for the next scanline // pair to start after the previous pair began) (related to final pass?) static constexpr int ScanlinePushDelay = 242 * TimingFrac; - static constexpr int TimeoutIncrement = 2130 * TimingFrac; - static constexpr int ScanlineIncrementold = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair - static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains" - static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126) + //static constexpr int TimeoutIncrement = 2130 * TimingFrac; + //static constexpr int ScanlineIncrementold = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair + //static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains" + //static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126) // (why does the next pair get more time if the previous scanline is aborted?) static constexpr int UnderflowFlag = 14 * TimingFrac; // 14 | How many cycles need to be left for the 3ddispcnt rdlines underflow flag to be set - static constexpr int RastDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?) - // (Amount of time before the end of the cycle a scanline must abort?) - static constexpr int FinishScanline = 512 * TimingFrac; + //static constexpr int FinishScanline = 512 * TimingFrac; // GPU 3D Rasterization Timings II: For Tracking Timing Behaviors @@ -392,6 +390,8 @@ public: static constexpr int FirstPerSlope = 1 * TimingFrac; // 1 | for each "slope" the first polygon has in this scanline increment it by 1. // (see DoTimingsSlopes() in GPU3D_Soft.cpp for more info) static constexpr int FirstNull = 1 * TimingFrac; // 1 | if the first polygon is "null" (probably wrong?) + static constexpr int RastDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?) + // (Amount of time before the end of the cycle a scanline must abort?) // static constexpr int RasterTimingCap = 51116 * TimingFrac; // static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 923b8a77..d3b72e08 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -19,6 +19,7 @@ #include "GPU3D_Soft.h" #include +#include #include #include #include "NDS.h" @@ -175,6 +176,8 @@ u32 SoftRenderer::DoTimingsPixels(s32 pixels, s32* timingcounter) bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, s32* timingcounter) { + DoTimings(RastDelay, timingcounter); + // determine the timing impact of the first polygon's slopes. Polygon* polygon = rp->PolyData; @@ -1457,6 +1460,7 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter) { + *timingcounter = 0; bool abort = false; bool first = true; for (int i = 0; i < npolys; i++) @@ -1466,7 +1470,7 @@ void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timing if (y == polygon->YBottom && y != polygon->YTop) { - if (!abort) abort = (first && DoTimings(FirstNull, timingcounter)) || DoTimings(EmptyPolyScanline, timingcounter); + if (!abort) abort = (first && DoTimings(FirstNull+RastDelay, timingcounter)) || DoTimings(EmptyPolyScanline, timingcounter); first = false; } @@ -1555,6 +1559,7 @@ bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z, u32 pixel } } +template void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext) { // to consider: @@ -1728,6 +1733,11 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, ColorBuffer[pixeladdr] = topR | (topG << 8) | (topB << 16) | (topA << 24); } } + if constexpr (push) + { + memcpy(&FinalBuffer[y*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], ScanlineWidth*4); + Platform::Semaphore_Post(Sema_ScanlineCount); + } } void SoftRenderer::ClearBuffers(const GPU& gpu) @@ -1846,190 +1856,87 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) //init internal buffer ClearBuffers(gpu); - // init all this junk i need to keep track of - s32 rasterevents[RasterEvents_MAX]; - rasterevents[RenderStart] = 0; - rasterevents[RenderFinal] = FrameLength; - rasterevents[PushScanline] = FrameLength; - rasterevents[PushScanlineP2] = FrameLength; - rasterevents[ScanlineRead] = InitGPU2DTimeout; - ScanlineTimeout = FrameLength; - RasterTiming = 0; - s32 rastertimingeven = 0; - s32 rastertimingodd = 0; - u8 scanlinesread = 0; - u8 scanlinesinit = 0; - u8 scanlinesfin = 0; - u8 scanlinespushed = 0; - u8 scanlinespushed2 = 0; - s16 scanlineswaitingforpush = 0; - s16 scanlineswaitingforread = 0; - u8 nextevent; - u16 leftovers; - bool evenread = false; - s32 timespent = 0; - s32 prevtimespent = 0; - bool edgebug = false; - bool prevedgebug = false; - - // until all scanlines have been pushed and read continue looping... CHECKME: unless its time for the next 3d frame to begin - while ((scanlinesread < 192 || scanlinespushed2 < 192) && (RasterTiming < (FrameLength-RastDelay))) + u32 slread[192]; // scanline read times + for (int i = 0, time = InitGPU2DTimeout; i < 192; i++, time += ScanlineReadInc) // CHECKME: is this computed at compile time? { - // check all events to find the earliest scheduled one - nextevent = 0; - for (u8 i = 1; i < RasterEvents_MAX; i++) - { - if (rasterevents[i] < rasterevents[nextevent]) - nextevent = i; - } - - // if all events are scheduled for after the next frame begins, ABORT - if (rasterevents[nextevent] >= FrameLength) break; - - switch (nextevent) - { - - // initial rendering pass (polygons, texturing, etc.) (variable cycle length) - case RenderStart: - { - // set current raster time to the start of the event - RasterTiming = rasterevents[RenderStart]; - - s32 rastertimingeven = 0; - s32 rastertimingodd = 0; - // scanlines are rendered in pairs of two - RenderScanline(gpu, scanlinesinit, j, &rastertimingeven); - RenderScanline(gpu, scanlinesinit+1, j, &rastertimingodd); - scanlinesinit += 2; - - // a new scanline pair cannot begin until both scanlines are finished. - prevtimespent = timespent; - timespent = std::max(rastertimingeven, rastertimingodd); - - // a new scanline pair cannot begin until the finishing pass + push is done. - if ((RasterTiming + timespent) < (RasterTiming+FinalPassLen)) - RasterTiming += FinalPassLen; - else - RasterTiming += timespent; - - // 12 cycles at the end of the scanline are always used, unless the scanline got within 12 cycles of timing out. Don't ask why, it just does. - s32 timeoutdist = ScanlineTimeout - RasterTiming; - prevedgebug = edgebug; - if (timeoutdist < 49385) edgebug = true; - else edgebug = false; - RasterTiming += std::clamp(timeoutdist, 0, 12); - - //set next scanline timeout - if (ScanlineTimeout == FrameLength) ScanlineTimeout = rasterevents[ScanlineRead] - FinalPassLen + (ScanlineReadInc*evenread);//(ScanlineReadSpeed+RastDelay); - else ScanlineTimeout += TimeoutIncrement; - - // schedule next scanline pair + the final pass of the latest pair - rasterevents[RenderFinal] = RasterTiming; - if (scanlinesinit < 192) rasterevents[RenderStart] = RasterTiming+RastDelay; // scheduled 4 cycles late (presumably due to initial polygon timing shenanigans?) - else rasterevents[RenderStart] = FrameLength; - break; - } - - // final rendering pass (edge marking, anti-aliasing, fog) (fixed length of 496 (maybe 500?) cycles) - case RenderFinal: - { - // schedule a scanline push event - rasterevents[PushScanline] = rasterevents[RenderFinal] + ScanlinePushDelay; - - // if the first scanline pair was just finished only render one scanline - if (scanlinesfin > 0) - { - ScanlineFinalPass(gpu.GPU3D, scanlinesfin, timespent+4 < 501 || edgebug, prevtimespent+4 < 501 || prevedgebug); - scanlineswaitingforpush++; - scanlinesfin++; - } - - // if the last scanline pair was just finished only render one scanline - if (scanlinesfin < 191) - { - ScanlineFinalPass(gpu.GPU3D, scanlinesfin, timespent+4 < 501 || edgebug, prevtimespent+4 < 501 || prevedgebug); - scanlineswaitingforpush++; - scanlinesfin++; - } - // unschedule final pass event - if (scanlinesfin != 191) - rasterevents[RenderFinal] = FrameLength; - else // schedule next final pass event to immediately after the current one - rasterevents[RenderFinal] += FinalPassLen; - break; - } - - // push scanlines to the intermediary "frame buffer" for the 2d engine to read them. (fixed length of ??? cycles) 256? - case PushScanline: - { - // reschedule events if buffer is full - if (scanlineswaitingforread >= 48) - { - rasterevents[PushScanline] = rasterevents[ScanlineRead]; - - // dont reschedule these events if they're done. - if (scanlinesinit < 192) - rasterevents[RenderStart] = rasterevents[ScanlineRead] + RastDelay; - if (scanlinesfin < 192) - rasterevents[RenderFinal] = rasterevents[ScanlineRead]; - - break; - } - - // if a scanline push might intersect a read determine the point at which it intersects - s32 pixelstopush = (scanlinespushed > scanlinesread ? 256 : (rasterevents[ScanlineRead] + (ScanlineReadInc*scanlineswaitingforread)) - rasterevents[PushScanline]); - leftovers = BeginPushScanline(scanlinespushed, pixelstopush); - - scanlineswaitingforpush--; - scanlinespushed++; - - // schedule the finish push event if needed - if (leftovers != 0) rasterevents[PushScanlineP2] = rasterevents[ScanlineRead]; - else - { - scanlineswaitingforread++; - scanlinespushed2++; - } - - if (scanlineswaitingforpush <= 0) - rasterevents[PushScanline] = FrameLength; // unsched event if no scanlines are waiting to be finished - - break; - } - - // 2d engine reading scanlines from the intermediary "framebuffer" - case ScanlineRead: - { - // read scanline from buffer - ReadScanline(scanlinesread); - - // avoid breaking seperate thread. - if constexpr (threaded) - Platform::Semaphore_Post(Sema_ScanlineCount); - - scanlinesread++; - scanlineswaitingforread--; - evenread = !evenread; - - // reschedule event for one scanline later unless all scanlines have been read - if (scanlinesread < 192) rasterevents[ScanlineRead] += ScanlineReadInc; - else rasterevents[ScanlineRead] = FrameLength; - break; - } - - // finish pushing a scanline to the buffer if it got interrupted by the read process. - case PushScanlineP2: - { - FinishPushScanline(scanlinespushed2, leftovers); - scanlineswaitingforread++; - scanlinespushed2++; - - // unschedule event if all partially pushed scanlines have been pushed - if (scanlinespushed2 >= scanlinespushed) rasterevents[PushScanlineP2] = FrameLength; - break; - } - } + slread[i] = time; } + + ScanlineTimeout = FrameLength; // CHECKME + + s32 rastertimingeven; // always init to 0 at the start of a scanline render + s32 rastertimingodd; + + s32 scanlineswaiting = 0; + s32 nextread = 0; + + u32 timespent; + u32 prevtimespent; + // scanlines are rendered in pairs of two + RenderScanline(gpu, 0, j, &rastertimingeven); + RenderScanline(gpu, 1, j, &rastertimingodd); + + RasterTiming = timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); + RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); + + // if first pair was not delayed past the first read, then later scanlines cannot either + // this allows us to implement a fast path + //if (slread[0] - timespent + ScanlinePushDelay >= 256) + { + ScanlineTimeout = slread[1] - FinalPassLen; + + RenderScanline(gpu, 2, j, &rastertimingeven); + RenderScanline(gpu, 3, j, &rastertimingodd); + + prevtimespent = timespent; + RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); + RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); + + ScanlineFinalPass(gpu.GPU3D, 0, true, true); + scanlineswaiting++; + for (int y = 4; y < 192; y+=2) + { + ScanlineTimeout = slread[y-1] - FinalPassLen; + + RenderScanline(gpu, y, j, &rastertimingeven); + RenderScanline(gpu, y+1, j, &rastertimingodd); + + prevtimespent = timespent; + RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); + RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); + + scanlineswaiting+=2; + + while (scanlineswaiting >= 47) + { + if (RasterTiming < slread[nextread]) RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? + scanlineswaiting--; + nextread++; + } + + ScanlineFinalPass(gpu.GPU3D, y-3, prevtimespent >= 502 || y-3 == 1, timespent >= 502); + ScanlineFinalPass(gpu.GPU3D, y-2, prevtimespent >= 502, timespent >= 502); + } + + ScanlineFinalPass(gpu.GPU3D, 189, timespent >= 502, timespent >= 502); + ScanlineFinalPass(gpu.GPU3D, 190, timespent >= 502, true); + + ScanlineFinalPass(gpu.GPU3D, 191, true, true); + } + /*else + { + ScanlineFinalPass(gpu, 0, false, false); + + s32 pixelstopush = slread[0] - (timespent + ScanlinePushDelay); + if (pixelstopush > 256) pixelstopush = 256; + //timespent + ScanlinePushDelay + ScanlineReadSpeed > slread[0] + + rastertimingeven = 0; + rastertimingodd = 0; + + RenderScanline(gpu, 2, j, &rastertimingeven); + RenderScanline(gpu, 3, j, &rastertimingodd); + }*/ } void SoftRenderer::VCount144(GPU& gpu) diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 6f81fae6..3814762d 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -472,6 +472,7 @@ private: void RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter); u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const; bool CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr); + template void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext); void ClearBuffers(const GPU& gpu); u16 BeginPushScanline(s32 y, s32 pixelstodraw); From 9219a084c4c9297ca3712f85ae629a3a2d70a6c0 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 25 Feb 2024 16:45:22 -0500 Subject: [PATCH 27/53] improve edge marking bug accuracy also begin groundwork for rdlines_count register emulation --- src/GPU3D.cpp | 10 ++++++---- src/GPU3D.h | 3 ++- src/GPU3D_Soft.cpp | 50 +++++++++++++++++++++++++++++++++++++++------- 3 files changed, 51 insertions(+), 12 deletions(-) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 5cf6426f..8706724b 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -241,7 +241,8 @@ void GPU3D::Reset() noexcept AlphaRefVal = 0; AlphaRef = 0; - RDLines = 46; + RDLines = 63; // defaults to 63 for one frame? (CHECKME: when does it reset?) + RDLinesTemp = 46; memset(ToonTable, 0, sizeof(ToonTable)); memset(EdgeTable, 0, sizeof(EdgeTable)); @@ -2401,7 +2402,6 @@ void GPU3D::CheckFIFODMA() noexcept void GPU3D::VCount144(GPU& gpu) noexcept { - RDLines = 46; CurrentRenderer->VCount144(gpu); } @@ -2431,6 +2431,7 @@ bool YSort(Polygon* a, Polygon* b) void GPU3D::VBlank() noexcept { + RDLines = RDLinesTemp; if (GeometryEnabled) { if (RenderingEnabled) @@ -2508,6 +2509,7 @@ void GPU3D::VBlank() noexcept void GPU3D::VCount215(GPU& gpu) noexcept { + //RDLinesTemp = 46; CurrentRenderer->RenderFrame(gpu); } @@ -2645,7 +2647,7 @@ u16 GPU3D::Read16(u32 addr) noexcept return DispCnt; case 0x04000320: - return RDLines; // IT IS TIME + return RDLines; // CHECKME: Can this always be read? Even when the gpu is powered off? case 0x04000600: { @@ -2689,7 +2691,7 @@ u32 GPU3D::Read32(u32 addr) noexcept return DispCnt; case 0x04000320: - return RDLines; // IT IS TIME + return RDLines; case 0x04000600: { diff --git a/src/GPU3D.h b/src/GPU3D.h index 8719a7e1..3d3b0e7f 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -246,7 +246,8 @@ public: bool RenderingEnabled = false; u32 DispCnt = 0; - u32 RDLines = 0; + u8 RDLines = 63; + u8 RDLinesTemp = 46; u8 AlphaRefVal = 0; u8 AlphaRef = 0; diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index d3b72e08..50d1104f 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1876,13 +1876,17 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) RenderScanline(gpu, 0, j, &rastertimingeven); RenderScanline(gpu, 1, j, &rastertimingodd); + // it can't proceed to the next scanline unless all others steps are done (both scanlines in the pair, and final pass) RasterTiming = timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); + // 12 cycles at the end of a "timeout" are always used for w/e reason RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); + gpu.GPU3D.RDLinesTemp = 46; // if first pair was not delayed past the first read, then later scanlines cannot either // this allows us to implement a fast path //if (slread[0] - timespent + ScanlinePushDelay >= 256) { + // begin scanline timeout ScanlineTimeout = slread[1] - FinalPassLen; RenderScanline(gpu, 2, j, &rastertimingeven); @@ -1891,9 +1895,21 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) prevtimespent = timespent; RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); - - ScanlineFinalPass(gpu.GPU3D, 0, true, true); + scanlineswaiting++; + + while (RasterTiming >= slread[nextread] + 565) + { + if (RasterTiming < slread[nextread] + 565) + { + RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? + timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? + } + scanlineswaiting--; + nextread++; + } + + ScanlineFinalPass(gpu.GPU3D, 0, true, timespent >= 502); for (int y = 4; y < 192; y+=2) { ScanlineTimeout = slread[y-1] - FinalPassLen; @@ -1907,9 +1923,13 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) scanlineswaiting+=2; - while (scanlineswaiting >= 47) + while (scanlineswaiting >= 47 || RasterTiming >= slread[nextread] + 565) { - if (RasterTiming < slread[nextread]) RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? + if (RasterTiming < slread[nextread] + 565) + { + RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? + timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? + } scanlineswaiting--; nextread++; } @@ -1917,11 +1937,27 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) ScanlineFinalPass(gpu.GPU3D, y-3, prevtimespent >= 502 || y-3 == 1, timespent >= 502); ScanlineFinalPass(gpu.GPU3D, y-2, prevtimespent >= 502, timespent >= 502); } + scanlineswaiting+= 2; + prevtimespent = timespent; - ScanlineFinalPass(gpu.GPU3D, 189, timespent >= 502, timespent >= 502); - ScanlineFinalPass(gpu.GPU3D, 190, timespent >= 502, true); + // do this one last time to allow for edge marking bug emulation. + while (scanlineswaiting >= 47 || RasterTiming >= slread[nextread] + 565) + { + if (RasterTiming < slread[nextread] + 565) + { + RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? + timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? + } + scanlineswaiting--; + nextread++; + } + + ScanlineFinalPass(gpu.GPU3D, 189, prevtimespent >= 502, timespent >= 502); + ScanlineFinalPass(gpu.GPU3D, 190, prevtimespent >= 502, true); + + // skip timing emulation here since it's irrelevant, also use timespent instead of prev because we're skipping timing emulation + ScanlineFinalPass(gpu.GPU3D, 191, timespent >= 502, true); - ScanlineFinalPass(gpu.GPU3D, 191, true, true); } /*else { From 9ffa04dfbc1bf187f3876864f224d404a69a3b05 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 25 Feb 2024 22:41:33 -0500 Subject: [PATCH 28/53] approximate rdlines_count; implement underflow flag --- src/GPU.cpp | 4 ++ src/GPU3D.cpp | 3 +- src/GPU3D.h | 3 +- src/GPU3D_Soft.cpp | 115 ++++++++++++++++++++++++++------------------- 4 files changed, 74 insertions(+), 51 deletions(-) diff --git a/src/GPU.cpp b/src/GPU.cpp index f23e641e..a78deba6 100644 --- a/src/GPU.cpp +++ b/src/GPU.cpp @@ -1041,6 +1041,10 @@ void GPU::StartScanline(u32 line) noexcept if (GPU3D.IsRendererAccelerated()) GPU3D.Blit(*this); } + else if (VCount == 183) + { + GPU3D.DispCnt |= GPU3D.RDLinesUnderflow << 12; + } } NDS.ScheduleEvent(Event_LCD, true, HBLANK_CYCLES, LCD_StartHBlank, line); diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 8706724b..a9524e88 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -2509,7 +2509,6 @@ void GPU3D::VBlank() noexcept void GPU3D::VCount215(GPU& gpu) noexcept { - //RDLinesTemp = 46; CurrentRenderer->RenderFrame(gpu); } @@ -2647,7 +2646,7 @@ u16 GPU3D::Read16(u32 addr) noexcept return DispCnt; case 0x04000320: - return RDLines; // CHECKME: Can this always be read? Even when the gpu is powered off? + return RDLines; // CHECKME: Can this always be read? Even when the gpu is powered off? also check 8 bit reads case 0x04000600: { diff --git a/src/GPU3D.h b/src/GPU3D.h index 3d3b0e7f..fb779a68 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -246,6 +246,7 @@ public: bool RenderingEnabled = false; u32 DispCnt = 0; + bool RDLinesUnderflow = false; u8 RDLines = 63; u8 RDLinesTemp = 46; u8 AlphaRefVal = 0; @@ -371,7 +372,7 @@ public: //static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains" //static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126) // (why does the next pair get more time if the previous scanline is aborted?) - static constexpr int UnderflowFlag = 14 * TimingFrac; // 14 | How many cycles need to be left for the 3ddispcnt rdlines underflow flag to be set + //static constexpr int UnderflowFlag = 2 * TimingFrac; // 14 | How many cycles need to be left for the 3ddispcnt rdlines underflow flag to be set //static constexpr int FinishScanline = 512 * TimingFrac; // GPU 3D Rasterization Timings II: For Tracking Timing Behaviors diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 50d1104f..0600b435 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1855,6 +1855,10 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) //init internal buffer ClearBuffers(gpu); + + // reset scanline trackers + gpu.GPU3D.RDLinesUnderflow = false; + gpu.GPU3D.RDLinesTemp = 63; u32 slread[192]; // scanline read times for (int i = 0, time = InitGPU2DTimeout; i < 192; i++, time += ScanlineReadInc) // CHECKME: is this computed at compile time? @@ -1872,6 +1876,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) u32 timespent; u32 prevtimespent; + // scanlines are rendered in pairs of two RenderScanline(gpu, 0, j, &rastertimingeven); RenderScanline(gpu, 1, j, &rastertimingodd); @@ -1880,8 +1885,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) RasterTiming = timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); // 12 cycles at the end of a "timeout" are always used for w/e reason RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); - - gpu.GPU3D.RDLinesTemp = 46; + // if first pair was not delayed past the first read, then later scanlines cannot either // this allows us to implement a fast path //if (slread[0] - timespent + ScanlinePushDelay >= 256) @@ -1892,13 +1896,72 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) RenderScanline(gpu, 2, j, &rastertimingeven); RenderScanline(gpu, 3, j, &rastertimingodd); + // the time spent on the previous scanline pair is important for emulating the edge marking bug properly prevtimespent = timespent; RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); - scanlineswaiting++; + // set the underflow flag if one of the scanlines came within 14 cycles of visible underflow + if (ScanlineTimeout <= RasterTiming) gpu.GPU3D.RDLinesUnderflow = true; + scanlineswaiting++; + + // simulate the process of scanlines being read from the 48 scanline buffer while (RasterTiming >= slread[nextread] + 565) + { + if (RasterTiming < slread[nextread] + 565) + { + RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? + timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? + } + scanlineswaiting--; + nextread++; + // update rdlines_count register + if (gpu.GPU3D.RDLinesTemp > scanlineswaiting) gpu.GPU3D.RDLinesTemp = scanlineswaiting; // TODO: not accurate, rdlines appears to update early in some manner? + } + + // final pass pairs are the previous scanline pair offset -1 scanline, thus we start with only building one + ScanlineFinalPass(gpu.GPU3D, 0, true, timespent >= 502); + for (int y = 4; y < 192; y+=2) + { + //update sl timeout + ScanlineTimeout = slread[y-1] - FinalPassLen; + + RenderScanline(gpu, y, j, &rastertimingeven); + RenderScanline(gpu, y+1, j, &rastertimingodd); + + prevtimespent = timespent; + RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); + RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); + + // set the underflow flag if one of the scanlines came within 14 cycles of visible underflow + if (ScanlineTimeout <= RasterTiming) gpu.GPU3D.RDLinesUnderflow = true; + + scanlineswaiting+=2; + + // simulate the process of scanlines being read from the 48 scanline buffer + while (scanlineswaiting >= 47 || RasterTiming >= slread[nextread] + 565) + { + if (RasterTiming < slread[nextread] + 565) + { + RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? + timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? + } + scanlineswaiting--; + nextread++; + // update rdlines_count register + if (gpu.GPU3D.RDLinesTemp > scanlineswaiting) gpu.GPU3D.RDLinesTemp = scanlineswaiting; // TODO: not accurate, rdlines appears to update early in some manner? + } + + ScanlineFinalPass(gpu.GPU3D, y-3, prevtimespent >= 502 || y-3 == 1, timespent >= 502); + ScanlineFinalPass(gpu.GPU3D, y-2, prevtimespent >= 502, timespent >= 502); + } + scanlineswaiting+= 2; + prevtimespent = timespent; + + // emulate read timings one last time, since it shouldn't matter after this + // additionally dont bother tracking rdlines anymore since it shouldn't be able to decrement anymore (CHECKME) + while (scanlineswaiting >= 47 || RasterTiming >= slread[nextread] + 565) { if (RasterTiming < slread[nextread] + 565) { @@ -1909,55 +1972,11 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) nextread++; } - ScanlineFinalPass(gpu.GPU3D, 0, true, timespent >= 502); - for (int y = 4; y < 192; y+=2) - { - ScanlineTimeout = slread[y-1] - FinalPassLen; - - RenderScanline(gpu, y, j, &rastertimingeven); - RenderScanline(gpu, y+1, j, &rastertimingodd); - - prevtimespent = timespent; - RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); - RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); - - scanlineswaiting+=2; - - while (scanlineswaiting >= 47 || RasterTiming >= slread[nextread] + 565) - { - if (RasterTiming < slread[nextread] + 565) - { - RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? - timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? - } - scanlineswaiting--; - nextread++; - } - - ScanlineFinalPass(gpu.GPU3D, y-3, prevtimespent >= 502 || y-3 == 1, timespent >= 502); - ScanlineFinalPass(gpu.GPU3D, y-2, prevtimespent >= 502, timespent >= 502); - } - scanlineswaiting+= 2; - prevtimespent = timespent; - - // do this one last time to allow for edge marking bug emulation. - while (scanlineswaiting >= 47 || RasterTiming >= slread[nextread] + 565) - { - if (RasterTiming < slread[nextread] + 565) - { - RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? - timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? - } - scanlineswaiting--; - nextread++; - } - + // finish the last 3 scanlines ScanlineFinalPass(gpu.GPU3D, 189, prevtimespent >= 502, timespent >= 502); ScanlineFinalPass(gpu.GPU3D, 190, prevtimespent >= 502, true); - // skip timing emulation here since it's irrelevant, also use timespent instead of prev because we're skipping timing emulation ScanlineFinalPass(gpu.GPU3D, 191, timespent >= 502, true); - } /*else { From 56e506ef9ad7a12eba6c7c9c04eae3d8ce46777d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 26 Feb 2024 12:25:49 -0500 Subject: [PATCH 29/53] misc cleanup --- src/GPU.cpp | 2 +- src/GPU3D.cpp | 2 +- src/GPU3D_Soft.cpp | 5 ++--- src/GPU3D_Soft.h | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/GPU.cpp b/src/GPU.cpp index a78deba6..c1f8e2a1 100644 --- a/src/GPU.cpp +++ b/src/GPU.cpp @@ -1043,7 +1043,7 @@ void GPU::StartScanline(u32 line) noexcept } else if (VCount == 183) { - GPU3D.DispCnt |= GPU3D.RDLinesUnderflow << 12; + GPU3D.DispCnt |= GPU3D.RDLinesUnderflow << 12; // CHECKME: does this get set *exactly* at vcount 183? earlier? later? } } diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index a9524e88..17f826fc 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -242,7 +242,7 @@ void GPU3D::Reset() noexcept AlphaRef = 0; RDLines = 63; // defaults to 63 for one frame? (CHECKME: when does it reset?) - RDLinesTemp = 46; + RDLinesTemp = 63; memset(ToonTable, 0, sizeof(ToonTable)); memset(EdgeTable, 0, sizeof(EdgeTable)); diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 0600b435..d23a9131 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1843,7 +1843,6 @@ void SoftRenderer::FinishPushScanline(s32 y, s32 pixelsremain) memcpy(&RDBuffer[bufferpos*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], 4 * pixelsremain); } -template void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) { int j = 0; @@ -2015,7 +2014,7 @@ void SoftRenderer::RenderFrame(GPU& gpu) // "Render thread, you're up! Get moving." Platform::Semaphore_Post(Sema_RenderStart); } - else if (!FrameIdentical) RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + else if (!FrameIdentical) RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); } void SoftRenderer::RestartFrame(GPU& gpu) @@ -2043,7 +2042,7 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) { // If no rendering is needed, just say we're done. Platform::Semaphore_Post(Sema_ScanlineCount, 192); } - else RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + else RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); // Tell the main thread that we're done rendering // and that it's safe to access the GPU state again. diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 3814762d..c11d6846 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -478,7 +478,7 @@ private: u16 BeginPushScanline(s32 y, s32 pixelstodraw); void ReadScanline(s32 y); void FinishPushScanline(s32 y, s32 pixelsremain); - template void RenderPolygons(GPU& gpu, Polygon** polygons, int npolys); + void RenderPolygons(GPU& gpu, Polygon** polygons, int npolys); void RenderThreadFunc(GPU& gpu); From bbbd56877d563e3e9b4d112bad3032d0e36f43a0 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 5 Mar 2024 17:46:13 -0500 Subject: [PATCH 30/53] minor tweaks to edge marking bug handling for some reason it does not check against the depth bitmap when enabled? --- src/GPU3D_Soft.cpp | 28 +++++++++++++--------------- src/GPU3D_Soft.h | 4 ++-- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index d23a9131..561e9741 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1544,19 +1544,17 @@ bool SoftRenderer::CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr) else return false; } -bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z, u32 pixeladdr) +bool SoftRenderer::CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z) { - if (gpu3d.RenderDispCnt & (1<<14)) - { - return true; - } - else + // for some reason it never checks against the bitmap clear plane? + if (polyid != gpu3d.RenderClearAttr1>>24) { u32 clearz = ((gpu3d.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF; - if ((polyid != gpu3d.RenderClearAttr1>>24) && (z < clearz)) return true; + if (z < clearz) return true; else return false; } + else return false; } template @@ -1582,14 +1580,14 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, u32 z = DepthBuffer[pixeladdr]; bool doit = false; - if ((checkprev && (x == 0) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z, pixeladdr+1)) || - (checknext && (x == 255) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z, pixeladdr+1)) || - ((y == 0) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z, pixeladdr-ScanlineWidth)) || - ((y == 191) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z, pixeladdr+ScanlineWidth)) || - ((x != 0) && CheckEdgeMarkingPixel(polyid, z, pixeladdr-1)) || - ((x != 255) && CheckEdgeMarkingPixel(polyid, z, pixeladdr+1)) || - ((y != 0) && CheckEdgeMarkingPixel(polyid, z, pixeladdr-ScanlineWidth)) || - ((y != 191) && CheckEdgeMarkingPixel(polyid, z, pixeladdr+ScanlineWidth))) + if ((checkprev && (x == 0) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) || // left + (checknext && (x == 255) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) || // right + ((y == 0) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) || // top + ((y == 191) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) || // bottom + ((x != 0) && CheckEdgeMarkingPixel(polyid, z, pixeladdr-1)) || // left + ((x != 255) && CheckEdgeMarkingPixel(polyid, z, pixeladdr+1)) || // right + ((y != 0) && CheckEdgeMarkingPixel(polyid, z, pixeladdr-ScanlineWidth)) || // top + ((y != 191) && CheckEdgeMarkingPixel(polyid, z, pixeladdr+ScanlineWidth))) // bottom { u16 edgecolor = gpu3d.RenderEdgeTable[polyid >> 3]; u32 edgeR = (edgecolor << 1) & 0x3E; if (edgeR) edgeR++; diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index c11d6846..0743178f 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -472,8 +472,8 @@ private: void RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter); u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const; bool CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr); - template - void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext); + bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z); + template void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext); void ClearBuffers(const GPU& gpu); u16 BeginPushScanline(s32 y, s32 pixelstodraw); void ReadScanline(s32 y); From 52e097d97c5a6abfe9c4865ff203b991086357fe Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 5 Mar 2024 19:28:20 -0500 Subject: [PATCH 31/53] Improve(?) edge marking check fixes a bug makes the code 200% uglier to look at though --- src/GPU3D_Soft.cpp | 51 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 561e9741..e50a0c1b 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1578,17 +1578,50 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, u32 polyid = attr >> 24; // opaque polygon IDs are used for edgemarking u32 z = DepthBuffer[pixeladdr]; - bool doit = false; - if ((checkprev && (x == 0) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) || // left - (checknext && (x == 255) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) || // right - ((y == 0) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) || // top - ((y == 191) && CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) || // bottom - ((x != 0) && CheckEdgeMarkingPixel(polyid, z, pixeladdr-1)) || // left - ((x != 255) && CheckEdgeMarkingPixel(polyid, z, pixeladdr+1)) || // right - ((y != 0) && CheckEdgeMarkingPixel(polyid, z, pixeladdr-ScanlineWidth)) || // top - ((y != 191) && CheckEdgeMarkingPixel(polyid, z, pixeladdr+ScanlineWidth))) // bottom + // check the pixel to the left + if (x == 0) { + // edge marking bug emulation + if (checkprev) + { + if (CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) goto pass; // check against the clear plane + } + else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr-(ScanlineWidth+1))) goto pass; // checks the right edge of the scanline 2 scanlines ago + } + else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr-1)) goto pass; // normal check + + // check the pixel to the right + if (x == 255) + { + // edge marking bug emulation + if (checknext) + { + if (CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) goto pass; // check against the clear plane + } + else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr+(ScanlineWidth+1))) goto pass; // checks the left edge of the scanline 2 scanlines ahead + } + else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr+1)) goto pass; // normal check + + // check the pixel above + if (y == 0) + { + // edge marking bug emulation + if (CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) goto pass; // check against the clear plane + } + else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr-ScanlineWidth)) goto pass; // normal check + + // check the pixel below + if (y == 191) + { + // edge marking bug emulation + if (CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) goto pass; // check against the clear plane + } + else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr+ScanlineWidth)) goto pass; // normal check + + if (false) + { + pass: u16 edgecolor = gpu3d.RenderEdgeTable[polyid >> 3]; u32 edgeR = (edgecolor << 1) & 0x3E; if (edgeR) edgeR++; u32 edgeG = (edgecolor >> 4) & 0x3E; if (edgeG) edgeG++; From 7f73dc35f94f7f17b82ead3f4b9bac33125edce7 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 5 Mar 2024 21:22:51 -0500 Subject: [PATCH 32/53] minor cleanup --- src/GPU3D.h | 13 ++++++++++++- src/GPU3D_Soft.cpp | 34 ++++++++++++++-------------------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index fb779a68..ee5409ea 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -342,7 +342,6 @@ public: static constexpr int ScanlineReadSpeed = 256 * TimingFrac; static constexpr int ScanlineReadInc = DelayBetweenReads + ScanlineReadSpeed; - //static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair. //static constexpr int GPU2DSpeedSecondInPair = 296 * TimingFrac; // 296 | 295??? | the delay between finishing reading the first scanline // and beginning reading the second scanline of a scanline pair. @@ -351,6 +350,18 @@ public: static constexpr int InitGPU2DTimeout = 51875 * TimingFrac; // 51618? 51874? 52128? | when it finishes reading the first scanline. //static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines. static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this? + + // compile-time list of scanline read times + // these *should* always occur at the same point in each frame, so it shouldn't matter if we make them fixed + constexpr std::array SLRead = []() constexpr { + std::array readtime {}; + + for (int i = 0, time = InitGPU2DTimeout; i < 192; i++, time += ScanlineReadInc) + { + readtime[i] = time; + } + return readtime; + }(); // GPU 3D Rasterization Timings: For Emulating Scanline Timeout diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index e50a0c1b..b9772d89 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1890,12 +1890,6 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) gpu.GPU3D.RDLinesUnderflow = false; gpu.GPU3D.RDLinesTemp = 63; - u32 slread[192]; // scanline read times - for (int i = 0, time = InitGPU2DTimeout; i < 192; i++, time += ScanlineReadInc) // CHECKME: is this computed at compile time? - { - slread[i] = time; - } - ScanlineTimeout = FrameLength; // CHECKME s32 rastertimingeven; // always init to 0 at the start of a scanline render @@ -1918,10 +1912,10 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // if first pair was not delayed past the first read, then later scanlines cannot either // this allows us to implement a fast path - //if (slread[0] - timespent + ScanlinePushDelay >= 256) + //if (SLRead[0] - timespent + ScanlinePushDelay >= 256) { // begin scanline timeout - ScanlineTimeout = slread[1] - FinalPassLen; + ScanlineTimeout = SLRead[1] - FinalPassLen; RenderScanline(gpu, 2, j, &rastertimingeven); RenderScanline(gpu, 3, j, &rastertimingodd); @@ -1937,11 +1931,11 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) scanlineswaiting++; // simulate the process of scanlines being read from the 48 scanline buffer - while (RasterTiming >= slread[nextread] + 565) + while (RasterTiming >= SLRead[nextread] + 565) { - if (RasterTiming < slread[nextread] + 565) + if (RasterTiming < SLRead[nextread] + 565) { - RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? + RasterTiming += timespent = (SLRead[nextread] + 565) - RasterTiming; // why + 565? timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? } scanlineswaiting--; @@ -1955,7 +1949,7 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) for (int y = 4; y < 192; y+=2) { //update sl timeout - ScanlineTimeout = slread[y-1] - FinalPassLen; + ScanlineTimeout = SLRead[y-1] - FinalPassLen; RenderScanline(gpu, y, j, &rastertimingeven); RenderScanline(gpu, y+1, j, &rastertimingodd); @@ -1970,11 +1964,11 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) scanlineswaiting+=2; // simulate the process of scanlines being read from the 48 scanline buffer - while (scanlineswaiting >= 47 || RasterTiming >= slread[nextread] + 565) + while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread] + 565) { - if (RasterTiming < slread[nextread] + 565) + if (RasterTiming < SLRead[nextread] + 565) { - RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? + RasterTiming += timespent = (SLRead[nextread] + 565) - RasterTiming; // why + 565? timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? } scanlineswaiting--; @@ -1991,11 +1985,11 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // emulate read timings one last time, since it shouldn't matter after this // additionally dont bother tracking rdlines anymore since it shouldn't be able to decrement anymore (CHECKME) - while (scanlineswaiting >= 47 || RasterTiming >= slread[nextread] + 565) + while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread] + 565) { - if (RasterTiming < slread[nextread] + 565) + if (RasterTiming < SLRead[nextread] + 565) { - RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? + RasterTiming += timespent = (SLRead[nextread] + 565) - RasterTiming; // why + 565? timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? } scanlineswaiting--; @@ -2012,9 +2006,9 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) { ScanlineFinalPass(gpu, 0, false, false); - s32 pixelstopush = slread[0] - (timespent + ScanlinePushDelay); + s32 pixelstopush = SLRead[0] - (timespent + ScanlinePushDelay); if (pixelstopush > 256) pixelstopush = 256; - //timespent + ScanlinePushDelay + ScanlineReadSpeed > slread[0] + //timespent + ScanlinePushDelay + ScanlineReadSpeed > SLRead[0] rastertimingeven = 0; rastertimingodd = 0; From 246fa18ab6dca17ae198efa95f599954665b6517 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 6 Mar 2024 07:39:52 -0500 Subject: [PATCH 33/53] return false if underflowed misc cleanup --- src/GPU3D.h | 2 +- src/GPU3D_Soft.cpp | 27 +++++++++++++++------------ src/GPU3D_Soft.h | 2 +- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index ee5409ea..11e175f0 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -403,7 +403,7 @@ public: static constexpr int FirstPerSlope = 1 * TimingFrac; // 1 | for each "slope" the first polygon has in this scanline increment it by 1. // (see DoTimingsSlopes() in GPU3D_Soft.cpp for more info) static constexpr int FirstNull = 1 * TimingFrac; // 1 | if the first polygon is "null" (probably wrong?) - static constexpr int RastDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?) + static constexpr int FirstPolyDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?) // (Amount of time before the end of the cycle a scanline must abort?) // static constexpr int RasterTimingCap = 51116 * TimingFrac; diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index b9772d89..8bbf891b 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -141,10 +141,10 @@ void SoftRenderer::SetThreaded(bool threaded, GPU& gpu) noexcept bool SoftRenderer::DoTimings(s32 cycles, s32* timingcounter) { - // add timings to a counter and check if underflowed. + // add timings to a counter and return false if underflowed. *timingcounter += cycles; - if (RasterTiming + *timingcounter <= ScanlineTimeout) return false; - else return true; + if (RasterTiming + *timingcounter <= ScanlineTimeout) return true; + else return false; } bool SoftRenderer::CheckTimings(s32 cycles, s32* timingcounter) @@ -158,7 +158,7 @@ u32 SoftRenderer::DoTimingsPixels(s32 pixels, s32* timingcounter) { // calculate and return the difference between the old span and the new span, while adding timings to the timings counter - // pixels dont count towards timings if they're the first 4 pixels in a scanline (for some reason?) + // pixels dont count towards timings if they're the first 4 pixels in a polygon scanline (for some reason?) if (pixels <= NumFreePixels) return 0; pixels -= NumFreePixels; @@ -174,16 +174,19 @@ u32 SoftRenderer::DoTimingsPixels(s32 pixels, s32* timingcounter) else return 0; } -bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, s32* timingcounter) +bool SoftRenderer::DoTimingsFirstPoly(RendererPolygon* rp, s32 y, s32* timingcounter) { - DoTimings(RastDelay, timingcounter); + // The first polygon in each scanline has an additional timing penalty (presumably due to pipelining?) + + // First polygon has a cost of 4 cycles + if (!DoTimings(FirstPolyDelay, timingcounter)) return false; // determine the timing impact of the first polygon's slopes. Polygon* polygon = rp->PolyData; - if (polygon->YTop == polygon->YBottom) return false; // 0 px tall line polygons do not have slopes, and thus no timing penalty - if (y == polygon->YTop) return false; + if (polygon->YTop == polygon->YBottom) return true; // 0 px tall line polygons do not have slopes, and thus no timing penalty + if (y == polygon->YTop) return true; if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) *timingcounter += FirstPerSlope; @@ -1470,16 +1473,16 @@ void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timing if (y == polygon->YBottom && y != polygon->YTop) { - if (!abort) abort = (first && DoTimings(FirstNull+RastDelay, timingcounter)) || DoTimings(EmptyPolyScanline, timingcounter); + if (!abort) abort = (first && !DoTimings(FirstNull+EmptyPolyScanline, timingcounter)) || !DoTimings(EmptyPolyScanline, timingcounter); first = false; } else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) { - //if (y == polygon->YTop) if(DoTimings(FirstPolyScanline, timingcounter)) abort = true; + //if (y == polygon->YTop) if(!DoTimings(FirstPolyScanline, timingcounter)) abort = true; - if (!abort) abort = (first && DoTimingsSlopes(rp, y, timingcounter)) // incorrect. needs research; behavior is strange... - || DoTimings(PerPolyScanline, timingcounter) + if (!abort) abort = (first && !DoTimingsFirstPoly(rp, y, timingcounter)) // incorrect. needs research; behavior is strange... + || !DoTimings(PerPolyScanline, timingcounter) || (!CheckTimings(MinToStartPoly, timingcounter)); if (abort) diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 0743178f..20aaf4bf 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -458,7 +458,7 @@ private: bool DoTimings(s32 cycles, s32* timingcounter); bool CheckTimings(s32 cycles, s32* timingcounter); u32 DoTimingsPixels(s32 pixels, s32* timingcounter); - bool DoTimingsSlopes(RendererPolygon* rp, s32 y, s32* timingcounter); + bool DoTimingsFirstPoly(RendererPolygon* rp, s32 y, s32* timingcounter); void TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const; u32 RenderPixel(const GPU& gpu, const Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t) const; void PlotTranslucentPixel(const GPU3D& gpu3d, u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow); From b32f519c5a0bc25e35338098d3fa64bb7eda7a52 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 6 Mar 2024 08:33:21 -0500 Subject: [PATCH 34/53] more cleanup + "fix" RDLines_Count fix feels wrong, but i can't prove it either way yet. --- src/GPU3D.h | 32 ++++++++++++++++------ src/GPU3D_Soft.cpp | 67 +++++++++++++++++++++++++++++++--------------- 2 files changed, 69 insertions(+), 30 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index 11e175f0..eb671ed0 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -341,19 +341,19 @@ public: static constexpr int DelayBetweenReads = 809 * TimingFrac; static constexpr int ScanlineReadSpeed = 256 * TimingFrac; static constexpr int ScanlineReadInc = DelayBetweenReads + ScanlineReadSpeed; + static constexpr int InitGPU2DTimeout = 51875 * TimingFrac; // 51618? 51874? 52128? | when it finishes reading the first scanline. + static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this? //static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair. //static constexpr int GPU2DSpeedSecondInPair = 296 * TimingFrac; // 296 | 295??? | the delay between finishing reading the first scanline // and beginning reading the second scanline of a scanline pair. //static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline. //static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment. - static constexpr int InitGPU2DTimeout = 51875 * TimingFrac; // 51618? 51874? 52128? | when it finishes reading the first scanline. //static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines. - static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this? // compile-time list of scanline read times // these *should* always occur at the same point in each frame, so it shouldn't matter if we make them fixed - constexpr std::array SLRead = []() constexpr { + static constexpr std::array SLRead = []() constexpr { std::array readtime {}; for (int i = 0, time = InitGPU2DTimeout; i < 192; i++, time += ScanlineReadInc) @@ -363,6 +363,20 @@ public: return readtime; }(); + static constexpr int Arbitrary = 565; // extra value after the scanline is read at which the cutoff of a scanline should be...? + // idk why this is needed. im probably doing something wrong. + + // the point at which rdlines decrements not sure why it's different...? + static constexpr std::array RDDecrement = []() constexpr { + std::array dec {}; + + for (int i = 0; i < 192; i++) + { + dec[i] = SLRead[i] + Arbitrary - 39 - (!(i % 2)); + } + return dec; + }(); + // GPU 3D Rasterization Timings: For Emulating Scanline Timeout //static constexpr int ScanlinePairLength = 2130 * TimingFrac; @@ -373,16 +387,18 @@ public: //static constexpr int FraudulentTiming = 1120 * TimingFrac; // bad theory. todo: find a better one. //static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair //static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is, - // it just cares about if its the first 50 scanlines to speedrun rendering? + // it just cares about if its the first 50 scanlines to speedrun rendering? static constexpr int FinalPassLen = 500 * TimingFrac; // 496 (might technically be 500?) | the next scanline cannot begin while a scanline's final pass is in progress - // (can be interpreted as the minimum amount of cycles for the next scanline - // pair to start after the previous pair began) (related to final pass?) + // (can be interpreted as the minimum amount of cycles for the next scanline + // pair to start after the previous pair began) (related to final pass?) static constexpr int ScanlinePushDelay = 242 * TimingFrac; + static constexpr int EMGlitchThreshhold = 502 * TimingFrac; // The threshold for the edge marking glitch behavior to change. + static constexpr int EMFixNum = 571 * TimingFrac; // Arbitrary value added to fix edge marking glitch, not sure why it's needed? //static constexpr int TimeoutIncrement = 2130 * TimingFrac; //static constexpr int ScanlineIncrementold = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair //static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains" //static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126) - // (why does the next pair get more time if the previous scanline is aborted?) + // (why does the next pair get more time if the previous scanline is aborted?) //static constexpr int UnderflowFlag = 2 * TimingFrac; // 14 | How many cycles need to be left for the 3ddispcnt rdlines underflow flag to be set //static constexpr int FinishScanline = 512 * TimingFrac; @@ -404,7 +420,7 @@ public: // (see DoTimingsSlopes() in GPU3D_Soft.cpp for more info) static constexpr int FirstNull = 1 * TimingFrac; // 1 | if the first polygon is "null" (probably wrong?) static constexpr int FirstPolyDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?) - // (Amount of time before the end of the cycle a scanline must abort?) + // (Amount of time before the end of the cycle a scanline must abort?) // static constexpr int RasterTimingCap = 51116 * TimingFrac; // static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 8bbf891b..60258e17 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1900,6 +1900,8 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) s32 scanlineswaiting = 0; s32 nextread = 0; + s32 slwaitingrd = 0; + s32 nextreadrd = 0; u32 timespent; u32 prevtimespent; @@ -1932,23 +1934,33 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) if (ScanlineTimeout <= RasterTiming) gpu.GPU3D.RDLinesUnderflow = true; scanlineswaiting++; + slwaitingrd++; // simulate the process of scanlines being read from the 48 scanline buffer - while (RasterTiming >= SLRead[nextread] + 565) + while (RasterTiming >= SLRead[nextread] + Arbitrary) { - if (RasterTiming < SLRead[nextread] + 565) + if (RasterTiming < SLRead[nextread] + Arbitrary) { - RasterTiming += timespent = (SLRead[nextread] + 565) - RasterTiming; // why + 565? - timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? + RasterTiming += timespent = (SLRead[nextread] + Arbitrary) - RasterTiming; // why + 565? + timespent += EMFixNum; // fixes edge marking bug emulation. not sure why this is needed? } scanlineswaiting--; nextread++; // update rdlines_count register - if (gpu.GPU3D.RDLinesTemp > scanlineswaiting) gpu.GPU3D.RDLinesTemp = scanlineswaiting; // TODO: not accurate, rdlines appears to update early in some manner? + //if (gpu.GPU3D.RDLinesTemp > scanlineswaiting) gpu.GPU3D.RDLinesTemp = scanlineswaiting; // TODO: not accurate, rdlines appears to update early in some manner? + } + + // feels wrong, needs improvement. + while (RasterTiming >= RDDecrement[nextreadrd]) + { + slwaitingrd--; + nextreadrd++; + // update rdlines_count register + if (gpu.GPU3D.RDLinesTemp > slwaitingrd) gpu.GPU3D.RDLinesTemp = slwaitingrd; } // final pass pairs are the previous scanline pair offset -1 scanline, thus we start with only building one - ScanlineFinalPass(gpu.GPU3D, 0, true, timespent >= 502); + ScanlineFinalPass(gpu.GPU3D, 0, true, timespent >= EMGlitchThreshhold); for (int y = 4; y < 192; y+=2) { //update sl timeout @@ -1964,46 +1976,57 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) // set the underflow flag if one of the scanlines came within 14 cycles of visible underflow if (ScanlineTimeout <= RasterTiming) gpu.GPU3D.RDLinesUnderflow = true; - scanlineswaiting+=2; + scanlineswaiting += 2; + slwaitingrd += 2; // simulate the process of scanlines being read from the 48 scanline buffer - while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread] + 565) + while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread] + Arbitrary) { - if (RasterTiming < SLRead[nextread] + 565) + if (RasterTiming < SLRead[nextread] + Arbitrary) { - RasterTiming += timespent = (SLRead[nextread] + 565) - RasterTiming; // why + 565? - timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? + RasterTiming += timespent = (SLRead[nextread] + Arbitrary) - RasterTiming; // why + 565? + timespent += EMFixNum; // fixes edge marking bug emulation. not sure why this is needed? } scanlineswaiting--; nextread++; // update rdlines_count register - if (gpu.GPU3D.RDLinesTemp > scanlineswaiting) gpu.GPU3D.RDLinesTemp = scanlineswaiting; // TODO: not accurate, rdlines appears to update early in some manner? + //if (gpu.GPU3D.RDLinesTemp > scanlineswaiting) gpu.GPU3D.RDLinesTemp = scanlineswaiting; // TODO: not accurate, rdlines appears to update early in some manner? } - ScanlineFinalPass(gpu.GPU3D, y-3, prevtimespent >= 502 || y-3 == 1, timespent >= 502); - ScanlineFinalPass(gpu.GPU3D, y-2, prevtimespent >= 502, timespent >= 502); + // feels wrong, needs improvement. + while (RasterTiming >= RDDecrement[nextreadrd]) + { + slwaitingrd--; + nextreadrd++; + // update rdlines_count register + if (gpu.GPU3D.RDLinesTemp > slwaitingrd) gpu.GPU3D.RDLinesTemp = slwaitingrd; + } + + ScanlineFinalPass(gpu.GPU3D, y-3, prevtimespent >= EMGlitchThreshhold || y-3 == 1, timespent >= EMGlitchThreshhold); + ScanlineFinalPass(gpu.GPU3D, y-2, prevtimespent >= EMGlitchThreshhold, timespent >= EMGlitchThreshhold); } - scanlineswaiting+= 2; + scanlineswaiting += 2; + slwaitingrd += 2; prevtimespent = timespent; // emulate read timings one last time, since it shouldn't matter after this // additionally dont bother tracking rdlines anymore since it shouldn't be able to decrement anymore (CHECKME) - while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread] + 565) + while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread] + Arbitrary) { - if (RasterTiming < SLRead[nextread] + 565) + if (RasterTiming < SLRead[nextread] + Arbitrary) { - RasterTiming += timespent = (SLRead[nextread] + 565) - RasterTiming; // why + 565? - timespent += 571; // fixes edge marking bug emulation. not sure why this is needed? + RasterTiming += timespent = (SLRead[nextread] + Arbitrary) - RasterTiming; // why + 565? + timespent += EMFixNum; // fixes edge marking bug emulation. not sure why this is needed? } scanlineswaiting--; nextread++; } // finish the last 3 scanlines - ScanlineFinalPass(gpu.GPU3D, 189, prevtimespent >= 502, timespent >= 502); - ScanlineFinalPass(gpu.GPU3D, 190, prevtimespent >= 502, true); + ScanlineFinalPass(gpu.GPU3D, 189, prevtimespent >= EMGlitchThreshhold, timespent >= EMGlitchThreshhold); + ScanlineFinalPass(gpu.GPU3D, 190, prevtimespent >= EMGlitchThreshhold, true); - ScanlineFinalPass(gpu.GPU3D, 191, timespent >= 502, true); + ScanlineFinalPass(gpu.GPU3D, 191, timespent >= EMGlitchThreshhold, true); } /*else { From e1cbadbe6084227a2c570d2c218f987de5d9e443 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 3 Apr 2024 10:23:03 -0400 Subject: [PATCH 35/53] attempt at some cleanup --- src/GPU3D_Soft.cpp | 169 +++++++++++++++++---------------------------- 1 file changed, 63 insertions(+), 106 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 60258e17..bb801612 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1877,6 +1877,43 @@ void SoftRenderer::FinishPushScanline(s32 y, s32 pixelsremain) memcpy(&RDBuffer[bufferpos*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], 4 * pixelsremain); } +#define RDLINES_COUNT_INCREMENT\ + /* feels wrong, needs improvement */\ + while (RasterTiming >= RDDecrement[nextreadrd])\ + {\ + slwaitingrd--;\ + nextreadrd++;\ + /* update rdlines_count register */\ + if (gpu.GPU3D.RDLinesTemp > slwaitingrd) gpu.GPU3D.RDLinesTemp = slwaitingrd;\ + } + +#define SCANLINE_BUFFER_SIM\ + /* simulate the process of scanlines being read from the 48 scanline buffer */\ + while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread] + Arbitrary)\ + {\ + if (RasterTiming < SLRead[nextread] + Arbitrary)\ + {\ + RasterTiming += timespent = (SLRead[nextread] + Arbitrary) - RasterTiming; /* why + 565? */\ + timespent += EMFixNum; /* fixes edge marking bug emulation. not sure why this is needed? */\ + }\ + scanlineswaiting--;\ + nextread++;\ + } + +#define RENDER_SCANLINES(y)\ + /* update sl timeout */\ + ScanlineTimeout = SLRead[y-1] - FinalPassLen;\ + \ + RenderScanline(gpu, y, j, &rastertimingeven);\ + RenderScanline(gpu, y+1, j, &rastertimingodd);\ + \ + prevtimespent = timespent;\ + RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen});\ + RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);\ + \ + /* set the underflow flag if one of the scanlines came within 14 cycles of visible underflow */\ + if (ScanlineTimeout <= RasterTiming) gpu.GPU3D.RDLinesUnderflow = true; + void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) { int j = 0; @@ -1885,26 +1922,19 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) if (polygons[i]->Degenerate) continue; SetupPolygon(&PolygonList[j++], polygons[i]); } - + //init internal buffer ClearBuffers(gpu); - + // reset scanline trackers gpu.GPU3D.RDLinesUnderflow = false; gpu.GPU3D.RDLinesTemp = 63; - ScanlineTimeout = FrameLength; // CHECKME - - s32 rastertimingeven; // always init to 0 at the start of a scanline render - s32 rastertimingodd; + s32 rastertimingeven, rastertimingodd; // always init to 0 at the start of a scanline render + s32 scanlineswaiting = 0, slwaitingrd = 0; + s32 nextread = 0, nextreadrd = 0; + u32 timespent, prevtimespent; - s32 scanlineswaiting = 0; - s32 nextread = 0; - s32 slwaitingrd = 0; - s32 nextreadrd = 0; - - u32 timespent; - u32 prevtimespent; // scanlines are rendered in pairs of two RenderScanline(gpu, 0, j, &rastertimingeven); @@ -1914,113 +1944,46 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) RasterTiming = timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); // 12 cycles at the end of a "timeout" are always used for w/e reason RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); - + // if first pair was not delayed past the first read, then later scanlines cannot either // this allows us to implement a fast path //if (SLRead[0] - timespent + ScanlinePushDelay >= 256) { - // begin scanline timeout - ScanlineTimeout = SLRead[1] - FinalPassLen; - - RenderScanline(gpu, 2, j, &rastertimingeven); - RenderScanline(gpu, 3, j, &rastertimingodd); - - // the time spent on the previous scanline pair is important for emulating the edge marking bug properly - prevtimespent = timespent; - RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); - RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); - - // set the underflow flag if one of the scanlines came within 14 cycles of visible underflow - if (ScanlineTimeout <= RasterTiming) gpu.GPU3D.RDLinesUnderflow = true; + RENDER_SCANLINES(2) scanlineswaiting++; slwaitingrd++; - - // simulate the process of scanlines being read from the 48 scanline buffer - while (RasterTiming >= SLRead[nextread] + Arbitrary) - { - if (RasterTiming < SLRead[nextread] + Arbitrary) - { - RasterTiming += timespent = (SLRead[nextread] + Arbitrary) - RasterTiming; // why + 565? - timespent += EMFixNum; // fixes edge marking bug emulation. not sure why this is needed? - } - scanlineswaiting--; - nextread++; - // update rdlines_count register - //if (gpu.GPU3D.RDLinesTemp > scanlineswaiting) gpu.GPU3D.RDLinesTemp = scanlineswaiting; // TODO: not accurate, rdlines appears to update early in some manner? - } - // feels wrong, needs improvement. - while (RasterTiming >= RDDecrement[nextreadrd]) - { - slwaitingrd--; - nextreadrd++; - // update rdlines_count register - if (gpu.GPU3D.RDLinesTemp > slwaitingrd) gpu.GPU3D.RDLinesTemp = slwaitingrd; - } + SCANLINE_BUFFER_SIM + + RDLINES_COUNT_INCREMENT // final pass pairs are the previous scanline pair offset -1 scanline, thus we start with only building one ScanlineFinalPass(gpu.GPU3D, 0, true, timespent >= EMGlitchThreshhold); + + // main loop for (int y = 4; y < 192; y+=2) { - //update sl timeout - ScanlineTimeout = SLRead[y-1] - FinalPassLen; - - RenderScanline(gpu, y, j, &rastertimingeven); - RenderScanline(gpu, y+1, j, &rastertimingodd); - - prevtimespent = timespent; - RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); - RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); - - // set the underflow flag if one of the scanlines came within 14 cycles of visible underflow - if (ScanlineTimeout <= RasterTiming) gpu.GPU3D.RDLinesUnderflow = true; + RENDER_SCANLINES(y) scanlineswaiting += 2; slwaitingrd += 2; - // simulate the process of scanlines being read from the 48 scanline buffer - while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread] + Arbitrary) - { - if (RasterTiming < SLRead[nextread] + Arbitrary) - { - RasterTiming += timespent = (SLRead[nextread] + Arbitrary) - RasterTiming; // why + 565? - timespent += EMFixNum; // fixes edge marking bug emulation. not sure why this is needed? - } - scanlineswaiting--; - nextread++; - // update rdlines_count register - //if (gpu.GPU3D.RDLinesTemp > scanlineswaiting) gpu.GPU3D.RDLinesTemp = scanlineswaiting; // TODO: not accurate, rdlines appears to update early in some manner? - } + SCANLINE_BUFFER_SIM - // feels wrong, needs improvement. - while (RasterTiming >= RDDecrement[nextreadrd]) - { - slwaitingrd--; - nextreadrd++; - // update rdlines_count register - if (gpu.GPU3D.RDLinesTemp > slwaitingrd) gpu.GPU3D.RDLinesTemp = slwaitingrd; - } + RDLINES_COUNT_INCREMENT ScanlineFinalPass(gpu.GPU3D, y-3, prevtimespent >= EMGlitchThreshhold || y-3 == 1, timespent >= EMGlitchThreshhold); ScanlineFinalPass(gpu.GPU3D, y-2, prevtimespent >= EMGlitchThreshhold, timespent >= EMGlitchThreshhold); } - scanlineswaiting += 2; - slwaitingrd += 2; - prevtimespent = timespent; + + scanlineswaiting += 2; + slwaitingrd += 2; + prevtimespent = timespent; // emulate read timings one last time, since it shouldn't matter after this // additionally dont bother tracking rdlines anymore since it shouldn't be able to decrement anymore (CHECKME) - while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread] + Arbitrary) - { - if (RasterTiming < SLRead[nextread] + Arbitrary) - { - RasterTiming += timespent = (SLRead[nextread] + Arbitrary) - RasterTiming; // why + 565? - timespent += EMFixNum; // fixes edge marking bug emulation. not sure why this is needed? - } - scanlineswaiting--; - nextread++; - } + SCANLINE_BUFFER_SIM // finish the last 3 scanlines ScanlineFinalPass(gpu.GPU3D, 189, prevtimespent >= EMGlitchThreshhold, timespent >= EMGlitchThreshhold); @@ -2030,20 +1993,14 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) } /*else { - ScanlineFinalPass(gpu, 0, false, false); - - s32 pixelstopush = SLRead[0] - (timespent + ScanlinePushDelay); - if (pixelstopush > 256) pixelstopush = 256; - //timespent + ScanlinePushDelay + ScanlineReadSpeed > SLRead[0] - - rastertimingeven = 0; - rastertimingodd = 0; - - RenderScanline(gpu, 2, j, &rastertimingeven); - RenderScanline(gpu, 3, j, &rastertimingodd); + Coming soon^tm to a melonDS near you }*/ } +#undef RENDER_SCANLINES +#undef SCANLINE_BUFFER_SIM +#undef RDLINES_COUNT_INCREMENT + void SoftRenderer::VCount144(GPU& gpu) { if (RenderThreadRunning.load(std::memory_order_relaxed) && !gpu.GPU3D.AbortFrame) From 75956b43c46989a6ebc8ecc7caf70d3b169121e3 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 4 Apr 2024 19:36:17 -0400 Subject: [PATCH 36/53] better implement when the line count reg is/isnt updated --- src/GPU3D.cpp | 11 +++++++---- src/GPU3D.h | 6 ++++-- src/GPU3D_Soft.cpp | 25 ++++++++++++++++++++----- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 17f826fc..47f5fc52 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -240,9 +240,6 @@ void GPU3D::Reset() noexcept DispCnt = 0; AlphaRefVal = 0; AlphaRef = 0; - - RDLines = 63; // defaults to 63 for one frame? (CHECKME: when does it reset?) - RDLinesTemp = 63; memset(ToonTable, 0, sizeof(ToonTable)); memset(EdgeTable, 0, sizeof(EdgeTable)); @@ -257,6 +254,8 @@ void GPU3D::Reset() noexcept ResetRenderingState(); + RDLines = 63; + AbortFrame = false; Timestamp = 0; @@ -570,6 +569,7 @@ void GPU3D::SetEnabled(bool geometry, bool rendering) noexcept RenderingEnabled = rendering; if (!rendering) ResetRenderingState(); + else RDLinesTemp = 63; // resets to 63 when the rasterizer is toggled on } @@ -2431,11 +2431,14 @@ bool YSort(Polygon* a, Polygon* b) void GPU3D::VBlank() noexcept { - RDLines = RDLinesTemp; + if (RenderingEnabled) + RDLines = RDLinesTemp; + if (GeometryEnabled) { if (RenderingEnabled) { + RDLines = RDLinesTemp; if (FlushRequest) { if (NumPolygons) diff --git a/src/GPU3D.h b/src/GPU3D.h index eb671ed0..b1684ead 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -246,9 +246,11 @@ public: bool RenderingEnabled = false; u32 DispCnt = 0; + bool RDLinesUnderflow = false; - u8 RDLines = 63; - u8 RDLinesTemp = 46; + u8 RDLines = 0; + u8 RDLinesTemp = 0; + u8 AlphaRefVal = 0; u8 AlphaRef = 0; diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index bb801612..3d71db85 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1923,9 +1923,6 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) SetupPolygon(&PolygonList[j++], polygons[i]); } - //init internal buffer - ClearBuffers(gpu); - // reset scanline trackers gpu.GPU3D.RDLinesUnderflow = false; gpu.GPU3D.RDLinesTemp = 63; @@ -2022,7 +2019,16 @@ void SoftRenderer::RenderFrame(GPU& gpu) // "Render thread, you're up! Get moving." Platform::Semaphore_Post(Sema_RenderStart); } - else if (!FrameIdentical) RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + else if (!FrameIdentical) + { + //init internal buffer + ClearBuffers(gpu); + + if (gpu.GPU3D.RenderNumPolygons > 0) + RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + else + memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); + } } void SoftRenderer::RestartFrame(GPU& gpu) @@ -2050,7 +2056,16 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) { // If no rendering is needed, just say we're done. Platform::Semaphore_Post(Sema_ScanlineCount, 192); } - else RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + else + { + //init internal buffer + ClearBuffers(gpu); + + if (gpu.GPU3D.RenderNumPolygons > 0) + RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + else + memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); + } // Tell the main thread that we're done rendering // and that it's safe to access the GPU state again. From a51747b2530d3c93988acc782e122bd35de27d2d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 5 Apr 2024 12:46:57 -0400 Subject: [PATCH 37/53] fix a bug i introduced, also fix one i didn't --- src/GPU3D.cpp | 2 ++ src/GPU3D_Soft.cpp | 3 +++ 2 files changed, 5 insertions(+) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 47f5fc52..faaaf10b 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -182,6 +182,8 @@ void GPU3D::ResetRenderingState() noexcept RenderClearAttr1 = 0x3F000000; RenderClearAttr2 = 0x00007FFF; + + RenderFrameIdentical = false; } void GPU3D::Reset() noexcept diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 3d71db85..d68d44da 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -2064,7 +2064,10 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) if (gpu.GPU3D.RenderNumPolygons > 0) RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); else + { memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); + Platform::Semaphore_Post(Sema_ScanlineCount, 192); + } } // Tell the main thread that we're done rendering From cdc7b01701d96482c0f5fe6e4bc08f06fe8de469 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 12 Apr 2024 12:51:44 -0400 Subject: [PATCH 38/53] fix up a few more things --- src/GPU3D.cpp | 38 ++++++++++++++++++++++++++++++-------- src/GPU3D.h | 6 +++++- src/GPU3D_Soft.cpp | 4 ++-- 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index faaaf10b..b329cc77 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -237,7 +237,7 @@ void GPU3D::Reset() noexcept TotalParams = 0; GeometryEnabled = false; - RenderingEnabled = false; + RenderingEnabled = 0; DispCnt = 0; AlphaRefVal = 0; @@ -550,12 +550,16 @@ void GPU3D::DoSavestate(Savestate* file) noexcept file->Bool32(&AbortFrame); file->Bool32(&GeometryEnabled); - file->Bool32(&RenderingEnabled); + file->Var8(&RenderingEnabled); file->Var32(&PolygonMode); file->Var32(&PolygonAttr); file->Var32(&CurPolygonAttr); file->Var32(&TexParam); file->Var32(&TexPalette); + + file->Var8(&RDLines); + file->Var8(&RDLinesTemp); + RenderFrameIdentical = false; if (softRenderer && softRenderer->IsThreaded()) { @@ -568,10 +572,19 @@ void GPU3D::DoSavestate(Savestate* file) noexcept void GPU3D::SetEnabled(bool geometry, bool rendering) noexcept { GeometryEnabled = geometry; - RenderingEnabled = rendering; - - if (!rendering) ResetRenderingState(); - else RDLinesTemp = 63; // resets to 63 when the rasterizer is toggled on + if (rendering) + { + if (RenderingEnabled == 0) + { + RenderingEnabled = 1; + RDLinesTemp = 63; // CHECKME + } + } + else + { + ResetRenderingState(); + RenderingEnabled = 0; + } } @@ -2438,11 +2451,11 @@ void GPU3D::VBlank() noexcept if (GeometryEnabled) { - if (RenderingEnabled) + if (RenderingEnabled >= 3) { - RDLines = RDLinesTemp; if (FlushRequest) { + swap: if (NumPolygons) { // separate translucent polygons from opaque ones @@ -2496,6 +2509,15 @@ void GPU3D::VBlank() noexcept RenderClearAttr1 = ClearAttr1; RenderClearAttr2 = ClearAttr2; } + else if (RenderingEnabled != 0) + { + if (FlushRequest) + { + RenderingEnabled++; + if (RenderingEnabled >= 3) + goto swap; + } + } if (FlushRequest) { diff --git a/src/GPU3D.h b/src/GPU3D.h index b1684ead..f3725fa0 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -243,7 +243,11 @@ public: u32 TotalParams = 0; bool GeometryEnabled = false; - bool RenderingEnabled = false; + // 0 = powered off + // 1 = powered on, inactive + // 2 = one swap buffers, inactive + // 3 = two swap buffers, active; + u8 RenderingEnabled = 0; u32 DispCnt = 0; diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index d68d44da..5da6cd78 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -2024,7 +2024,7 @@ void SoftRenderer::RenderFrame(GPU& gpu) //init internal buffer ClearBuffers(gpu); - if (gpu.GPU3D.RenderNumPolygons > 0) + if (gpu.GPU3D.RenderingEnabled >= 3) RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); else memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); @@ -2061,7 +2061,7 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) //init internal buffer ClearBuffers(gpu); - if (gpu.GPU3D.RenderNumPolygons > 0) + if (gpu.GPU3D.RenderingEnabled >= 3) RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); else { From 95faca402a0bab96677b3f10475c3cee67b7bff4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 19 Apr 2024 06:16:32 -0400 Subject: [PATCH 39/53] accuracy toggle + some attempt at understanding slopes --- src/GPU3D_Soft.cpp | 189 +++++++++++++++++++++++++++++++-------------- src/GPU3D_Soft.h | 13 ++-- 2 files changed, 141 insertions(+), 61 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 5da6cd78..da6acbc0 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -174,25 +174,71 @@ u32 SoftRenderer::DoTimingsPixels(s32 pixels, s32* timingcounter) else return 0; } -bool SoftRenderer::DoTimingsFirstPoly(RendererPolygon* rp, s32 y, s32* timingcounter) +void SoftRenderer::FindFirstPolyDoTimings(int npolys, s32 y, s32* timingcountereven, s32*timingcounterodd) { - // The first polygon in each scanline has an additional timing penalty (presumably due to pipelining?) + // TODO: actually figure this out - // First polygon has a cost of 4 cycles - if (!DoTimings(FirstPolyDelay, timingcounter)) return false; + // The First Polygon in each scanline pair has some additional timing penalties (presumably due to pipelining of the rasterizer) - // determine the timing impact of the first polygon's slopes. + bool fixeddelay = false; + bool perslope = false; + bool etc = false; + + for (int i = 0; i < npolys; i++) + { + RendererPolygon* rp = &PolygonList[i]; + Polygon* polygon = rp->PolyData; + + if (y >= polygon->YTop && y <= polygon->YBottom) + { + fixeddelay = true; + break; + if (y == polygon->YBottom) break; + if (y == polygon->YTop) {perslope = true; break;} + /*else if ((y == polygon->Vertices[rp->NextVL]->FinalPosition[1] || y == polygon->Vertices[rp->CurVL]->FinalPosition[1]) || + (y == polygon->Vertices[rp->NextVR]->FinalPosition[1] || y == polygon->Vertices[rp->CurVR]->FinalPosition[1])) + { + perslope = true; + } + else */etc = true; + break; + } + } + + y++; + for (int i = 0; i < npolys; i++) + { + RendererPolygon* rp = &PolygonList[i]; + Polygon* polygon = rp->PolyData; + + if (y >= polygon->YTop && y <= polygon->YBottom) + { + fixeddelay = true; + break; + if (y == polygon->YBottom) break; + if (y == polygon->YTop) {perslope = true; break;} + /*else if ((y == polygon->Vertices[rp->NextVL]->FinalPosition[1] || y == polygon->Vertices[rp->CurVL]->FinalPosition[1]) || + (y == polygon->Vertices[rp->NextVR]->FinalPosition[1] || y == polygon->Vertices[rp->CurVR]->FinalPosition[1])) + { + perslope = true; + } + else */etc = true; + break; + } + } - Polygon* polygon = rp->PolyData; - - if (polygon->YTop == polygon->YBottom) return true; // 0 px tall line polygons do not have slopes, and thus no timing penalty - if (y == polygon->YTop) return true; - - if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) *timingcounter += FirstPerSlope; - - if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) *timingcounter += FirstPerSlope; - - return DoTimings(FirstPerSlope*2, timingcounter); // CHECKME: does this need to be done every time its incremented here? does this even need to be done *at all?* + *timingcountereven = fixeddelay*FirstPolyDelay;// + perslope*FirstPerSlope + etc*2; + *timingcounterodd = fixeddelay*FirstPolyDelay;/// + perslope*FirstPerSlope + etc*2; + if (!perslope) + { + *timingcountereven += etc*2;// + perslope*FirstPerSlope + etc*2; + *timingcounterodd += etc*2;/// + perslope*FirstPerSlope + etc*2; + } + else + { + *timingcountereven += perslope*FirstPerSlope;// + perslope*FirstPerSlope + etc*2; + *timingcounterodd += perslope*FirstPerSlope;/// + perslope*FirstPerSlope + etc*2; + } } void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const @@ -779,6 +825,7 @@ void SoftRenderer::CheckSlope(RendererPolygon* rp, s32 y) } } +template bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, s32* timingcounter) { Polygon* polygon = rp->PolyData; @@ -912,16 +959,20 @@ bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* s32 xlimit; if (xend > 256) xend = 256; - // determine if the span can be rendered within the time allotted to the scanline - // TODO: verify the timing characteristics of shadow masks are the same as regular polygons. - s32 diff = DoTimingsPixels(xend-x, timingcounter); - if (diff != 0) + if (accuracy) { - xend -= diff; - r_edgelen -= diff; - abortscanline = true; + // determine if the span can be rendered within the time allotted to the scanline + // TODO: verify the timing characteristics of shadow masks are the same as regular polygons. + s32 diff = DoTimingsPixels(xend-x, timingcounter); + if (diff != 0) + { + xend -= diff; + r_edgelen -= diff; + abortscanline = true; + } + else abortscanline = false; } - else abortscanline = false; + else abortscanline = true; // for shadow masks: set stencil bits where the depth test fails. // draw nothing. @@ -1007,6 +1058,7 @@ bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* return abortscanline; } +template bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter) { Polygon* polygon = rp->PolyData; @@ -1163,13 +1215,17 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 s32 xcov = 0; if (xend > 256) xend = 256; - // determine if the span can be rendered within the time allotted to the scanline - s32 diff = DoTimingsPixels(xend-x, timingcounter); - if (diff != 0) + if (accuracy) { - xend -= diff; - r_edgelen -= diff; - abortscanline = true; + // determine if the span can be rendered within the time allotted to the scanline + s32 diff = DoTimingsPixels(xend-x, timingcounter); + if (diff != 0) + { + xend -= diff; + r_edgelen -= diff; + abortscanline = true; + } + else abortscanline = false; } else abortscanline = false; @@ -1461,45 +1517,35 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 return abortscanline; } +template void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter) { - *timingcounter = 0; bool abort = false; - bool first = true; for (int i = 0; i < npolys; i++) { RendererPolygon* rp = &PolygonList[i]; Polygon* polygon = rp->PolyData; - if (y == polygon->YBottom && y != polygon->YTop) + if (accuracy && y == polygon->YBottom && y != polygon->YTop) { - if (!abort) abort = (first && !DoTimings(FirstNull+EmptyPolyScanline, timingcounter)) || !DoTimings(EmptyPolyScanline, timingcounter); - - first = false; + if (!abort) abort = !DoTimings(EmptyPolyScanline, timingcounter); } else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) - { - //if (y == polygon->YTop) if(!DoTimings(FirstPolyScanline, timingcounter)) abort = true; - - if (!abort) abort = (first && !DoTimingsFirstPoly(rp, y, timingcounter)) // incorrect. needs research; behavior is strange... - || !DoTimings(PerPolyScanline, timingcounter) - || (!CheckTimings(MinToStartPoly, timingcounter)); + { + if (accuracy && !abort) abort = (!DoTimings(PerPolyScanline, timingcounter) + || !CheckTimings(MinToStartPoly, timingcounter)); - if (abort) + if (accuracy && abort) { CheckSlope(rp, y); Step(rp); } else if (polygon->IsShadowMask) - abort = RenderShadowMaskScanline(gpu.GPU3D, rp, y, timingcounter); + abort = RenderShadowMaskScanline(gpu.GPU3D, rp, y, timingcounter); else - abort = RenderPolygonScanline(gpu, rp, y, timingcounter); - - first = false; + abort = RenderPolygonScanline(gpu, rp, y, timingcounter); } } - - return; } u32 SoftRenderer::CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const @@ -1904,8 +1950,9 @@ void SoftRenderer::FinishPushScanline(s32 y, s32 pixelsremain) /* update sl timeout */\ ScanlineTimeout = SLRead[y-1] - FinalPassLen;\ \ - RenderScanline(gpu, y, j, &rastertimingeven);\ - RenderScanline(gpu, y+1, j, &rastertimingodd);\ + FindFirstPolyDoTimings(j, y, &rastertimingeven, &rastertimingodd);\ + RenderScanline(gpu, y, j, &rastertimingeven);\ + RenderScanline(gpu, y+1, j, &rastertimingodd);\ \ prevtimespent = timespent;\ RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen});\ @@ -1914,7 +1961,27 @@ void SoftRenderer::FinishPushScanline(s32 y, s32 pixelsremain) /* set the underflow flag if one of the scanlines came within 14 cycles of visible underflow */\ if (ScanlineTimeout <= RasterTiming) gpu.GPU3D.RDLinesUnderflow = true; -void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) +void SoftRenderer::RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys) +{ + int j = 0; + for (int i = 0; i < npolys; i++) + { + if (polygons[i]->Degenerate) continue; + SetupPolygon(&PolygonList[j++], polygons[i]); + } + int dummy; + RenderScanline(gpu, 0, j, &dummy); + + for (s32 y = 1; y < 192; y++) + { + RenderScanline(gpu, y, j, &dummy); + ScanlineFinalPass(gpu.GPU3D, y-1, true, true); + } + + ScanlineFinalPass(gpu.GPU3D, 191, true, true); +} + +void SoftRenderer::RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys) { int j = 0; for (int i = 0; i < npolys; i++) @@ -1932,10 +1999,10 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) s32 nextread = 0, nextreadrd = 0; u32 timespent, prevtimespent; - + FindFirstPolyDoTimings(j, 0, &rastertimingeven, &rastertimingodd); // scanlines are rendered in pairs of two - RenderScanline(gpu, 0, j, &rastertimingeven); - RenderScanline(gpu, 1, j, &rastertimingodd); + RenderScanline(gpu, 0, j, &rastertimingeven); + RenderScanline(gpu, 1, j, &rastertimingodd); // it can't proceed to the next scanline unless all others steps are done (both scanlines in the pair, and final pass) RasterTiming = timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); @@ -2025,7 +2092,12 @@ void SoftRenderer::RenderFrame(GPU& gpu) ClearBuffers(gpu); if (gpu.GPU3D.RenderingEnabled >= 3) - RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + { + if (Accuracy) + RenderPolygonsTiming(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + else + RenderPolygonsFast(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + } else memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); } @@ -2062,7 +2134,12 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) ClearBuffers(gpu); if (gpu.GPU3D.RenderingEnabled >= 3) - RenderPolygons(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + { + if (Accuracy) + RenderPolygonsTiming(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + else + RenderPolygonsFast(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + } else { memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 20aaf4bf..a9a15787 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -458,7 +458,7 @@ private: bool DoTimings(s32 cycles, s32* timingcounter); bool CheckTimings(s32 cycles, s32* timingcounter); u32 DoTimingsPixels(s32 pixels, s32* timingcounter); - bool DoTimingsFirstPoly(RendererPolygon* rp, s32 y, s32* timingcounter); + void FindFirstPolyDoTimings(int npolys, s32 y, s32* timingcountereven, s32*timingcounterodd); void TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const; u32 RenderPixel(const GPU& gpu, const Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t) const; void PlotTranslucentPixel(const GPU3D& gpu3d, u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow); @@ -467,9 +467,9 @@ private: void SetupPolygon(RendererPolygon* rp, Polygon* polygon) const; void Step(RendererPolygon* rp); void CheckSlope(RendererPolygon* rp, s32 y); - bool RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, s32* timingcounter); - bool RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter); - void RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter); + template bool RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, s32* timingcounter); + template bool RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter); + template void RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter); u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const; bool CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr); bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z); @@ -478,7 +478,8 @@ private: u16 BeginPushScanline(s32 y, s32 pixelstodraw); void ReadScanline(s32 y); void FinishPushScanline(s32 y, s32 pixelsremain); - void RenderPolygons(GPU& gpu, Polygon** polygons, int npolys); + void RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys); + void RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys); void RenderThreadFunc(GPU& gpu); @@ -532,6 +533,8 @@ private: bool FrameIdentical; + bool Accuracy = true; // TODO + // threading bool Threaded; From 668c493bf429051a85c9a6eea84064188394a376 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 19 Apr 2024 06:29:46 -0400 Subject: [PATCH 40/53] temp hack cuz lazyy --- src/GPU3D_Soft.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index da6acbc0..d3b64347 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -97,7 +97,7 @@ void SoftRenderer::EnableRenderThread() } SoftRenderer::SoftRenderer(bool threaded) noexcept - : Renderer3D(false), Threaded(threaded) + : Renderer3D(false), Accuracy(threaded) { Sema_RenderStart = Platform::Semaphore_Create(); Sema_RenderDone = Platform::Semaphore_Create(); @@ -106,6 +106,7 @@ SoftRenderer::SoftRenderer(bool threaded) noexcept RenderThreadRunning = false; RenderThreadRendering = false; RenderThread = nullptr; + Threaded = true; } SoftRenderer::~SoftRenderer() From 39a569bd3538257712c5638816216ca915fcfe73 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 19 Apr 2024 08:00:29 -0400 Subject: [PATCH 41/53] Revert "temp hack cuz lazyy" This reverts commit 668c493bf429051a85c9a6eea84064188394a376. --- src/GPU3D_Soft.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index d3b64347..da6acbc0 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -97,7 +97,7 @@ void SoftRenderer::EnableRenderThread() } SoftRenderer::SoftRenderer(bool threaded) noexcept - : Renderer3D(false), Accuracy(threaded) + : Renderer3D(false), Threaded(threaded) { Sema_RenderStart = Platform::Semaphore_Create(); Sema_RenderDone = Platform::Semaphore_Create(); @@ -106,7 +106,6 @@ SoftRenderer::SoftRenderer(bool threaded) noexcept RenderThreadRunning = false; RenderThreadRendering = false; RenderThread = nullptr; - Threaded = true; } SoftRenderer::~SoftRenderer() From 520f7a0f3a18c82a230e951c64dd0231f6946025 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 19 Apr 2024 10:41:01 -0400 Subject: [PATCH 42/53] small optimization --- src/GPU3D_Soft.cpp | 34 ++++++++++++++++++---------------- src/GPU3D_Soft.h | 4 ++-- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index da6acbc0..d707336b 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -174,7 +174,7 @@ u32 SoftRenderer::DoTimingsPixels(s32 pixels, s32* timingcounter) else return 0; } -void SoftRenderer::FindFirstPolyDoTimings(int npolys, s32 y, s32* timingcountereven, s32*timingcounterodd) +void SoftRenderer::FindFirstPolyDoTimings(int npolys, s32 y, int* firstpolyeven, int* firstpolyodd, s32* timingcountereven, s32*timingcounterodd) { // TODO: actually figure this out @@ -184,9 +184,9 @@ void SoftRenderer::FindFirstPolyDoTimings(int npolys, s32 y, s32* timingcountere bool perslope = false; bool etc = false; - for (int i = 0; i < npolys; i++) + for (*firstpolyeven = 0; *firstpolyeven < npolys; (*firstpolyeven)++) { - RendererPolygon* rp = &PolygonList[i]; + RendererPolygon* rp = &PolygonList[*firstpolyeven]; Polygon* polygon = rp->PolyData; if (y >= polygon->YTop && y <= polygon->YBottom) @@ -206,9 +206,9 @@ void SoftRenderer::FindFirstPolyDoTimings(int npolys, s32 y, s32* timingcountere } y++; - for (int i = 0; i < npolys; i++) + for (*firstpolyodd = 0; *firstpolyodd < npolys; (*firstpolyodd)++) { - RendererPolygon* rp = &PolygonList[i]; + RendererPolygon* rp = &PolygonList[*firstpolyodd]; Polygon* polygon = rp->PolyData; if (y >= polygon->YTop && y <= polygon->YBottom) @@ -1518,12 +1518,12 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 } template -void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter) +void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int firstpoly, int npolys, s32* timingcounter) { bool abort = false; - for (int i = 0; i < npolys; i++) + for (; firstpoly < npolys; firstpoly++) { - RendererPolygon* rp = &PolygonList[i]; + RendererPolygon* rp = &PolygonList[firstpoly]; Polygon* polygon = rp->PolyData; if (accuracy && y == polygon->YBottom && y != polygon->YTop) @@ -1950,9 +1950,9 @@ void SoftRenderer::FinishPushScanline(s32 y, s32 pixelsremain) /* update sl timeout */\ ScanlineTimeout = SLRead[y-1] - FinalPassLen;\ \ - FindFirstPolyDoTimings(j, y, &rastertimingeven, &rastertimingodd);\ - RenderScanline(gpu, y, j, &rastertimingeven);\ - RenderScanline(gpu, y+1, j, &rastertimingodd);\ + FindFirstPolyDoTimings(j, y, &firstpolyeven, &firstpolyodd, &rastertimingeven, &rastertimingodd);\ + RenderScanline(gpu, y, firstpolyeven, j, &rastertimingeven);\ + RenderScanline(gpu, y+1, firstpolyodd, j, &rastertimingodd);\ \ prevtimespent = timespent;\ RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen});\ @@ -1969,12 +1969,13 @@ void SoftRenderer::RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys) if (polygons[i]->Degenerate) continue; SetupPolygon(&PolygonList[j++], polygons[i]); } + int dummy; - RenderScanline(gpu, 0, j, &dummy); + RenderScanline(gpu, 0, 0, j, &dummy); for (s32 y = 1; y < 192; y++) { - RenderScanline(gpu, y, j, &dummy); + RenderScanline(gpu, y, 0, j, &dummy); ScanlineFinalPass(gpu.GPU3D, y-1, true, true); } @@ -1998,11 +1999,12 @@ void SoftRenderer::RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys s32 scanlineswaiting = 0, slwaitingrd = 0; s32 nextread = 0, nextreadrd = 0; u32 timespent, prevtimespent; + int firstpolyeven, firstpolyodd; - FindFirstPolyDoTimings(j, 0, &rastertimingeven, &rastertimingodd); + FindFirstPolyDoTimings(j, 0, &firstpolyeven, &firstpolyodd, &rastertimingeven, &rastertimingodd); // scanlines are rendered in pairs of two - RenderScanline(gpu, 0, j, &rastertimingeven); - RenderScanline(gpu, 1, j, &rastertimingodd); + RenderScanline(gpu, 0, firstpolyeven, j, &rastertimingeven); + RenderScanline(gpu, 1, firstpolyodd, j, &rastertimingodd); // it can't proceed to the next scanline unless all others steps are done (both scanlines in the pair, and final pass) RasterTiming = timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index a9a15787..0c4baf79 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -458,7 +458,7 @@ private: bool DoTimings(s32 cycles, s32* timingcounter); bool CheckTimings(s32 cycles, s32* timingcounter); u32 DoTimingsPixels(s32 pixels, s32* timingcounter); - void FindFirstPolyDoTimings(int npolys, s32 y, s32* timingcountereven, s32*timingcounterodd); + void FindFirstPolyDoTimings(int npolys, s32 y, int* firstpolyeven, int* firstpolyodd, s32* timingcountereven, s32*timingcounterodd); void TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const; u32 RenderPixel(const GPU& gpu, const Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t) const; void PlotTranslucentPixel(const GPU3D& gpu3d, u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow); @@ -469,7 +469,7 @@ private: void CheckSlope(RendererPolygon* rp, s32 y); template bool RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, s32* timingcounter); template bool RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter); - template void RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter); + template void RenderScanline(const GPU& gpu, s32 y, int firstpoly, int npolys, s32* timingcounter); u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const; bool CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr); bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z); From 1c24fe03c23ea434360a0e300f26e085e613fe30 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 19 Apr 2024 12:56:38 -0400 Subject: [PATCH 43/53] improve when 3d dispcnt underflow flag updates --- src/GPU.cpp | 13 ++++++++----- src/GPU3D.cpp | 3 ++- src/GPU3D.h | 2 +- src/GPU3D_Soft.cpp | 10 +++++----- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/GPU.cpp b/src/GPU.cpp index c1f8e2a1..1b272cc9 100644 --- a/src/GPU.cpp +++ b/src/GPU.cpp @@ -1002,6 +1002,13 @@ void GPU::StartScanline(u32 line) noexcept NDS.ScheduleEvent(Event_DisplayFIFO, false, 32, 0, 0); } + if (VCount == GPU3D.UnderflowFlagVCount) + { + // appears to get set the vcount before the underflow occured? + // probably gets updated the instant the underflow happened, which might be annoying to work out with precision. + GPU3D.DispCnt |= (1<<12); + } + if (VCount == 262) { // frame end @@ -1017,7 +1024,7 @@ void GPU::StartScanline(u32 line) noexcept // and games might already start to modify texture memory. // That doesn't matter for us because we cache the entire // texture memory anyway and only update it before the start - //of the next frame. + // of the next frame. // So we can give the rasteriser a bit more headroom GPU3D.VCount144(*this); @@ -1041,10 +1048,6 @@ void GPU::StartScanline(u32 line) noexcept if (GPU3D.IsRendererAccelerated()) GPU3D.Blit(*this); } - else if (VCount == 183) - { - GPU3D.DispCnt |= GPU3D.RDLinesUnderflow << 12; // CHECKME: does this get set *exactly* at vcount 183? earlier? later? - } } NDS.ScheduleEvent(Event_LCD, true, HBLANK_CYCLES, LCD_StartHBlank, line); diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index b329cc77..4c177e4c 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -255,7 +255,8 @@ void GPU3D::Reset() noexcept ClearAttr2 = 0x00007FFF; ResetRenderingState(); - + + UnderflowFlagVCount = -1; RDLines = 63; AbortFrame = false; diff --git a/src/GPU3D.h b/src/GPU3D.h index f3725fa0..d35894d6 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -251,7 +251,7 @@ public: u32 DispCnt = 0; - bool RDLinesUnderflow = false; + u16 UnderflowFlagVCount = 0; u8 RDLines = 0; u8 RDLinesTemp = 0; diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index d707336b..c3b162d1 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -228,16 +228,16 @@ void SoftRenderer::FindFirstPolyDoTimings(int npolys, s32 y, int* firstpolyeven, } *timingcountereven = fixeddelay*FirstPolyDelay;// + perslope*FirstPerSlope + etc*2; - *timingcounterodd = fixeddelay*FirstPolyDelay;/// + perslope*FirstPerSlope + etc*2; + *timingcounterodd = fixeddelay*FirstPolyDelay;// + perslope*FirstPerSlope + etc*2; if (!perslope) { *timingcountereven += etc*2;// + perslope*FirstPerSlope + etc*2; - *timingcounterodd += etc*2;/// + perslope*FirstPerSlope + etc*2; + *timingcounterodd += etc*2;// + perslope*FirstPerSlope + etc*2; } else { *timingcountereven += perslope*FirstPerSlope;// + perslope*FirstPerSlope + etc*2; - *timingcounterodd += perslope*FirstPerSlope;/// + perslope*FirstPerSlope + etc*2; + *timingcounterodd += perslope*FirstPerSlope;// + perslope*FirstPerSlope + etc*2; } } @@ -1959,7 +1959,7 @@ void SoftRenderer::FinishPushScanline(s32 y, s32 pixelsremain) RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);\ \ /* set the underflow flag if one of the scanlines came within 14 cycles of visible underflow */\ - if (ScanlineTimeout <= RasterTiming) gpu.GPU3D.RDLinesUnderflow = true; + if ((ScanlineTimeout <= RasterTiming) && (gpu.GPU3D.UnderflowFlagVCount == (u16)-1)) gpu.GPU3D.UnderflowFlagVCount = y-1; void SoftRenderer::RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys) { @@ -1992,7 +1992,7 @@ void SoftRenderer::RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys } // reset scanline trackers - gpu.GPU3D.RDLinesUnderflow = false; + gpu.GPU3D.UnderflowFlagVCount = -1; gpu.GPU3D.RDLinesTemp = 63; ScanlineTimeout = FrameLength; // CHECKME s32 rastertimingeven, rastertimingodd; // always init to 0 at the start of a scanline render From 36f555db338e259ebe7f6b148b3b3a49aa99c3b9 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 20 Apr 2024 12:13:26 -0400 Subject: [PATCH 44/53] approximate time of dispcnt underflow bit update --- src/GPU.cpp | 36 +++++++++++++++++++++++------------- src/GPU.h | 1 + src/GPU3D.cpp | 4 ++++ src/GPU3D.h | 2 ++ src/GPU3D_Soft.cpp | 8 +++++--- src/GPU3D_Soft.h | 1 + 6 files changed, 36 insertions(+), 16 deletions(-) diff --git a/src/GPU.cpp b/src/GPU.cpp index 1b272cc9..bb3d57fd 100644 --- a/src/GPU.cpp +++ b/src/GPU.cpp @@ -30,15 +30,17 @@ namespace melonDS using Platform::Log; using Platform::LogLevel; -#define LINE_CYCLES (355*6) +#define LINE_CYCLES (355*6) #define HBLANK_CYCLES (48+(256*6)) #define FRAME_CYCLES (LINE_CYCLES * 263) +#define READ_CYCLES (520) // CHECKME: Probably off by a little bit enum { LCD_StartHBlank = 0, LCD_StartScanline, LCD_FinishFrame, + LCD_ReadScanline, }; @@ -73,6 +75,8 @@ GPU::GPU(melonDS::NDS& nds, std::unique_ptr&& renderer3d, std::uniqu NDS.RegisterEventFunc(Event_LCD, LCD_StartHBlank, MemberEventFunc(GPU, StartHBlank)); NDS.RegisterEventFunc(Event_LCD, LCD_StartScanline, MemberEventFunc(GPU, StartScanline)); NDS.RegisterEventFunc(Event_LCD, LCD_FinishFrame, MemberEventFunc(GPU, FinishFrame)); + NDS.RegisterEventFunc(Event_LCD, LCD_ReadScanline, MemberEventFunc(GPU, ReadScanline)); + NDS.RegisterEventFunc(Event_DisplayFIFO, 0, MemberEventFunc(GPU, DisplayFIFO)); NDS.RegisterEventFunc(Event_DisplayFIFO, 0, MemberEventFunc(GPU, DisplayFIFO)); InitFramebuffers(); @@ -85,6 +89,7 @@ GPU::~GPU() noexcept NDS.UnregisterEventFunc(Event_LCD, LCD_StartHBlank); NDS.UnregisterEventFunc(Event_LCD, LCD_StartScanline); NDS.UnregisterEventFunc(Event_LCD, LCD_FinishFrame); + NDS.UnregisterEventFunc(Event_LCD, LCD_ReadScanline); NDS.UnregisterEventFunc(Event_DisplayFIFO, 0); } @@ -910,11 +915,10 @@ void GPU::StartHBlank(u32 line) noexcept if (DispStat[0] & (1<<4)) NDS.SetIRQ(0, IRQ_HBlank); if (DispStat[1] & (1<<4)) NDS.SetIRQ(1, IRQ_HBlank); - - if (VCount < 262) - NDS.ScheduleEvent(Event_LCD, true, (LINE_CYCLES - HBLANK_CYCLES), LCD_StartScanline, line+1); + if (VCount == 262 || VCount < 191) // this is probably wrong, but i haven't dug deep enough to prove it yet + NDS.ScheduleEvent(Event_LCD, true, (LINE_CYCLES - HBLANK_CYCLES - READ_CYCLES), LCD_ReadScanline, line); else - NDS.ScheduleEvent(Event_LCD, true, (LINE_CYCLES - HBLANK_CYCLES), LCD_FinishFrame, line+1); + NDS.ScheduleEvent(Event_LCD, true, (LINE_CYCLES - HBLANK_CYCLES), LCD_StartScanline, line+1); } void GPU::FinishFrame(u32 lines) noexcept @@ -949,6 +953,19 @@ void GPU::BlankFrame() noexcept TotalScanlines = 263; } +void GPU::ReadScanline(u32 line) noexcept +{ + int scanline; + scanline = (VCount == 262 ? 0 : (line+1)); + GPU3D.ScanlineSync(scanline); + if (GPU3D.UnderflowFlagVCount == scanline) GPU3D.DispCnt |= (1<<12); + + if (VCount != 262) + NDS.ScheduleEvent(Event_LCD, true, READ_CYCLES, LCD_StartScanline, line+1); + else + NDS.ScheduleEvent(Event_LCD, true, READ_CYCLES, LCD_FinishFrame, line+1); +} + void GPU::StartScanline(u32 line) noexcept { if (line == 0) @@ -1002,13 +1019,6 @@ void GPU::StartScanline(u32 line) noexcept NDS.ScheduleEvent(Event_DisplayFIFO, false, 32, 0, 0); } - if (VCount == GPU3D.UnderflowFlagVCount) - { - // appears to get set the vcount before the underflow occured? - // probably gets updated the instant the underflow happened, which might be annoying to work out with precision. - GPU3D.DispCnt |= (1<<12); - } - if (VCount == 262) { // frame end @@ -1020,7 +1030,7 @@ void GPU::StartScanline(u32 line) noexcept { if (VCount == 192) { - // in reality rendering already finishes at line 144 + // in reality rendering already finishes at line 144 (can take up to ~191 depending on load) // and games might already start to modify texture memory. // That doesn't matter for us because we cache the entire // texture memory anyway and only update it before the start diff --git a/src/GPU.h b/src/GPU.h index 780d5e01..e1f4b89d 100644 --- a/src/GPU.h +++ b/src/GPU.h @@ -506,6 +506,7 @@ public: void BlankFrame() noexcept; void StartScanline(u32 line) noexcept; void StartHBlank(u32 line) noexcept; + void ReadScanline(u32 line) noexcept; void DisplayFIFO(u32 x) noexcept; diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 4c177e4c..cbd2721c 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -2547,6 +2547,10 @@ void GPU3D::SetRenderXPos(u16 xpos) noexcept RenderXPos = xpos & 0x01FF; } +void GPU3D::ScanlineSync(int line) noexcept +{ + CurrentRenderer->ScanlineSync(line); +} u32* GPU3D::GetLine(int line) noexcept { diff --git a/src/GPU3D.h b/src/GPU3D.h index d35894d6..ada40fb1 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -114,6 +114,7 @@ public: void SetRenderXPos(u16 xpos) noexcept; [[nodiscard]] u16 GetRenderXPos() const noexcept { return RenderXPos; } + void ScanlineSync(int line) noexcept; u32* GetLine(int line) noexcept; void WriteToGXFIFO(u32 val) noexcept; @@ -454,6 +455,7 @@ public: virtual void RenderFrame(GPU& gpu) = 0; virtual void RestartFrame(GPU& gpu) {}; virtual u32* GetLine(int line) = 0; + virtual void ScanlineSync(int line) {}; virtual void Blit(const GPU& gpu) {}; virtual void PrepareCaptureFrame() {} protected: diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index c3b162d1..d99bbba6 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1959,7 +1959,7 @@ void SoftRenderer::FinishPushScanline(s32 y, s32 pixelsremain) RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);\ \ /* set the underflow flag if one of the scanlines came within 14 cycles of visible underflow */\ - if ((ScanlineTimeout <= RasterTiming) && (gpu.GPU3D.UnderflowFlagVCount == (u16)-1)) gpu.GPU3D.UnderflowFlagVCount = y-1; + if ((ScanlineTimeout <= RasterTiming) && (gpu.GPU3D.UnderflowFlagVCount == (u16)-1)) gpu.GPU3D.UnderflowFlagVCount = y - (y&1 ? 0 : 1); void SoftRenderer::RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys) { @@ -2156,8 +2156,7 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) RenderThreadRendering = false; } } - -u32* SoftRenderer::GetLine(int line) +void SoftRenderer::ScanlineSync(int line) { if (RenderThreadRunning.load(std::memory_order_relaxed)) { @@ -2167,7 +2166,10 @@ u32* SoftRenderer::GetLine(int line) // so we don't need to wait for a specific row) Platform::Semaphore_Wait(Sema_ScanlineCount); } +} +u32* SoftRenderer::GetLine(int line) +{ return &FinalBuffer[line * ScanlineWidth]; } diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 0c4baf79..4b1a8e52 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -40,6 +40,7 @@ public: void RenderFrame(GPU& gpu) override; void RestartFrame(GPU& gpu) override; u32* GetLine(int line) override; + void ScanlineSync(int line) override; void SetupRenderThread(GPU& gpu); void EnableRenderThread(); From 424c5755ea5db98b0b02eb6a4023aa47118867f3 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 21 Apr 2024 11:31:12 -0400 Subject: [PATCH 45/53] nvm ill just shove it into hblank --- src/GPU.cpp | 28 ++++++++-------------------- src/GPU.h | 1 - src/GPU3D_Soft.cpp | 10 +++++++++- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/src/GPU.cpp b/src/GPU.cpp index bb3d57fd..acbc9f17 100644 --- a/src/GPU.cpp +++ b/src/GPU.cpp @@ -33,14 +33,12 @@ using Platform::LogLevel; #define LINE_CYCLES (355*6) #define HBLANK_CYCLES (48+(256*6)) #define FRAME_CYCLES (LINE_CYCLES * 263) -#define READ_CYCLES (520) // CHECKME: Probably off by a little bit enum { LCD_StartHBlank = 0, LCD_StartScanline, LCD_FinishFrame, - LCD_ReadScanline, }; @@ -75,7 +73,6 @@ GPU::GPU(melonDS::NDS& nds, std::unique_ptr&& renderer3d, std::uniqu NDS.RegisterEventFunc(Event_LCD, LCD_StartHBlank, MemberEventFunc(GPU, StartHBlank)); NDS.RegisterEventFunc(Event_LCD, LCD_StartScanline, MemberEventFunc(GPU, StartScanline)); NDS.RegisterEventFunc(Event_LCD, LCD_FinishFrame, MemberEventFunc(GPU, FinishFrame)); - NDS.RegisterEventFunc(Event_LCD, LCD_ReadScanline, MemberEventFunc(GPU, ReadScanline)); NDS.RegisterEventFunc(Event_DisplayFIFO, 0, MemberEventFunc(GPU, DisplayFIFO)); NDS.RegisterEventFunc(Event_DisplayFIFO, 0, MemberEventFunc(GPU, DisplayFIFO)); @@ -89,7 +86,6 @@ GPU::~GPU() noexcept NDS.UnregisterEventFunc(Event_LCD, LCD_StartHBlank); NDS.UnregisterEventFunc(Event_LCD, LCD_StartScanline); NDS.UnregisterEventFunc(Event_LCD, LCD_FinishFrame); - NDS.UnregisterEventFunc(Event_LCD, LCD_ReadScanline); NDS.UnregisterEventFunc(Event_DisplayFIFO, 0); } @@ -883,6 +879,11 @@ void GPU::StartHBlank(u32 line) noexcept { DispStat[0] |= (1<<1); DispStat[1] |= (1<<1); + + // not the correct timing, but... close enough i guess? + int scanline = (VCount == 262 ? 0 : (line+1)); + GPU3D.ScanlineSync(scanline); + if (GPU3D.UnderflowFlagVCount == scanline) GPU3D.DispCnt |= (1<<12); if (VCount < 192) { @@ -915,10 +916,10 @@ void GPU::StartHBlank(u32 line) noexcept if (DispStat[0] & (1<<4)) NDS.SetIRQ(0, IRQ_HBlank); if (DispStat[1] & (1<<4)) NDS.SetIRQ(1, IRQ_HBlank); - if (VCount == 262 || VCount < 191) // this is probably wrong, but i haven't dug deep enough to prove it yet - NDS.ScheduleEvent(Event_LCD, true, (LINE_CYCLES - HBLANK_CYCLES - READ_CYCLES), LCD_ReadScanline, line); - else + if (VCount < 262) NDS.ScheduleEvent(Event_LCD, true, (LINE_CYCLES - HBLANK_CYCLES), LCD_StartScanline, line+1); + else + NDS.ScheduleEvent(Event_LCD, true, (LINE_CYCLES - HBLANK_CYCLES), LCD_FinishFrame, line+1); } void GPU::FinishFrame(u32 lines) noexcept @@ -953,19 +954,6 @@ void GPU::BlankFrame() noexcept TotalScanlines = 263; } -void GPU::ReadScanline(u32 line) noexcept -{ - int scanline; - scanline = (VCount == 262 ? 0 : (line+1)); - GPU3D.ScanlineSync(scanline); - if (GPU3D.UnderflowFlagVCount == scanline) GPU3D.DispCnt |= (1<<12); - - if (VCount != 262) - NDS.ScheduleEvent(Event_LCD, true, READ_CYCLES, LCD_StartScanline, line+1); - else - NDS.ScheduleEvent(Event_LCD, true, READ_CYCLES, LCD_FinishFrame, line+1); -} - void GPU::StartScanline(u32 line) noexcept { if (line == 0) diff --git a/src/GPU.h b/src/GPU.h index e1f4b89d..780d5e01 100644 --- a/src/GPU.h +++ b/src/GPU.h @@ -506,7 +506,6 @@ public: void BlankFrame() noexcept; void StartScanline(u32 line) noexcept; void StartHBlank(u32 line) noexcept; - void ReadScanline(u32 line) noexcept; void DisplayFIFO(u32 x) noexcept; diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index d99bbba6..eca3fa5b 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -2158,7 +2158,7 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) } void SoftRenderer::ScanlineSync(int line) { - if (RenderThreadRunning.load(std::memory_order_relaxed)) + if (Accuracy && RenderThreadRunning.load(std::memory_order_relaxed)) { if (line < 192) // We need a scanline, so let's wait for the render thread to finish it. @@ -2170,6 +2170,14 @@ void SoftRenderer::ScanlineSync(int line) u32* SoftRenderer::GetLine(int line) { + if (!Accuracy && RenderThreadRunning.load(std::memory_order_relaxed)) + { + if (line < 192) + // We need a scanline, so let's wait for the render thread to finish it. + // (both threads process scanlines from top-to-bottom, + // so we don't need to wait for a specific row) + Platform::Semaphore_Wait(Sema_ScanlineCount); + } return &FinalBuffer[line * ScanlineWidth]; } From 1aa86967b5cbdcb0121db9f2abb8996644f31f55 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 21 Apr 2024 14:48:46 -0400 Subject: [PATCH 46/53] small fix --- src/GPU.cpp | 4 ++-- src/GPU3D_Soft.cpp | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/GPU.cpp b/src/GPU.cpp index acbc9f17..53069784 100644 --- a/src/GPU.cpp +++ b/src/GPU.cpp @@ -880,9 +880,9 @@ void GPU::StartHBlank(u32 line) noexcept DispStat[0] |= (1<<1); DispStat[1] |= (1<<1); - // not the correct timing, but... close enough i guess? + // TODO: not quite the correct update time, but... close enough i guess? int scanline = (VCount == 262 ? 0 : (line+1)); - GPU3D.ScanlineSync(scanline); + if (!(scanline & 1)) GPU3D.ScanlineSync(scanline); if (GPU3D.UnderflowFlagVCount == scanline) GPU3D.DispCnt |= (1<<12); if (VCount < 192) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index eca3fa5b..fc2f0c13 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -2158,18 +2158,21 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) } void SoftRenderer::ScanlineSync(int line) { + // only used in accurate mode (timings must be emulated) if (Accuracy && RenderThreadRunning.load(std::memory_order_relaxed)) { if (line < 192) - // We need a scanline, so let's wait for the render thread to finish it. - // (both threads process scanlines from top-to-bottom, - // so we don't need to wait for a specific row) + { + // wait for two scanlines here, since scanlines render in pairs. Platform::Semaphore_Wait(Sema_ScanlineCount); + Platform::Semaphore_Wait(Sema_ScanlineCount); + } } } u32* SoftRenderer::GetLine(int line) { + // only wait in in-accurate mode (we've already waited for scanlines in accurate mode) if (!Accuracy && RenderThreadRunning.load(std::memory_order_relaxed)) { if (line < 192) From 896df08c5cb46f6d166cde230fb291319418e1a0 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 23 Apr 2024 22:05:45 -0400 Subject: [PATCH 47/53] clean up --- src/GPU3D.h | 32 ++--------------- src/GPU3D_Soft.cpp | 87 ++++++++++++---------------------------------- 2 files changed, 24 insertions(+), 95 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index ada40fb1..16d5b2de 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -350,14 +350,7 @@ public: static constexpr int ScanlineReadInc = DelayBetweenReads + ScanlineReadSpeed; static constexpr int InitGPU2DTimeout = 51875 * TimingFrac; // 51618? 51874? 52128? | when it finishes reading the first scanline. static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this? - - //static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair. - //static constexpr int GPU2DSpeedSecondInPair = 296 * TimingFrac; // 296 | 295??? | the delay between finishing reading the first scanline - // and beginning reading the second scanline of a scanline pair. - //static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline. - //static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment. - //static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines. - + // compile-time list of scanline read times // these *should* always occur at the same point in each frame, so it shouldn't matter if we make them fixed static constexpr std::array SLRead = []() constexpr { @@ -373,7 +366,7 @@ public: static constexpr int Arbitrary = 565; // extra value after the scanline is read at which the cutoff of a scanline should be...? // idk why this is needed. im probably doing something wrong. - // the point at which rdlines decrements not sure why it's different...? + // the point at which rdlines decrements. not sure why it's different...? static constexpr std::array RDDecrement = []() constexpr { std::array dec {}; @@ -386,28 +379,12 @@ public: // GPU 3D Rasterization Timings: For Emulating Scanline Timeout - //static constexpr int ScanlinePairLength = 2130 * TimingFrac; - //static constexpr int ScanlineTimeout = 1686 * TimingFrac; // 2126? 1686? - //static constexpr int ScanlineBreak = 4 * TimingFrac; - //static constexpr int ScanlineBreak2 = 40 * TimingFrac; - //static constexpr int FakeTiming = 2 * TimingFrac; - //static constexpr int FraudulentTiming = 1120 * TimingFrac; // bad theory. todo: find a better one. - //static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair - //static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is, - // it just cares about if its the first 50 scanlines to speedrun rendering? static constexpr int FinalPassLen = 500 * TimingFrac; // 496 (might technically be 500?) | the next scanline cannot begin while a scanline's final pass is in progress // (can be interpreted as the minimum amount of cycles for the next scanline // pair to start after the previous pair began) (related to final pass?) static constexpr int ScanlinePushDelay = 242 * TimingFrac; static constexpr int EMGlitchThreshhold = 502 * TimingFrac; // The threshold for the edge marking glitch behavior to change. static constexpr int EMFixNum = 571 * TimingFrac; // Arbitrary value added to fix edge marking glitch, not sure why it's needed? - //static constexpr int TimeoutIncrement = 2130 * TimingFrac; - //static constexpr int ScanlineIncrementold = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair - //static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains" - //static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126) - // (why does the next pair get more time if the previous scanline is aborted?) - //static constexpr int UnderflowFlag = 2 * TimingFrac; // 14 | How many cycles need to be left for the 3ddispcnt rdlines underflow flag to be set - //static constexpr int FinishScanline = 512 * TimingFrac; // GPU 3D Rasterization Timings II: For Tracking Timing Behaviors @@ -429,11 +406,6 @@ public: static constexpr int FirstPolyDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?) // (Amount of time before the end of the cycle a scanline must abort?) - // static constexpr int RasterTimingCap = 51116 * TimingFrac; - // static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED - // static constexpr int PerScanlineRecup = 2112 * TimingFrac; // seems to check out? // should be the "free" time the gpu has to do the calculation - // static constexpr int PerRightSlope = 1 * TimingFrac; - // static constexpr int FirstPixelTiming; class Renderer3D { diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index fc2f0c13..36495703 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1632,11 +1632,9 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, if (x == 0) { // edge marking bug emulation - if (checkprev) - { - if (CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) goto pass; // check against the clear plane - } - else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr-(ScanlineWidth+1))) goto pass; // checks the right edge of the scanline 2 scanlines ago + if (checkprev ? CheckEdgeMarkingClearPlane(gpu3d, polyid, z) : // check against the clear plane + CheckEdgeMarkingPixel(polyid, z, pixeladdr-1 - ScanlineWidth)) // checks the right edge of the scanline 2 scanlines ago + goto pass; } else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr-1)) goto pass; // normal check @@ -1644,11 +1642,9 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, if (x == 255) { // edge marking bug emulation - if (checknext) - { - if (CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) goto pass; // check against the clear plane - } - else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr+(ScanlineWidth+1))) goto pass; // checks the left edge of the scanline 2 scanlines ahead + if (checknext ? CheckEdgeMarkingClearPlane(gpu3d, polyid, z) : // check against the clear plane + CheckEdgeMarkingPixel(polyid, z, pixeladdr+1 + ScanlineWidth)) // checks the left edge of the scanline 2 scanlines ahead + goto pass; } else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr+1)) goto pass; // normal check @@ -1884,43 +1880,24 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) } } -u16 SoftRenderer::BeginPushScanline(s32 y, s32 pixelstodraw) +void SoftRenderer::RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys) { - // push the finished scanline to the appropriate frame buffers. - // if a scanline is late enough to intersect with the 2d engine read time it will be partially drawn - - u16 start; - if (pixelstodraw >= 256 || pixelstodraw <= 0) // if scheduled after or 256 cycles before a scanline read render full scanline + int j = 0; + for (int i = 0; i < npolys; i++) { - start = 0; - pixelstodraw = 256; + if (polygons[i]->Degenerate) continue; + SetupPolygon(&PolygonList[j++], polygons[i]); } - else // render partial scanline + + RenderScanline(gpu, 0, 0, j, nullptr); + + for (s32 y = 1; y < 192; y++) { - start = ScanlineWidth - pixelstodraw; - - // it seems to read in pairs of two every two cycles? looks jittery - bool jitter = pixelstodraw % 2; - pixelstodraw += jitter; - start -= jitter; + RenderScanline(gpu, y, 0, j, nullptr); + ScanlineFinalPass(gpu.GPU3D, y-1, true, true); } - u8 bufferpos = y % 48; - memcpy(&RDBuffer[bufferpos*ScanlineWidth+start], &ColorBuffer[y*ScanlineWidth+start], 4 * pixelstodraw); - return start; -} -void SoftRenderer::ReadScanline(s32 y) -{ - u8 bufferpos = y % 48; - memcpy(&FinalBuffer[y*ScanlineWidth], &RDBuffer[bufferpos*ScanlineWidth], 4 * ScanlineWidth); -} - -void SoftRenderer::FinishPushScanline(s32 y, s32 pixelsremain) -{ - if (pixelsremain = 0) return; - - u8 bufferpos = y % 48; - memcpy(&RDBuffer[bufferpos*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], 4 * pixelsremain); + ScanlineFinalPass(gpu.GPU3D, 191, true, true); } #define RDLINES_COUNT_INCREMENT\ @@ -1937,10 +1914,11 @@ void SoftRenderer::FinishPushScanline(s32 y, s32 pixelsremain) /* simulate the process of scanlines being read from the 48 scanline buffer */\ while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread] + Arbitrary)\ {\ - if (RasterTiming < SLRead[nextread] + Arbitrary)\ + if (RasterTiming < SLRead[nextread] + Arbitrary) /* why + 565? */\ {\ - RasterTiming += timespent = (SLRead[nextread] + Arbitrary) - RasterTiming; /* why + 565? */\ + timespent = (SLRead[nextread] + Arbitrary) - RasterTiming;\ timespent += EMFixNum; /* fixes edge marking bug emulation. not sure why this is needed? */\ + RasterTiming = (SLRead[nextread] + Arbitrary);\ }\ scanlineswaiting--;\ nextread++;\ @@ -1961,27 +1939,6 @@ void SoftRenderer::FinishPushScanline(s32 y, s32 pixelsremain) /* set the underflow flag if one of the scanlines came within 14 cycles of visible underflow */\ if ((ScanlineTimeout <= RasterTiming) && (gpu.GPU3D.UnderflowFlagVCount == (u16)-1)) gpu.GPU3D.UnderflowFlagVCount = y - (y&1 ? 0 : 1); -void SoftRenderer::RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys) -{ - int j = 0; - for (int i = 0; i < npolys; i++) - { - if (polygons[i]->Degenerate) continue; - SetupPolygon(&PolygonList[j++], polygons[i]); - } - - int dummy; - RenderScanline(gpu, 0, 0, j, &dummy); - - for (s32 y = 1; y < 192; y++) - { - RenderScanline(gpu, y, 0, j, &dummy); - ScanlineFinalPass(gpu.GPU3D, y-1, true, true); - } - - ScanlineFinalPass(gpu.GPU3D, 191, true, true); -} - void SoftRenderer::RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys) { int j = 0; @@ -1994,7 +1951,7 @@ void SoftRenderer::RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys // reset scanline trackers gpu.GPU3D.UnderflowFlagVCount = -1; gpu.GPU3D.RDLinesTemp = 63; - ScanlineTimeout = FrameLength; // CHECKME + ScanlineTimeout = 0x7FFFFFFF; // CHECKME: first scanline pair timeout. s32 rastertimingeven, rastertimingodd; // always init to 0 at the start of a scanline render s32 scanlineswaiting = 0, slwaitingrd = 0; s32 nextread = 0, nextreadrd = 0; From 57e590269f82a0e4ced9990cb4c24bf24011e2ca Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 24 Apr 2024 00:27:50 -0400 Subject: [PATCH 48/53] cleanup more misc stuff --- src/GPU3D.h | 10 +++------- src/GPU3D_Soft.cpp | 38 +++++++++++++++++++------------------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index 16d5b2de..7a3d9efa 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -348,7 +348,7 @@ public: static constexpr int DelayBetweenReads = 809 * TimingFrac; static constexpr int ScanlineReadSpeed = 256 * TimingFrac; static constexpr int ScanlineReadInc = DelayBetweenReads + ScanlineReadSpeed; - static constexpr int InitGPU2DTimeout = 51875 * TimingFrac; // 51618? 51874? 52128? | when it finishes reading the first scanline. + static constexpr int InitGPU2DTimeout = (51875+565) * TimingFrac; // 51618? 51874? 52128? | when it finishes reading the first scanline. static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this? // compile-time list of scanline read times @@ -363,8 +363,7 @@ public: return readtime; }(); - static constexpr int Arbitrary = 565; // extra value after the scanline is read at which the cutoff of a scanline should be...? - // idk why this is needed. im probably doing something wrong. + static constexpr int PreReadCutoff = 565; // time before a read that a scanline is cutoff. // the point at which rdlines decrements. not sure why it's different...? static constexpr std::array RDDecrement = []() constexpr { @@ -372,7 +371,7 @@ public: for (int i = 0; i < 192; i++) { - dec[i] = SLRead[i] + Arbitrary - 39 - (!(i % 2)); + dec[i] = SLRead[i] - 39 - (!(i % 2)); } return dec; }(); @@ -400,9 +399,6 @@ public: // GPU 3D Rasterization Timings III, For First Polygon "Pre-Calc" Timings // should be added before other timings, as these are "async" pre-calcs of polygon attributes - static constexpr int FirstPerSlope = 1 * TimingFrac; // 1 | for each "slope" the first polygon has in this scanline increment it by 1. - // (see DoTimingsSlopes() in GPU3D_Soft.cpp for more info) - static constexpr int FirstNull = 1 * TimingFrac; // 1 | if the first polygon is "null" (probably wrong?) static constexpr int FirstPolyDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?) // (Amount of time before the end of the cycle a scanline must abort?) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 36495703..51865276 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -193,15 +193,15 @@ void SoftRenderer::FindFirstPolyDoTimings(int npolys, s32 y, int* firstpolyeven, { fixeddelay = true; break; - if (y == polygon->YBottom) break; + /*if (y == polygon->YBottom) break; if (y == polygon->YTop) {perslope = true; break;} - /*else if ((y == polygon->Vertices[rp->NextVL]->FinalPosition[1] || y == polygon->Vertices[rp->CurVL]->FinalPosition[1]) || + else if ((y == polygon->Vertices[rp->NextVL]->FinalPosition[1] || y == polygon->Vertices[rp->CurVL]->FinalPosition[1]) || (y == polygon->Vertices[rp->NextVR]->FinalPosition[1] || y == polygon->Vertices[rp->CurVR]->FinalPosition[1])) { perslope = true; } - else */etc = true; - break; + else etc = true; + break;*/ } } @@ -215,21 +215,21 @@ void SoftRenderer::FindFirstPolyDoTimings(int npolys, s32 y, int* firstpolyeven, { fixeddelay = true; break; - if (y == polygon->YBottom) break; + /*if (y == polygon->YBottom) break; if (y == polygon->YTop) {perslope = true; break;} - /*else if ((y == polygon->Vertices[rp->NextVL]->FinalPosition[1] || y == polygon->Vertices[rp->CurVL]->FinalPosition[1]) || + else if ((y == polygon->Vertices[rp->NextVL]->FinalPosition[1] || y == polygon->Vertices[rp->CurVL]->FinalPosition[1]) || (y == polygon->Vertices[rp->NextVR]->FinalPosition[1] || y == polygon->Vertices[rp->CurVR]->FinalPosition[1])) { perslope = true; } - else */etc = true; - break; + else etc = true; + break;*/ } } - *timingcountereven = fixeddelay*FirstPolyDelay;// + perslope*FirstPerSlope + etc*2; - *timingcounterodd = fixeddelay*FirstPolyDelay;// + perslope*FirstPerSlope + etc*2; - if (!perslope) + *timingcountereven = fixeddelay ? FirstPolyDelay : 0;// + perslope*FirstPerSlope + etc*2; + *timingcounterodd = fixeddelay ? FirstPolyDelay : 0;// + perslope*FirstPerSlope + etc*2; + /*if (!perslope) { *timingcountereven += etc*2;// + perslope*FirstPerSlope + etc*2; *timingcounterodd += etc*2;// + perslope*FirstPerSlope + etc*2; @@ -238,7 +238,7 @@ void SoftRenderer::FindFirstPolyDoTimings(int npolys, s32 y, int* firstpolyeven, { *timingcountereven += perslope*FirstPerSlope;// + perslope*FirstPerSlope + etc*2; *timingcounterodd += perslope*FirstPerSlope;// + perslope*FirstPerSlope + etc*2; - } + }*/ } void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const @@ -1912,13 +1912,13 @@ void SoftRenderer::RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys) #define SCANLINE_BUFFER_SIM\ /* simulate the process of scanlines being read from the 48 scanline buffer */\ - while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread] + Arbitrary)\ + while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread])\ {\ - if (RasterTiming < SLRead[nextread] + Arbitrary) /* why + 565? */\ + if (RasterTiming < SLRead[nextread])\ {\ - timespent = (SLRead[nextread] + Arbitrary) - RasterTiming;\ + timespent = SLRead[nextread] - RasterTiming;\ timespent += EMFixNum; /* fixes edge marking bug emulation. not sure why this is needed? */\ - RasterTiming = (SLRead[nextread] + Arbitrary);\ + RasterTiming = SLRead[nextread];\ }\ scanlineswaiting--;\ nextread++;\ @@ -1926,7 +1926,7 @@ void SoftRenderer::RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys) #define RENDER_SCANLINES(y)\ /* update sl timeout */\ - ScanlineTimeout = SLRead[y-1] - FinalPassLen;\ + ScanlineTimeout = SLRead[y-1] - (PreReadCutoff+FinalPassLen);\ \ FindFirstPolyDoTimings(j, y, &firstpolyeven, &firstpolyodd, &rastertimingeven, &rastertimingodd);\ RenderScanline(gpu, y, firstpolyeven, j, &rastertimingeven);\ @@ -1951,7 +1951,7 @@ void SoftRenderer::RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys // reset scanline trackers gpu.GPU3D.UnderflowFlagVCount = -1; gpu.GPU3D.RDLinesTemp = 63; - ScanlineTimeout = 0x7FFFFFFF; // CHECKME: first scanline pair timeout. + ScanlineTimeout = SLRead[2] - (PreReadCutoff+FinalPassLen+4); // TEMP: should be infinity, but i dont want it to break due to not being set up to handle this properly. //0x7FFFFFFF; // CHECKME: first scanline pair timeout. s32 rastertimingeven, rastertimingodd; // always init to 0 at the start of a scanline render s32 scanlineswaiting = 0, slwaitingrd = 0; s32 nextread = 0, nextreadrd = 0; @@ -1966,7 +1966,7 @@ void SoftRenderer::RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys // it can't proceed to the next scanline unless all others steps are done (both scanlines in the pair, and final pass) RasterTiming = timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); // 12 cycles at the end of a "timeout" are always used for w/e reason - RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); + RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); // should probably just be += 12 tbh but i'll leave it for now // if first pair was not delayed past the first read, then later scanlines cannot either // this allows us to implement a fast path From 635bfa0c29b255548ffae95361bd3129cb04c3b6 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 24 Apr 2024 07:59:38 -0400 Subject: [PATCH 49/53] even more cleanup! --- src/GPU.cpp | 2 +- src/GPU3D.cpp | 4 +++- src/GPU3D_Soft.cpp | 32 ++++++++++++++++---------------- src/GPU3D_Soft.h | 16 +++------------- 4 files changed, 23 insertions(+), 31 deletions(-) diff --git a/src/GPU.cpp b/src/GPU.cpp index 53069784..30a2d2af 100644 --- a/src/GPU.cpp +++ b/src/GPU.cpp @@ -74,7 +74,6 @@ GPU::GPU(melonDS::NDS& nds, std::unique_ptr&& renderer3d, std::uniqu NDS.RegisterEventFunc(Event_LCD, LCD_StartScanline, MemberEventFunc(GPU, StartScanline)); NDS.RegisterEventFunc(Event_LCD, LCD_FinishFrame, MemberEventFunc(GPU, FinishFrame)); NDS.RegisterEventFunc(Event_DisplayFIFO, 0, MemberEventFunc(GPU, DisplayFIFO)); - NDS.RegisterEventFunc(Event_DisplayFIFO, 0, MemberEventFunc(GPU, DisplayFIFO)); InitFramebuffers(); } @@ -916,6 +915,7 @@ void GPU::StartHBlank(u32 line) noexcept if (DispStat[0] & (1<<4)) NDS.SetIRQ(0, IRQ_HBlank); if (DispStat[1] & (1<<4)) NDS.SetIRQ(1, IRQ_HBlank); + if (VCount < 262) NDS.ScheduleEvent(Event_LCD, true, (LINE_CYCLES - HBLANK_CYCLES), LCD_StartScanline, line+1); else diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index cbd2721c..bf05f97d 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -821,6 +821,8 @@ void GPU3D::StallPolygonPipeline(s32 delay, s32 nonstalldelay) noexcept } } + + template void ClipSegment(Vertex* outbuf, Vertex* vin, Vertex* vout) { @@ -2678,7 +2680,7 @@ u16 GPU3D::Read16(u32 addr) noexcept return DispCnt; case 0x04000320: - return RDLines; // CHECKME: Can this always be read? Even when the gpu is powered off? also check 8 bit reads + return RDLines; case 0x04000600: { diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 51865276..88b50f84 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -972,7 +972,7 @@ bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* } else abortscanline = false; } - else abortscanline = true; + // note: if accuracy mode isn't enabled the abort flag never gets set, this is fine, because it also never gets used by fast mode. // for shadow masks: set stencil bits where the depth test fails. // draw nothing. @@ -1053,7 +1053,7 @@ bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* StencilBuffer[256*(y&0x1) + x] |= 0x2; } } - + Step(rp); return abortscanline; } @@ -1227,7 +1227,7 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 } else abortscanline = false; } - else abortscanline = false; + // note: if accuracy mode isn't enabled the abort flag never gets set, this is fine, because it also never gets used by fast mode. // part 1: left edge edge = yedge | 0x1; @@ -1238,12 +1238,11 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 xcov = (l_edgecov >> 12) & 0x3FF; if (xcov == 0x3FF) xcov = 0; } - - + if (!l_filledge) x = xlimit; else for (; x < xlimit; x++) { - u32 pixeladdr = (y * ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; // check stencil buffer for shadows @@ -1337,7 +1336,7 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 if (wireframe && !edge) x = std::max(x, xlimit); else for (; x < xlimit; x++) { - u32 pixeladdr = (y * ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; // check stencil buffer for shadows @@ -1424,11 +1423,11 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 xcov = (r_edgecov >> 12) & 0x3FF; if (xcov == 0x3FF) xcov = 0; } - + if (r_filledge) for (; x < xlimit; x++) { - u32 pixeladdr = (y * ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; // check stencil buffer for shadows @@ -1620,7 +1619,7 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, for (int x = 0; x < 256; x++) { - u32 pixeladdr = (y * ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; u32 attr = AttrBuffer[pixeladdr]; if (!(attr & 0xF)) continue; @@ -1702,7 +1701,7 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, for (int x = 0; x < 256; x++) { - u32 pixeladdr = (y * ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; u32 density, srccolor, srcR, srcG, srcB, srcA; u32 attr = AttrBuffer[pixeladdr]; @@ -1767,7 +1766,7 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, for (int x = 0; x < 256; x++) { - u32 pixeladdr = (y * ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; u32 attr = AttrBuffer[pixeladdr]; if (!(attr & 0xF)) continue; @@ -1843,7 +1842,7 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) u32 z = ((val3 & 0x7FFF) * 0x200) + 0x1FF; - u32 pixeladdr = (y * ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; ColorBuffer[pixeladdr] = color; DepthBuffer[pixeladdr] = z; AttrBuffer[pixeladdr] = polyid | (val3 & 0x8000); @@ -1866,12 +1865,12 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) u32 color = r | (g << 8) | (b << 16) | (a << 24); polyid |= (gpu.GPU3D.RenderClearAttr1 & 0x8000); - + for (int y = 0; y < 192; y++) { for (int x = 0; x < 256; x++) { - u32 pixeladdr = (y * ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; ColorBuffer[pixeladdr] = color; DepthBuffer[pixeladdr] = clearz; AttrBuffer[pixeladdr] = polyid; @@ -1882,6 +1881,7 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) void SoftRenderer::RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys) { + gpu.GPU3D.RDLinesTemp = 46; // dumb way of making sure it gets updated to a "normal" value when the gpu starts rasterizing. int j = 0; for (int i = 0; i < npolys; i++) { @@ -2138,7 +2138,7 @@ u32* SoftRenderer::GetLine(int line) // so we don't need to wait for a specific row) Platform::Semaphore_Wait(Sema_ScanlineCount); } - return &FinalBuffer[line * ScanlineWidth]; + return &FinalBuffer[line*ScanlineWidth]; } } diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 4b1a8e52..1760cc27 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -488,16 +488,6 @@ private: s32 ScanlineTimeout; s32 RasterTiming; - enum - { - RenderStart = 0, - ScanlineRead, - PushScanline, - PushScanlineP2, - RenderFinal, - RasterEvents_MAX, - }; - // buffer dimensions are 258x194 to add a offscreen 1px border // which simplifies edge marking tests // buffer is duplicated to keep track of the two topmost pixels @@ -506,16 +496,16 @@ private: static constexpr int ScanlineWidth = 256; static constexpr int NumScanlinesIntBuf = 192; - static constexpr int NumScanlinesRD = 48; + //static constexpr int NumScanlinesRD = 48; static constexpr int NumScanlinesFinal = 192; static constexpr int BufferSize = ScanlineWidth * NumScanlinesIntBuf; - static constexpr int RDBufferSize = ScanlineWidth * NumScanlinesRD; + //static constexpr int RDBufferSize = ScanlineWidth * NumScanlinesRD; static constexpr int FinalBufferSize = ScanlineWidth * NumScanlinesFinal; u32 ColorBuffer[BufferSize * 2]; u32 DepthBuffer[BufferSize * 2]; u32 AttrBuffer[BufferSize * 2]; - u32 RDBuffer[RDBufferSize]; // is this buffer ever initialized by hw before writing to it? what is its initial value? can you transfer 3d framebuffer data between games? + //u32 RDBuffer[RDBufferSize]; // is this buffer ever initialized by hw before writing to it? what is its initial value? can you transfer 3d framebuffer data between games? u32 FinalBuffer[FinalBufferSize]; // attribute buffer: From bb20a0b1d2818e5ced3e6ff4cdddffe567610c1a Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 24 Apr 2024 08:04:41 -0400 Subject: [PATCH 50/53] try to avoid some memcpys in fast mode --- src/GPU3D_Soft.cpp | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 88b50f84..ece6f15b 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1894,10 +1894,12 @@ void SoftRenderer::RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys) for (s32 y = 1; y < 192; y++) { RenderScanline(gpu, y, 0, j, nullptr); - ScanlineFinalPass(gpu.GPU3D, y-1, true, true); + ScanlineFinalPass(gpu.GPU3D, y-1, true, true); + Platform::Semaphore_Post(Sema_ScanlineCount); } - ScanlineFinalPass(gpu.GPU3D, 191, true, true); + ScanlineFinalPass(gpu.GPU3D, 191, true, true); + Platform::Semaphore_Post(Sema_ScanlineCount); } #define RDLINES_COUNT_INCREMENT\ @@ -2058,7 +2060,7 @@ void SoftRenderer::RenderFrame(GPU& gpu) RenderPolygonsFast(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); } else - memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); + if (Accuracy) memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); } } @@ -2101,7 +2103,7 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) } else { - memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); + if (Accuracy) memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); Platform::Semaphore_Post(Sema_ScanlineCount, 192); } } @@ -2130,15 +2132,19 @@ void SoftRenderer::ScanlineSync(int line) u32* SoftRenderer::GetLine(int line) { // only wait in in-accurate mode (we've already waited for scanlines in accurate mode) - if (!Accuracy && RenderThreadRunning.load(std::memory_order_relaxed)) + if (!Accuracy) { - if (line < 192) - // We need a scanline, so let's wait for the render thread to finish it. - // (both threads process scanlines from top-to-bottom, - // so we don't need to wait for a specific row) - Platform::Semaphore_Wait(Sema_ScanlineCount); + if (RenderThreadRunning.load(std::memory_order_relaxed)) + { + if (line < 192) + // We need a scanline, so let's wait for the render thread to finish it. + // (both threads process scanlines from top-to-bottom, + // so we don't need to wait for a specific row) + Platform::Semaphore_Wait(Sema_ScanlineCount); + } + return &ColorBuffer[line*ScanlineWidth]; } - return &FinalBuffer[line*ScanlineWidth]; + else return &FinalBuffer[line*ScanlineWidth]; } } From 9b106d064d7fb8ebadb4ba66fbb49d8f3758fcb4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 24 Apr 2024 09:53:34 -0400 Subject: [PATCH 51/53] im dumb --- src/GPU3D_Soft.cpp | 1 + src/GPU3D_Soft.h | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index ece6f15b..e6e3a4e2 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -1953,6 +1953,7 @@ void SoftRenderer::RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys // reset scanline trackers gpu.GPU3D.UnderflowFlagVCount = -1; gpu.GPU3D.RDLinesTemp = 63; + RasterTiming = 0; ScanlineTimeout = SLRead[2] - (PreReadCutoff+FinalPassLen+4); // TEMP: should be infinity, but i dont want it to break due to not being set up to handle this properly. //0x7FFFFFFF; // CHECKME: first scanline pair timeout. s32 rastertimingeven, rastertimingodd; // always init to 0 at the start of a scanline render s32 scanlineswaiting = 0, slwaitingrd = 0; diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 1760cc27..98623804 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -476,9 +476,6 @@ private: bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z); template void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext); void ClearBuffers(const GPU& gpu); - u16 BeginPushScanline(s32 y, s32 pixelstodraw); - void ReadScanline(s32 y); - void FinishPushScanline(s32 y, s32 pixelsremain); void RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys); void RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys); From 72ffe6b29716060880209e0c0295add2657cb9a9 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 27 Apr 2024 12:37:48 -0400 Subject: [PATCH 52/53] fix bugs with negative viewports it it renders out of bounds pixels........... --- src/GPU3D_Soft.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index e6e3a4e2..086bb885 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -957,7 +957,6 @@ bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* if (x < 0) x = 0; s32 xlimit; - if (xend > 256) xend = 256; if (accuracy) { @@ -973,6 +972,13 @@ bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* else abortscanline = false; } // note: if accuracy mode isn't enabled the abort flag never gets set, this is fine, because it also never gets used by fast mode. + + // we cap it to 256 *after* counting the cycles, because yes, it tries to render oob pixels. + if (xend > 256) + { + r_edgelen += 256 - xend; + xend = 256; + } // for shadow masks: set stencil bits where the depth test fails. // draw nothing. @@ -1213,7 +1219,6 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 s32 xlimit; s32 xcov = 0; - if (xend > 256) xend = 256; if (accuracy) { @@ -1228,6 +1233,13 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 else abortscanline = false; } // note: if accuracy mode isn't enabled the abort flag never gets set, this is fine, because it also never gets used by fast mode. + + // we cap it to 256 *after* counting the cycles, because yes, it tries to render oob pixels. + if (xend > 256) + { + r_edgelen += 256 - xend; + xend = 256; + } // part 1: left edge edge = yedge | 0x1; From 0b85038586cccc472d070c498bac85cffefa1827 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 9 May 2024 09:45:36 -0400 Subject: [PATCH 53/53] remove accuracy toggle doesn't seem necessary --- src/GPU3D_Soft.cpp | 113 ++++++++++++--------------------------------- src/GPU3D_Soft.h | 8 ++-- 2 files changed, 32 insertions(+), 89 deletions(-) diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 086bb885..a561154c 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -825,7 +825,6 @@ void SoftRenderer::CheckSlope(RendererPolygon* rp, s32 y) } } -template bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, s32* timingcounter) { Polygon* polygon = rp->PolyData; @@ -958,20 +957,15 @@ bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* if (x < 0) x = 0; s32 xlimit; - if (accuracy) + // determine if the span can be rendered within the time allotted to the scanline + s32 diff = DoTimingsPixels(xend-x, timingcounter); + if (diff != 0) { - // determine if the span can be rendered within the time allotted to the scanline - // TODO: verify the timing characteristics of shadow masks are the same as regular polygons. - s32 diff = DoTimingsPixels(xend-x, timingcounter); - if (diff != 0) - { - xend -= diff; - r_edgelen -= diff; - abortscanline = true; - } - else abortscanline = false; + xend -= diff; + r_edgelen -= diff; + abortscanline = true; } - // note: if accuracy mode isn't enabled the abort flag never gets set, this is fine, because it also never gets used by fast mode. + else abortscanline = false; // we cap it to 256 *after* counting the cycles, because yes, it tries to render oob pixels. if (xend > 256) @@ -1064,7 +1058,6 @@ bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* return abortscanline; } -template bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter) { Polygon* polygon = rp->PolyData; @@ -1220,19 +1213,15 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 s32 xcov = 0; - if (accuracy) + // determine if the span can be rendered within the time allotted to the scanline + s32 diff = DoTimingsPixels(xend-x, timingcounter); + if (diff != 0) { - // determine if the span can be rendered within the time allotted to the scanline - s32 diff = DoTimingsPixels(xend-x, timingcounter); - if (diff != 0) - { - xend -= diff; - r_edgelen -= diff; - abortscanline = true; - } - else abortscanline = false; + xend -= diff; + r_edgelen -= diff; + abortscanline = true; } - // note: if accuracy mode isn't enabled the abort flag never gets set, this is fine, because it also never gets used by fast mode. + else abortscanline = false; // we cap it to 256 *after* counting the cycles, because yes, it tries to render oob pixels. if (xend > 256) @@ -1528,7 +1517,6 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 return abortscanline; } -template void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int firstpoly, int npolys, s32* timingcounter) { bool abort = false; @@ -1537,24 +1525,24 @@ void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int firstpoly, int npol RendererPolygon* rp = &PolygonList[firstpoly]; Polygon* polygon = rp->PolyData; - if (accuracy && y == polygon->YBottom && y != polygon->YTop) + if (y == polygon->YBottom && y != polygon->YTop) { if (!abort) abort = !DoTimings(EmptyPolyScanline, timingcounter); } else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) { - if (accuracy && !abort) abort = (!DoTimings(PerPolyScanline, timingcounter) + if (!abort) abort = (!DoTimings(PerPolyScanline, timingcounter) || !CheckTimings(MinToStartPoly, timingcounter)); - if (accuracy && abort) + if (abort) { CheckSlope(rp, y); Step(rp); } else if (polygon->IsShadowMask) - abort = RenderShadowMaskScanline(gpu.GPU3D, rp, y, timingcounter); + abort = RenderShadowMaskScanline(gpu.GPU3D, rp, y, timingcounter); else - abort = RenderPolygonScanline(gpu, rp, y, timingcounter); + abort = RenderPolygonScanline(gpu, rp, y, timingcounter); } } } @@ -1891,29 +1879,6 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) } } -void SoftRenderer::RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys) -{ - gpu.GPU3D.RDLinesTemp = 46; // dumb way of making sure it gets updated to a "normal" value when the gpu starts rasterizing. - int j = 0; - for (int i = 0; i < npolys; i++) - { - if (polygons[i]->Degenerate) continue; - SetupPolygon(&PolygonList[j++], polygons[i]); - } - - RenderScanline(gpu, 0, 0, j, nullptr); - - for (s32 y = 1; y < 192; y++) - { - RenderScanline(gpu, y, 0, j, nullptr); - ScanlineFinalPass(gpu.GPU3D, y-1, true, true); - Platform::Semaphore_Post(Sema_ScanlineCount); - } - - ScanlineFinalPass(gpu.GPU3D, 191, true, true); - Platform::Semaphore_Post(Sema_ScanlineCount); -} - #define RDLINES_COUNT_INCREMENT\ /* feels wrong, needs improvement */\ while (RasterTiming >= RDDecrement[nextreadrd])\ @@ -1943,8 +1908,8 @@ void SoftRenderer::RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys) ScanlineTimeout = SLRead[y-1] - (PreReadCutoff+FinalPassLen);\ \ FindFirstPolyDoTimings(j, y, &firstpolyeven, &firstpolyodd, &rastertimingeven, &rastertimingodd);\ - RenderScanline(gpu, y, firstpolyeven, j, &rastertimingeven);\ - RenderScanline(gpu, y+1, firstpolyodd, j, &rastertimingodd);\ + RenderScanline(gpu, y, firstpolyeven, j, &rastertimingeven);\ + RenderScanline(gpu, y+1, firstpolyodd, j, &rastertimingodd);\ \ prevtimespent = timespent;\ RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen});\ @@ -1975,8 +1940,8 @@ void SoftRenderer::RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys FindFirstPolyDoTimings(j, 0, &firstpolyeven, &firstpolyodd, &rastertimingeven, &rastertimingodd); // scanlines are rendered in pairs of two - RenderScanline(gpu, 0, firstpolyeven, j, &rastertimingeven); - RenderScanline(gpu, 1, firstpolyodd, j, &rastertimingodd); + RenderScanline(gpu, 0, firstpolyeven, j, &rastertimingeven); + RenderScanline(gpu, 1, firstpolyodd, j, &rastertimingodd); // it can't proceed to the next scanline unless all others steps are done (both scanlines in the pair, and final pass) RasterTiming = timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); @@ -2067,13 +2032,9 @@ void SoftRenderer::RenderFrame(GPU& gpu) if (gpu.GPU3D.RenderingEnabled >= 3) { - if (Accuracy) - RenderPolygonsTiming(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); - else - RenderPolygonsFast(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + RenderPolygonsTiming(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); } - else - if (Accuracy) memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); + else memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); } } @@ -2109,14 +2070,11 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) if (gpu.GPU3D.RenderingEnabled >= 3) { - if (Accuracy) - RenderPolygonsTiming(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); - else - RenderPolygonsFast(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + RenderPolygonsTiming(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); } else { - if (Accuracy) memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); + memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); Platform::Semaphore_Post(Sema_ScanlineCount, 192); } } @@ -2131,7 +2089,7 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) void SoftRenderer::ScanlineSync(int line) { // only used in accurate mode (timings must be emulated) - if (Accuracy && RenderThreadRunning.load(std::memory_order_relaxed)) + if (RenderThreadRunning.load(std::memory_order_relaxed)) { if (line < 192) { @@ -2144,20 +2102,7 @@ void SoftRenderer::ScanlineSync(int line) u32* SoftRenderer::GetLine(int line) { - // only wait in in-accurate mode (we've already waited for scanlines in accurate mode) - if (!Accuracy) - { - if (RenderThreadRunning.load(std::memory_order_relaxed)) - { - if (line < 192) - // We need a scanline, so let's wait for the render thread to finish it. - // (both threads process scanlines from top-to-bottom, - // so we don't need to wait for a specific row) - Platform::Semaphore_Wait(Sema_ScanlineCount); - } - return &ColorBuffer[line*ScanlineWidth]; - } - else return &FinalBuffer[line*ScanlineWidth]; + return &FinalBuffer[line*ScanlineWidth]; } } diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 98623804..43eac27f 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -468,9 +468,9 @@ private: void SetupPolygon(RendererPolygon* rp, Polygon* polygon) const; void Step(RendererPolygon* rp); void CheckSlope(RendererPolygon* rp, s32 y); - template bool RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, s32* timingcounter); - template bool RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter); - template void RenderScanline(const GPU& gpu, s32 y, int firstpoly, int npolys, s32* timingcounter); + bool RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, s32* timingcounter); + bool RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter); + void RenderScanline(const GPU& gpu, s32 y, int firstpoly, int npolys, s32* timingcounter); u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const; bool CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr); bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z); @@ -521,8 +521,6 @@ private: bool FrameIdentical; - bool Accuracy = true; // TODO - // threading bool Threaded;