From 1054011c90973278f8df1e658d59c7187f8805c4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 19 Dec 2023 22:52:54 -0500 Subject: [PATCH] wip --- src/GPU3D.cpp | 8 +- src/GPU3D.h | 53 +++++++----- src/GPU3D_Soft.cpp | 203 ++++++++++++++++++++++++++++----------------- src/GPU3D_Soft.h | 6 +- 4 files changed, 170 insertions(+), 100 deletions(-) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 6fb24979..056d5735 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -222,7 +222,7 @@ void GPU3D::Reset() noexcept AlphaRefVal = 0; AlphaRef = 0; - RDLinesDisplay = 46; + RDLines = 46; memset(ToonTable, 0, sizeof(ToonTable)); memset(EdgeTable, 0, sizeof(EdgeTable)); @@ -2369,7 +2369,7 @@ void GPU3D::CheckFIFODMA() noexcept void GPU3D::VCount144() noexcept { - RDLinesDisplay = 46; + RDLines = 46; CurrentRenderer->VCount144(); } @@ -2613,7 +2613,7 @@ u16 GPU3D::Read16(u32 addr) noexcept return DispCnt; case 0x04000320: - return RDLinesDisplay; // IT IS TIME + return RDLines; // IT IS TIME case 0x04000600: { @@ -2657,7 +2657,7 @@ u32 GPU3D::Read32(u32 addr) noexcept return DispCnt; case 0x04000320: - return RDLinesDisplay; // IT IS TIME + return RDLines; // IT IS TIME case 0x04000600: { diff --git a/src/GPU3D.h b/src/GPU3D.h index 2dfacdc0..6413935e 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -245,7 +245,7 @@ public: bool RenderingEnabled = false; u32 DispCnt = 0; - u32 RDLinesDisplay = 0; + u32 RDLines = 0; u8 AlphaRefVal = 0; u8 AlphaRef = 0; @@ -334,34 +334,47 @@ public: static constexpr int TimingFrac = 1; // add a fractional component if pixels is not enough precision // GPU 2D read timings, for emulating race conditions - static constexpr int GPU2DSpeedWithinPair = 296 * TimingFrac; - static constexpr int GPU2DSpeedOutsidePair = 810 * TimingFrac; - static constexpr int GPU2DSpeedReadScanline = 256 * TimingFrac; - static constexpr int InitGPU2DTimeout = 51618 * TimingFrac; + static constexpr int GPU2DSpeedWithinPair = 296 * TimingFrac; // the delay between finishing reading the first scanline and beginning reading the second scanline of a scanline pair. + static constexpr int GPU2DSpeedOutsidePair = 810 * TimingFrac; // the delay between finishing reading a pair and beginning reading a new pair. + static constexpr int GPU2DReadScanline = 256 * TimingFrac; // the time it takes to read a scanline. + static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // notably the same as the scanline increment. + static constexpr int InitGPU2DTimeout = 50000 * TimingFrac; // 51618? | when it starts reading the first scanline. + static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 48 * TimingFrac; // time to read 48 scanlines. // GPU 3D rasterization timings, for emulating the timeout - static constexpr int ScanlinePairLength = 2130 * TimingFrac; + + //static constexpr int ScanlinePairLength = 2130 * TimingFrac; //static constexpr int ScanlineTimeout = 1686 * TimingFrac; // 2126? 1686? //static constexpr int ScanlineBreak = 4 * TimingFrac; //static constexpr int ScanlineBreak2 = 40 * TimingFrac; - static constexpr int ScanlineIncrement = 1618 * TimingFrac; // how much to increment per scanline pair - static constexpr int AbortIncrement = 12 * TimingFrac; // how much extra to increment after an aborted scanline (total 1630) - static constexpr int FreeTiming = 496 * TimingFrac; // every scanline has a free 496 pixels worth of timing for some reason. - static constexpr int InitialTiming = 48688 * TimingFrac; // add 1618*2 to get the timeout of the second scanline pair - static constexpr int Post50Max = 51116 * TimingFrac; // for some reason it doesn't care about how full it actually is, it just cares about if its the first 50 scanlines to speedrun rendering? + //static constexpr int FakeTiming = 2 * TimingFrac; + //static constexpr int FraudulentTiming = 1120 * TimingFrac; // bad theory. todo: find a better one. + static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair + static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is, it just cares about if its the first 50 scanlines to speedrun rendering? + static constexpr int FreeTiming = 496 * TimingFrac; // 496 | every scanline has a free 496 pixels worth of timing for some reason. + static constexpr int ScanlineIncrement = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair + static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 1630) // GPU 3D rasterization timings II, for counting each element with timing characteristics - static constexpr int FirstPolyScanline = 0 * TimingFrac; - static constexpr int PerPolyScanline = 12 * TimingFrac; // should be correct for *most* line polygons and polygons with vertical slopes - static constexpr int PerPixelTiming = 1 * TimingFrac; // 1 pixel = 1 pixel - static constexpr int NumFreePixels = 4; // First 4 pixels in a polygon scanline are free (for some reason) + + //static constexpr int FirstPolyScanline = 0 * TimingFrac; + static constexpr int PerPolyScanline = 12 * TimingFrac; // 12 | should be 12, but 14 is "correct" // should be correct for *most* line polygons and polygons with vertical slopes + static constexpr int PerPixelTiming = 1 * TimingFrac; // 1 | 1 pixel = 1 pixel + static constexpr int NumFreePixels = 4; // 4 | First 4 pixels in a polygon scanline are free (for some reason) + static constexpr int MinToStartPoly = 2 * TimingFrac; // 1 | if there is not 1 cycle remaining, do not bother rendering polygon (CHECKME: I dont think this should decrement timings by anything?) + static constexpr int EmptyPolyScanline = 4 * TimingFrac; // - 14; // 4 | seems to be slightly under 4 px? + + // GPU 3D rasterization timing III, for first polygon exclusive timing characteristics + // should be done first, as these are "async" pre-calcs of polygon attributes + + static constexpr int FirstVSlope = 0 * TimingFrac; // 1 | the first polygon in a scanline having two vertical slopes adds 1 to timings...? + static constexpr int FirstNull = 1 * TimingFrac; // 1 | if the first polygon is "null" (probably wrong?) // static constexpr int RasterTimingCap = 51116 * TimingFrac; - static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED - static constexpr int PerScanlineRecup = 2112 * TimingFrac; // seems to check out? // should be the "free" time the gpu has to do the calculation - static constexpr int PerRightSlope = 1 * TimingFrac; - static constexpr int EmptyPolyScanline = 4 * TimingFrac;// - 14; // seems to be slightly under 4? - //static constexpr int FirstPixelTiming; + // static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED + // static constexpr int PerScanlineRecup = 2112 * TimingFrac; // seems to check out? // should be the "free" time the gpu has to do the calculation + // static constexpr int PerRightSlope = 1 * TimingFrac; + // static constexpr int FirstPixelTiming; class Renderer3D { diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index af23132d..688785d0 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -127,20 +127,32 @@ bool SoftRenderer::DoTimings(s32 cycles, bool odd) return true; } +bool SoftRenderer::CheckTimings(s32 cycles, bool odd) +{ + // check if there are 'cycles' amount of cycles remaining. + + s32* counter; + if (odd) counter = &RasterTimingOdd; + else counter = &RasterTimingEven; + + if (RasterTiming - *counter >= cycles) return true; + else return false; +} + u32 SoftRenderer::DoTimingsPixels(s32 pixels, bool odd) { // calculate and return the difference between the old span and the new span, while adding timings to the timings counter // pixels dont count towards timings if they're the first 4 pixels in a scanline (for some reason?) - if (pixels <= 4) return 0; + if (pixels <= NumFreePixels) return 0; - pixels -= 4; + pixels -= NumFreePixels; s32* counter; if (odd) counter = &RasterTimingOdd; else counter = &RasterTimingEven; - //todo: figure out a faster way to support TimingFrac > 1 without using a for loop somehow. + //todo: figure out a faster way to support TimingFrac > 1 without using a for loop somehow. (fingers crossed we dont have to!) if constexpr (TimingFrac > 1) for (; pixels > 0; pixels--) { @@ -160,6 +172,26 @@ u32 SoftRenderer::DoTimingsPixels(s32 pixels, bool odd) return pixels; } +bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, bool odd) +{ + // determine the timing impact of the first polygon's slopes. + + Polygon* polygon = rp->PolyData; + + if (polygon->YTop == polygon->YBottom) return false; + if (y == polygon->YTop) return false; + + s32* counter; + if (odd) counter = &RasterTimingOdd; + else counter = &RasterTimingEven; + + if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) *counter += 1; + + if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) *counter += 1; + + return DoTimings(2, odd); // CHECKME: does this need to be done time its incremented here? +} + void SoftRenderer::TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) { u32 vramaddr = (texparam & 0xFFFF) << 3; @@ -744,7 +776,7 @@ void SoftRenderer::CheckSlope(RendererPolygon* rp, s32 y) } } -void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) +bool SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y, bool odd) { Polygon* polygon = rp->PolyData; @@ -766,19 +798,8 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) memset(&StencilBuffer[256 * (y&0x1)], 0, 256); PrevIsShadowMask = true; - - if (polygon->YTop != polygon->YBottom) - { - if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) - { - SetupPolygonLeftEdge(rp, y); - } - - if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) - { - SetupPolygonRightEdge(rp, y); - } - } + + CheckSlope(rp, y); Vertex *vlcur, *vlnext, *vrcur, *vrnext; s32 xstart, xend; @@ -787,6 +808,7 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) s32 l_edgecov, r_edgecov; Interpolator<1>* interp_start; Interpolator<1>* interp_end; + bool abortscanline; // to abort the rest of the scanline after finishing this polygon xstart = rp->XL; xend = rp->XR; @@ -870,7 +892,7 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) // similarly, we can perform alpha test early (checkme) if (wireframe) polyalpha = 31; - if (polyalpha <= GPU.GPU3D.RenderAlphaRef) return; + if (polyalpha <= GPU.GPU3D.RenderAlphaRef) return false; // TODO: check how this impacts timings? // in wireframe mode, there are special rules for equal Z (TODO) @@ -880,10 +902,23 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) int edge; s32 x = xstart; - Interpolator<0> interpX(xstart, xend+1, wl, wr); + xend += 1; + Interpolator<0> interpX(xstart, xend, wl, wr); if (x < 0) x = 0; s32 xlimit; + if (xend > 256) xend = 256; + + // determine if the span can be rendered within the time allotted to the scanline + // TODO: verify the timing characteristics of shadow masks are the same as regular polygons. + s32 diff = DoTimingsPixels(xend-x, odd); + if (diff != 0) + { + xend -= diff; + r_edgelen -= diff; + abortscanline = true; + } + else abortscanline = false; // for shadow masks: set stencil bits where the depth test fails. // draw nothing. @@ -891,8 +926,7 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) // part 1: left edge edge = yedge | 0x1; xlimit = xstart+l_edgelen; - if (xlimit > xend+1) xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + if (xlimit > xend) xlimit = xend; if (!l_filledge) x = xlimit; else @@ -918,9 +952,8 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) // part 2: polygon inside edge = yedge; - xlimit = xend-r_edgelen+1; - if (xlimit > xend+1) xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + xlimit = xend-r_edgelen; + if (xlimit > xend) xlimit = xend; if (wireframe && !edge) x = std::max(x, xlimit); else for (; x < xlimit; x++) { @@ -944,8 +977,7 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) // part 3: right edge edge = yedge | 0x2; - xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + xlimit = xend; if (r_filledge) for (; x < xlimit; x++) @@ -967,9 +999,9 @@ void SoftRenderer::RenderShadowMaskScanline(RendererPolygon* rp, s32 y) StencilBuffer[256*(y&0x1) + x] |= 0x2; } } - - rp->XL = rp->SlopeL.Step(); - rp->XR = rp->SlopeR.Step(); + + Step(rp); + return abortscanline; } bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) @@ -993,12 +1025,6 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) CheckSlope(rp, y); - if (DoTimings(PerPolyScanline, odd)) - { - Step(rp); - return true; - } - Vertex *vlcur, *vlnext, *vrcur, *vrnext; s32 xstart, xend; bool l_filledge, r_filledge; @@ -1006,7 +1032,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) s32 l_edgecov, r_edgecov; Interpolator<1>* interp_start; Interpolator<1>* interp_end; - bool abortscanline = false; // to abort the rest of the scanline after finishing this polygon + bool abortscanline; // to abort the rest of the scanline after finishing this polygon xstart = rp->XL; xend = rp->XR; @@ -1142,6 +1168,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) r_edgelen -= diff; abortscanline = true; } + else abortscanline = false; // part 1: left edge edge = yedge | 0x1; @@ -1434,6 +1461,7 @@ bool SoftRenderer::RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd) bool SoftRenderer::RenderScanline(s32 y, int npolys, bool odd) { bool abort = false; + bool first = true; for (int i = 0; i < npolys; i++) { RendererPolygon* rp = &PolygonList[i]; @@ -1441,20 +1469,29 @@ bool SoftRenderer::RenderScanline(s32 y, int npolys, bool odd) if (y == polygon->YBottom && y != polygon->YTop) { - if (DoTimings(EmptyPolyScanline, odd)) abort = true; + if (!abort) abort = (first && DoTimings(FirstNull, odd)) || DoTimings(EmptyPolyScanline, odd); + + first = false; } else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) { - if (y == polygon->YTop) if(DoTimings(FirstPolyScanline, odd)) abort = true; + //if (y == polygon->YTop) if(DoTimings(FirstPolyScanline, odd)) abort = true; + + if (!abort) abort = (first && DoTimingsSlopes(rp, y, odd)) // incorrect. needs research; behavior is strange... + || DoTimings(PerPolyScanline, odd) + || (!CheckTimings(MinToStartPoly, odd)); + if (abort) { CheckSlope(rp, y); Step(rp); } else if (polygon->IsShadowMask) - ;//RenderShadowMaskScanline(rp, y); + abort = RenderShadowMaskScanline(rp, y, odd); else - if (RenderPolygonScanline(rp, y, odd)) abort = true; + abort = RenderPolygonScanline(rp, y, odd); + + first = false; } } @@ -1500,7 +1537,7 @@ u32 SoftRenderer::CalculateFogDensity(u32 pixeladdr) return density; } -void SoftRenderer::ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool odd) +void SoftRenderer::ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool odd, s32 uhohzone) { // to consider: // clearing all polygon fog flags if the master flag isn't set? @@ -1759,10 +1796,13 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) s8 buffersize = 0; RasterTiming = InitialTiming; - s32 timingadvance = InitialTiming; bool abort = false; - //u32* RDLinesReg = &GPU.GPU3D.RDLines; ClearBuffers(); + s32 gpu2dtracking = InitGPU2DTimeout; + s32 gpu2dfreetime = InitGPU2DTimeout; + s32 prev2dtime; + bool readodd = true; + for (u8 quarter = 0; quarter < 4; quarter++) for (u8 bufferline = 0; bufferline < 48; bufferline += 2) { @@ -1770,13 +1810,19 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) RasterTimingEven = 0; RasterTiming += ScanlineIncrement; + gpu2dtracking += GPU2DReadSLPair; if (abort) RasterTiming += AbortIncrement; // if previous scanline was aborted, allow an extra 12 pixels worth of timing if (y >= 50) { - if (RasterTiming > Post50Max) RasterTiming = Post50Max; - timingadvance = 0; - buffersize = 48; + gpu2dfreetime = 0; + if (RasterTiming > Post50Max) + { + s32 temp = RasterTiming - Post50Max; + RasterTiming = Post50Max; + gpu2dtracking -= temp; + } + if (buffersize > 48) buffersize = 48; } abort = RenderScanline(y, j, true); @@ -1785,50 +1831,59 @@ void SoftRenderer::RenderPolygons(bool threaded, Polygon** polygons, int npolys) buffersize += 2; //RasterTiming += ScanlineBreak; s32 timespent = std::max(RasterTimingOdd, RasterTimingEven); - - /*if (timespent > FreeTiming) - { - abort = true; - timespent -= FreeTiming; - } - else if (!abort) - { - abort = false; - timespent -= FreeTiming; - }*/ - //if (!abort) - //if (buffersize > 48) timespent -= PerScanlineRecup; - /*else*/ + timespent -= FreeTiming; - if (timespent > 0) - { - RasterTiming -= timespent; - timingadvance -= timespent; - } + // measure scanlines being read here. + gpu2dtracking -= timespent; + gpu2dfreetime -= timespent; - if (timingadvance < 0) for (s32 i = (ScanlinePairLength / 2) * buffersize; i > RasterTiming + (ScanlinePairLength / 2); i -= ScanlinePairLength / 2) buffersize -= 1; - if (buffersize < 0) buffersize = 0; + if (timespent > 0) RasterTiming -= timespent; + + //if (RasterTiming < 0) RasterTiming = 0; + if (gpu2dfreetime <= 0) + { + buffersize = 0; + if (gpu2dtracking > 0) + { + s32 i = gpu2dtracking; + while (true) + { + s32 comp = GPU2DReadSLPair/2; + //if (readodd) comp = GPU2DSpeedOutsidePair + GPU2DReadScanline; + //else comp = GPU2DSpeedWithinPair + GPU2DReadScanline; - // seems to display the lowest scanline buffer count reached during the current frame. - // we also caps it to 46 here, because this reg does that too for some reason. - if (quarter >= 1 && buffersize < GPU.GPU3D.RDLinesDisplay) GPU.GPU3D.RDLinesDisplay = buffersize; + if (i < comp) break; + + i -= comp; + buffersize++; + //readodd = !readodd; + } + + if (i > 0) buffersize++; + } + + // seems to display the lowest scanline buffer count reached during the current frame. + // we also caps it to 46 here, because this reg does that too for some reason. + if (GPU.GPU3D.RDLines > buffersize) GPU.GPU3D.RDLines = buffersize; + } if (prevbufferline >= 0) { - ScanlineFinalPass(y-2, prevbufferline, true); - ScanlineFinalPass(y-1, prevbufferline+1, false); + ScanlineFinalPass(y-2, prevbufferline, true, prev2dtime); + ScanlineFinalPass(y-1, prevbufferline+1, false, prev2dtime); } y += 2; prevbufferline = bufferline; + prev2dtime = gpu2dtracking; if (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); } - ScanlineFinalPass(190, prevbufferline, true); - ScanlineFinalPass(191, prevbufferline+1, false); + ScanlineFinalPass(190, prevbufferline, true, prev2dtime); + ScanlineFinalPass(191, prevbufferline+1, false, prev2dtime); if (threaded) Platform::Semaphore_Post(Sema_ScanlineCount); diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 01187a8a..4b9b31eb 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -455,7 +455,9 @@ private: melonDS::GPU& GPU; RendererPolygon PolygonList[2048]; bool DoTimings(s32 cycles, bool odd); + bool CheckTimings(s32 cycles, bool odd); u32 DoTimingsPixels(s32 pixels, bool odd); + bool DoTimingsSlopes(RendererPolygon* rp, s32 y, bool odd); void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha); u32 RenderPixel(Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t); void PlotTranslucentPixel(u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow); @@ -464,11 +466,11 @@ private: void SetupPolygon(RendererPolygon* rp, Polygon* polygon); void Step(RendererPolygon* rp); void CheckSlope(RendererPolygon* rp, s32 y); - void RenderShadowMaskScanline(RendererPolygon* rp, s32 y); + bool RenderShadowMaskScanline(RendererPolygon* rp, s32 y, bool odd); bool RenderPolygonScanline(RendererPolygon* rp, s32 y, bool odd); bool RenderScanline(s32 y, int npolys, bool odd); u32 CalculateFogDensity(u32 pixeladdr); - void ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool odd); + void ScanlineFinalPass(s32 y, u8 rdbufferoffset, bool odd, s32 uhohzone); void ClearBuffers(); void RenderPolygons(bool threaded, Polygon** polygons, int npolys);