From 249687a2ce9c0fd08f5e2b2b69f0129754d00214 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 24 Feb 2024 14:18:45 -0500 Subject: [PATCH] rework 4: now with proper edge marking bug emulation! --- src/GPU3D.h | 30 ++--- src/GPU3D_Soft.cpp | 273 +++++++++++++++------------------------------ src/GPU3D_Soft.h | 1 + 3 files changed, 106 insertions(+), 198 deletions(-) diff --git a/src/GPU3D.h b/src/GPU3D.h index 27162854..8719a7e1 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -341,13 +341,13 @@ public: static constexpr int ScanlineReadInc = DelayBetweenReads + ScanlineReadSpeed; - static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair. - static constexpr int GPU2DSpeedSecondInPair = 296 * TimingFrac; // 296 | 295??? | the delay between finishing reading the first scanline + //static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair. + //static constexpr int GPU2DSpeedSecondInPair = 296 * TimingFrac; // 296 | 295??? | the delay between finishing reading the first scanline // and beginning reading the second scanline of a scanline pair. - static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline. - static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment. - static constexpr int InitGPU2DTimeout = 52128 * TimingFrac; // 51618? 51874? | when it finishes reading the first scanline. - static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines. + //static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline. + //static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment. + static constexpr int InitGPU2DTimeout = 51875 * TimingFrac; // 51618? 51874? 52128? | when it finishes reading the first scanline. + //static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines. static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this? // GPU 3D Rasterization Timings: For Emulating Scanline Timeout @@ -358,22 +358,20 @@ public: //static constexpr int ScanlineBreak2 = 40 * TimingFrac; //static constexpr int FakeTiming = 2 * TimingFrac; //static constexpr int FraudulentTiming = 1120 * TimingFrac; // bad theory. todo: find a better one. - static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair - static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is, + //static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair + //static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is, // it just cares about if its the first 50 scanlines to speedrun rendering? static constexpr int FinalPassLen = 500 * TimingFrac; // 496 (might technically be 500?) | the next scanline cannot begin while a scanline's final pass is in progress // (can be interpreted as the minimum amount of cycles for the next scanline // pair to start after the previous pair began) (related to final pass?) static constexpr int ScanlinePushDelay = 242 * TimingFrac; - static constexpr int TimeoutIncrement = 2130 * TimingFrac; - static constexpr int ScanlineIncrementold = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair - static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains" - static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126) + //static constexpr int TimeoutIncrement = 2130 * TimingFrac; + //static constexpr int ScanlineIncrementold = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair + //static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains" + //static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126) // (why does the next pair get more time if the previous scanline is aborted?) static constexpr int UnderflowFlag = 14 * TimingFrac; // 14 | How many cycles need to be left for the 3ddispcnt rdlines underflow flag to be set - static constexpr int RastDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?) - // (Amount of time before the end of the cycle a scanline must abort?) - static constexpr int FinishScanline = 512 * TimingFrac; + //static constexpr int FinishScanline = 512 * TimingFrac; // GPU 3D Rasterization Timings II: For Tracking Timing Behaviors @@ -392,6 +390,8 @@ public: static constexpr int FirstPerSlope = 1 * TimingFrac; // 1 | for each "slope" the first polygon has in this scanline increment it by 1. // (see DoTimingsSlopes() in GPU3D_Soft.cpp for more info) static constexpr int FirstNull = 1 * TimingFrac; // 1 | if the first polygon is "null" (probably wrong?) + static constexpr int RastDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?) + // (Amount of time before the end of the cycle a scanline must abort?) // static constexpr int RasterTimingCap = 51116 * TimingFrac; // static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 923b8a77..d3b72e08 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -19,6 +19,7 @@ #include "GPU3D_Soft.h" #include +#include #include #include #include "NDS.h" @@ -175,6 +176,8 @@ u32 SoftRenderer::DoTimingsPixels(s32 pixels, s32* timingcounter) bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, s32* timingcounter) { + DoTimings(RastDelay, timingcounter); + // determine the timing impact of the first polygon's slopes. Polygon* polygon = rp->PolyData; @@ -1457,6 +1460,7 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter) { + *timingcounter = 0; bool abort = false; bool first = true; for (int i = 0; i < npolys; i++) @@ -1466,7 +1470,7 @@ void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timing if (y == polygon->YBottom && y != polygon->YTop) { - if (!abort) abort = (first && DoTimings(FirstNull, timingcounter)) || DoTimings(EmptyPolyScanline, timingcounter); + if (!abort) abort = (first && DoTimings(FirstNull+RastDelay, timingcounter)) || DoTimings(EmptyPolyScanline, timingcounter); first = false; } @@ -1555,6 +1559,7 @@ bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z, u32 pixel } } +template void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext) { // to consider: @@ -1728,6 +1733,11 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, ColorBuffer[pixeladdr] = topR | (topG << 8) | (topB << 16) | (topA << 24); } } + if constexpr (push) + { + memcpy(&FinalBuffer[y*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], ScanlineWidth*4); + Platform::Semaphore_Post(Sema_ScanlineCount); + } } void SoftRenderer::ClearBuffers(const GPU& gpu) @@ -1846,190 +1856,87 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys) //init internal buffer ClearBuffers(gpu); - // init all this junk i need to keep track of - s32 rasterevents[RasterEvents_MAX]; - rasterevents[RenderStart] = 0; - rasterevents[RenderFinal] = FrameLength; - rasterevents[PushScanline] = FrameLength; - rasterevents[PushScanlineP2] = FrameLength; - rasterevents[ScanlineRead] = InitGPU2DTimeout; - ScanlineTimeout = FrameLength; - RasterTiming = 0; - s32 rastertimingeven = 0; - s32 rastertimingodd = 0; - u8 scanlinesread = 0; - u8 scanlinesinit = 0; - u8 scanlinesfin = 0; - u8 scanlinespushed = 0; - u8 scanlinespushed2 = 0; - s16 scanlineswaitingforpush = 0; - s16 scanlineswaitingforread = 0; - u8 nextevent; - u16 leftovers; - bool evenread = false; - s32 timespent = 0; - s32 prevtimespent = 0; - bool edgebug = false; - bool prevedgebug = false; - - // until all scanlines have been pushed and read continue looping... CHECKME: unless its time for the next 3d frame to begin - while ((scanlinesread < 192 || scanlinespushed2 < 192) && (RasterTiming < (FrameLength-RastDelay))) + u32 slread[192]; // scanline read times + for (int i = 0, time = InitGPU2DTimeout; i < 192; i++, time += ScanlineReadInc) // CHECKME: is this computed at compile time? { - // check all events to find the earliest scheduled one - nextevent = 0; - for (u8 i = 1; i < RasterEvents_MAX; i++) - { - if (rasterevents[i] < rasterevents[nextevent]) - nextevent = i; - } - - // if all events are scheduled for after the next frame begins, ABORT - if (rasterevents[nextevent] >= FrameLength) break; - - switch (nextevent) - { - - // initial rendering pass (polygons, texturing, etc.) (variable cycle length) - case RenderStart: - { - // set current raster time to the start of the event - RasterTiming = rasterevents[RenderStart]; - - s32 rastertimingeven = 0; - s32 rastertimingodd = 0; - // scanlines are rendered in pairs of two - RenderScanline(gpu, scanlinesinit, j, &rastertimingeven); - RenderScanline(gpu, scanlinesinit+1, j, &rastertimingodd); - scanlinesinit += 2; - - // a new scanline pair cannot begin until both scanlines are finished. - prevtimespent = timespent; - timespent = std::max(rastertimingeven, rastertimingodd); - - // a new scanline pair cannot begin until the finishing pass + push is done. - if ((RasterTiming + timespent) < (RasterTiming+FinalPassLen)) - RasterTiming += FinalPassLen; - else - RasterTiming += timespent; - - // 12 cycles at the end of the scanline are always used, unless the scanline got within 12 cycles of timing out. Don't ask why, it just does. - s32 timeoutdist = ScanlineTimeout - RasterTiming; - prevedgebug = edgebug; - if (timeoutdist < 49385) edgebug = true; - else edgebug = false; - RasterTiming += std::clamp(timeoutdist, 0, 12); - - //set next scanline timeout - if (ScanlineTimeout == FrameLength) ScanlineTimeout = rasterevents[ScanlineRead] - FinalPassLen + (ScanlineReadInc*evenread);//(ScanlineReadSpeed+RastDelay); - else ScanlineTimeout += TimeoutIncrement; - - // schedule next scanline pair + the final pass of the latest pair - rasterevents[RenderFinal] = RasterTiming; - if (scanlinesinit < 192) rasterevents[RenderStart] = RasterTiming+RastDelay; // scheduled 4 cycles late (presumably due to initial polygon timing shenanigans?) - else rasterevents[RenderStart] = FrameLength; - break; - } - - // final rendering pass (edge marking, anti-aliasing, fog) (fixed length of 496 (maybe 500?) cycles) - case RenderFinal: - { - // schedule a scanline push event - rasterevents[PushScanline] = rasterevents[RenderFinal] + ScanlinePushDelay; - - // if the first scanline pair was just finished only render one scanline - if (scanlinesfin > 0) - { - ScanlineFinalPass(gpu.GPU3D, scanlinesfin, timespent+4 < 501 || edgebug, prevtimespent+4 < 501 || prevedgebug); - scanlineswaitingforpush++; - scanlinesfin++; - } - - // if the last scanline pair was just finished only render one scanline - if (scanlinesfin < 191) - { - ScanlineFinalPass(gpu.GPU3D, scanlinesfin, timespent+4 < 501 || edgebug, prevtimespent+4 < 501 || prevedgebug); - scanlineswaitingforpush++; - scanlinesfin++; - } - // unschedule final pass event - if (scanlinesfin != 191) - rasterevents[RenderFinal] = FrameLength; - else // schedule next final pass event to immediately after the current one - rasterevents[RenderFinal] += FinalPassLen; - break; - } - - // push scanlines to the intermediary "frame buffer" for the 2d engine to read them. (fixed length of ??? cycles) 256? - case PushScanline: - { - // reschedule events if buffer is full - if (scanlineswaitingforread >= 48) - { - rasterevents[PushScanline] = rasterevents[ScanlineRead]; - - // dont reschedule these events if they're done. - if (scanlinesinit < 192) - rasterevents[RenderStart] = rasterevents[ScanlineRead] + RastDelay; - if (scanlinesfin < 192) - rasterevents[RenderFinal] = rasterevents[ScanlineRead]; - - break; - } - - // if a scanline push might intersect a read determine the point at which it intersects - s32 pixelstopush = (scanlinespushed > scanlinesread ? 256 : (rasterevents[ScanlineRead] + (ScanlineReadInc*scanlineswaitingforread)) - rasterevents[PushScanline]); - leftovers = BeginPushScanline(scanlinespushed, pixelstopush); - - scanlineswaitingforpush--; - scanlinespushed++; - - // schedule the finish push event if needed - if (leftovers != 0) rasterevents[PushScanlineP2] = rasterevents[ScanlineRead]; - else - { - scanlineswaitingforread++; - scanlinespushed2++; - } - - if (scanlineswaitingforpush <= 0) - rasterevents[PushScanline] = FrameLength; // unsched event if no scanlines are waiting to be finished - - break; - } - - // 2d engine reading scanlines from the intermediary "framebuffer" - case ScanlineRead: - { - // read scanline from buffer - ReadScanline(scanlinesread); - - // avoid breaking seperate thread. - if constexpr (threaded) - Platform::Semaphore_Post(Sema_ScanlineCount); - - scanlinesread++; - scanlineswaitingforread--; - evenread = !evenread; - - // reschedule event for one scanline later unless all scanlines have been read - if (scanlinesread < 192) rasterevents[ScanlineRead] += ScanlineReadInc; - else rasterevents[ScanlineRead] = FrameLength; - break; - } - - // finish pushing a scanline to the buffer if it got interrupted by the read process. - case PushScanlineP2: - { - FinishPushScanline(scanlinespushed2, leftovers); - scanlineswaitingforread++; - scanlinespushed2++; - - // unschedule event if all partially pushed scanlines have been pushed - if (scanlinespushed2 >= scanlinespushed) rasterevents[PushScanlineP2] = FrameLength; - break; - } - } + slread[i] = time; } + + ScanlineTimeout = FrameLength; // CHECKME + + s32 rastertimingeven; // always init to 0 at the start of a scanline render + s32 rastertimingodd; + + s32 scanlineswaiting = 0; + s32 nextread = 0; + + u32 timespent; + u32 prevtimespent; + // scanlines are rendered in pairs of two + RenderScanline(gpu, 0, j, &rastertimingeven); + RenderScanline(gpu, 1, j, &rastertimingodd); + + RasterTiming = timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); + RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); + + // if first pair was not delayed past the first read, then later scanlines cannot either + // this allows us to implement a fast path + //if (slread[0] - timespent + ScanlinePushDelay >= 256) + { + ScanlineTimeout = slread[1] - FinalPassLen; + + RenderScanline(gpu, 2, j, &rastertimingeven); + RenderScanline(gpu, 3, j, &rastertimingodd); + + prevtimespent = timespent; + RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); + RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); + + ScanlineFinalPass(gpu.GPU3D, 0, true, true); + scanlineswaiting++; + for (int y = 4; y < 192; y+=2) + { + ScanlineTimeout = slread[y-1] - FinalPassLen; + + RenderScanline(gpu, y, j, &rastertimingeven); + RenderScanline(gpu, y+1, j, &rastertimingodd); + + prevtimespent = timespent; + RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); + RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); + + scanlineswaiting+=2; + + while (scanlineswaiting >= 47) + { + if (RasterTiming < slread[nextread]) RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565? + scanlineswaiting--; + nextread++; + } + + ScanlineFinalPass(gpu.GPU3D, y-3, prevtimespent >= 502 || y-3 == 1, timespent >= 502); + ScanlineFinalPass(gpu.GPU3D, y-2, prevtimespent >= 502, timespent >= 502); + } + + ScanlineFinalPass(gpu.GPU3D, 189, timespent >= 502, timespent >= 502); + ScanlineFinalPass(gpu.GPU3D, 190, timespent >= 502, true); + + ScanlineFinalPass(gpu.GPU3D, 191, true, true); + } + /*else + { + ScanlineFinalPass(gpu, 0, false, false); + + s32 pixelstopush = slread[0] - (timespent + ScanlinePushDelay); + if (pixelstopush > 256) pixelstopush = 256; + //timespent + ScanlinePushDelay + ScanlineReadSpeed > slread[0] + + rastertimingeven = 0; + rastertimingodd = 0; + + RenderScanline(gpu, 2, j, &rastertimingeven); + RenderScanline(gpu, 3, j, &rastertimingodd); + }*/ } void SoftRenderer::VCount144(GPU& gpu) diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 6f81fae6..3814762d 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -472,6 +472,7 @@ private: void RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter); u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const; bool CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr); + template void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext); void ClearBuffers(const GPU& gpu); u16 BeginPushScanline(s32 y, s32 pixelstodraw);