rework 4: now with proper edge marking bug emulation!

This commit is contained in:
Jaklyy 2024-02-24 14:18:45 -05:00
parent 3256e054fa
commit 249687a2ce
3 changed files with 106 additions and 198 deletions

View File

@ -341,13 +341,13 @@ public:
static constexpr int ScanlineReadInc = DelayBetweenReads + ScanlineReadSpeed; static constexpr int ScanlineReadInc = DelayBetweenReads + ScanlineReadSpeed;
static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair. //static constexpr int GPU2DSpeedFirstInPair = 810 * TimingFrac; // 810 | the delay between finishing reading a pair and beginning reading a new pair.
static constexpr int GPU2DSpeedSecondInPair = 296 * TimingFrac; // 296 | 295??? | the delay between finishing reading the first scanline //static constexpr int GPU2DSpeedSecondInPair = 296 * TimingFrac; // 296 | 295??? | the delay between finishing reading the first scanline
// and beginning reading the second scanline of a scanline pair. // and beginning reading the second scanline of a scanline pair.
static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline. //static constexpr int GPU2DReadScanline = 256 * TimingFrac; // 256 | the time it takes to read a scanline.
static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment. //static constexpr int GPU2DReadSLPair = 1618 * TimingFrac; // 1618 | notably the same as the scanline increment.
static constexpr int InitGPU2DTimeout = 52128 * TimingFrac; // 51618? 51874? | when it finishes reading the first scanline. static constexpr int InitGPU2DTimeout = 51875 * TimingFrac; // 51618? 51874? 52128? | when it finishes reading the first scanline.
static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines. //static constexpr int GPU2D48Scanlines = GPU2DReadSLPair * 24; // time to read 48 scanlines.
static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this? static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this?
// GPU 3D Rasterization Timings: For Emulating Scanline Timeout // GPU 3D Rasterization Timings: For Emulating Scanline Timeout
@ -358,22 +358,20 @@ public:
//static constexpr int ScanlineBreak2 = 40 * TimingFrac; //static constexpr int ScanlineBreak2 = 40 * TimingFrac;
//static constexpr int FakeTiming = 2 * TimingFrac; //static constexpr int FakeTiming = 2 * TimingFrac;
//static constexpr int FraudulentTiming = 1120 * TimingFrac; // bad theory. todo: find a better one. //static constexpr int FraudulentTiming = 1120 * TimingFrac; // bad theory. todo: find a better one.
static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair //static constexpr int InitialTiming = 48688 * TimingFrac; // 48688 | add 1618*2 to get the timeout of the second scanline pair
static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is, //static constexpr int Post50Max = 51116 * TimingFrac; // 51116 | for some reason it doesn't care about how full it actually is,
// it just cares about if its the first 50 scanlines to speedrun rendering? // it just cares about if its the first 50 scanlines to speedrun rendering?
static constexpr int FinalPassLen = 500 * TimingFrac; // 496 (might technically be 500?) | the next scanline cannot begin while a scanline's final pass is in progress static constexpr int FinalPassLen = 500 * TimingFrac; // 496 (might technically be 500?) | the next scanline cannot begin while a scanline's final pass is in progress
// (can be interpreted as the minimum amount of cycles for the next scanline // (can be interpreted as the minimum amount of cycles for the next scanline
// pair to start after the previous pair began) (related to final pass?) // pair to start after the previous pair began) (related to final pass?)
static constexpr int ScanlinePushDelay = 242 * TimingFrac; static constexpr int ScanlinePushDelay = 242 * TimingFrac;
static constexpr int TimeoutIncrement = 2130 * TimingFrac; //static constexpr int TimeoutIncrement = 2130 * TimingFrac;
static constexpr int ScanlineIncrementold = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair //static constexpr int ScanlineIncrementold = 1618 * TimingFrac; // 1618 | how much to regain per scanline pair
static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains" //static constexpr int ScanlineIncrement = 2114 * TimingFrac; // 2114 | how much time a scanline pair "gains"
static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126) //static constexpr int AbortIncrement = 12 * TimingFrac; // 12 | how much extra to regain after an aborted scanline (total 2126)
// (why does the next pair get more time if the previous scanline is aborted?) // (why does the next pair get more time if the previous scanline is aborted?)
static constexpr int UnderflowFlag = 14 * TimingFrac; // 14 | How many cycles need to be left for the 3ddispcnt rdlines underflow flag to be set static constexpr int UnderflowFlag = 14 * TimingFrac; // 14 | How many cycles need to be left for the 3ddispcnt rdlines underflow flag to be set
static constexpr int RastDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?) //static constexpr int FinishScanline = 512 * TimingFrac;
// (Amount of time before the end of the cycle a scanline must abort?)
static constexpr int FinishScanline = 512 * TimingFrac;
// GPU 3D Rasterization Timings II: For Tracking Timing Behaviors // GPU 3D Rasterization Timings II: For Tracking Timing Behaviors
@ -392,6 +390,8 @@ public:
static constexpr int FirstPerSlope = 1 * TimingFrac; // 1 | for each "slope" the first polygon has in this scanline increment it by 1. static constexpr int FirstPerSlope = 1 * TimingFrac; // 1 | for each "slope" the first polygon has in this scanline increment it by 1.
// (see DoTimingsSlopes() in GPU3D_Soft.cpp for more info) // (see DoTimingsSlopes() in GPU3D_Soft.cpp for more info)
static constexpr int FirstNull = 1 * TimingFrac; // 1 | if the first polygon is "null" (probably wrong?) static constexpr int FirstNull = 1 * TimingFrac; // 1 | if the first polygon is "null" (probably wrong?)
static constexpr int RastDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?)
// (Amount of time before the end of the cycle a scanline must abort?)
// static constexpr int RasterTimingCap = 51116 * TimingFrac; // static constexpr int RasterTimingCap = 51116 * TimingFrac;
// static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED // static constexpr int PerScanlineTiming = 1064 * TimingFrac; // approximate currently, used to calc RDLines. TEMPORARY UNTIL ACCURATE "FRAMEBUFFER" CAN BE IMPLEMENTED

View File

@ -19,6 +19,7 @@
#include "GPU3D_Soft.h" #include "GPU3D_Soft.h"
#include <algorithm> #include <algorithm>
#include <initializer_list>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include "NDS.h" #include "NDS.h"
@ -175,6 +176,8 @@ u32 SoftRenderer::DoTimingsPixels(s32 pixels, s32* timingcounter)
bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, s32* timingcounter) bool SoftRenderer::DoTimingsSlopes(RendererPolygon* rp, s32 y, s32* timingcounter)
{ {
DoTimings(RastDelay, timingcounter);
// determine the timing impact of the first polygon's slopes. // determine the timing impact of the first polygon's slopes.
Polygon* polygon = rp->PolyData; Polygon* polygon = rp->PolyData;
@ -1457,6 +1460,7 @@ bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3
void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter) void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter)
{ {
*timingcounter = 0;
bool abort = false; bool abort = false;
bool first = true; bool first = true;
for (int i = 0; i < npolys; i++) for (int i = 0; i < npolys; i++)
@ -1466,7 +1470,7 @@ void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timing
if (y == polygon->YBottom && y != polygon->YTop) if (y == polygon->YBottom && y != polygon->YTop)
{ {
if (!abort) abort = (first && DoTimings(FirstNull, timingcounter)) || DoTimings(EmptyPolyScanline, timingcounter); if (!abort) abort = (first && DoTimings(FirstNull+RastDelay, timingcounter)) || DoTimings(EmptyPolyScanline, timingcounter);
first = false; first = false;
} }
@ -1555,6 +1559,7 @@ bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z, u32 pixel
} }
} }
template <bool push>
void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext) void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext)
{ {
// to consider: // to consider:
@ -1728,6 +1733,11 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev,
ColorBuffer[pixeladdr] = topR | (topG << 8) | (topB << 16) | (topA << 24); ColorBuffer[pixeladdr] = topR | (topG << 8) | (topB << 16) | (topA << 24);
} }
} }
if constexpr (push)
{
memcpy(&FinalBuffer[y*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], ScanlineWidth*4);
Platform::Semaphore_Post(Sema_ScanlineCount);
}
} }
void SoftRenderer::ClearBuffers(const GPU& gpu) void SoftRenderer::ClearBuffers(const GPU& gpu)
@ -1846,190 +1856,87 @@ void SoftRenderer::RenderPolygons(GPU& gpu, Polygon** polygons, int npolys)
//init internal buffer //init internal buffer
ClearBuffers(gpu); ClearBuffers(gpu);
// init all this junk i need to keep track of u32 slread[192]; // scanline read times
s32 rasterevents[RasterEvents_MAX]; for (int i = 0, time = InitGPU2DTimeout; i < 192; i++, time += ScanlineReadInc) // CHECKME: is this computed at compile time?
rasterevents[RenderStart] = 0;
rasterevents[RenderFinal] = FrameLength;
rasterevents[PushScanline] = FrameLength;
rasterevents[PushScanlineP2] = FrameLength;
rasterevents[ScanlineRead] = InitGPU2DTimeout;
ScanlineTimeout = FrameLength;
RasterTiming = 0;
s32 rastertimingeven = 0;
s32 rastertimingodd = 0;
u8 scanlinesread = 0;
u8 scanlinesinit = 0;
u8 scanlinesfin = 0;
u8 scanlinespushed = 0;
u8 scanlinespushed2 = 0;
s16 scanlineswaitingforpush = 0;
s16 scanlineswaitingforread = 0;
u8 nextevent;
u16 leftovers;
bool evenread = false;
s32 timespent = 0;
s32 prevtimespent = 0;
bool edgebug = false;
bool prevedgebug = false;
// until all scanlines have been pushed and read continue looping... CHECKME: unless its time for the next 3d frame to begin
while ((scanlinesread < 192 || scanlinespushed2 < 192) && (RasterTiming < (FrameLength-RastDelay)))
{ {
// check all events to find the earliest scheduled one slread[i] = time;
nextevent = 0;
for (u8 i = 1; i < RasterEvents_MAX; i++)
{
if (rasterevents[i] < rasterevents[nextevent])
nextevent = i;
}
// if all events are scheduled for after the next frame begins, ABORT
if (rasterevents[nextevent] >= FrameLength) break;
switch (nextevent)
{
// initial rendering pass (polygons, texturing, etc.) (variable cycle length)
case RenderStart:
{
// set current raster time to the start of the event
RasterTiming = rasterevents[RenderStart];
s32 rastertimingeven = 0;
s32 rastertimingodd = 0;
// scanlines are rendered in pairs of two
RenderScanline(gpu, scanlinesinit, j, &rastertimingeven);
RenderScanline(gpu, scanlinesinit+1, j, &rastertimingodd);
scanlinesinit += 2;
// a new scanline pair cannot begin until both scanlines are finished.
prevtimespent = timespent;
timespent = std::max(rastertimingeven, rastertimingodd);
// a new scanline pair cannot begin until the finishing pass + push is done.
if ((RasterTiming + timespent) < (RasterTiming+FinalPassLen))
RasterTiming += FinalPassLen;
else
RasterTiming += timespent;
// 12 cycles at the end of the scanline are always used, unless the scanline got within 12 cycles of timing out. Don't ask why, it just does.
s32 timeoutdist = ScanlineTimeout - RasterTiming;
prevedgebug = edgebug;
if (timeoutdist < 49385) edgebug = true;
else edgebug = false;
RasterTiming += std::clamp(timeoutdist, 0, 12);
//set next scanline timeout
if (ScanlineTimeout == FrameLength) ScanlineTimeout = rasterevents[ScanlineRead] - FinalPassLen + (ScanlineReadInc*evenread);//(ScanlineReadSpeed+RastDelay);
else ScanlineTimeout += TimeoutIncrement;
// schedule next scanline pair + the final pass of the latest pair
rasterevents[RenderFinal] = RasterTiming;
if (scanlinesinit < 192) rasterevents[RenderStart] = RasterTiming+RastDelay; // scheduled 4 cycles late (presumably due to initial polygon timing shenanigans?)
else rasterevents[RenderStart] = FrameLength;
break;
}
// final rendering pass (edge marking, anti-aliasing, fog) (fixed length of 496 (maybe 500?) cycles)
case RenderFinal:
{
// schedule a scanline push event
rasterevents[PushScanline] = rasterevents[RenderFinal] + ScanlinePushDelay;
// if the first scanline pair was just finished only render one scanline
if (scanlinesfin > 0)
{
ScanlineFinalPass(gpu.GPU3D, scanlinesfin, timespent+4 < 501 || edgebug, prevtimespent+4 < 501 || prevedgebug);
scanlineswaitingforpush++;
scanlinesfin++;
}
// if the last scanline pair was just finished only render one scanline
if (scanlinesfin < 191)
{
ScanlineFinalPass(gpu.GPU3D, scanlinesfin, timespent+4 < 501 || edgebug, prevtimespent+4 < 501 || prevedgebug);
scanlineswaitingforpush++;
scanlinesfin++;
}
// unschedule final pass event
if (scanlinesfin != 191)
rasterevents[RenderFinal] = FrameLength;
else // schedule next final pass event to immediately after the current one
rasterevents[RenderFinal] += FinalPassLen;
break;
}
// push scanlines to the intermediary "frame buffer" for the 2d engine to read them. (fixed length of ??? cycles) 256?
case PushScanline:
{
// reschedule events if buffer is full
if (scanlineswaitingforread >= 48)
{
rasterevents[PushScanline] = rasterevents[ScanlineRead];
// dont reschedule these events if they're done.
if (scanlinesinit < 192)
rasterevents[RenderStart] = rasterevents[ScanlineRead] + RastDelay;
if (scanlinesfin < 192)
rasterevents[RenderFinal] = rasterevents[ScanlineRead];
break;
}
// if a scanline push might intersect a read determine the point at which it intersects
s32 pixelstopush = (scanlinespushed > scanlinesread ? 256 : (rasterevents[ScanlineRead] + (ScanlineReadInc*scanlineswaitingforread)) - rasterevents[PushScanline]);
leftovers = BeginPushScanline(scanlinespushed, pixelstopush);
scanlineswaitingforpush--;
scanlinespushed++;
// schedule the finish push event if needed
if (leftovers != 0) rasterevents[PushScanlineP2] = rasterevents[ScanlineRead];
else
{
scanlineswaitingforread++;
scanlinespushed2++;
}
if (scanlineswaitingforpush <= 0)
rasterevents[PushScanline] = FrameLength; // unsched event if no scanlines are waiting to be finished
break;
}
// 2d engine reading scanlines from the intermediary "framebuffer"
case ScanlineRead:
{
// read scanline from buffer
ReadScanline(scanlinesread);
// avoid breaking seperate thread.
if constexpr (threaded)
Platform::Semaphore_Post(Sema_ScanlineCount);
scanlinesread++;
scanlineswaitingforread--;
evenread = !evenread;
// reschedule event for one scanline later unless all scanlines have been read
if (scanlinesread < 192) rasterevents[ScanlineRead] += ScanlineReadInc;
else rasterevents[ScanlineRead] = FrameLength;
break;
}
// finish pushing a scanline to the buffer if it got interrupted by the read process.
case PushScanlineP2:
{
FinishPushScanline(scanlinespushed2, leftovers);
scanlineswaitingforread++;
scanlinespushed2++;
// unschedule event if all partially pushed scanlines have been pushed
if (scanlinespushed2 >= scanlinespushed) rasterevents[PushScanlineP2] = FrameLength;
break;
}
}
} }
ScanlineTimeout = FrameLength; // CHECKME
s32 rastertimingeven; // always init to 0 at the start of a scanline render
s32 rastertimingodd;
s32 scanlineswaiting = 0;
s32 nextread = 0;
u32 timespent;
u32 prevtimespent;
// scanlines are rendered in pairs of two
RenderScanline(gpu, 0, j, &rastertimingeven);
RenderScanline(gpu, 1, j, &rastertimingodd);
RasterTiming = timespent = std::max(std::initializer_list<s32> {rastertimingeven, rastertimingodd, FinalPassLen});
RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);
// if first pair was not delayed past the first read, then later scanlines cannot either
// this allows us to implement a fast path
//if (slread[0] - timespent + ScanlinePushDelay >= 256)
{
ScanlineTimeout = slread[1] - FinalPassLen;
RenderScanline(gpu, 2, j, &rastertimingeven);
RenderScanline(gpu, 3, j, &rastertimingodd);
prevtimespent = timespent;
RasterTiming += timespent = std::max(std::initializer_list<s32> {rastertimingeven, rastertimingodd, FinalPassLen});
RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);
ScanlineFinalPass<true>(gpu.GPU3D, 0, true, true);
scanlineswaiting++;
for (int y = 4; y < 192; y+=2)
{
ScanlineTimeout = slread[y-1] - FinalPassLen;
RenderScanline(gpu, y, j, &rastertimingeven);
RenderScanline(gpu, y+1, j, &rastertimingodd);
prevtimespent = timespent;
RasterTiming += timespent = std::max(std::initializer_list<s32> {rastertimingeven, rastertimingodd, FinalPassLen});
RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);
scanlineswaiting+=2;
while (scanlineswaiting >= 47)
{
if (RasterTiming < slread[nextread]) RasterTiming += timespent = (slread[nextread] + 565) - RasterTiming; // why + 565?
scanlineswaiting--;
nextread++;
}
ScanlineFinalPass<true>(gpu.GPU3D, y-3, prevtimespent >= 502 || y-3 == 1, timespent >= 502);
ScanlineFinalPass<true>(gpu.GPU3D, y-2, prevtimespent >= 502, timespent >= 502);
}
ScanlineFinalPass<true>(gpu.GPU3D, 189, timespent >= 502, timespent >= 502);
ScanlineFinalPass<true>(gpu.GPU3D, 190, timespent >= 502, true);
ScanlineFinalPass<true>(gpu.GPU3D, 191, true, true);
}
/*else
{
ScanlineFinalPass(gpu, 0, false, false);
s32 pixelstopush = slread[0] - (timespent + ScanlinePushDelay);
if (pixelstopush > 256) pixelstopush = 256;
//timespent + ScanlinePushDelay + ScanlineReadSpeed > slread[0]
rastertimingeven = 0;
rastertimingodd = 0;
RenderScanline(gpu, 2, j, &rastertimingeven);
RenderScanline(gpu, 3, j, &rastertimingodd);
}*/
} }
void SoftRenderer::VCount144(GPU& gpu) void SoftRenderer::VCount144(GPU& gpu)

View File

@ -472,6 +472,7 @@ private:
void RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter); void RenderScanline(const GPU& gpu, s32 y, int npolys, s32* timingcounter);
u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const; u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const;
bool CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr); bool CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr);
template <bool push>
void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext); void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext);
void ClearBuffers(const GPU& gpu); void ClearBuffers(const GPU& gpu);
u16 BeginPushScanline(s32 y, s32 pixelstodraw); u16 BeginPushScanline(s32 y, s32 pixelstodraw);