diff --git a/src/GPU.cpp b/src/GPU.cpp index f24d8ab5..c736fe60 100644 --- a/src/GPU.cpp +++ b/src/GPU.cpp @@ -30,7 +30,7 @@ namespace melonDS using Platform::Log; using Platform::LogLevel; -#define LINE_CYCLES (355*6) +#define LINE_CYCLES (355*6) #define HBLANK_CYCLES (48+(256*6)) #define FRAME_CYCLES (LINE_CYCLES * 263) @@ -878,6 +878,11 @@ void GPU::StartHBlank(u32 line) noexcept { DispStat[0] |= (1<<1); DispStat[1] |= (1<<1); + + // TODO: not quite the correct update time, but... close enough i guess? + int scanline = (VCount == 262 ? 0 : (line+1)); + if (!(scanline & 1)) GPU3D.ScanlineSync(scanline); + if (GPU3D.UnderflowFlagVCount == scanline) GPU3D.DispCnt |= (1<<12); if (VCount < 192) { @@ -1013,11 +1018,11 @@ void GPU::StartScanline(u32 line) noexcept { if (VCount == 192) { - // in reality rendering already finishes at line 144 + // in reality rendering already finishes at line 144 (can take up to ~191 depending on load) // and games might already start to modify texture memory. // That doesn't matter for us because we cache the entire // texture memory anyway and only update it before the start - //of the next frame. + // of the next frame. // So we can give the rasteriser a bit more headroom GPU3D.VCount144(*this); diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 4a1426aa..207d0076 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -183,6 +183,8 @@ void GPU3D::ResetRenderingState() noexcept RenderClearAttr1 = 0x3F000000; RenderClearAttr2 = 0x00007FFF; + + RenderFrameIdentical = false; } void GPU3D::Reset() noexcept @@ -236,7 +238,7 @@ void GPU3D::Reset() noexcept TotalParams = 0; GeometryEnabled = false; - RenderingEnabled = false; + RenderingEnabled = 0; DispCnt = 0; AlphaRefVal = 0; @@ -254,6 +256,9 @@ void GPU3D::Reset() noexcept ClearAttr2 = 0x00007FFF; ResetRenderingState(); + + UnderflowFlagVCount = -1; + RDLines = 63; AbortFrame = false; @@ -549,12 +554,16 @@ void GPU3D::DoSavestate(Savestate* file) noexcept file->Bool32(&AbortFrame); file->Bool32(&GeometryEnabled); - file->Bool32(&RenderingEnabled); + file->Var8(&RenderingEnabled); file->Var32(&PolygonMode); file->Var32(&PolygonAttr); file->Var32(&CurPolygonAttr); file->Var32(&TexParam); file->Var32(&TexPalette); + + file->Var8(&RDLines); + file->Var8(&RDLinesTemp); + RenderFrameIdentical = false; if (softRenderer && softRenderer->IsThreaded()) { @@ -567,9 +576,19 @@ void GPU3D::DoSavestate(Savestate* file) noexcept void GPU3D::SetEnabled(bool geometry, bool rendering) noexcept { GeometryEnabled = geometry; - RenderingEnabled = rendering; - - if (!rendering) ResetRenderingState(); + if (rendering) + { + if (RenderingEnabled == 0) + { + RenderingEnabled = 1; + RDLinesTemp = 63; // CHECKME + } + } + else + { + ResetRenderingState(); + RenderingEnabled = 0; + } } @@ -2458,12 +2477,16 @@ bool YSort(Polygon* a, Polygon* b) void GPU3D::VBlank() noexcept { + if (RenderingEnabled) + RDLines = RDLinesTemp; + if (GeometryEnabled) { - if (RenderingEnabled) + if (RenderingEnabled >= 3) { if (FlushRequest) { + swap: if (NumPolygons) { // separate translucent polygons from opaque ones @@ -2517,6 +2540,15 @@ void GPU3D::VBlank() noexcept RenderClearAttr1 = ClearAttr1; RenderClearAttr2 = ClearAttr2; } + else if (RenderingEnabled != 0) + { + if (FlushRequest) + { + RenderingEnabled++; + if (RenderingEnabled >= 3) + goto swap; + } + } if (FlushRequest) { @@ -2545,6 +2577,10 @@ void GPU3D::SetRenderXPos(u16 xpos) noexcept RenderXPos = xpos & 0x01FF; } +void GPU3D::ScanlineSync(int line) noexcept +{ + CurrentRenderer->ScanlineSync(line); +} u32* GPU3D::GetLine(int line) noexcept { @@ -2672,7 +2708,7 @@ u16 GPU3D::Read16(u32 addr) noexcept return DispCnt; case 0x04000320: - return 46; // TODO, eventually + return RDLines; case 0x04000600: { @@ -2716,7 +2752,7 @@ u32 GPU3D::Read32(u32 addr) noexcept return DispCnt; case 0x04000320: - return 46; // TODO, eventually + return RDLines; case 0x04000600: { diff --git a/src/GPU3D.h b/src/GPU3D.h index d10df55f..39be1f3f 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -25,10 +25,12 @@ #include "Savestate.h" #include "FIFO.h" + namespace melonDS { class GPU; + struct Vertex { s32 Position[4]; @@ -112,6 +114,7 @@ public: void SetRenderXPos(u16 xpos) noexcept; [[nodiscard]] u16 GetRenderXPos() const noexcept { return RenderXPos; } + void ScanlineSync(int line) noexcept; u32* GetLine(int line) noexcept; void WriteToGXFIFO(u32 val) noexcept; @@ -241,9 +244,18 @@ public: u32 TotalParams = 0; bool GeometryEnabled = false; - bool RenderingEnabled = false; + // 0 = powered off + // 1 = powered on, inactive + // 2 = one swap buffers, inactive + // 3 = two swap buffers, active; + u8 RenderingEnabled = 0; u32 DispCnt = 0; + + u16 UnderflowFlagVCount = 0; + u8 RDLines = 0; + u8 RDLinesTemp = 0; + u8 AlphaRefVal = 0; u8 AlphaRef = 0; @@ -329,6 +341,69 @@ public: u32 ScrolledLine[256]; // not part of the hardware state, don't serialize }; + // Rasterization Timing Constants + + static constexpr int TimingFrac = 1; // add a fractional component if pixels is not enough precision + + // GPU 2D Read Timings: For Emulating Buffer Read/Write Race Conditions + static constexpr int DelayBetweenReads = 809 * TimingFrac; + static constexpr int ScanlineReadSpeed = 256 * TimingFrac; + static constexpr int ScanlineReadInc = DelayBetweenReads + ScanlineReadSpeed; + static constexpr int InitGPU2DTimeout = (51875+565) * TimingFrac; // 51618? 51874? 52128? | when it finishes reading the first scanline. + static constexpr int FrameLength = ScanlineReadInc * 263; // how long the entire frame is. TODO: Verify if we actually need this? + + // compile-time list of scanline read times + // these *should* always occur at the same point in each frame, so it shouldn't matter if we make them fixed + static constexpr std::array SLRead = []() constexpr { + std::array readtime {}; + + for (int i = 0, time = InitGPU2DTimeout; i < 192; i++, time += ScanlineReadInc) + { + readtime[i] = time; + } + return readtime; + }(); + + static constexpr int PreReadCutoff = 565; // time before a read that a scanline is cutoff. + + // the point at which rdlines decrements. not sure why it's different...? + static constexpr std::array RDDecrement = []() constexpr { + std::array dec {}; + + for (int i = 0; i < 192; i++) + { + dec[i] = SLRead[i] - 39 - (!(i % 2)); + } + return dec; + }(); + + // GPU 3D Rasterization Timings: For Emulating Scanline Timeout + + static constexpr int FinalPassLen = 500 * TimingFrac; // 496 (might technically be 500?) | the next scanline cannot begin while a scanline's final pass is in progress + // (can be interpreted as the minimum amount of cycles for the next scanline + // pair to start after the previous pair began) (related to final pass?) + static constexpr int ScanlinePushDelay = 242 * TimingFrac; + static constexpr int EMGlitchThreshhold = 502 * TimingFrac; // The threshold for the edge marking glitch behavior to change. + static constexpr int EMFixNum = 571 * TimingFrac; // Arbitrary value added to fix edge marking glitch, not sure why it's needed? + + // GPU 3D Rasterization Timings II: For Tracking Timing Behaviors + + //static constexpr int FirstPolyScanline = 0 * TimingFrac; + static constexpr int PerPolyScanline = 12 * TimingFrac; // 12 | The basic timing cost for polygons. Applies per polygon per scanline. + static constexpr int PerPixelTiming = 1 * TimingFrac; // 1 | 1 pixel = 1 pixel + static constexpr int NumFreePixels = 4; // 4 | First 4 pixels in a polygon scanline are free (for some reason) + static constexpr int MinToStartPoly = 2 * TimingFrac; // 1 | if there aren't 2 (why two?) cycles remaining after the polygon timing penalty, + // do not bother rendering the polygon (CHECKME: I dont think this should decrement timings by anything?) + static constexpr int EmptyPolyScanline = 4 * TimingFrac; // 4 | the ignored "empty" bottom-most scanline of a polygon + // which shouldn't be rendered for some reason has timing characteristics. + + // GPU 3D Rasterization Timings III, For First Polygon "Pre-Calc" Timings + // should be added before other timings, as these are "async" pre-calcs of polygon attributes + + static constexpr int FirstPolyDelay = 4 * TimingFrac; // 4 | Min amount of cycles to begin a scanline? (minimum time it takes to init the first polygon?) + // (Amount of time before the end of the cycle a scanline must abort?) + + class Renderer3D { public: @@ -349,6 +424,7 @@ public: virtual void RenderFrame(GPU& gpu) = 0; virtual void RestartFrame(GPU& gpu) {}; virtual u32* GetLine(int line) = 0; + virtual void ScanlineSync(int line) {}; virtual void Blit(const GPU& gpu) {}; virtual void SetupAccelFrame() {} diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index a9d0bd64..8a9fbfc7 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -19,6 +19,7 @@ #include "GPU3D_Soft.h" #include +#include #include #include #include "NDS.h" @@ -138,6 +139,108 @@ void SoftRenderer::SetThreaded(bool threaded, GPU& gpu) noexcept } } +bool SoftRenderer::DoTimings(s32 cycles, s32* timingcounter) +{ + // add timings to a counter and return false if underflowed. + *timingcounter += cycles; + if (RasterTiming + *timingcounter <= ScanlineTimeout) return true; + else return false; +} + +bool SoftRenderer::CheckTimings(s32 cycles, s32* timingcounter) +{ + // check if there are 'cycles' amount of cycles remaining. + if (RasterTiming + *timingcounter <= ScanlineTimeout - cycles) return true; + else return false; +} + +u32 SoftRenderer::DoTimingsPixels(s32 pixels, s32* timingcounter) +{ + // calculate and return the difference between the old span and the new span, while adding timings to the timings counter + + // pixels dont count towards timings if they're the first 4 pixels in a polygon scanline (for some reason?) + if (pixels <= NumFreePixels) return 0; + + pixels -= NumFreePixels; + + *timingcounter += pixels; + pixels = -(ScanlineTimeout - (RasterTiming + *timingcounter)); + + if (pixels > 0) + { + *timingcounter -= pixels; + return pixels; + } + else return 0; +} + +void SoftRenderer::FindFirstPolyDoTimings(int npolys, s32 y, int* firstpolyeven, int* firstpolyodd, s32* timingcountereven, s32*timingcounterodd) +{ + // TODO: actually figure this out + + // The First Polygon in each scanline pair has some additional timing penalties (presumably due to pipelining of the rasterizer) + + bool fixeddelay = false; + bool perslope = false; + bool etc = false; + + for (*firstpolyeven = 0; *firstpolyeven < npolys; (*firstpolyeven)++) + { + RendererPolygon* rp = &PolygonList[*firstpolyeven]; + Polygon* polygon = rp->PolyData; + + if (y >= polygon->YTop && y <= polygon->YBottom) + { + fixeddelay = true; + break; + /*if (y == polygon->YBottom) break; + if (y == polygon->YTop) {perslope = true; break;} + else if ((y == polygon->Vertices[rp->NextVL]->FinalPosition[1] || y == polygon->Vertices[rp->CurVL]->FinalPosition[1]) || + (y == polygon->Vertices[rp->NextVR]->FinalPosition[1] || y == polygon->Vertices[rp->CurVR]->FinalPosition[1])) + { + perslope = true; + } + else etc = true; + break;*/ + } + } + + y++; + for (*firstpolyodd = 0; *firstpolyodd < npolys; (*firstpolyodd)++) + { + RendererPolygon* rp = &PolygonList[*firstpolyodd]; + Polygon* polygon = rp->PolyData; + + if (y >= polygon->YTop && y <= polygon->YBottom) + { + fixeddelay = true; + break; + /*if (y == polygon->YBottom) break; + if (y == polygon->YTop) {perslope = true; break;} + else if ((y == polygon->Vertices[rp->NextVL]->FinalPosition[1] || y == polygon->Vertices[rp->CurVL]->FinalPosition[1]) || + (y == polygon->Vertices[rp->NextVR]->FinalPosition[1] || y == polygon->Vertices[rp->CurVR]->FinalPosition[1])) + { + perslope = true; + } + else etc = true; + break;*/ + } + } + + *timingcountereven = fixeddelay ? FirstPolyDelay : 0;// + perslope*FirstPerSlope + etc*2; + *timingcounterodd = fixeddelay ? FirstPolyDelay : 0;// + perslope*FirstPerSlope + etc*2; + /*if (!perslope) + { + *timingcountereven += etc*2;// + perslope*FirstPerSlope + etc*2; + *timingcounterodd += etc*2;// + perslope*FirstPerSlope + etc*2; + } + else + { + *timingcountereven += perslope*FirstPerSlope;// + perslope*FirstPerSlope + etc*2; + *timingcounterodd += perslope*FirstPerSlope;// + perslope*FirstPerSlope + etc*2; + }*/ +} + void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const { u32 vramaddr = (texparam & 0xFFFF) << 3; @@ -705,7 +808,31 @@ void SoftRenderer::SetupPolygon(SoftRenderer::RendererPolygon* rp, Polygon* poly } } -void SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y) +void SoftRenderer::Step(RendererPolygon* rp) +{ + rp->XL = rp->SlopeL.Step(); + rp->XR = rp->SlopeR.Step(); +} + +void SoftRenderer::CheckSlope(RendererPolygon* rp, s32 y) +{ + Polygon* polygon = rp->PolyData; + + if (polygon->YTop != polygon->YBottom) + { + if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) + { + SetupPolygonLeftEdge(rp, y); + } + + if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) + { + SetupPolygonRightEdge(rp, y); + } + } +} + +bool SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, s32* timingcounter) { Polygon* polygon = rp->PolyData; @@ -727,19 +854,8 @@ void SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* memset(&StencilBuffer[256 * (y&0x1)], 0, 256); PrevIsShadowMask = true; - - if (polygon->YTop != polygon->YBottom) - { - if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) - { - SetupPolygonLeftEdge(rp, y); - } - - if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) - { - SetupPolygonRightEdge(rp, y); - } - } + + CheckSlope(rp, y); Vertex *vlcur, *vlnext, *vrcur, *vrnext; s32 xstart, xend; @@ -748,6 +864,7 @@ void SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* s32 l_edgecov, r_edgecov; Interpolator<1>* interp_start; Interpolator<1>* interp_end; + bool abortscanline; // to abort the rest of the scanline after finishing this polygon xstart = rp->XL; xend = rp->XR; @@ -831,7 +948,7 @@ void SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* // similarly, we can perform alpha test early (checkme) if (wireframe) polyalpha = 31; - if (polyalpha <= gpu3d.RenderAlphaRef) return; + if (polyalpha <= gpu3d.RenderAlphaRef) return false; // TODO: check how this impacts timings? // in wireframe mode, there are special rules for equal Z (TODO) @@ -841,10 +958,28 @@ void SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* int edge; s32 x = xstart; - Interpolator<0> interpX(xstart, xend+1, wl, wr); + xend += 1; + Interpolator<0> interpX(xstart, xend, wl, wr); if (x < 0) x = 0; s32 xlimit; + + // determine if the span can be rendered within the time allotted to the scanline + s32 diff = DoTimingsPixels(xend-x, timingcounter); + if (diff != 0) + { + xend -= diff; + r_edgelen -= diff; + abortscanline = true; + } + else abortscanline = false; + + // we cap it to 256 *after* counting the cycles, because yes, it tries to render oob pixels. + if (xend > 256) + { + r_edgelen += 256 - xend; + xend = 256; + } // for shadow masks: set stencil bits where the depth test fails. // draw nothing. @@ -852,14 +987,13 @@ void SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* // part 1: left edge edge = yedge | 0x1; xlimit = xstart+l_edgelen; - if (xlimit > xend+1) xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + if (xlimit > xend) xlimit = xend; if (!l_filledge) x = xlimit; else for (; x < xlimit; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; interpX.SetX(x); @@ -879,13 +1013,12 @@ void SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* // part 2: polygon inside edge = yedge; - xlimit = xend-r_edgelen+1; - if (xlimit > xend+1) xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + xlimit = xend-r_edgelen; + if (xlimit > xend) xlimit = xend; if (wireframe && !edge) x = std::max(x, xlimit); else for (; x < xlimit; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; interpX.SetX(x); @@ -905,13 +1038,12 @@ void SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* // part 3: right edge edge = yedge | 0x2; - xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + xlimit = xend; if (r_filledge) for (; x < xlimit; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; interpX.SetX(x); @@ -929,14 +1061,13 @@ void SoftRenderer::RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* } } - rp->XL = rp->SlopeL.Step(); - rp->XR = rp->SlopeR.Step(); + Step(rp); + return abortscanline; } -void SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y) +bool SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter) { Polygon* polygon = rp->PolyData; - u32 polyattr = (polygon->Attr & 0x3F008000); if (!polygon->FacingView) polyattr |= (1<<4); @@ -953,19 +1084,8 @@ void SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 PrevIsShadowMask = false; - if (polygon->YTop != polygon->YBottom) - { - if (y >= polygon->Vertices[rp->NextVL]->FinalPosition[1] && rp->CurVL != polygon->VBottom) - { - SetupPolygonLeftEdge(rp, y); - } - - if (y >= polygon->Vertices[rp->NextVR]->FinalPosition[1] && rp->CurVR != polygon->VBottom) - { - SetupPolygonRightEdge(rp, y); - } - } - + CheckSlope(rp, y); + Vertex *vlcur, *vlnext, *vrcur, *vrnext; s32 xstart, xend; bool l_filledge, r_filledge; @@ -973,6 +1093,7 @@ void SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 s32 l_edgecov, r_edgecov; Interpolator<1>* interp_start; Interpolator<1>* interp_end; + bool abortscanline; // to abort the rest of the scanline after finishing this polygon xstart = rp->XL; xend = rp->XR; @@ -1091,18 +1212,35 @@ void SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 int edge; s32 x = xstart; - Interpolator<0> interpX(xstart, xend+1, wl, wr); + xend += 1; + Interpolator<0> interpX(xstart, xend, wl, wr); if (x < 0) x = 0; s32 xlimit; s32 xcov = 0; + // determine if the span can be rendered within the time allotted to the scanline + s32 diff = DoTimingsPixels(xend-x, timingcounter); + if (diff != 0) + { + xend -= diff; + r_edgelen -= diff; + abortscanline = true; + } + else abortscanline = false; + + // we cap it to 256 *after* counting the cycles, because yes, it tries to render oob pixels. + if (xend > 256) + { + r_edgelen += 256 - xend; + xend = 256; + } + // part 1: left edge edge = yedge | 0x1; xlimit = xstart+l_edgelen; - if (xlimit > xend+1) xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + if (xlimit > xend) xlimit = xend; if (l_edgecov & (1<<31)) { xcov = (l_edgecov >> 12) & 0x3FF; @@ -1110,10 +1248,9 @@ void SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 } if (!l_filledge) x = xlimit; - else - for (; x < xlimit; x++) + else for (; x < xlimit; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; // check stencil buffer for shadows @@ -1201,15 +1338,13 @@ void SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 // part 2: polygon inside edge = yedge; - xlimit = xend-r_edgelen+1; - if (xlimit > xend+1) xlimit = xend+1; - if (xlimit > 256) xlimit = 256; - + xlimit = xend-r_edgelen; + if (xlimit > xend) xlimit = xend; + if (wireframe && !edge) x = std::max(x, xlimit); - else - for (; x < xlimit; x++) + else for (; x < xlimit; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; // check stencil buffer for shadows @@ -1290,8 +1425,7 @@ void SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 // part 3: right edge edge = yedge | 0x2; - xlimit = xend+1; - if (xlimit > 256) xlimit = 256; + xlimit = xend; if (r_edgecov & (1<<31)) { xcov = (r_edgecov >> 12) & 0x3FF; @@ -1301,7 +1435,7 @@ void SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 if (r_filledge) for (; x < xlimit; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; u32 dstattr = AttrBuffer[pixeladdr]; // check stencil buffer for shadows @@ -1386,24 +1520,36 @@ void SoftRenderer::RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s3 PlotTranslucentPixel(gpu.GPU3D, pixeladdr+BufferSize, color, z, polyattr, polygon->IsShadow); } } - - rp->XL = rp->SlopeL.Step(); - rp->XR = rp->SlopeR.Step(); + Step(rp); + return abortscanline; } -void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int npolys) +void SoftRenderer::RenderScanline(const GPU& gpu, s32 y, int firstpoly, int npolys, s32* timingcounter) { - for (int i = 0; i < npolys; i++) + bool abort = false; + for (; firstpoly < npolys; firstpoly++) { - RendererPolygon* rp = &PolygonList[i]; + RendererPolygon* rp = &PolygonList[firstpoly]; Polygon* polygon = rp->PolyData; - if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) + if (y == polygon->YBottom && y != polygon->YTop) { - if (polygon->IsShadowMask) - RenderShadowMaskScanline(gpu.GPU3D, rp, y); + if (!abort) abort = !DoTimings(EmptyPolyScanline, timingcounter); + } + else if (y >= polygon->YTop && (y < polygon->YBottom || (y == polygon->YTop && polygon->YBottom == polygon->YTop))) + { + if (!abort) abort = (!DoTimings(PerPolyScanline, timingcounter) + || !CheckTimings(MinToStartPoly, timingcounter)); + + if (abort) + { + CheckSlope(rp, y); + Step(rp); + } + else if (polygon->IsShadowMask) + abort = RenderShadowMaskScanline(gpu.GPU3D, rp, y, timingcounter); else - RenderPolygonScanline(gpu, rp, y); + abort = RenderPolygonScanline(gpu, rp, y, timingcounter); } } } @@ -1447,7 +1593,27 @@ u32 SoftRenderer::CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const return density; } -void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y) +bool SoftRenderer::CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr) +{ + if ((polyid != AttrBuffer[pixeladdr] >> 24) && (z < DepthBuffer[pixeladdr])) return true; + else return false; +} + +bool SoftRenderer::CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z) +{ + // for some reason it never checks against the bitmap clear plane? + if (polyid != gpu3d.RenderClearAttr1>>24) + { + u32 clearz = ((gpu3d.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF; + + if (z < clearz) return true; + else return false; + } + else return false; +} + +template +void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext) { // to consider: // clearing all polygon fog flags if the master flag isn't set? @@ -1460,7 +1626,7 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y) for (int x = 0; x < 256; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; u32 attr = AttrBuffer[pixeladdr]; if (!(attr & 0xF)) continue; @@ -1468,11 +1634,45 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y) u32 polyid = attr >> 24; // opaque polygon IDs are used for edgemarking u32 z = DepthBuffer[pixeladdr]; - if (((polyid != (AttrBuffer[pixeladdr-1] >> 24)) && (z < DepthBuffer[pixeladdr-1])) || - ((polyid != (AttrBuffer[pixeladdr+1] >> 24)) && (z < DepthBuffer[pixeladdr+1])) || - ((polyid != (AttrBuffer[pixeladdr-ScanlineWidth] >> 24)) && (z < DepthBuffer[pixeladdr-ScanlineWidth])) || - ((polyid != (AttrBuffer[pixeladdr+ScanlineWidth] >> 24)) && (z < DepthBuffer[pixeladdr+ScanlineWidth]))) + // check the pixel to the left + if (x == 0) { + // edge marking bug emulation + if (checkprev ? CheckEdgeMarkingClearPlane(gpu3d, polyid, z) : // check against the clear plane + CheckEdgeMarkingPixel(polyid, z, pixeladdr-1 - ScanlineWidth)) // checks the right edge of the scanline 2 scanlines ago + goto pass; + } + else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr-1)) goto pass; // normal check + + // check the pixel to the right + if (x == 255) + { + // edge marking bug emulation + if (checknext ? CheckEdgeMarkingClearPlane(gpu3d, polyid, z) : // check against the clear plane + CheckEdgeMarkingPixel(polyid, z, pixeladdr+1 + ScanlineWidth)) // checks the left edge of the scanline 2 scanlines ahead + goto pass; + } + else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr+1)) goto pass; // normal check + + // check the pixel above + if (y == 0) + { + // edge marking bug emulation + if (CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) goto pass; // check against the clear plane + } + else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr-ScanlineWidth)) goto pass; // normal check + + // check the pixel below + if (y == 191) + { + // edge marking bug emulation + if (CheckEdgeMarkingClearPlane(gpu3d, polyid, z)) goto pass; // check against the clear plane + } + else if (CheckEdgeMarkingPixel(polyid, z, pixeladdr+ScanlineWidth)) goto pass; // normal check + + if (false) + { + pass: u16 edgecolor = gpu3d.RenderEdgeTable[polyid >> 3]; u32 edgeR = (edgecolor << 1) & 0x3E; if (edgeR) edgeR++; u32 edgeG = (edgecolor >> 4) & 0x3E; if (edgeG) edgeG++; @@ -1508,7 +1708,7 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y) for (int x = 0; x < 256; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; u32 density, srccolor, srcR, srcG, srcB, srcA; u32 attr = AttrBuffer[pixeladdr]; @@ -1573,7 +1773,7 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y) for (int x = 0; x < 256; x++) { - u32 pixeladdr = FirstPixelOffset + (y*ScanlineWidth) + x; + u32 pixeladdr = (y*ScanlineWidth) + x; u32 attr = AttrBuffer[pixeladdr]; if (!(attr & 0xF)) continue; @@ -1615,39 +1815,17 @@ void SoftRenderer::ScanlineFinalPass(const GPU3D& gpu3d, s32 y) ColorBuffer[pixeladdr] = topR | (topG << 8) | (topB << 16) | (topA << 24); } } + if constexpr (push) + { + memcpy(&FinalBuffer[y*ScanlineWidth], &ColorBuffer[y*ScanlineWidth], ScanlineWidth*4); + Platform::Semaphore_Post(Sema_ScanlineCount); + } } void SoftRenderer::ClearBuffers(const GPU& gpu) { - u32 clearz = ((gpu.GPU3D.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF; u32 polyid = gpu.GPU3D.RenderClearAttr1 & 0x3F000000; // this sets the opaque polygonID - // fill screen borders for edge marking - - for (int x = 0; x < ScanlineWidth; x++) - { - ColorBuffer[x] = 0; - DepthBuffer[x] = clearz; - AttrBuffer[x] = polyid; - } - - for (int x = ScanlineWidth; x < ScanlineWidth*193; x+=ScanlineWidth) - { - ColorBuffer[x] = 0; - DepthBuffer[x] = clearz; - AttrBuffer[x] = polyid; - ColorBuffer[x+257] = 0; - DepthBuffer[x+257] = clearz; - AttrBuffer[x+257] = polyid; - } - - for (int x = ScanlineWidth*193; x < ScanlineWidth*194; x++) - { - ColorBuffer[x] = 0; - DepthBuffer[x] = clearz; - AttrBuffer[x] = polyid; - } - // clear the screen if (gpu.GPU3D.RenderDispCnt & (1<<14)) @@ -1655,7 +1833,7 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) u8 xoff = (gpu.GPU3D.RenderClearAttr2 >> 16) & 0xFF; u8 yoff = (gpu.GPU3D.RenderClearAttr2 >> 24) & 0xFF; - for (int y = 0; y < ScanlineWidth*192; y+=ScanlineWidth) + for (int y = 0; y < 192; y++) { for (int x = 0; x < 256; x++) { @@ -1671,7 +1849,7 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) u32 z = ((val3 & 0x7FFF) * 0x200) + 0x1FF; - u32 pixeladdr = FirstPixelOffset + y + x; + u32 pixeladdr = (y*ScanlineWidth) + x; ColorBuffer[pixeladdr] = color; DepthBuffer[pixeladdr] = z; AttrBuffer[pixeladdr] = polyid | (val3 & 0x8000); @@ -1684,6 +1862,8 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) } else { + u32 clearz = ((gpu.GPU3D.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF; + // TODO: confirm color conversion u32 r = (gpu.GPU3D.RenderClearAttr1 << 1) & 0x3E; if (r) r++; u32 g = (gpu.GPU3D.RenderClearAttr1 >> 4) & 0x3E; if (g) g++; @@ -1693,11 +1873,11 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) polyid |= (gpu.GPU3D.RenderClearAttr1 & 0x8000); - for (int y = 0; y < ScanlineWidth*192; y+=ScanlineWidth) + for (int y = 0; y < 192; y++) { for (int x = 0; x < 256; x++) { - u32 pixeladdr = FirstPixelOffset + y + x; + u32 pixeladdr = (y*ScanlineWidth) + x; ColorBuffer[pixeladdr] = color; DepthBuffer[pixeladdr] = clearz; AttrBuffer[pixeladdr] = polyid; @@ -1706,7 +1886,46 @@ void SoftRenderer::ClearBuffers(const GPU& gpu) } } -void SoftRenderer::RenderPolygons(const GPU& gpu, bool threaded, Polygon** polygons, int npolys) +#define RDLINES_COUNT_INCREMENT\ + /* feels wrong, needs improvement */\ + while (RasterTiming >= RDDecrement[nextreadrd])\ + {\ + slwaitingrd--;\ + nextreadrd++;\ + /* update rdlines_count register */\ + if (gpu.GPU3D.RDLinesTemp > slwaitingrd) gpu.GPU3D.RDLinesTemp = slwaitingrd;\ + } + +#define SCANLINE_BUFFER_SIM\ + /* simulate the process of scanlines being read from the 48 scanline buffer */\ + while (scanlineswaiting >= 47 || RasterTiming >= SLRead[nextread])\ + {\ + if (RasterTiming < SLRead[nextread])\ + {\ + timespent = SLRead[nextread] - RasterTiming;\ + timespent += EMFixNum; /* fixes edge marking bug emulation. not sure why this is needed? */\ + RasterTiming = SLRead[nextread];\ + }\ + scanlineswaiting--;\ + nextread++;\ + } + +#define RENDER_SCANLINES(y)\ + /* update sl timeout */\ + ScanlineTimeout = SLRead[y-1] - (PreReadCutoff+FinalPassLen);\ + \ + FindFirstPolyDoTimings(j, y, &firstpolyeven, &firstpolyodd, &rastertimingeven, &rastertimingodd);\ + RenderScanline(gpu, y, firstpolyeven, j, &rastertimingeven);\ + RenderScanline(gpu, y+1, firstpolyodd, j, &rastertimingodd);\ + \ + prevtimespent = timespent;\ + RasterTiming += timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen});\ + RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12);\ + \ + /* set the underflow flag if one of the scanlines came within 14 cycles of visible underflow */\ + if ((ScanlineTimeout <= RasterTiming) && (gpu.GPU3D.UnderflowFlagVCount == (u16)-1)) gpu.GPU3D.UnderflowFlagVCount = y - (y&1 ? 0 : 1); + +void SoftRenderer::RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys) { int j = 0; for (int i = 0; i < npolys; i++) @@ -1715,25 +1934,83 @@ void SoftRenderer::RenderPolygons(const GPU& gpu, bool threaded, Polygon** polyg SetupPolygon(&PolygonList[j++], polygons[i]); } - RenderScanline(gpu, 0, j); + // reset scanline trackers + gpu.GPU3D.UnderflowFlagVCount = -1; + gpu.GPU3D.RDLinesTemp = 63; + RasterTiming = 0; + ScanlineTimeout = SLRead[2] - (PreReadCutoff+FinalPassLen+4); // TEMP: should be infinity, but i dont want it to break due to not being set up to handle this properly. //0x7FFFFFFF; // CHECKME: first scanline pair timeout. + s32 rastertimingeven, rastertimingodd; // always init to 0 at the start of a scanline render + s32 scanlineswaiting = 0, slwaitingrd = 0; + s32 nextread = 0, nextreadrd = 0; + u32 timespent, prevtimespent; + int firstpolyeven, firstpolyodd; - for (s32 y = 1; y < 192; y++) + FindFirstPolyDoTimings(j, 0, &firstpolyeven, &firstpolyodd, &rastertimingeven, &rastertimingodd); + // scanlines are rendered in pairs of two + RenderScanline(gpu, 0, firstpolyeven, j, &rastertimingeven); + RenderScanline(gpu, 1, firstpolyodd, j, &rastertimingodd); + + // it can't proceed to the next scanline unless all others steps are done (both scanlines in the pair, and final pass) + RasterTiming = timespent = std::max(std::initializer_list {rastertimingeven, rastertimingodd, FinalPassLen}); + // 12 cycles at the end of a "timeout" are always used for w/e reason + RasterTiming += std::clamp(ScanlineTimeout - RasterTiming, 0, 12); // should probably just be += 12 tbh but i'll leave it for now + + // if first pair was not delayed past the first read, then later scanlines cannot either + // this allows us to implement a fast path + //if (SLRead[0] - timespent + ScanlinePushDelay >= 256) { - RenderScanline(gpu, y, j); - ScanlineFinalPass(gpu.GPU3D, y-1); + RENDER_SCANLINES(2) - if (threaded) - // Notify the main thread that we're done with a scanline. - Platform::Semaphore_Post(Sema_ScanlineCount); + scanlineswaiting++; + slwaitingrd++; + + SCANLINE_BUFFER_SIM + + RDLINES_COUNT_INCREMENT + + // final pass pairs are the previous scanline pair offset -1 scanline, thus we start with only building one + ScanlineFinalPass(gpu.GPU3D, 0, true, timespent >= EMGlitchThreshhold); + + // main loop + for (int y = 4; y < 192; y+=2) + { + RENDER_SCANLINES(y) + + scanlineswaiting += 2; + slwaitingrd += 2; + + SCANLINE_BUFFER_SIM + + RDLINES_COUNT_INCREMENT + + ScanlineFinalPass(gpu.GPU3D, y-3, prevtimespent >= EMGlitchThreshhold || y-3 == 1, timespent >= EMGlitchThreshhold); + ScanlineFinalPass(gpu.GPU3D, y-2, prevtimespent >= EMGlitchThreshhold, timespent >= EMGlitchThreshhold); + } + + scanlineswaiting += 2; + slwaitingrd += 2; + prevtimespent = timespent; + + // emulate read timings one last time, since it shouldn't matter after this + // additionally dont bother tracking rdlines anymore since it shouldn't be able to decrement anymore (CHECKME) + SCANLINE_BUFFER_SIM + + // finish the last 3 scanlines + ScanlineFinalPass(gpu.GPU3D, 189, prevtimespent >= EMGlitchThreshhold, timespent >= EMGlitchThreshhold); + ScanlineFinalPass(gpu.GPU3D, 190, prevtimespent >= EMGlitchThreshhold, true); + + ScanlineFinalPass(gpu.GPU3D, 191, timespent >= EMGlitchThreshhold, true); } - - ScanlineFinalPass(gpu.GPU3D, 191); - - if (threaded) - // If this renderer is threaded, notify the main thread that we're done with the frame. - Platform::Semaphore_Post(Sema_ScanlineCount); + /*else + { + Coming soon^tm to a melonDS near you + }*/ } +#undef RENDER_SCANLINES +#undef SCANLINE_BUFFER_SIM +#undef RDLINES_COUNT_INCREMENT + void SoftRenderer::VCount144(GPU& gpu) { if (RenderThreadRunning.load(std::memory_order_relaxed) && !gpu.GPU3D.AbortFrame) @@ -1757,8 +2034,14 @@ void SoftRenderer::RenderFrame(GPU& gpu) } else if (!FrameIdentical) { + //init internal buffer ClearBuffers(gpu); - RenderPolygons(gpu, false, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + + if (gpu.GPU3D.RenderingEnabled >= 3) + { + RenderPolygonsTiming(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + } + else memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); } } @@ -1789,8 +2072,18 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) } else { + //init internal buffer ClearBuffers(gpu); - RenderPolygons(gpu, true, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + + if (gpu.GPU3D.RenderingEnabled >= 3) + { + RenderPolygonsTiming(gpu, &gpu.GPU3D.RenderPolygonRAM[0], gpu.GPU3D.RenderNumPolygons); + } + else + { + memcpy(FinalBuffer, ColorBuffer, sizeof(FinalBuffer)); + Platform::Semaphore_Post(Sema_ScanlineCount, 192); + } } // Tell the main thread that we're done rendering @@ -1800,19 +2093,23 @@ void SoftRenderer::RenderThreadFunc(GPU& gpu) RenderThreadRendering = false; } } - -u32* SoftRenderer::GetLine(int line) +void SoftRenderer::ScanlineSync(int line) { + // only used in accurate mode (timings must be emulated) if (RenderThreadRunning.load(std::memory_order_relaxed)) { if (line < 192) - // We need a scanline, so let's wait for the render thread to finish it. - // (both threads process scanlines from top-to-bottom, - // so we don't need to wait for a specific row) + { + // wait for two scanlines here, since scanlines render in pairs. Platform::Semaphore_Wait(Sema_ScanlineCount); + Platform::Semaphore_Wait(Sema_ScanlineCount); + } } +} - return &ColorBuffer[(line * ScanlineWidth) + FirstPixelOffset]; +u32* SoftRenderer::GetLine(int line) +{ + return &FinalBuffer[line*ScanlineWidth]; } } diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 73d02e4f..0e595da7 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -40,6 +40,7 @@ public: void RenderFrame(GPU& gpu) override; void RestartFrame(GPU& gpu) override; u32* GetLine(int line) override; + void ScanlineSync(int line) override; void SetupRenderThread(GPU& gpu); void EnableRenderThread(); @@ -445,21 +446,34 @@ private: }; RendererPolygon PolygonList[2048]; + bool DoTimings(s32 cycles, s32* timingcounter); + bool CheckTimings(s32 cycles, s32* timingcounter); + u32 DoTimingsPixels(s32 pixels, s32* timingcounter); + void FindFirstPolyDoTimings(int npolys, s32 y, int* firstpolyeven, int* firstpolyodd, s32* timingcountereven, s32*timingcounterodd); void TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const; u32 RenderPixel(const GPU& gpu, const Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t) const; void PlotTranslucentPixel(const GPU3D& gpu3d, u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow); void SetupPolygonLeftEdge(RendererPolygon* rp, s32 y) const; void SetupPolygonRightEdge(RendererPolygon* rp, s32 y) const; void SetupPolygon(RendererPolygon* rp, Polygon* polygon) const; - void RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y); - void RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y); - void RenderScanline(const GPU& gpu, s32 y, int npolys); + void Step(RendererPolygon* rp); + void CheckSlope(RendererPolygon* rp, s32 y); + bool RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, s32* timingcounter); + bool RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter); + void RenderScanline(const GPU& gpu, s32 y, int firstpoly, int npolys, s32* timingcounter); u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const; - void ScanlineFinalPass(const GPU3D& gpu3d, s32 y); + bool CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr); + bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z); + template void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext); void ClearBuffers(const GPU& gpu); - void RenderPolygons(const GPU& gpu, bool threaded, Polygon** polygons, int npolys); + void RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys); + void RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys); void RenderThreadFunc(GPU& gpu); + + // counters for scanline rasterization timings + s32 ScanlineTimeout; + s32 RasterTiming; // buffer dimensions are 258x194 to add a offscreen 1px border // which simplifies edge marking tests @@ -467,14 +481,19 @@ private: // TODO: check if the hardware can accidentally plot pixels // offscreen in that border - static constexpr int ScanlineWidth = 258; - static constexpr int NumScanlines = 194; - static constexpr int BufferSize = ScanlineWidth * NumScanlines; - static constexpr int FirstPixelOffset = ScanlineWidth + 1; + static constexpr int ScanlineWidth = 256; + static constexpr int NumScanlinesIntBuf = 192; + //static constexpr int NumScanlinesRD = 48; + static constexpr int NumScanlinesFinal = 192; + static constexpr int BufferSize = ScanlineWidth * NumScanlinesIntBuf; + //static constexpr int RDBufferSize = ScanlineWidth * NumScanlinesRD; + static constexpr int FinalBufferSize = ScanlineWidth * NumScanlinesFinal; u32 ColorBuffer[BufferSize * 2]; u32 DepthBuffer[BufferSize * 2]; u32 AttrBuffer[BufferSize * 2]; + //u32 RDBuffer[RDBufferSize]; // is this buffer ever initialized by hw before writing to it? what is its initial value? can you transfer 3d framebuffer data between games? + u32 FinalBuffer[FinalBufferSize]; // attribute buffer: // bit0-3: edge flags (left/right/top/bottom)