Compute shader renderer (#2041)

* nothing works yet

* don't double buffer 3D framebuffers for the GL Renderer
looks like leftovers from when 3D+2D composition was done in the frontend

* oops

* it works!

* implement display capture for compute renderer
it's actually just all stolen from the regular OpenGL renderer

* fix bad indirect call

* handle cleanup properly

* add hires rendering to the compute shader renderer

* fix UB
also misc changes to use more unsigned multiplication
also fix framebuffer resize

* correct edge filling behaviour when AA is disabled

* fix full color textures

* fix edge marking (polygon id is 6-bit not 5)
also make the code a bit nicer

* take all edge cases into account for XMin/XMax calculation

* use hires coordinate again

* stop using fixed size buffers based on scale factor in shaders
this makes shader compile times tolerable on Wintel
- beginning of the shader cache
- increase size of tile idx in workdesc to 20 bits

* apparently & is not defined on bvec4
why does this even compile on Intel and Nvidia?

* put the texture cache into it's own file

* add compute shader renderer properly to the GUI
also add option to toggle using high resolution vertex coordinates

* unbind sampler object in compute shader renderer

* fix GetRangedBitMask for 64 bit aligned 64 bits
pretty embarassing

* convert NonStupidBitfield.h back to LF only new lines

* actually adapt to latest changes

* fix stupid merge

* actually make compute shader renderer work with newest changes

* show progress on shader compilation

* remove merge leftover
This commit is contained in:
RSDuck
2024-05-13 17:17:39 +02:00
committed by GitHub
parent c85a2103bb
commit 043244a56d
35 changed files with 4389 additions and 382 deletions

View File

@ -26,11 +26,38 @@
#include <initializer_list>
#include <algorithm>
namespace melonDS
{
inline u64 GetRangedBitMask(u32 idx, u32 startBit, u32 bitsCount)
{
u32 startEntry = startBit >> 6;
u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
if (entriesCount > 1)
{
if (idx == startEntry)
return 0xFFFFFFFFFFFFFFFF << (startBit & 0x3F);
if (((startBit + bitsCount) & 0x3F) && idx == startEntry + entriesCount - 1)
return ~(0xFFFFFFFFFFFFFFFF << ((startBit + bitsCount) & 0x3F));
return 0xFFFFFFFFFFFFFFFF;
}
else if (idx == startEntry)
{
return bitsCount == 64
? 0xFFFFFFFFFFFFFFFF
: ((1ULL << bitsCount) - 1) << (startBit & 0x3F);
}
else
{
return 0;
}
}
// like std::bitset but less stupid and optimised for
// our use case (keeping track of memory invalidations)
namespace melonDS
{
template <u32 Size>
struct NonStupidBitField
{
@ -166,6 +193,11 @@ struct NonStupidBitField
return Ref{*this, idx};
}
bool operator[](u32 idx) const
{
return Data[idx >> 6] & (1ULL << (idx & 0x3F));
}
void SetRange(u32 startBit, u32 bitsCount)
{
u32 startEntry = startBit >> 6;
@ -187,6 +219,26 @@ struct NonStupidBitField
}
}
int Min() const
{
for (int i = 0; i < DataLength; i++)
{
if (Data[i])
return i * 64 + __builtin_ctzll(Data[i]);
}
return -1;
}
int Max() const
{
for (int i = DataLength - 1; i >= 0; i--)
{
if (Data[i])
return i * 64 + (63 - __builtin_clzll(Data[i]));
}
return -1;
}
NonStupidBitField& operator|=(const NonStupidBitField<Size>& other)
{
for (u32 i = 0; i < DataLength; i++)
@ -195,6 +247,7 @@ struct NonStupidBitField
}
return *this;
}
NonStupidBitField& operator&=(const NonStupidBitField<Size>& other)
{
for (u32 i = 0; i < DataLength; i++)
@ -203,6 +256,20 @@ struct NonStupidBitField
}
return *this;
}
operator bool() const
{
for (int i = 0; i < DataLength - 1; i++)
{
if (Data[i])
return true;
}
if (Data[DataLength-1] & ((Size&0x3F) ? ~(0xFFFFFFFFFFFFFFFF << (Size&0x3F)) : 0xFFFFFFFFFFFFFFFF))
{
return true;
}
return false;
}
};
}