make things fasteeer!

also fix alpha for A3I5 textures
This commit is contained in:
RSDuck 2020-03-21 19:37:18 +01:00
parent 36075dae5a
commit ed61867dec
5 changed files with 145 additions and 85 deletions

View File

@ -52,6 +52,8 @@ u8 VRAM_I[ 16*1024];
u8* VRAM[9] = {VRAM_A, VRAM_B, VRAM_C, VRAM_D, VRAM_E, VRAM_F, VRAM_G, VRAM_H, VRAM_I};
u32 VRAMMask[9] = {0x1FFFF, 0x1FFFF, 0x1FFFF, 0x1FFFF, 0xFFFF, 0x3FFF, 0x3FFF, 0x7FFF, 0x3FFF};
u64 LCDCDirty[9][2];
u8 VRAMCNT[9];
u8 VRAMSTAT;
@ -380,16 +382,17 @@ void MapVRAM_AB(u32 bank, u8 cnt)
case 1: // ABG
UNMAP_RANGE_PTR(ABG, oldofs<<3, 8);
LCDCDirty[bank][0] = LCDCDirty[bank][1] = 0xFFFFFFFFFFFFFFFF;
break;
case 2: // AOBJ
oldofs &= 0x1;
UNMAP_RANGE_PTR(AOBJ, oldofs<<3, 8);
LCDCDirty[bank][0] = LCDCDirty[bank][1] = 0xFFFFFFFFFFFFFFFF;
break;
case 3: // texture
VRAMMap_Texture[oldofs] &= ~bankmask;
GPU3D::TexCache::InvalidateTexSlot(oldofs);
break;
}
}
@ -413,7 +416,6 @@ void MapVRAM_AB(u32 bank, u8 cnt)
case 3: // texture
VRAMMap_Texture[ofs] |= bankmask;
GPU3D::TexCache::InvalidateTexSlot(ofs);
break;
}
}
@ -442,16 +444,17 @@ void MapVRAM_CD(u32 bank, u8 cnt)
case 1: // ABG
UNMAP_RANGE_PTR(ABG, oldofs<<3, 8);
LCDCDirty[bank][0] = LCDCDirty[bank][1] = 0xFFFFFFFFFFFFFFFF;
break;
case 2: // ARM7 VRAM
oldofs &= 0x1;
VRAMMap_ARM7[oldofs] &= ~bankmask;
LCDCDirty[bank][0] = LCDCDirty[bank][1] = 0xFFFFFFFFFFFFFFFF;
break;
case 3: // texture
VRAMMap_Texture[oldofs] &= ~bankmask;
GPU3D::TexCache::InvalidateTexSlot(oldofs);
break;
case 4: // BBG/BOBJ
@ -463,6 +466,7 @@ void MapVRAM_CD(u32 bank, u8 cnt)
{
UNMAP_RANGE_PTR(BOBJ, 0, 8);
}
LCDCDirty[bank][0] = LCDCDirty[bank][1] = 0xFFFFFFFFFFFFFFFF;
break;
}
}
@ -487,7 +491,6 @@ void MapVRAM_CD(u32 bank, u8 cnt)
case 3: // texture
VRAMMap_Texture[ofs] |= bankmask;
GPU3D::TexCache::InvalidateTexSlot(ofs);
break;
case 4: // BBG/BOBJ
@ -523,16 +526,16 @@ void MapVRAM_E(u32 bank, u8 cnt)
case 1: // ABG
UNMAP_RANGE_PTR(ABG, 0, 4);
LCDCDirty[bank][0] = 0xFFFFFFFFFFFFFFFF;
break;
case 2: // AOBJ
UNMAP_RANGE_PTR(AOBJ, 0, 4);
LCDCDirty[bank][0] = 0xFFFFFFFFFFFFFFFF;
break;
case 3: // texture palette
UNMAP_RANGE(TexPal, 0, 4);
for (int i = 0; i < 4; i++)
GPU3D::TexCache::InvalidatePalSlot(i);
break;
case 4: // ABG ext palette
@ -561,8 +564,6 @@ void MapVRAM_E(u32 bank, u8 cnt)
case 3: // texture palette
MAP_RANGE(TexPal, 0, 4);
for (int i = 0; i < 4; i++)
GPU3D::TexCache::InvalidatePalSlot(i);
break;
case 4: // ABG ext palette
@ -601,6 +602,7 @@ void MapVRAM_FG(u32 bank, u8 cnt)
VRAMPtr_ABG[base] = GetUniqueBankPtr(VRAMMap_ABG[base], base << 14);
VRAMPtr_ABG[base + 2] = GetUniqueBankPtr(VRAMMap_ABG[base + 2], (base + 2) << 14);
}
LCDCDirty[bank][0] = 0xFFFFFFFF;
break;
case 2: // AOBJ
@ -611,11 +613,11 @@ void MapVRAM_FG(u32 bank, u8 cnt)
VRAMPtr_AOBJ[base] = GetUniqueBankPtr(VRAMMap_AOBJ[base], base << 14);
VRAMPtr_AOBJ[base + 2] = GetUniqueBankPtr(VRAMMap_AOBJ[base + 2], (base + 2) << 14);
}
LCDCDirty[bank][0] = 0xFFFFFFFF;
break;
case 3: // texture palette
VRAMMap_TexPal[(oldofs & 0x1) + ((oldofs & 0x2) << 1)] &= ~bankmask;
GPU3D::TexCache::InvalidatePalSlot((oldofs & 0x1) + ((oldofs & 0x2) << 1));
break;
case 4: // ABG ext palette
@ -661,7 +663,6 @@ void MapVRAM_FG(u32 bank, u8 cnt)
case 3: // texture palette
VRAMMap_TexPal[(ofs & 0x1) + ((ofs & 0x2) << 1)] |= bankmask;
GPU3D::TexCache::InvalidatePalSlot((ofs & 0x1) + ((ofs & 0x2) << 1));
break;
case 4: // ABG ext palette

View File

@ -49,6 +49,8 @@ extern u8 VRAM_I[ 16*1024];
extern u8* VRAM[9];
extern u32 VRAMMask[9];
extern u64 LCDCDirty[9][2];
extern u32 VRAMMap_LCDC;
extern u32 VRAMMap_ABG[0x20];
extern u32 VRAMMap_AOBJ[0x10];
@ -219,7 +221,11 @@ void WriteVRAM_LCDC(u32 addr, T val)
default: return;
}
if (VRAMMap_LCDC & (1<<bank)) *(T*)&VRAM[bank][addr] = val;
if (VRAMMap_LCDC & (1<<bank))
{
*(T*)&VRAM[bank][addr] = val;
LCDCDirty[bank][addr >> 16] |= 1 << ((addr >> 10) & 0x3F);
}
}

View File

@ -155,9 +155,6 @@ void SaveTextures();
template <int format>
ExternalTexHandle GetTexture(u32 texParam, u32 palBase);
void InvalidateTexSlot(u32 base);
void InvalidatePalSlot(u32 base);
}
namespace SoftRenderer

View File

@ -80,7 +80,7 @@ struct TextureAllocator
// all sizes below 8*8 (log2(64)=6) can be ignored
TextureAllocator TextureMem[14];
TextureAllocator& GetTextureAllocator(u32 width, u32 height)
inline TextureAllocator& GetTextureAllocator(u32 width, u32 height)
{
return TextureMem[__builtin_ctz(width * height) - 6];
}
@ -1136,15 +1136,24 @@ void SetupPolygonRightEdge(RendererPolygon* rp, s32 y)
polygon->FinalW[rp->CurVR], polygon->FinalW[rp->NextVR], y);
}
void SetupPolygon(RendererPolygon* rp, Polygon* polygon)
void SetupPolygon(RendererPolygon* rp, Polygon* polygon, RendererPolygon* lastRp)
{
if (polygon->TexParam & 0x1C000000)
{
TexCache::ExternalTexHandle handle = TexCache::GetTexture<TexCache::outputFmt_RGB6A5>(polygon->TexParam, polygon->TexPalette);
if (lastRp && lastRp->PolyData->TexParam == polygon->TexParam
&& lastRp->PolyData->TexPalette == polygon->TexPalette)
{
rp->TextureData = lastRp->TextureData;
}
else
{
TexCache::ExternalTexHandle handle =
TexCache::GetTexture<TexCache::outputFmt_RGB6A5>(polygon->TexParam, polygon->TexPalette);
u32 width = 8 << ((polygon->TexParam >> 20) & 0x7);
u32 height = 8 << ((polygon->TexParam >> 23) & 0x7);
rp->TextureData = &GetTextureAllocator(width, height).Pixels[handle];
}
}
u32 nverts = polygon->NumVertices;
@ -2142,17 +2151,15 @@ void ClearBuffers()
void RenderPolygons(bool threaded, Polygon** polygons, int npolys)
{
u64 ticksStart = SDL_GetPerformanceCounter();
TexCache::UpdateTextures();
int j = 0;
for (int i = 0; i < npolys; i++)
{
if (polygons[i]->Degenerate) continue;
SetupPolygon(&PolygonList[j++], polygons[i]);
}
u64 tickesEnd = SDL_GetPerformanceCounter();
printf("time %fms\n", (tickesEnd-ticksStart)/(float)SDL_GetPerformanceFrequency()*1000.f);
SetupPolygon(&PolygonList[j], polygons[i], j > 0 ? &PolygonList[j - 1] : NULL);
j++;
}
TexCache::SaveTextures();
RenderScanline(0, j);

View File

@ -165,7 +165,7 @@ void ConvertDirectColorTexture(u32 width, u32 height, u32* output, u8* texData)
}
template <int outputFmt, int X, int Y>
void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent)
void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData)
{
for (int y = 0; y < height; y++)
{
@ -176,11 +176,9 @@ void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* pa
u32 idx = val & ((1 << Y) - 1);
u16 color = palData[idx];
u32 alpha = (val >> X) & ((1 << X) - 1);
u32 alpha = (val >> Y) & ((1 << X) - 1);
if (X != 5)
alpha = alpha * 4 + alpha / 2;
if (color0Transparent && idx == 0)
alpha = 0;
u32 res;
switch (outputFmt)
@ -236,11 +234,11 @@ struct Texture
ExternalTexHandle Handle;
};
u64 PaletteCacheStatus;
u64 PaletteCacheStatus[2];
u8 PaletteCache[128*1024];
u64 PaletteDirty[2];
u64 TexturesDirty[8];
u32 TextureMap[4];
u32 PaletteMap[8];
std::unordered_map<u64, Texture> TextureCache;
@ -260,11 +258,12 @@ void DeInit()
void Reset()
{
PaletteCacheStatus = 0;
memset(PaletteCacheStatus, 0, 2*8);
memset(PaletteDirty, 0, 8*2);
memset(TexturesDirty, 0, 8*8);
TextureCache.clear();
memset(TextureMap, 0, 2*4);
memset(PaletteMap, 0, 8*4);
}
u8* GetTexturePtr(u32 addr, u32 size, u8** unpackBuffer)
@ -289,29 +288,32 @@ u8* GetTexturePtr(u32 addr, u32 size, u8** unpackBuffer)
}
}
void EnsurePaletteCoherent(u64 mask)
void EnsurePaletteCoherent(u64* mask)
{
if ((PaletteCacheStatus & mask) != mask)
for (int i = 0; i < 2; i++)
{
u32 updateField = ~PaletteCacheStatus & mask;
PaletteCacheStatus |= mask;
if ((PaletteCacheStatus[i] & mask[i]) != mask[i])
{
u64 updateField = ~PaletteCacheStatus[i] & mask[i];
PaletteCacheStatus[i] |= mask[i];
while (updateField != 0)
{
updatePalette = true;
int idx = __builtin_ctz(updateField);
u32 map = GPU::VRAMMap_TexPal[idx >> 3];
int idx = __builtin_ctzll(updateField);
u32 map = GPU::VRAMMap_TexPal[idx >> 4 + i * 4];
if (map && (map & (map - 1)) == 0)
{
u32 bank = __builtin_ctz(map);
memcpy(
PaletteCache + idx * 0x800,
GPU::VRAM[bank] + ((idx * 0x800) & GPU::VRAMMask[bank]),
0x800);
PaletteCache + i * 0x10000 + idx * 0x400,
GPU::VRAM[bank] + ((idx * 0x400) & GPU::VRAMMask[bank]),
0x400);
}
else
for (int i = 0; i < 0x800; i += 8)
*(u64*)&PaletteCache[idx * 0x800 + i] = GPU::ReadVRAM_TexPal<u64>(idx * 0x800 + i);
updateField &= ~(1 << idx);
for (int j = 0; j < 0x400; j += 8)
*(u64*)&PaletteCache[i * 0x10000 + idx * 0x400 + j] = GPU::ReadVRAM_TexPal<u64>(i * 0x10000 + idx * 0x400 + j);
updateField &= ~(1ULL << idx);
}
}
}
}
@ -324,6 +326,62 @@ void UpdateTextures()
copyTexture = false;
textureUpdated = false;
u64 PaletteDirty[2] = {0};
u64 TexturesDirty[8] = {0};
for (int i = 0; i < 4; i++)
{
if (GPU::VRAMMap_Texture[i] != TextureMap[i])
{
TexturesDirty[(i << 1)] = 0xFFFFFFFFFFFFFFFF;
TexturesDirty[(i << 1) + 1] = 0xFFFFFFFFFFFFFFFF;
TextureMap[i] = GPU::VRAMMap_Texture[i];
}
else
{
for (int j = 0; j < 4; j++)
{
if (TextureMap[i] & (1<<j))
{
TexturesDirty[(i << 1)] |= GPU::LCDCDirty[j][0];
TexturesDirty[(i << 1) + 1] |= GPU::LCDCDirty[j][1];
GPU::LCDCDirty[j][0] = 0;
GPU::LCDCDirty[j][1] = 0;
}
}
}
}
for (int i = 0; i < 8; i++)
{
if (GPU::VRAMMap_TexPal[i] != PaletteMap[i])
{
PaletteDirty[i >> 2] |= 0xFFFF << (i & 0x3) * 16;
PaletteCacheStatus[i >> 2] &= ~(0xFFFF << (i & 0x3) * 16);
PaletteMap[i] = GPU::VRAMMap_TexPal[i];
}
else
{
// E
if (PaletteMap[i] & (1<<3))
{
PaletteDirty[i >> 2] |= GPU::LCDCDirty[3][0];
PaletteCacheStatus[i >> 2] &= ~GPU::LCDCDirty[3][0];
GPU::LCDCDirty[3][0] = 0;
}
// FG
for (int j = 0; j < 2; j++)
{
if (PaletteMap[i] & (1<<(4+j)))
{
PaletteDirty[i >> 2] |= GPU::LCDCDirty[4+j][0] << (i & 0x3) * 16;
PaletteCacheStatus[i >> 2] &= ~(GPU::LCDCDirty[4+j][0] << (i & 0x3) * 16);
GPU::LCDCDirty[4+j][0] = 0;
}
}
}
}
bool paletteDirty = PaletteDirty[0] | PaletteDirty[1];
bool textureDirty = false;
for (int i = 0; i < 8; i++)
@ -358,23 +416,25 @@ void UpdateTextures()
}
}
inline u64 MakePaletteMask(u32 addr, u32 size)
{
return ((1ULL << (((addr + size + 0x7FF & ~0x7FF) >> 11) - (addr >> 11))) - 1) << (addr >> 11);
}
inline void MakeDirtyMask(u64* out, u32 addr, u32 size)
{
u32 start = addr >> 10;
u32 count = (((addr + size + 0x3FF) & ~0x3FF) >> 10) - start;
u32 startBit = addr >> 10;
u32 bitsCount = ((addr + size + 0x3FF & ~0x3FF) >> 10) - startBit;
u32 firstIdx = start >> 6;
u32 indicesCount = (((count + 0x3F) & ~0x3F) >> 6) - firstIdx;
u32 startEntry = startBit >> 6;
u64 entriesCount = ((startBit + bitsCount + 0x3F & ~0x3F) >> 6) - startEntry;
out[firstIdx] = (1ULL << (63 - (start & 0x3F))) - 1 << (start & 0x3F);
out[firstIdx + indicesCount - 1] = (1ULL << (start & 0x3F)) - 1;
for (int i = firstIdx + 1; i < firstIdx + indicesCount - 1; i++)
out[i] |= 0xFFFFFFFFFFFFFFFF;
if (entriesCount > 1)
{
out[startEntry] |= 0xFFFFFFFFFFFFFFFF << (startBit & 0x3F);
out[startEntry + entriesCount - 1] |= (1ULL << (startBit & 0x3F)) - 1;
for (int i = startEntry + 1; i < startEntry + entriesCount - 1; i++)
out[i] = 0xFFFFFFFFFFFFFFFF;
}
else
{
out[startEntry] |= ((1ULL << bitsCount) - 1) << (startBit & 0x3F);
}
}
template <int format>
@ -427,8 +487,7 @@ ExternalTexHandle GetTexture(u32 texParam, u32 palBase)
MakeDirtyMask(texture.TextureMask, slot1addr, width*height/16*2);
MakeDirtyMask(texture.PaletteMask, palBase*16, 0x10000);
u64 paletteMask = MakePaletteMask(palBase*16, 0x10000);
EnsurePaletteCoherent(MakePaletteMask(palBase*16, 0x10000));
EnsurePaletteCoherent(texture.PaletteMask);
u16* palData = (u16*)(PaletteCache + palBase*16);
ConvertCompressedTexture<format>(width, height, data, texData, texAuxData, palData);
@ -445,19 +504,20 @@ ExternalTexHandle GetTexture(u32 texParam, u32 palBase)
case 4: texSize = width*height; palSize = 256; break;
}
u8* texData = GetTexturePtr(addr, texSize, &unpackBuffer);
EnsurePaletteCoherent(MakePaletteMask(palAddr, palSize*2));
u16* palData = (u16*)(PaletteCache + palAddr);
MakeDirtyMask(texture.TextureMask, addr, texSize);
MakeDirtyMask(texture.PaletteMask, palAddr, palSize);
EnsurePaletteCoherent(texture.PaletteMask);
u8* texData = GetTexturePtr(addr, texSize, &unpackBuffer);
u16* palData = (u16*)(PaletteCache + palAddr);
bool color0Transparent = texParam & (1 << 29);
switch (fmt)
{
case 1: ConvertAXIYTexture<format, 3, 5>(width, height, data, texData, palData, color0Transparent); break;
case 6: ConvertAXIYTexture<format, 5, 3>(width, height, data, texData, palData, color0Transparent); break;
case 1: ConvertAXIYTexture<format, 3, 5>(width, height, data, texData, palData); break;
case 6: ConvertAXIYTexture<format, 5, 3>(width, height, data, texData, palData); break;
case 2: ConvertNColorsTexture<format, 2>(width, height, data, texData, palData, color0Transparent); break;
case 3: ConvertNColorsTexture<format, 4>(width, height, data, texData, palData, color0Transparent); break;
case 4: ConvertNColorsTexture<format, 8>(width, height, data, texData, palData, color0Transparent); break;
@ -493,19 +553,8 @@ void SaveTextures()
//printf("%d %d textures converted %d pixels %d %d %d\n", converted, TextureCache.size(), pixelsConverted, updatePalette, copyTexture, textureUpdated);
}
void InvalidateTexSlot(u32 base)
{
TexturesDirty[(base << 1)] = 0xFFFFFFFFFFFFFFFF;
TexturesDirty[(base << 1) + 1] = 0xFFFFFFFFFFFFFFFF;
}
void InvalidatePalSlot(u32 base)
{
PaletteDirty[base >> 2] |= 0xFFFF << (base & 0x3) * 16;
PaletteCacheStatus &= ~(0xFF << base * 8);
}
}
}
template GPU3D::TexCache::ExternalTexHandle