#ifndef GPU3D_TEXCACHE #define GPU3D_TEXCACHE #include "types.h" #include "GPU.h" #include #include #include #define XXH_STATIC_LINKING_ONLY #include "xxhash/xxhash.h" namespace melonDS { inline u32 TextureWidth(u32 texparam) { return 8 << ((texparam >> 20) & 0x7); } inline u32 TextureHeight(u32 texparam) { return 8 << ((texparam >> 23) & 0x7); } enum { outputFmt_RGB6A5, outputFmt_RGBA8, outputFmt_BGRA8 }; template void ConvertBitmapTexture(u32 width, u32 height, u32* output, u32 addr, GPU& gpu); template void ConvertCompressedTexture(u32 width, u32 height, u32* output, u32 addr, u32 addrAux, u32 palAddr, GPU& gpu); template void ConvertAXIYTexture(u32 width, u32 height, u32* output, u32 addr, u32 palAddr, GPU& gpu); template void ConvertNColorsTexture(u32 width, u32 height, u32* output, u32 addr, u32 palAddr, bool color0Transparent, GPU& gpu); template class Texcache { public: Texcache(const TexLoaderT& texloader) : TexLoader(texloader) // probably better if this would be a move constructor??? {} u64 MaskedHash(u8* vram, u32 vramSize, u32 addr, u32 size) { u64 hash = 0; while (size > 0) { u32 pieceSize; if (addr + size > vramSize) // wraps around, only do the part inside pieceSize = vramSize - addr; else // fits completely inside pieceSize = size; hash = XXH64(&vram[addr], pieceSize, hash); addr += pieceSize; addr &= (vramSize - 1); assert(size >= pieceSize); size -= pieceSize; } return hash; } bool CheckInvalid(u32 start, u32 size, u64 oldHash, u64* dirty, u8* vram, u32 vramSize) { u32 startBit = start / VRAMDirtyGranularity; u32 bitsCount = ((start + size + VRAMDirtyGranularity - 1) / VRAMDirtyGranularity) - startBit; u32 startEntry = startBit >> 6; u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry; for (u32 j = startEntry; j < startEntry + entriesCount; j++) { if (GetRangedBitMask(j, startBit, bitsCount) & dirty[j & ((vramSize / VRAMDirtyGranularity)-1)]) { if (MaskedHash(vram, vramSize, start, size) != oldHash) return true; } } return false; } bool Update(GPU& gpu) { auto textureDirty = gpu.VRAMDirty_Texture.DeriveState(gpu.VRAMMap_Texture, gpu); auto texPalDirty = gpu.VRAMDirty_TexPal.DeriveState(gpu.VRAMMap_TexPal, gpu); bool textureChanged = gpu.MakeVRAMFlat_TextureCoherent(textureDirty); bool texPalChanged = gpu.MakeVRAMFlat_TexPalCoherent(texPalDirty); if (textureChanged || texPalChanged) { //printf("check invalidation %d\n", TexCache.size()); for (auto it = Cache.begin(); it != Cache.end();) { TexCacheEntry& entry = it->second; if (textureChanged) { for (u32 i = 0; i < 2; i++) { if (CheckInvalid(entry.TextureRAMStart[i], entry.TextureRAMSize[i], entry.TextureHash[i], textureDirty.Data, gpu.VRAMFlat_Texture, sizeof(gpu.VRAMFlat_Texture))) goto invalidate; } } if (texPalChanged && entry.TexPalSize > 0) { if (CheckInvalid(entry.TexPalStart, entry.TexPalSize, entry.TexPalHash, texPalDirty.Data, gpu.VRAMFlat_TexPal, sizeof(gpu.VRAMFlat_TexPal))) goto invalidate; } it++; continue; invalidate: FreeTextures[entry.WidthLog2][entry.HeightLog2].push_back(entry.Texture); //printf("invalidating texture %d\n", entry.ImageDescriptor); it = Cache.erase(it); } return true; } return false; } void GetTexture(GPU& gpu, u32 texParam, u32 palBase, TexHandleT& textureHandle, u32& layer, u32*& helper) { // remove sampling and texcoord gen params texParam &= ~0xC00F0000; u32 fmt = (texParam >> 26) & 0x7; u64 key = texParam; if (fmt != 7) { key |= (u64)palBase << 32; if (fmt == 5) key &= ~((u64)1 << 29); } //printf("%" PRIx64 " %" PRIx32 " %" PRIx32 "\n", key, texParam, palBase); assert(fmt != 0 && "no texture is not a texture format!"); auto it = Cache.find(key); if (it != Cache.end()) { textureHandle = it->second.Texture.TextureID; layer = it->second.Texture.Layer; helper = &it->second.LastVariant; return; } u32 widthLog2 = (texParam >> 20) & 0x7; u32 heightLog2 = (texParam >> 23) & 0x7; u32 width = 8 << widthLog2; u32 height = 8 << heightLog2; u32 addr = (texParam & 0xFFFF) * 8; TexCacheEntry entry = {0}; entry.TextureRAMStart[0] = addr; entry.WidthLog2 = widthLog2; entry.HeightLog2 = heightLog2; // apparently a new texture if (fmt == 7) { entry.TextureRAMSize[0] = width*height*2; ConvertBitmapTexture(width, height, DecodingBuffer, addr, gpu); } else if (fmt == 5) { u32 slot1addr = 0x20000 + ((addr & 0x1FFFC) >> 1); if (addr >= 0x40000) slot1addr += 0x10000; entry.TextureRAMSize[0] = width*height/16*4; entry.TextureRAMStart[1] = slot1addr; entry.TextureRAMSize[1] = width*height/16*2; entry.TexPalStart = palBase*16; entry.TexPalSize = 0x10000; ConvertCompressedTexture(width, height, DecodingBuffer, addr, slot1addr, entry.TexPalStart, gpu); } else { u32 texSize, palAddr = palBase*16, numPalEntries; switch (fmt) { case 1: texSize = width*height; numPalEntries = 32; break; case 6: texSize = width*height; numPalEntries = 8; break; case 2: texSize = width*height/4; numPalEntries = 4; palAddr >>= 1; break; case 3: texSize = width*height/2; numPalEntries = 16; break; case 4: texSize = width*height; numPalEntries = 256; break; } palAddr &= 0x1FFFF; /*printf("creating texture | fmt: %d | %dx%d | %08x | %08x\n", fmt, width, height, addr, palAddr); svcSleepThread(1000*1000);*/ entry.TextureRAMSize[0] = texSize; entry.TexPalStart = palAddr; entry.TexPalSize = numPalEntries*2; //assert(entry.TexPalStart+entry.TexPalSize <= 128*1024*1024); bool color0Transparent = texParam & (1 << 29); switch (fmt) { case 1: ConvertAXIYTexture(width, height, DecodingBuffer, addr, palAddr, gpu); break; case 6: ConvertAXIYTexture(width, height, DecodingBuffer, addr, palAddr, gpu); break; case 2: ConvertNColorsTexture(width, height, DecodingBuffer, addr, palAddr, color0Transparent, gpu); break; case 3: ConvertNColorsTexture(width, height, DecodingBuffer, addr, palAddr, color0Transparent, gpu); break; case 4: ConvertNColorsTexture(width, height, DecodingBuffer, addr, palAddr, color0Transparent, gpu); break; } } for (int i = 0; i < 2; i++) { if (entry.TextureRAMSize[i]) entry.TextureHash[i] = MaskedHash(gpu.VRAMFlat_Texture, sizeof(gpu.VRAMFlat_Texture), entry.TextureRAMStart[i], entry.TextureRAMSize[i]); } if (entry.TexPalSize) entry.TexPalHash = MaskedHash(gpu.VRAMFlat_TexPal, sizeof(gpu.VRAMFlat_TexPal), entry.TexPalStart, entry.TexPalSize); auto& texArrays = TexArrays[widthLog2][heightLog2]; auto& freeTextures = FreeTextures[widthLog2][heightLog2]; if (freeTextures.size() == 0) { texArrays.resize(texArrays.size()+1); TexHandleT& array = texArrays[texArrays.size()-1]; u32 layers = std::min((8*1024*1024) / (width*height*4), 64); // allocate new array texture //printf("allocating new layer set for %d %d %d %d\n", width, height, texArrays.size()-1, array.ImageDescriptor); array = TexLoader.GenerateTexture(width, height, layers); for (u32 i = 0; i < layers; i++) { freeTextures.push_back(TexArrayEntry{array, i}); } } TexArrayEntry storagePlace = freeTextures[freeTextures.size()-1]; freeTextures.pop_back(); entry.Texture = storagePlace; TexLoader.UploadTexture(storagePlace.TextureID, width, height, storagePlace.Layer, DecodingBuffer); //printf("using storage place %d %d | %d %d (%d)\n", width, height, storagePlace.TexArrayIdx, storagePlace.LayerIdx, array.ImageDescriptor); textureHandle = storagePlace.TextureID; layer = storagePlace.Layer; helper = &Cache.emplace(std::make_pair(key, entry)).first->second.LastVariant; } void Reset() { for (u32 i = 0; i < 8; i++) { for (u32 j = 0; j < 8; j++) { for (u32 k = 0; k < TexArrays[i][j].size(); k++) TexLoader.DeleteTexture(TexArrays[i][j][k]); TexArrays[i][j].clear(); FreeTextures[i][j].clear(); } } Cache.clear(); } private: struct TexArrayEntry { TexHandleT TextureID; u32 Layer; }; struct TexCacheEntry { u32 LastVariant; // very cheap way to make variant lookup faster u32 TextureRAMStart[2], TextureRAMSize[2]; u32 TexPalStart, TexPalSize; u8 WidthLog2, HeightLog2; TexArrayEntry Texture; u64 TextureHash[2]; u64 TexPalHash; }; std::unordered_map Cache; TexLoaderT TexLoader; std::vector FreeTextures[8][8]; std::vector TexArrays[8][8]; u32 DecodingBuffer[1024*1024]; }; } #endif