diff --git a/Source/Core/VideoCommon/Src/DataReader.h b/Source/Core/VideoCommon/Src/DataReader.h index cd6371a452..915bd8d68e 100644 --- a/Source/Core/VideoCommon/Src/DataReader.h +++ b/Source/Core/VideoCommon/Src/DataReader.h @@ -20,17 +20,22 @@ extern u8* g_pVideoData; -inline u8 DataPeek8(u32 _uOffset) +inline void DataSkip(u32 skip) +{ + g_pVideoData += skip; +} + +inline u8 DataPeek8(int _uOffset) { return g_pVideoData[_uOffset]; } -inline u16 DataPeek16(u32 _uOffset) +inline u16 DataPeek16(int _uOffset) { return Common::swap16(*(u16*)&g_pVideoData[_uOffset]); } -inline u32 DataPeek32(u32 _uOffset) +inline u32 DataPeek32(int _uOffset) { return Common::swap32(*(u32*)&g_pVideoData[_uOffset]); } @@ -118,9 +123,4 @@ inline u8* DataGetPosition() return g_pVideoData; } -inline void DataSkip(u32 skip) -{ - g_pVideoData += skip; -} - #endif diff --git a/Source/Core/VideoCommon/Src/Fifo.cpp b/Source/Core/VideoCommon/Src/Fifo.cpp index 9be1bd2288..6dcc6181f1 100644 --- a/Source/Core/VideoCommon/Src/Fifo.cpp +++ b/Source/Core/VideoCommon/Src/Fifo.cpp @@ -97,7 +97,8 @@ void Fifo_ExitLoop() // May be executed from any thread, even the graphics thread. // Created to allow for self shutdown. -void Fifo_ExitLoopNonBlocking() { +void Fifo_ExitLoopNonBlocking() +{ fifoStateRun = false; } @@ -118,7 +119,7 @@ void Fifo_SendFifoData(u8* _uData, u32 len) // Copy new video instructions to videoBuffer for future use in rendering the new picture memcpy(videoBuffer + size, _uData, len); size += len; - OpcodeDecoder_Run(); + OpcodeDecoder_Run(g_bSkipCurrentFrame); } // Description: Main FIFO update loop @@ -146,7 +147,7 @@ void Fifo_EnterLoop(const SVideoInitialize &video_initialize) while (_fifo.bFF_GPReadEnable && _fifo.CPReadWriteDistance) { - if(!fifoStateRun) + if (!fifoStateRun) break; // Create pointer to video data and send it to the VideoPlugin diff --git a/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp b/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp index 138cc93fd7..e631d9957f 100644 --- a/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp +++ b/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp @@ -19,10 +19,11 @@ // Ikaruga uses (nearly) NO display lists! // Zelda WW uses TONS of display lists // Zelda TP uses almost 100% display lists except menus (we like this!) +// Super Mario Galaxy has nearly all geometry and more than half of the state in DLs (great!) // Note that it IS NOT GENERALLY POSSIBLE to precompile display lists! You can compile them as they are -// and hope that the vertex format doesn't change, though, if you do it just when they are -// called. The reason is that the vertex format affects the sizes of the vertices. +// while interpreting them, and hope that the vertex format doesn't change, though, if you do it right +// when they are called. The reason is that the vertex format affects the sizes of the vertices. #include "Common.h" #include "VideoCommon.h" @@ -47,13 +48,12 @@ extern u8* FAKE_GetFifoEndPtr(); static void Decode(); -static void ExecuteDisplayList(u32 address, u32 size) +void InterpretDisplayList(u32 address, u32 size) { u8* old_pVideoData = g_pVideoData; - u8* startAddress = Memory_GetPtr(address); - //Avoid the crash if Memory_GetPtr failed .. + // Avoid the crash if Memory_GetPtr failed .. if (startAddress != 0) { g_pVideoData = startAddress; @@ -61,7 +61,8 @@ static void ExecuteDisplayList(u32 address, u32 size) // temporarily swap dl and non-dl (small "hack" for the stats) Statistics::SwapDL(); - while ((u32)(g_pVideoData - startAddress) < size) + u8 *end = g_pVideoData + size; + while (g_pVideoData < end) { Decode(); } @@ -76,48 +77,60 @@ static void ExecuteDisplayList(u32 address, u32 size) g_pVideoData = old_pVideoData; } +// Defer to plugin-specific DL cache. +extern bool HandleDisplayList(u32 address, u32 size); + +void ExecuteDisplayList(u32 address, u32 size) +{ + if (!HandleDisplayList(address, size)) + InterpretDisplayList(address, size); +} + bool FifoCommandRunnable() { - u32 iBufferSize = (u32)(FAKE_GetFifoEndPtr() - g_pVideoData); - if (iBufferSize == 0) + u32 buffer_size = (u32)(FAKE_GetFifoEndPtr() - g_pVideoData); + if (buffer_size == 0) return false; // can't peek - u8 Cmd = DataPeek8(0); - u32 iCommandSize = 0; + u8 cmd_byte = DataPeek8(0); + u32 command_size = 0; - switch (Cmd) + switch (cmd_byte) { case GX_NOP: // Hm, this means that we scan over nop streams pretty slowly... case GX_CMD_INVL_VC: // Invalidate Vertex Cache - no parameters - case 0x44: // zelda 4 swords calls it and checks the metrics registers after that - iCommandSize = 1; + case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that + command_size = 1; + break; + + case GX_LOAD_BP_REG: + command_size = 5; break; case GX_LOAD_CP_REG: - iCommandSize = 6; + command_size = 6; break; case GX_LOAD_INDX_A: case GX_LOAD_INDX_B: case GX_LOAD_INDX_C: case GX_LOAD_INDX_D: - case GX_LOAD_BP_REG: - iCommandSize = 5; + command_size = 5; break; case GX_CMD_CALL_DL: - iCommandSize = 9; + command_size = 9; break; case GX_LOAD_XF_REG: { // check if we can read the header - if (iBufferSize >= 5) + if (buffer_size >= 5) { - iCommandSize = 1 + 4; + command_size = 1 + 4; u32 Cmd2 = DataPeek32(1); - int dwTransferSize = ((Cmd2 >> 16) & 15) + 1; - iCommandSize += dwTransferSize * 4; + int transfer_size = ((Cmd2 >> 16) & 15) + 1; + command_size += transfer_size * 4; } else { @@ -127,14 +140,14 @@ bool FifoCommandRunnable() break; default: - if (Cmd & 0x80) + if (cmd_byte & 0x80) { // check if we can read the header - if (iBufferSize >= 3) + if (buffer_size >= 3) { - iCommandSize = 1 + 2; + command_size = 1 + 2; u16 numVertices = DataPeek16(1); - iCommandSize += numVertices * VertexLoaderManager::GetVertexSize(Cmd & GX_VAT_MASK); + command_size += numVertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK); } else { @@ -151,14 +164,14 @@ bool FifoCommandRunnable() "* Command stream corrupted by some spurious memory bug\n" "* This really is an unknown opcode (unlikely)\n" "* Some other sort of bug\n\n" - "Dolphin will now likely crash or hang. Enjoy." , Cmd); + "Dolphin will now likely crash or hang. Enjoy." , cmd_byte); g_VideoInitialize.pSysMessage(szTemp); g_VideoInitialize.pLog(szTemp, TRUE); { SCPFifoStruct &fifo = *g_VideoInitialize.pCPFifo; char szTmp[256]; - // sprintf(szTmp, "Illegal command %02x (at %08x)",Cmd,g_pDataReader->GetPtr()); + // sprintf(szTmp, "Illegal command %02x (at %08x)",cmd_byte,g_pDataReader->GetPtr()); sprintf(szTmp, "Illegal command %02x\n" "CPBase: 0x%08x\n" "CPEnd: 0x%08x\n" @@ -172,42 +185,39 @@ bool FifoCommandRunnable() "bFF_BPEnable: %s\n" "bFF_GPLinkEnable: %s\n" "bFF_Breakpoint: %s\n" - ,Cmd, fifo.CPBase, fifo.CPEnd, fifo.CPHiWatermark, fifo.CPLoWatermark, fifo.CPReadWriteDistance + ,cmd_byte, fifo.CPBase, fifo.CPEnd, fifo.CPHiWatermark, fifo.CPLoWatermark, fifo.CPReadWriteDistance ,fifo.CPWritePointer, fifo.CPReadPointer, fifo.CPBreakpoint, fifo.bFF_GPReadEnable ? "true" : "false" ,fifo.bFF_BPEnable ? "true" : "false" ,fifo.bFF_GPLinkEnable ? "true" : "false" ,fifo.bFF_Breakpoint ? "true" : "false"); g_VideoInitialize.pSysMessage(szTmp); g_VideoInitialize.pLog(szTmp, TRUE); - // _assert_msg_(0,szTmp,""); - } } break; } - if (iCommandSize > iBufferSize) + if (command_size > buffer_size) return false; - // INFO_LOG("OP detected: Cmd 0x%x size %i buffer %i",Cmd, iCommandSize, iBufferSize); + // INFO_LOG("OP detected: cmd_byte 0x%x size %i buffer %i",cmd_byte, command_size, buffer_size); return true; } static void Decode() { - int Cmd = DataReadU8(); - - switch(Cmd) + int cmd_byte = DataReadU8(); + switch (cmd_byte) { case GX_NOP: break; case GX_LOAD_CP_REG: //0x08 { - u32 SubCmd = DataReadU8(); - u32 Value = DataReadU32(); - LoadCPReg(SubCmd, Value); + u8 sub_cmd = DataReadU8(); + u32 value = DataReadU32(); + LoadCPReg(sub_cmd, value); INCSTAT(stats.thisFrame.numCPLoads); } break; @@ -215,13 +225,13 @@ static void Decode() case GX_LOAD_XF_REG: { u32 Cmd2 = DataReadU32(); - int dwTransferSize = ((Cmd2 >> 16) & 15) + 1; - u32 dwAddress = Cmd2 & 0xFFFF; + int transfer_size = ((Cmd2 >> 16) & 15) + 1; + u32 address = Cmd2 & 0xFFFF; // TODO - speed this up. pshufb? - static u32 pData[16]; - for (int i = 0; i < dwTransferSize; i++) - pData[i] = DataReadU32(); - LoadXFReg(dwTransferSize, dwAddress, pData); + u32 data_buffer[16]; + for (int i = 0; i < transfer_size; i++) + data_buffer[i] = DataReadU32(); + LoadXFReg(transfer_size, address, data_buffer); INCSTAT(stats.thisFrame.numXFLoads); } break; @@ -241,13 +251,13 @@ static void Decode() case GX_CMD_CALL_DL: { - u32 dwAddr = DataReadU32(); - u32 dwCount = DataReadU32(); - ExecuteDisplayList(dwAddr, dwCount); + u32 address = DataReadU32(); + u32 count = DataReadU32(); + ExecuteDisplayList(address, count); } break; - case 0x44: // zelda 4 swords calls it and checks the metrics registers after that + case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that DEBUG_LOG(VIDEO, "GX 0x44: %08x", Cmd); break; @@ -257,31 +267,107 @@ static void Decode() case GX_LOAD_BP_REG: //0x61 { - u32 cmd = DataReadU32(); - LoadBPReg(cmd); + u32 bp_cmd = DataReadU32(); + LoadBPReg(bp_cmd); INCSTAT(stats.thisFrame.numBPLoads); } break; // draw primitives default: - if (Cmd & 0x80) + if (cmd_byte & 0x80) { // load vertices (use computed vertex size from FifoCommandRunnable above) u16 numVertices = DataReadU16(); VertexLoaderManager::RunVertices( - Cmd & GX_VAT_MASK, // Vertex loader index (0 - 7) - (Cmd & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, + cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) + (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, numVertices); } else { - // char szTmp[256]; - //sprintf(szTmp, "Illegal command %02x (at %08x)",Cmd,g_pDataReader->GetPtr()); - //g_VideoInitialize.pLog(szTmp); - //MessageBox(0,szTmp,"GFX ERROR",0); - // _assert_msg_(0,szTmp,""); + ERROR_LOG(VIDEO, "OpcodeDecoding::Decode: Illegal command %02x", cmd_byte); + break; + } + break; + } +} + +static void DecodeSemiNop() +{ + int cmd_byte = DataReadU8(); + switch (cmd_byte) + { + case GX_CMD_UNKNOWN_METRICS: // zelda 4 swords calls it and checks the metrics registers after that + case GX_CMD_INVL_VC: // Invalidate Vertex Cache + case GX_NOP: + break; + + case GX_LOAD_CP_REG: //0x08 + // We have to let CP writes through because they determine the size of vertices. + { + u8 sub_cmd = DataReadU8(); + u32 value = DataReadU32(); + LoadCPReg(sub_cmd, value); + INCSTAT(stats.thisFrame.numCPLoads); + } + break; + + case GX_LOAD_XF_REG: + { + u32 Cmd2 = DataReadU32(); + int transfer_size = ((Cmd2 >> 16) & 15) + 1; + u32 address = Cmd2 & 0xFFFF; + // TODO - speed this up. pshufb? + u32 data_buffer[16]; + for (int i = 0; i < transfer_size; i++) + data_buffer[i] = DataReadU32(); + LoadXFReg(transfer_size, address, data_buffer); + INCSTAT(stats.thisFrame.numXFLoads); + } + break; + + case GX_LOAD_INDX_A: //used for position matrices + LoadIndexedXF(DataReadU32(), 0xC); + break; + case GX_LOAD_INDX_B: //used for normal matrices + LoadIndexedXF(DataReadU32(), 0xD); + break; + case GX_LOAD_INDX_C: //used for postmatrices + LoadIndexedXF(DataReadU32(), 0xE); + break; + case GX_LOAD_INDX_D: //used for lights + LoadIndexedXF(DataReadU32(), 0xF); + break; + + case GX_CMD_CALL_DL: + // Hm, wonder if any games put tokens in display lists - in that case, + // we'll have to parse them too. + DataSkip(8); + break; + + case GX_LOAD_BP_REG: //0x61 + // We have to let BP writes through because they set tokens and stuff. + // TODO: Call a much simplified LoadBPReg instead. + { + u32 bp_cmd = DataReadU32(); + LoadBPReg(bp_cmd); + INCSTAT(stats.thisFrame.numBPLoads); + } + break; + + // draw primitives + default: + if (cmd_byte & 0x80) + { + // load vertices (use computed vertex size from FifoCommandRunnable above) + u16 numVertices = DataReadU16(); + DataSkip(numVertices * VertexLoaderManager::GetVertexSize(cmd_byte & GX_VAT_MASK)); + } + else + { + ERROR_LOG(VIDEO, "OpcodeDecoding::Decode: Illegal command %02x", cmd_byte); break; } break; @@ -298,13 +384,17 @@ void OpcodeDecoder_Shutdown() { } -void OpcodeDecoder_Run() +void OpcodeDecoder_Run(bool skipped_frame) { - DVSTARTPROFILE(); - while (FifoCommandRunnable()) - { - //TODO?: if really needed, do something like this: "InterlockedExchange((LONG*)&_fifo.CPCmdIdle, 0);" - Decode(); - } - //TODO?: if really needed, do something like this: "InterlockedExchange((LONG*)&_fifo.CPCmdIdle, 1);" -} + DVSTARTPROFILE(); + if (!skipped_frame) + { + while (FifoCommandRunnable()) + Decode(); + } + else + { + while (FifoCommandRunnable()) + DecodeSemiNop(); + } +} \ No newline at end of file diff --git a/Source/Core/VideoCommon/Src/OpcodeDecoding.h b/Source/Core/VideoCommon/Src/OpcodeDecoding.h index cb724dbc06..b4472927ba 100644 --- a/Source/Core/VideoCommon/Src/OpcodeDecoding.h +++ b/Source/Core/VideoCommon/Src/OpcodeDecoding.h @@ -29,6 +29,7 @@ #define GX_LOAD_INDX_D 0x38 #define GX_CMD_CALL_DL 0x40 +#define GX_CMD_UNKNOWN_METRICS 0x44 #define GX_CMD_INVL_VC 0x48 #define GX_PRIMITIVE_MASK 0x78 @@ -46,6 +47,6 @@ void OpcodeDecoder_Init(); void OpcodeDecoder_Shutdown(); -void OpcodeDecoder_Run(); +void OpcodeDecoder_Run(bool skipped_frame); #endif // _OPCODE_DECODING_H diff --git a/Source/Core/VideoCommon/Src/PixelShaderManager.cpp b/Source/Core/VideoCommon/Src/PixelShaderManager.cpp index 7bdaa36cbc..c0d8a7e4c6 100644 --- a/Source/Core/VideoCommon/Src/PixelShaderManager.cpp +++ b/Source/Core/VideoCommon/Src/PixelShaderManager.cpp @@ -234,6 +234,7 @@ void PixelShaderManager::SetPSTextureDims(int texid) SetPSConstant4fv(C_TEXDIMS + texid, fdims); } +// This one is high in profiles (0.5%) void PixelShaderManager::SetColorChanged(int type, int num) { int r = bpmem.tevregs[num].low.a; @@ -241,10 +242,10 @@ void PixelShaderManager::SetColorChanged(int type, int num) int b = bpmem.tevregs[num].high.a; int g = bpmem.tevregs[num].high.b; float *pf = &lastRGBAfull[type][num][0]; - pf[0] = (float)r / 255.0f; - pf[1] = (float)g / 255.0f; - pf[2] = (float)b / 255.0f; - pf[3] = (float)a / 255.0f; + pf[0] = (float)r * (1.0f / 255.0f); + pf[1] = (float)g * (1.0f / 255.0f); + pf[2] = (float)b * (1.0f / 255.0f); + pf[3] = (float)a * (1.0f / 255.0f); s_nColorsChanged[type] |= 1 << num; PRIM_LOG("pixel %scolor%d: %f %f %f %f\n", type?"k":"", num, pf[0], pf[1], pf[2], pf[3]); } diff --git a/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp b/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp index 3d33501446..3739bb18c7 100644 --- a/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp +++ b/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp @@ -289,6 +289,7 @@ void LOADERDECL TexCoord_ReadIndex16_Short1() } void LOADERDECL TexCoord_ReadIndex16_Short2() { + // Heavy in ZWW u16 Index = DataReadU16(); const u16 *pData = (const u16 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); ((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)Common::swap16(pData[0]) * tcScale[tcIndex]; diff --git a/Source/Plugins/Plugin_DSP_HLE/Src/UCodes/UCode_Zelda_ADPCM.cpp b/Source/Plugins/Plugin_DSP_HLE/Src/UCodes/UCode_Zelda_ADPCM.cpp index f5b6eff99c..78c56fde3c 100644 --- a/Source/Plugins/Plugin_DSP_HLE/Src/UCodes/UCode_Zelda_ADPCM.cpp +++ b/Source/Plugins/Plugin_DSP_HLE/Src/UCodes/UCode_Zelda_ADPCM.cpp @@ -18,33 +18,26 @@ #include "Common.h" #include "UCode_Zelda.h" -void CUCode_Zelda::AFCdecodebuffer(const s16 *coef, const char *input, signed short *out, short *histp, short *hist2p, int type) +void CUCode_Zelda::AFCdecodebuffer(const s16 *coef, const char *src, signed short *out, short *histp, short *hist2p, int type) { - short nibbles[16]; - short hist = *histp; - short hist2 = *hist2p; - - const char *src = input; - char *dst = (char*)out; - // First 2 nibbles are ADPCM scale etc. short delta = 1 << (((*src) >> 4) & 0xf); short idx = (*src) & 0xf; src++; + short nibbles[16]; if (type == 9) { - for (int i = 0; i < 16; i = i + 2) { - int j = (*src & 255) >> 4; - nibbles[i] = j; - j = *src & 255 & 15; - nibbles[i+1] = j; + for (int i = 0; i < 16; i += 2) + { + nibbles[i + 0] = *src >> 4; + nibbles[i + 1] = *src & 15; src++; } - - for (int i = 0; i < 16; i = i + 1) { + for (int i = 0; i < 16; i++) { if (nibbles[i] >= 8) nibbles[i] = nibbles[i] - 16; + nibbles[i] <<= 11; } } else @@ -52,45 +45,33 @@ void CUCode_Zelda::AFCdecodebuffer(const s16 *coef, const char *input, signed sh // In Pikmin, Dolphin's engine sound is using AFC 5bits, even though such a sound is hard // to compare, it seems like to sound exactly like a real GC DEBUG_LOG(DSPHLE, "5 bits AFC sample"); - for (int i = 0; i < 16; i += 4) { - int j = (*src >> 0) & 0x02; - nibbles[i] = j; - - j = (*src >> 2) & 0x02; - nibbles[i+1] = j; - - j = (*src >> 4) & 0x02; - nibbles[i+2] = j; - - j = (*src >> 6) & 0x02; - nibbles[i+3] = j; - + nibbles[i + 0] = (*src >> 6) & 0x02; + nibbles[i + 1] = (*src >> 4) & 0x02; + nibbles[i + 2] = (*src >> 2) & 0x02; + nibbles[i + 3] = (*src >> 0) & 0x02; src++; } - for (int i = 0; i < 16; i++) { if (nibbles[i] >= 2) nibbles[i] = nibbles[i] - 4; + nibbles[i] <<= 13; } } + short hist = *histp; + short hist2 = *hist2p; for (int i = 0; i < 16; i++) { - int sample = (delta * nibbles[i]) << 11; - sample += ((long)hist * coef[idx * 2]) + ((long)hist2 * coef[idx * 2 + 1]); - sample = sample >> 11; - - if (sample > 32767) { + int sample = delta * nibbles[i] + ((long)hist * coef[idx * 2]) + ((long)hist2 * coef[idx * 2 + 1]); + sample >>= 11; + if (sample > 32767) sample = 32767; - } - if (sample < -32768) { + if (sample < -32768) sample = -32768; - } - *(short*)dst = (short)sample; - dst = dst + 2; + out[i] = sample; hist2 = hist; hist = (short)sample; } diff --git a/Source/Plugins/Plugin_VideoOGL/Plugin_VideoOGL.vcproj b/Source/Plugins/Plugin_VideoOGL/Plugin_VideoOGL.vcproj index 5bb422b98e..9394893e72 100644 --- a/Source/Plugins/Plugin_VideoOGL/Plugin_VideoOGL.vcproj +++ b/Source/Plugins/Plugin_VideoOGL/Plugin_VideoOGL.vcproj @@ -720,6 +720,14 @@ RelativePath=".\Src\BPFunctions.cpp" > + + + + diff --git a/Source/Plugins/Plugin_VideoOGL/Src/DLCache.cpp b/Source/Plugins/Plugin_VideoOGL/Src/DLCache.cpp new file mode 100644 index 0000000000..fefa2d064b --- /dev/null +++ b/Source/Plugins/Plugin_VideoOGL/Src/DLCache.cpp @@ -0,0 +1,564 @@ +// Copyright (C) 2003-2009 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +// TODO: Handle cache-is-full condition :p + +#include + +#include "Common.h" +#include "VideoCommon.h" +#include "Hash.h" +#include "MemoryUtil.h" +#include "DataReader.h" +#include "Statistics.h" +#include "OpcodeDecoding.h" // For the GX_ constants. + +#include "XFMemory.h" +#include "CPMemory.h" +#include "BPMemory.h" + +#include "VertexManager.h" +#include "VertexLoaderManager.h" + +#include "x64Emitter.h" +#include "ABI.h" + +#include "DLCache.h" + +#define DL_CODE_CACHE_SIZE (1024*1024*16) +#define DL_STATIC_DATA_SIZE (1024*1024*4) +extern int frameCount; + +using namespace Gen; + +namespace DLCache +{ + +// Currently just recompiles the DLs themselves, doesn't bother with the vertex data. +// The speed boost is pretty small. The real big boost will come when we also store +// vertex arrays in the cached DLs. + +enum DisplayListPass { + DLPASS_ANALYZE, + DLPASS_COMPILE, + DLPASS_RUN, +}; + +struct VDataHashRegion +{ + u32 hash; + u32 start_address; + int size; +}; + +struct CachedDisplayList +{ + CachedDisplayList() + : uncachable(false), + pass(DLPASS_ANALYZE), + next_check(1) + { + frame_count = frameCount; + } + + int pass; + u32 dl_hash; + + int check; + int next_check; + + u32 vdata_hash; + + std::vector hash_regions; + + int frame_count; + + bool uncachable; // if set, this DL will always be interpreted. This gets set if hash ever changes. + + // ... Something containing cached vertex buffers here ... + + // Compile the commands themselves down to native code. + const u8 *compiled_code; +}; + +// We want to allow caching DLs that start at the same address but have different lengths, +// so the size has to be in the ID. +inline u64 CreateMapId(u32 address, u32 size) +{ + return ((u64)address << 32) | size; +} + +typedef std::map DLMap; + +static DLMap dl_map; +static u8 *dlcode_cache; +static u8 *static_data_buffer; +static u8 *static_data_ptr; + +static Gen::XEmitter emitter; + +// Everything gets free'd when the cache is cleared. +u8 *AllocStaticData(int size) +{ + u8 *cur_ptr = static_data_ptr; + static_data_ptr += (size + 3) & ~3; + return cur_ptr; +} + +// First pass - analyze +bool AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl) +{ + int num_xf_reg = 0; + int num_cp_reg = 0; + int num_bp_reg = 0; + int num_index_xf = 0; + int num_draw_call = 0; + + u8 *old_datareader = g_pVideoData; + g_pVideoData = Memory_GetPtr(address); + + u8 *end = g_pVideoData + size; + while (g_pVideoData < end) + { + // Yet another reimplementation of the DL reading... + int cmd_byte = DataReadU8(); + switch (cmd_byte) + { + case GX_NOP: + break; + + case GX_LOAD_CP_REG: //0x08 + { + // Execute + u8 sub_cmd = DataReadU8(); + u32 value = DataReadU32(); + LoadCPReg(sub_cmd, value); + INCSTAT(stats.thisFrame.numCPLoads); + + // Analyze + num_cp_reg++; + } + break; + + case GX_LOAD_XF_REG: + { + // Execute + u32 Cmd2 = DataReadU32(); + int transfer_size = ((Cmd2 >> 16) & 15) + 1; + u32 address = Cmd2 & 0xFFFF; + // TODO - speed this up. pshufb? + u32 data_buffer[16]; + for (int i = 0; i < transfer_size; i++) + data_buffer[i] = DataReadU32(); + LoadXFReg(transfer_size, address, data_buffer); + INCSTAT(stats.thisFrame.numXFLoads); + + // Analyze + num_xf_reg++; + } + break; + + case GX_LOAD_INDX_A: //used for position matrices + { + u32 value = DataReadU32(); + // Execute + LoadIndexedXF(value, 0xC); + // Analyze + num_index_xf++; + } + break; + case GX_LOAD_INDX_B: //used for normal matrices + { + u32 value = DataReadU32(); + // Execute + LoadIndexedXF(value, 0xD); + // Analyze + num_index_xf++; + } + break; + case GX_LOAD_INDX_C: //used for postmatrices + { + u32 value = DataReadU32(); + // Execute + LoadIndexedXF(value, 0xE); + // Analyze + num_index_xf++; + } + break; + case GX_LOAD_INDX_D: //used for lights + { + u32 value = DataReadU32(); + // Execute + LoadIndexedXF(value, 0xF); + // Analyze + num_index_xf++; + } + break; + + case GX_CMD_CALL_DL: + PanicAlert("Seeing DL call inside DL."); + break; + + case GX_CMD_UNKNOWN_METRICS: + // zelda 4 swords calls it and checks the metrics registers after that + break; + + case GX_CMD_INVL_VC:// Invalidate (vertex cache?) + DEBUG_LOG(VIDEO, "Invalidate (vertex cache?)"); + break; + + case GX_LOAD_BP_REG: //0x61 + { + u32 bp_cmd = DataReadU32(); + // Execute + LoadBPReg(bp_cmd); + INCSTAT(stats.thisFrame.numBPLoads); + + // Analyze + } + break; + + // draw primitives + default: + if (cmd_byte & 0x80) + { + // load vertices (use computed vertex size from FifoCommandRunnable above) + + // Execute + u16 numVertices = DataReadU16(); + + VertexLoaderManager::RunVertices( + cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) + (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, + numVertices); + + // Analyze + } + else + { + ERROR_LOG(VIDEO, "DLCache::CompileAndRun: Illegal command %02x", cmd_byte); + break; + } + break; + } + } + + g_pVideoData = old_datareader; + return true; +} + +// The only sensible way to detect changes to vertex data is to convert several times +// and hash the output. + +// Second pass - compile +// Since some commands can affect the size of other commands, we really have no choice +// but to compile as we go, interpreting the list. We can't compile and then execute, we must +// compile AND execute at the same time. The second time the display list gets called, we already +// have the compiled code so we don't have to interpret anymore, we just run it. +bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl) +{ + VertexManager::Flush(); + + u8 *old_datareader = g_pVideoData; + g_pVideoData = Memory_GetPtr(address); + + u8 *end = g_pVideoData + size; + + emitter.AlignCode4(); + dl->compiled_code = emitter.GetCodePtr(); + emitter.ABI_EmitPrologue(4); + + while (g_pVideoData < end) + { + // Yet another reimplementation of the DL reading... + int cmd_byte = DataReadU8(); + switch (cmd_byte) + { + case GX_NOP: + // Execute + // Compile + break; + + case GX_LOAD_CP_REG: //0x08 + { + // Execute + u8 sub_cmd = DataReadU8(); + u32 value = DataReadU32(); + LoadCPReg(sub_cmd, value); + INCSTAT(stats.thisFrame.numCPLoads); + + // Compile + emitter.ABI_CallFunctionCC(&LoadCPReg, sub_cmd, value); + } + break; + + case GX_LOAD_XF_REG: + { + // Execute + u32 Cmd2 = DataReadU32(); + int transfer_size = ((Cmd2 >> 16) & 15) + 1; + u32 address = Cmd2 & 0xFFFF; + // TODO - speed this up. pshufb? + u8 *real_data_buffer = AllocStaticData(4 * transfer_size); + u32 *data_buffer = (u32 *)real_data_buffer; + for (int i = 0; i < transfer_size; i++) + data_buffer[i] = DataReadU32(); + LoadXFReg(transfer_size, address, data_buffer); + INCSTAT(stats.thisFrame.numXFLoads); + + // Compile + emitter.ABI_CallFunctionCCP(&LoadXFReg, transfer_size, address, data_buffer); + } + break; + + case GX_LOAD_INDX_A: //used for position matrices + { + u32 value = DataReadU32(); + // Execute + LoadIndexedXF(value, 0xC); + // Compile + emitter.ABI_CallFunctionCC(&LoadIndexedXF, value, 0xC); + } + break; + case GX_LOAD_INDX_B: //used for normal matrices + { + u32 value = DataReadU32(); + // Execute + LoadIndexedXF(value, 0xD); + // Compile + emitter.ABI_CallFunctionCC(&LoadIndexedXF, value, 0xD); + } + break; + case GX_LOAD_INDX_C: //used for postmatrices + { + u32 value = DataReadU32(); + // Execute + LoadIndexedXF(value, 0xE); + // Compile + emitter.ABI_CallFunctionCC(&LoadIndexedXF, value, 0xE); + } + break; + case GX_LOAD_INDX_D: //used for lights + { + u32 value = DataReadU32(); + // Execute + LoadIndexedXF(value, 0xF); + // Compile + emitter.ABI_CallFunctionCC(&LoadIndexedXF, value, 0xF); + } + break; + + case GX_CMD_CALL_DL: + PanicAlert("Seeing DL call inside DL."); + break; + + case GX_CMD_UNKNOWN_METRICS: + // zelda 4 swords calls it and checks the metrics registers after that + break; + + case GX_CMD_INVL_VC:// Invalidate (vertex cache?) + DEBUG_LOG(VIDEO, "Invalidate (vertex cache?)"); + break; + + case GX_LOAD_BP_REG: //0x61 + { + u32 bp_cmd = DataReadU32(); + // Execute + LoadBPReg(bp_cmd); + INCSTAT(stats.thisFrame.numBPLoads); + // Compile + emitter.ABI_CallFunctionC(&LoadBPReg, bp_cmd); + } + break; + + // draw primitives + default: + if (cmd_byte & 0x80) + { + // load vertices (use computed vertex size from FifoCommandRunnable above) + + // Execute + u16 numVertices = DataReadU16(); + + u64 pre_draw_video_data = (u64)g_pVideoData; + + VertexLoaderManager::RunVertices( + cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) + (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, + numVertices); + + // Compile +#ifdef _M_X64 + emitter.MOV(64, R(RAX), Imm64(pre_draw_video_data)); + emitter.MOV(64, M(&g_pVideoData), R(RAX)); +#else + emitter.MOV(32, R(EAX), Imm32(pre_draw_video_data)); + emitter.MOV(32, M(&g_pVideoData), R(EAX)); +#endif + emitter.ABI_CallFunctionCCC( + &VertexLoaderManager::RunVertices, + cmd_byte & GX_VAT_MASK, // Vertex loader index (0 - 7) + (cmd_byte & GX_PRIMITIVE_MASK) >> GX_PRIMITIVE_SHIFT, + numVertices); + } + else + { + ERROR_LOG(VIDEO, "DLCache::CompileAndRun: Illegal command %02x", cmd_byte); + break; + } + break; + } + } + + emitter.ABI_EmitEpilogue(4); + + g_pVideoData = old_datareader; + return true; +} + +// This one's pretty expensive. We should check if we can get away with only +// hashing the entire DL the first 3 frames or something. +u32 ComputeDLHash(u32 address, u32 size) +{ + u8 *ptr = Memory_GetPtr(address); + return HashFletcher(ptr, size & ~1); +} + +void Init() +{ + dlcode_cache = (u8 *)AllocateExecutableMemory(DL_CODE_CACHE_SIZE, false); // Don't need low memory. + static_data_buffer = (u8 *)AllocateMemoryPages(DL_STATIC_DATA_SIZE); + static_data_ptr = static_data_buffer; + emitter.SetCodePtr(dlcode_cache); +} + +void Shutdown() +{ + Clear(); + FreeMemoryPages(dlcode_cache, DL_CODE_CACHE_SIZE); + FreeMemoryPages(static_data_buffer, DL_STATIC_DATA_SIZE); + dlcode_cache = NULL; +} + +void Clear() +{ + dl_map.clear(); + + // Reset the cache pointers. + emitter.SetCodePtr(dlcode_cache); + static_data_ptr = static_data_buffer; +} + +void ProgressiveCleanup() +{ + DLMap::iterator iter = dl_map.begin(); + while (iter != dl_map.end()) { + CachedDisplayList &entry = iter->second; + int limit = iter->second.uncachable ? 1200 : 400; + if (entry.frame_count < frameCount - limit) { + // entry.Destroy(); +#ifdef _WIN32 + iter = dl_map.erase(iter); +#else + dl_map.erase(iter++); // (this is gcc standard!) +#endif + } + else + iter++; + } +} + +} // namespace + +// NOTE - outside the namespace on purpose. +bool HandleDisplayList(u32 address, u32 size) +{ + // Disable display list caching since the benefit isn't much to write home about + // right now... + return false; + + u64 dl_id = DLCache::CreateMapId(address, size); + DLCache::DLMap::iterator iter = DLCache::dl_map.find(dl_id); + + stats.numDListsAlive = DLCache::dl_map.size(); + if (iter != DLCache::dl_map.end()) + { + DLCache::CachedDisplayList &dl = iter->second; + if (dl.uncachable) + { + // We haven't compiled it - let's return false so it gets + // interpreted. + return false; + } + + // Got one! And it's been compiled too, so let's run the compiled code! + switch (dl.pass) + { + case DLCache::DLPASS_ANALYZE: + PanicAlert("DLPASS_ANALYZE - should have been done the first pass"); + break; + case DLCache::DLPASS_COMPILE: + // First, check that the hash is the same as the last time. + if (dl.dl_hash != HashAdler32(Memory_GetPtr(address), size)) + { + // PanicAlert("uncachable %08x", address); + dl.uncachable = true; + return false; + } + DLCache::CompileAndRunDisplayList(address, size, &dl); + dl.pass = DLCache::DLPASS_RUN; + break; + case DLCache::DLPASS_RUN: + { + // Every N draws, check hash + dl.check--; + if (dl.check <= 0) + { + if (dl.dl_hash != HashAdler32(Memory_GetPtr(address), size)) + { + dl.uncachable = true; + return false; + } + dl.check = dl.next_check; + dl.next_check *= 2; + if (dl.next_check > 1024) + dl.next_check = 1024; + } + u8 *old_datareader = g_pVideoData; + ((void (*)())(void*)(dl.compiled_code))(); + g_pVideoData = old_datareader; + break; + } + } + return true; + } + + DLCache::CachedDisplayList dl; + + if (DLCache::AnalyzeAndRunDisplayList(address, size, &dl)) { + dl.dl_hash = HashAdler32(Memory_GetPtr(address), size); + dl.pass = DLCache::DLPASS_COMPILE; + dl.check = 1; + dl.next_check = 1; + DLCache::dl_map[dl_id] = dl; + return true; + } else { + dl.uncachable = true; + DLCache::dl_map[dl_id] = dl; + return true; // don't also interpret the list. + } +} diff --git a/Source/Plugins/Plugin_VideoOGL/Src/DLCache.h b/Source/Plugins/Plugin_VideoOGL/Src/DLCache.h new file mode 100644 index 0000000000..8acca4a064 --- /dev/null +++ b/Source/Plugins/Plugin_VideoOGL/Src/DLCache.h @@ -0,0 +1,32 @@ +// Copyright (C) 2003-2009 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#ifndef _DLCACHE_H +#define _DLCACHE_H + +bool HandleDisplayList(u32 address, u32 size); + +namespace DLCache { + +void Init(); +void Shutdown(); +void ProgressiveCleanup(); +void Clear(); + +} // namespace + +#endif // _DLCACHE_H diff --git a/Source/Plugins/Plugin_VideoOGL/Src/Render.cpp b/Source/Plugins/Plugin_VideoOGL/Src/Render.cpp index 8f01808c67..6a8d7fd36e 100644 --- a/Source/Plugins/Plugin_VideoOGL/Src/Render.cpp +++ b/Source/Plugins/Plugin_VideoOGL/Src/Render.cpp @@ -43,6 +43,7 @@ #include "TextureMngr.h" #include "rasterfont.h" #include "VertexShaderGen.h" +#include "DLCache.h" #include "PixelShaderCache.h" #include "PixelShaderManager.h" #include "VertexShaderCache.h" @@ -1102,6 +1103,7 @@ void Renderer::SwapBuffers() GL_REPORT_ERRORD(); // Clean out old stuff from caches + DLCache::ProgressiveCleanup(); VertexShaderCache::ProgressiveCleanup(); PixelShaderCache::ProgressiveCleanup(); TextureMngr::ProgressiveCleanup(); @@ -1186,6 +1188,7 @@ void Renderer::DrawDebugText() p+=sprintf(p,"vshaders alive: %i\n",stats.numVertexShadersAlive); p+=sprintf(p,"dlists called: %i\n",stats.numDListsCalled); p+=sprintf(p,"dlists called(f): %i\n",stats.thisFrame.numDListsCalled); + p+=sprintf(p,"dlists alive: %i\n",stats.numDListsAlive); // not used. //p+=sprintf(p,"dlists created: %i\n",stats.numDListsCreated); //p+=sprintf(p,"dlists alive: %i\n",stats.numDListsAlive); diff --git a/Source/Plugins/Plugin_VideoOGL/Src/VertexManager.cpp b/Source/Plugins/Plugin_VideoOGL/Src/VertexManager.cpp index 821180eb75..0038a715d3 100644 --- a/Source/Plugins/Plugin_VideoOGL/Src/VertexManager.cpp +++ b/Source/Plugins/Plugin_VideoOGL/Src/VertexManager.cpp @@ -187,10 +187,6 @@ void Flush() GL_REPORT_ERRORD(); - if(g_bSkipCurrentFrame) { - ResetBuffer(); - return; - } glBindBuffer(GL_ARRAY_BUFFER, s_vboBuffers[s_nCurVBOIndex]); glBufferData(GL_ARRAY_BUFFER, s_pCurBufferPointer - s_pBaseBufferPointer, s_pBaseBufferPointer, GL_STREAM_DRAW); @@ -226,7 +222,7 @@ void Flush() tex.texImage0[i&3].width + 1, tex.texImage0[i&3].height + 1, tex.texImage0[i&3].format, tex.texTlut[i&3].tmem_offset<<9, tex.texTlut[i&3].tlut_format); - if (tentry != NULL) + if (tentry) { // texture loaded fine, set dims for pixel shader if (tentry->isRectangle) diff --git a/Source/Plugins/Plugin_VideoOGL/Src/main.cpp b/Source/Plugins/Plugin_VideoOGL/Src/main.cpp index 29984b1a73..73fbac43a0 100644 --- a/Source/Plugins/Plugin_VideoOGL/Src/main.cpp +++ b/Source/Plugins/Plugin_VideoOGL/Src/main.cpp @@ -91,6 +91,7 @@ GFXDebuggerOGL *m_DebuggerFrame = NULL; #include "PostProcessing.h" #include "OnScreenDisplay.h" #include "Setup.h" +#include "DLCache.h" #include "VideoState.h" @@ -385,7 +386,7 @@ void Video_Prepare(void) GL_REPORT_ERRORD(); VertexLoaderManager::Init(); TextureConverter::Init(); - + DLCache::Init(); s_swapRequested = FALSE; s_efbAccessRequested = FALSE; @@ -400,6 +401,7 @@ void Shutdown(void) s_efbAccessRequested = FALSE; s_swapRequested = FALSE; + DLCache::Shutdown(); Fifo_Shutdown(); PostProcessing::Shutdown(); @@ -418,7 +420,6 @@ void Shutdown(void) OpenGL_Shutdown(); } - void Video_SendFifoData(u8* _uData, u32 len) { Fifo_SendFifoData(_uData, len); @@ -435,8 +436,6 @@ void Video_ExitLoop() Fifo_ExitLoop(); } - - // Screenshot and screen message void Video_Screenshot(const char *_szFilename)