diff --git a/Source/Core/DSPCore/Src/DSPHWInterface.cpp b/Source/Core/DSPCore/Src/DSPHWInterface.cpp index 8185f0071a..76e44867e5 100644 --- a/Source/Core/DSPCore/Src/DSPHWInterface.cpp +++ b/Source/Core/DSPCore/Src/DSPHWInterface.cpp @@ -33,6 +33,11 @@ #include "DSPAccelerator.h" #include "DSPInterpreter.h" #include "DSPHWInterface.h" +#include "CPUDetect.h" + +#if _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__) +#include +#endif void gdsp_do_dma(); @@ -264,17 +269,29 @@ void gdsp_idma_out(u16 dsp_addr, u32 addr, u32 size) ERROR_LOG(DSPLLE, "*** idma_out IRAM_DSP (0x%04x) -> RAM (0x%08x) : size (0x%08x)", dsp_addr / 2, addr, size); } +static const __m128i s_mask = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L); // TODO: These should eat clock cycles. void gdsp_ddma_in(u16 dsp_addr, u32 addr, u32 size) { u8* dst = ((u8*)g_dsp.dram); - for (u32 i = 0; i < size; i += 2) +#if _M_SSE >= 0x301 + if (cpu_info.bSSSE3 && !(size % 16)) { - *(u16*)&dst[dsp_addr + i] = Common::swap16(*(const u16*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF]); + for (u32 i = 0; i < size; i += 16) + { + _mm_store_si128((__m128i *)&dst[dsp_addr + i], _mm_shuffle_epi8(_mm_load_si128((__m128i *)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF]), s_mask)); + } + } + else +#endif + { + for (u32 i = 0; i < size; i += 2) + { + *(u16*)&dst[dsp_addr + i] = Common::swap16(*(const u16*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF]); + } } - INFO_LOG(DSPLLE, "*** ddma_in RAM (0x%08x) -> DRAM_DSP (0x%04x) : size (0x%08x)", addr, dsp_addr / 2, size); } @@ -283,9 +300,21 @@ void gdsp_ddma_out(u16 dsp_addr, u32 addr, u32 size) { const u8* src = ((const u8*)g_dsp.dram); - for (u32 i = 0; i < size; i += 2) +#if _M_SSE >= 0x301 + if (cpu_info.bSSSE3 && !(size % 16)) { - *(u16*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF] = Common::swap16(*(const u16*)&src[dsp_addr + i]); + for (u32 i = 0; i < size; i += 16) + { + _mm_store_si128((__m128i *)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF], _mm_shuffle_epi8(_mm_load_si128((__m128i *)&src[dsp_addr + i]), s_mask)); + } + } + else +#endif + { + for (u32 i = 0; i < size; i += 2) + { + *(u16*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF] = Common::swap16(*(const u16*)&src[dsp_addr + i]); + } } INFO_LOG(DSPLLE, "*** ddma_out DRAM_DSP (0x%04x) -> RAM (0x%08x) : size (0x%08x)", dsp_addr / 2, addr, size);