mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2024-11-15 05:47:56 -07:00
Merge pull request #1621 from FioraAeterna/pscleanup
JIT: cleanups/optimizations for ps loadstore
This commit is contained in:
commit
09a10622dd
@ -212,8 +212,6 @@ static const float GC_ALIGNED16(m_dequantizeTableS[]) =
|
||||
(1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
|
||||
};
|
||||
|
||||
static float GC_ALIGNED16(psTemp[4]);
|
||||
|
||||
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
|
||||
static const float GC_ALIGNED16(m_32767) = 32767.0f;
|
||||
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
|
||||
@ -230,36 +228,26 @@ static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f};
|
||||
// I don't know whether the overflow actually happens in any games
|
||||
// but it potentially can cause problems, so we need some clamping
|
||||
|
||||
static void WriteDual32(u32 address)
|
||||
{
|
||||
Memory::Write_U64(*(u64 *) psTemp, address);
|
||||
}
|
||||
|
||||
// See comment in header for in/outs.
|
||||
void CommonAsmRoutines::GenQuantizedStores()
|
||||
{
|
||||
const u8* storePairedIllegal = AlignCode4();
|
||||
UD2();
|
||||
const u8* storePairedFloat = AlignCode4();
|
||||
|
||||
FixupBranch skip_complex, too_complex;
|
||||
SHUFPS(XMM0, R(XMM0), 1);
|
||||
MOVQ_xmm(M(&psTemp[0]), XMM0);
|
||||
if (!jit->js.memcheck)
|
||||
const u8* storePairedFloat = AlignCode4();
|
||||
if (cpu_info.bSSSE3)
|
||||
{
|
||||
TEST(32, R(RSCRATCH_EXTRA), Imm32(0x0C000000));
|
||||
too_complex = J_CC(CC_NZ, true);
|
||||
MOV(64, R(RSCRATCH), M(&psTemp[0]));
|
||||
SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH);
|
||||
skip_complex = J(true);
|
||||
SetJumpTarget(too_complex);
|
||||
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
}
|
||||
// RSP alignment here is 8 due to the call.
|
||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||
ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA);
|
||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||
if (!jit->js.memcheck)
|
||||
SetJumpTarget(skip_complex);
|
||||
else
|
||||
{
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
ROL(64, R(RSCRATCH), Imm8(32));
|
||||
BSWAP(64, RSCRATCH);
|
||||
}
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedU8 = AlignCode4();
|
||||
@ -316,12 +304,8 @@ void CommonAsmRoutines::GenQuantizedStores()
|
||||
MINPS(XMM0, M(m_65535));
|
||||
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
MOVQ_xmm(M(psTemp), XMM0);
|
||||
// place ps[0] into the higher word, ps[1] into the lower
|
||||
// so no need in ROL after BSWAP
|
||||
MOVZX(32, 16, RSCRATCH, M(&psTemp[0]));
|
||||
SHL(32, R(RSCRATCH), Imm8(16));
|
||||
MOV(16, R(RSCRATCH), M(&psTemp[1]));
|
||||
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
BSWAP(32, RSCRATCH);
|
||||
}
|
||||
|
||||
@ -369,21 +353,6 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
/*
|
||||
if (cpu_info.bSSSE3)
|
||||
{
|
||||
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
||||
// TODO: SafeWriteFloat
|
||||
MOVSS(M(&psTemp[0]), XMM0);
|
||||
MOV(32, R(RSCRATCH), M(&psTemp[0]));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVSS(M(&psTemp[0]), XMM0);
|
||||
MOV(32, R(RSCRATCH), M(&psTemp[0]));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
}*/
|
||||
|
||||
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
@ -441,6 +410,12 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||
const u8* loadPairedIllegal = AlignCode4();
|
||||
UD2();
|
||||
|
||||
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
|
||||
// don't need hardware access handling. This will definitely crash if paired loads occur
|
||||
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
|
||||
// for a good reason, or merely because no game does this.
|
||||
// If we find something that actually does do this, maybe this should be changed. How
|
||||
// much of a performance hit would it be?
|
||||
const u8* loadPairedFloatTwo = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user