Merge pull request #1621 from FioraAeterna/pscleanup

JIT: cleanups/optimizations for ps loadstore
This commit is contained in:
Markus Wick 2015-01-06 20:12:50 +01:00
commit 09a10622dd

View File

@ -212,8 +212,6 @@ static const float GC_ALIGNED16(m_dequantizeTableS[]) =
(1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
};
static float GC_ALIGNED16(psTemp[4]);
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
static const float GC_ALIGNED16(m_32767) = 32767.0f;
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
@ -230,36 +228,26 @@ static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f};
// I don't know whether the overflow actually happens in any games
// but it potentially can cause problems, so we need some clamping
static void WriteDual32(u32 address)
{
Memory::Write_U64(*(u64 *) psTemp, address);
}
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedStores()
{
const u8* storePairedIllegal = AlignCode4();
UD2();
const u8* storePairedFloat = AlignCode4();
FixupBranch skip_complex, too_complex;
SHUFPS(XMM0, R(XMM0), 1);
MOVQ_xmm(M(&psTemp[0]), XMM0);
if (!jit->js.memcheck)
const u8* storePairedFloat = AlignCode4();
if (cpu_info.bSSSE3)
{
TEST(32, R(RSCRATCH_EXTRA), Imm32(0x0C000000));
too_complex = J_CC(CC_NZ, true);
MOV(64, R(RSCRATCH), M(&psTemp[0]));
SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH);
skip_complex = J(true);
SetJumpTarget(too_complex);
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
MOVQ_xmm(R(RSCRATCH), XMM0);
}
// RSP alignment here is 8 due to the call.
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
if (!jit->js.memcheck)
SetJumpTarget(skip_complex);
else
{
MOVQ_xmm(R(RSCRATCH), XMM0);
ROL(64, R(RSCRATCH), Imm8(32));
BSWAP(64, RSCRATCH);
}
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedU8 = AlignCode4();
@ -316,12 +304,8 @@ void CommonAsmRoutines::GenQuantizedStores()
MINPS(XMM0, M(m_65535));
CVTTPS2DQ(XMM0, R(XMM0));
MOVQ_xmm(M(psTemp), XMM0);
// place ps[0] into the higher word, ps[1] into the lower
// so no need in ROL after BSWAP
MOVZX(32, 16, RSCRATCH, M(&psTemp[0]));
SHL(32, R(RSCRATCH), Imm8(16));
MOV(16, R(RSCRATCH), M(&psTemp[1]));
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
}
@ -369,21 +353,6 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
/*
if (cpu_info.bSSSE3)
{
PSHUFB(XMM0, M(pbswapShuffle2x4));
// TODO: SafeWriteFloat
MOVSS(M(&psTemp[0]), XMM0);
MOV(32, R(RSCRATCH), M(&psTemp[0]));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
}
else
{
MOVSS(M(&psTemp[0]), XMM0);
MOV(32, R(RSCRATCH), M(&psTemp[0]));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
}*/
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
@ -441,6 +410,12 @@ void CommonAsmRoutines::GenQuantizedLoads()
const u8* loadPairedIllegal = AlignCode4();
UD2();
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
// don't need hardware access handling. This will definitely crash if paired loads occur
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
// for a good reason, or merely because no game does this.
// If we find something that actually does do this, maybe this should be changed. How
// much of a performance hit would it be?
const u8* loadPairedFloatTwo = AlignCode4();
if (jit->js.memcheck)
{