Merge pull request #1958 from Sonicadvance1/Rearchitect_asmcommon

Rearchitect a bit of our AsmCommon routines.
This commit is contained in:
Ryan Houdek 2015-01-25 18:26:17 -06:00
commit b1fc18cbaa
9 changed files with 636 additions and 616 deletions

View File

@ -199,6 +199,7 @@ if(_M_X86)
PowerPC/Jit64/Jit_Paired.cpp
PowerPC/Jit64/JitRegCache.cpp
PowerPC/Jit64/Jit_SystemRegisters.cpp
PowerPC/Jit64Common/Jit64AsmCommon.cpp
PowerPC/JitCommon/JitBackpatch.cpp
PowerPC/JitCommon/Jit_Util.cpp
PowerPC/JitCommon/TrampolineCache.cpp)

View File

@ -235,6 +235,7 @@
<ClCompile Include="PowerPC\Jit64\Jit_LoadStorePaired.cpp" />
<ClCompile Include="PowerPC\Jit64\Jit_Paired.cpp" />
<ClCompile Include="PowerPC\Jit64\Jit_SystemRegisters.cpp" />
<ClCompile Include="PowerPC\Jit64Common\Jit64AsmCommon.cpp" />
<ClCompile Include="PowerPC\JitCommon\JitAsmCommon.cpp" />
<ClCompile Include="PowerPC\JitCommon\JitBackpatch.cpp" />
<ClCompile Include="PowerPC\JitCommon\JitBase.cpp" />
@ -417,6 +418,7 @@
<ClInclude Include="PowerPC\Jit64\JitRegCache.h" />
<ClInclude Include="PowerPC\JitILCommon\IR.h" />
<ClInclude Include="PowerPC\JitILCommon\JitILBase.h" />
<ClInclude Include="PowerPC\Jit64Common\Jit64AsmCommon.h" />
<ClInclude Include="PowerPC\JitCommon\JitAsmCommon.h" />
<ClInclude Include="PowerPC\JitCommon\JitBase.h" />
<ClInclude Include="PowerPC\JitCommon\JitCache.h" />

View File

@ -631,6 +631,9 @@
<ClCompile Include="PowerPC\SignatureDB.cpp">
<Filter>PowerPC</Filter>
</ClCompile>
<ClCompile Include="PowerPC\Jit64Common\Jit64AsmCommon.cpp">
<Filter>PowerPC\Jit64Common</Filter>
</ClCompile>
<ClCompile Include="PowerPC\JitCommon\Jit_Util.cpp">
<Filter>PowerPC\JitCommon</Filter>
</ClCompile>
@ -1184,6 +1187,9 @@
<ClInclude Include="PowerPC\SignatureDB.h">
<Filter>PowerPC</Filter>
</ClInclude>
<ClInclude Include="PowerPC\Jit64Common\Jit64AsmCommon.h">
<Filter>PowerPC\Jit64Common</Filter>
</ClInclude>
<ClInclude Include="PowerPC\JitCommon\Jit_Util.h">
<Filter>PowerPC\JitCommon</Filter>
</ClInclude>

View File

@ -4,7 +4,7 @@
#pragma once
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
// In Dolphin, we don't use inline assembly. Instead, we generate all machine-near
// code at runtime. In the case of fixed code like this, after writing it, we write

View File

@ -0,0 +1,601 @@
// Copyright 2013 Dolphin Emulator Project
// Licensed under GPLv2
// Refer to the license.txt file included.
#include "Common/MathUtil.h"
#include "Common/x64ABI.h"
#include "Common/x64Emitter.h"
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
#include "Core/PowerPC/JitCommon/JitBase.h"
#define QUANTIZED_REGS_TO_SAVE \
(ABI_ALL_CALLER_SAVED & ~BitSet32 { \
RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \
})
#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 })
using namespace Gen;
void CommonAsmRoutines::GenFifoWrite(int size)
{
// Assume value in RSCRATCH
u32 gather_pipe = (u32)(u64)GPFifo::m_gatherPipe;
_assert_msg_(DYNA_REC, gather_pipe <= 0x7FFFFFFF, "Gather pipe not in low 2GB of memory!");
MOV(32, R(RSCRATCH2), M(&GPFifo::m_gatherPipeCount));
SwapAndStore(size, MDisp(RSCRATCH2, gather_pipe), RSCRATCH);
ADD(32, R(RSCRATCH2), Imm8(size >> 3));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(RSCRATCH2));
RET();
}
void CommonAsmRoutines::GenFrsqrte()
{
// Assume input in XMM0.
// This function clobbers all three RSCRATCH.
MOVQ_xmm(R(RSCRATCH), XMM0);
// Negative and zero inputs set an exception and take the complex path.
TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch zero = J_CC(CC_Z, true);
FixupBranch negative = J_CC(CC_S, true);
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
// Zero and max exponents (non-normal floats) take the complex path.
FixupBranch complex1 = J_CC(CC_Z, true);
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
FixupBranch complex2 = J_CC(CC_E, true);
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
MOV(32, R(RSCRATCH2), Imm32(0x3FF));
SUB(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
SHL(64, R(RSCRATCH2), Imm8(52)); // exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) / 2)) & (0x7FFLL << 52);
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(48));
AND(32, R(RSCRATCH_EXTRA), Imm8(0x1F));
XOR(32, R(RSCRATCH_EXTRA), Imm8(0x10)); // int index = i / 2048 + (odd_exponent ? 16 : 0);
SHR(64, R(RSCRATCH), Imm8(37));
AND(32, R(RSCRATCH), Imm32(0x7FF));
IMUL(32, RSCRATCH, MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_dec));
MOV(32, R(RSCRATCH_EXTRA), MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_base));
SUB(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHL(64, R(RSCRATCH_EXTRA), Imm8(26));
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(frsqrte_expected_base[index] - frsqrte_expected_dec[index] * (i % 2048)) << 26;
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
// Exception flags for zero input.
SetJumpTarget(zero);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
FixupBranch complex3 = J();
// Exception flags for negative input.
SetJumpTarget(negative);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
SetJumpTarget(skip_set_fx1);
SetJumpTarget(skip_set_fx2);
SetJumpTarget(complex1);
SetJumpTarget(complex2);
SetJumpTarget(complex3);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
RET();
}
void CommonAsmRoutines::GenFres()
{
// Assume input in XMM0.
// This function clobbers all three RSCRATCH.
MOVQ_xmm(R(RSCRATCH), XMM0);
// Zero inputs set an exception and take the complex path.
TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch zero = J_CC(CC_Z);
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
MOV(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); // exp
AND(32, R(RSCRATCH2), Imm32(0x800)); // sign
CMP(32, R(RSCRATCH_EXTRA), Imm32(895));
// Take the complex path for very large/small exponents.
FixupBranch complex1 = J_CC(CC_L);
CMP(32, R(RSCRATCH_EXTRA), Imm32(1149));
FixupBranch complex2 = J_CC(CC_GE);
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x7FD));
NEG(32, R(RSCRATCH_EXTRA));
OR(32, R(RSCRATCH_EXTRA), R(RSCRATCH2));
SHL(64, R(RSCRATCH_EXTRA), Imm8(52)); // vali = sign | exponent
MOV(64, R(RSCRATCH2), R(RSCRATCH));
SHR(64, R(RSCRATCH), Imm8(37));
SHR(64, R(RSCRATCH2), Imm8(47));
AND(32, R(RSCRATCH), Imm32(0x3FF)); // i % 1024
AND(32, R(RSCRATCH2), Imm8(0x1F)); // i / 1024
IMUL(32, RSCRATCH, MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_dec));
ADD(32, R(RSCRATCH), Imm8(1));
SHR(32, R(RSCRATCH), Imm8(1));
MOV(32, R(RSCRATCH2), MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_base));
SUB(32, R(RSCRATCH2), R(RSCRATCH));
SHL(64, R(RSCRATCH2), Imm8(29));
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(fres_expected_base[i / 1024] - (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
// Exception flags for zero input.
SetJumpTarget(zero);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
SetJumpTarget(skip_set_fx1);
SetJumpTarget(complex1);
SetJumpTarget(complex2);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
RET();
}
void CommonAsmRoutines::GenMfcr()
{
// Input: none
// Output: RSCRATCH
// This function clobbers all three RSCRATCH.
X64Reg dst = RSCRATCH;
X64Reg tmp = RSCRATCH2;
X64Reg cr_val = RSCRATCH_EXTRA;
XOR(32, R(dst), R(dst));
// we only need to zero the high bits of tmp once
XOR(32, R(tmp), R(tmp));
for (int i = 0; i < 8; i++)
{
static const u32 m_flagTable[8] = { 0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9 };
if (i != 0)
SHL(32, R(dst), Imm8(4));
MOV(64, R(cr_val), PPCSTATE(cr_val[i]));
// EQ: Bits 31-0 == 0; set flag bit 1
TEST(32, R(cr_val), R(cr_val));
// FIXME: is there a better way to do this without the partial register merging?
SETcc(CC_Z, R(tmp));
LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));
// GT: Value > 0; set flag bit 2
TEST(64, R(cr_val), R(cr_val));
SETcc(CC_G, R(tmp));
LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0));
// SO: Bit 61 set; set flag bit 0
// LT: Bit 62 set; set flag bit 3
SHR(64, R(cr_val), Imm8(61));
OR(32, R(dst), MScaled(cr_val, SCALE_4, (u32)(u64)m_flagTable));
}
RET();
}
// Safe + Fast Quantizers, originally from JITIL by magumagu
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
static const float GC_ALIGNED16(m_32767) = 32767.0f;
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
static const float GC_ALIGNED16(m_255) = 255.0f;
static const float GC_ALIGNED16(m_127) = 127.0f;
static const float GC_ALIGNED16(m_m128) = -128.0f;
#define QUANTIZE_OVERFLOW_SAFE
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range
// while it's OK for large negatives, it isn't for positives
// I don't know whether the overflow actually happens in any games
// but it potentially can cause problems, so we need some clamping
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedStores()
{
const u8* storePairedIllegal = AlignCode4();
UD2();
const u8* storePairedFloat = AlignCode4();
if (cpu_info.bSSSE3)
{
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
MOVQ_xmm(R(RSCRATCH), XMM0);
}
else
{
MOVQ_xmm(R(RSCRATCH), XMM0);
ROL(64, R(RSCRATCH), Imm8(32));
BSWAP(64, RSCRATCH);
}
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedU8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKUSWB(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedS8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKSSWB(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedU16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
if (cpu_info.bSSE4_1)
{
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKUSDW(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
ROL(32, R(RSCRATCH), Imm8(16));
}
else
{
XORPS(XMM1, R(XMM1));
MAXPS(XMM0, R(XMM1));
MINPS(XMM0, M(m_65535));
CVTTPS2DQ(XMM0, R(XMM0));
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
}
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedS16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
ROL(32, R(RSCRATCH), Imm8(16));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(8 * sizeof(u8*));
pairedStoreQuantized[0] = storePairedFloat;
pairedStoreQuantized[1] = storePairedIllegal;
pairedStoreQuantized[2] = storePairedIllegal;
pairedStoreQuantized[3] = storePairedIllegal;
pairedStoreQuantized[4] = storePairedU8;
pairedStoreQuantized[5] = storePairedU16;
pairedStoreQuantized[6] = storePairedS8;
pairedStoreQuantized[7] = storePairedS16;
}
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedSingleStores()
{
const u8* storeSingleIllegal = AlignCode4();
UD2();
// Easy!
const u8* storeSingleFloat = AlignCode4();
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M(&m_255));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleS8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MAXSS(XMM0, M(&m_m128));
MINSS(XMM0, M(&m_127));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M(m_65535));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleS16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MAXSS(XMM0, M(&m_m32768));
MINSS(XMM0, M(&m_32767));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(8 * sizeof(u8*));
singleStoreQuantized[0] = storeSingleFloat;
singleStoreQuantized[1] = storeSingleIllegal;
singleStoreQuantized[2] = storeSingleIllegal;
singleStoreQuantized[3] = storeSingleIllegal;
singleStoreQuantized[4] = storeSingleU8;
singleStoreQuantized[5] = storeSingleU16;
singleStoreQuantized[6] = storeSingleS8;
singleStoreQuantized[7] = storeSingleS16;
}
void CommonAsmRoutines::GenQuantizedLoads()
{
const u8* loadPairedIllegal = AlignCode4();
UD2();
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
// don't need hardware access handling. This will definitely crash if paired loads occur
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
// for a good reason, or merely because no game does this.
// If we find something that actually does do this, maybe this should be changed. How
// much of a performance hit would it be?
const u8* loadPairedFloatTwo = AlignCode4();
if (jit->js.memcheck)
{
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
else if (cpu_info.bSSSE3)
{
MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
PSHUFB(XMM0, M(pbswapShuffle2x4));
}
else
{
LoadAndSwap(64, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
RET();
const u8* loadPairedFloatOne = AlignCode4();
if (jit->js.memcheck)
{
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
UNPCKLPS(XMM0, M(m_one));
}
else if (cpu_info.bSSSE3)
{
MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
PSHUFB(XMM0, M(pbswapShuffle1x4));
UNPCKLPS(XMM0, M(m_one));
}
else
{
LoadAndSwap(32, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
UNPCKLPS(XMM0, M(m_one));
}
RET();
const u8* loadPairedU8Two = AlignCode4();
if (jit->js.memcheck)
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
else
{
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
}
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVZXBD(XMM0, R(XMM0));
}
else
{
PXOR(XMM1, R(XMM1));
PUNPCKLBW(XMM0, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedU8One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedS8Two = AlignCode4();
if (jit->js.memcheck)
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
else
{
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
}
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVSXBD(XMM0, R(XMM0));
}
else
{
PUNPCKLBW(XMM0, R(XMM0));
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 24);
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS8One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedU16Two = AlignCode4();
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVZXWD(XMM0, R(XMM0));
}
else
{
PXOR(XMM1, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedU16One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedS16Two = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVSXWD(XMM0, R(XMM0));
}
else
{
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 16);
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS16One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*));
pairedLoadQuantized[0] = loadPairedFloatTwo;
pairedLoadQuantized[1] = loadPairedIllegal;
pairedLoadQuantized[2] = loadPairedIllegal;
pairedLoadQuantized[3] = loadPairedIllegal;
pairedLoadQuantized[4] = loadPairedU8Two;
pairedLoadQuantized[5] = loadPairedU16Two;
pairedLoadQuantized[6] = loadPairedS8Two;
pairedLoadQuantized[7] = loadPairedS16Two;
pairedLoadQuantized[8] = loadPairedFloatOne;
pairedLoadQuantized[9] = loadPairedIllegal;
pairedLoadQuantized[10] = loadPairedIllegal;
pairedLoadQuantized[11] = loadPairedIllegal;
pairedLoadQuantized[12] = loadPairedU8One;
pairedLoadQuantized[13] = loadPairedU16One;
pairedLoadQuantized[14] = loadPairedS8One;
pairedLoadQuantized[15] = loadPairedS16One;
}

View File

@ -0,0 +1,22 @@
// Copyright 2013 Dolphin Emulator Project
// Licensed under GPLv2
// Refer to the license.txt file included.
#pragma once
#include "Core/PowerPC/JitCommon/Jit_Util.h"
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock
{
protected:
void GenQuantizedLoads();
void GenQuantizedStores();
void GenQuantizedSingleStores();
public:
void GenFifoWrite(int size);
void GenFrsqrte();
void GenFres();
void GenMfcr();
};

View File

@ -2,194 +2,7 @@
// Licensed under GPLv2
// Refer to the license.txt file included.
#include "Common/CPUDetect.h"
#include "Common/MathUtil.h"
#include "Common/MemoryUtil.h"
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
#include "Core/PowerPC/JitCommon/JitBase.h"
#define QUANTIZED_REGS_TO_SAVE \
(ABI_ALL_CALLER_SAVED & ~BitSet32 { \
RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \
})
#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 })
using namespace Gen;
void CommonAsmRoutines::GenFifoWrite(int size)
{
// Assume value in RSCRATCH
u32 gather_pipe = (u32)(u64)GPFifo::m_gatherPipe;
_assert_msg_(DYNA_REC, gather_pipe <= 0x7FFFFFFF, "Gather pipe not in low 2GB of memory!");
MOV(32, R(RSCRATCH2), M(&GPFifo::m_gatherPipeCount));
SwapAndStore(size, MDisp(RSCRATCH2, gather_pipe), RSCRATCH);
ADD(32, R(RSCRATCH2), Imm8(size >> 3));
MOV(32, M(&GPFifo::m_gatherPipeCount), R(RSCRATCH2));
RET();
}
void CommonAsmRoutines::GenFrsqrte()
{
// Assume input in XMM0.
// This function clobbers all three RSCRATCH.
MOVQ_xmm(R(RSCRATCH), XMM0);
// Negative and zero inputs set an exception and take the complex path.
TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch zero = J_CC(CC_Z, true);
FixupBranch negative = J_CC(CC_S, true);
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
// Zero and max exponents (non-normal floats) take the complex path.
FixupBranch complex1 = J_CC(CC_Z, true);
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
FixupBranch complex2 = J_CC(CC_E, true);
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
MOV(32, R(RSCRATCH2), Imm32(0x3FF));
SUB(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
SHL(64, R(RSCRATCH2), Imm8(52)); // exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) / 2)) & (0x7FFLL << 52);
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(48));
AND(32, R(RSCRATCH_EXTRA), Imm8(0x1F));
XOR(32, R(RSCRATCH_EXTRA), Imm8(0x10)); // int index = i / 2048 + (odd_exponent ? 16 : 0);
SHR(64, R(RSCRATCH), Imm8(37));
AND(32, R(RSCRATCH), Imm32(0x7FF));
IMUL(32, RSCRATCH, MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_dec));
MOV(32, R(RSCRATCH_EXTRA), MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_base));
SUB(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHL(64, R(RSCRATCH_EXTRA), Imm8(26));
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(frsqrte_expected_base[index] - frsqrte_expected_dec[index] * (i % 2048)) << 26;
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
// Exception flags for zero input.
SetJumpTarget(zero);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
FixupBranch complex3 = J();
// Exception flags for negative input.
SetJumpTarget(negative);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
SetJumpTarget(skip_set_fx1);
SetJumpTarget(skip_set_fx2);
SetJumpTarget(complex1);
SetJumpTarget(complex2);
SetJumpTarget(complex3);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
RET();
}
void CommonAsmRoutines::GenFres()
{
// Assume input in XMM0.
// This function clobbers all three RSCRATCH.
MOVQ_xmm(R(RSCRATCH), XMM0);
// Zero inputs set an exception and take the complex path.
TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch zero = J_CC(CC_Z);
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
MOV(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); // exp
AND(32, R(RSCRATCH2), Imm32(0x800)); // sign
CMP(32, R(RSCRATCH_EXTRA), Imm32(895));
// Take the complex path for very large/small exponents.
FixupBranch complex1 = J_CC(CC_L);
CMP(32, R(RSCRATCH_EXTRA), Imm32(1149));
FixupBranch complex2 = J_CC(CC_GE);
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x7FD));
NEG(32, R(RSCRATCH_EXTRA));
OR(32, R(RSCRATCH_EXTRA), R(RSCRATCH2));
SHL(64, R(RSCRATCH_EXTRA), Imm8(52)); // vali = sign | exponent
MOV(64, R(RSCRATCH2), R(RSCRATCH));
SHR(64, R(RSCRATCH), Imm8(37));
SHR(64, R(RSCRATCH2), Imm8(47));
AND(32, R(RSCRATCH), Imm32(0x3FF)); // i % 1024
AND(32, R(RSCRATCH2), Imm8(0x1F)); // i / 1024
IMUL(32, RSCRATCH, MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_dec));
ADD(32, R(RSCRATCH), Imm8(1));
SHR(32, R(RSCRATCH), Imm8(1));
MOV(32, R(RSCRATCH2), MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_base));
SUB(32, R(RSCRATCH2), R(RSCRATCH));
SHL(64, R(RSCRATCH2), Imm8(29));
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(fres_expected_base[i / 1024] - (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
// Exception flags for zero input.
SetJumpTarget(zero);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
SetJumpTarget(skip_set_fx1);
SetJumpTarget(complex1);
SetJumpTarget(complex2);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
RET();
}
void CommonAsmRoutines::GenMfcr()
{
// Input: none
// Output: RSCRATCH
// This function clobbers all three RSCRATCH.
X64Reg dst = RSCRATCH;
X64Reg tmp = RSCRATCH2;
X64Reg cr_val = RSCRATCH_EXTRA;
XOR(32, R(dst), R(dst));
// we only need to zero the high bits of tmp once
XOR(32, R(tmp), R(tmp));
for (int i = 0; i < 8; i++)
{
static const u32 m_flagTable[8] = { 0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9 };
if (i != 0)
SHL(32, R(dst), Imm8(4));
MOV(64, R(cr_val), PPCSTATE(cr_val[i]));
// EQ: Bits 31-0 == 0; set flag bit 1
TEST(32, R(cr_val), R(cr_val));
// FIXME: is there a better way to do this without the partial register merging?
SETcc(CC_Z, R(tmp));
LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));
// GT: Value > 0; set flag bit 2
TEST(64, R(cr_val), R(cr_val));
SETcc(CC_G, R(tmp));
LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0));
// SO: Bit 61 set; set flag bit 0
// LT: Bit 62 set; set flag bit 3
SHR(64, R(cr_val), Imm8(61));
OR(32, R(dst), MScaled(cr_val, SCALE_4, (u32)(u64)m_flagTable));
}
RET();
}
// Safe + Fast Quantizers, originally from JITIL by magumagu
const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 };
@ -250,414 +63,4 @@ const float GC_ALIGNED16(m_dequantizeTableS[]) =
(1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
};
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
static const float GC_ALIGNED16(m_32767) = 32767.0f;
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
static const float GC_ALIGNED16(m_255) = 255.0f;
static const float GC_ALIGNED16(m_127) = 127.0f;
static const float GC_ALIGNED16(m_m128) = -128.0f;
const float GC_ALIGNED16(m_one[]) = { 1.0f, 0.0f, 0.0f, 0.0f };
#define QUANTIZE_OVERFLOW_SAFE
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range
// while it's OK for large negatives, it isn't for positives
// I don't know whether the overflow actually happens in any games
// but it potentially can cause problems, so we need some clamping
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedStores()
{
const u8* storePairedIllegal = AlignCode4();
UD2();
const u8* storePairedFloat = AlignCode4();
if (cpu_info.bSSSE3)
{
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
MOVQ_xmm(R(RSCRATCH), XMM0);
}
else
{
MOVQ_xmm(R(RSCRATCH), XMM0);
ROL(64, R(RSCRATCH), Imm8(32));
BSWAP(64, RSCRATCH);
}
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedU8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKUSWB(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedS8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKSSWB(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedU16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
if (cpu_info.bSSE4_1)
{
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKUSDW(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
ROL(32, R(RSCRATCH), Imm8(16));
}
else
{
XORPS(XMM1, R(XMM1));
MAXPS(XMM0, R(XMM1));
MINPS(XMM0, M(m_65535));
CVTTPS2DQ(XMM0, R(XMM0));
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
}
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storePairedS16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE
MINPS(XMM0, M(m_65535));
#endif
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
MOVD_xmm(R(RSCRATCH), XMM0);
BSWAP(32, RSCRATCH);
ROL(32, R(RSCRATCH), Imm8(16));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(8 * sizeof(u8*));
pairedStoreQuantized[0] = storePairedFloat;
pairedStoreQuantized[1] = storePairedIllegal;
pairedStoreQuantized[2] = storePairedIllegal;
pairedStoreQuantized[3] = storePairedIllegal;
pairedStoreQuantized[4] = storePairedU8;
pairedStoreQuantized[5] = storePairedU16;
pairedStoreQuantized[6] = storePairedS8;
pairedStoreQuantized[7] = storePairedS16;
}
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedSingleStores()
{
const u8* storeSingleIllegal = AlignCode4();
UD2();
// Easy!
const u8* storeSingleFloat = AlignCode4();
MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M(&m_255));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleS8 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MAXSS(XMM0, M(&m_m128));
MINSS(XMM0, M(&m_127));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
XORPS(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M(m_65535));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
const u8* storeSingleS16 = AlignCode4();
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MAXSS(XMM0, M(&m_m32768));
MINSS(XMM0, M(&m_32767));
CVTTSS2SI(RSCRATCH, R(XMM0));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET();
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(8 * sizeof(u8*));
singleStoreQuantized[0] = storeSingleFloat;
singleStoreQuantized[1] = storeSingleIllegal;
singleStoreQuantized[2] = storeSingleIllegal;
singleStoreQuantized[3] = storeSingleIllegal;
singleStoreQuantized[4] = storeSingleU8;
singleStoreQuantized[5] = storeSingleU16;
singleStoreQuantized[6] = storeSingleS8;
singleStoreQuantized[7] = storeSingleS16;
}
void CommonAsmRoutines::GenQuantizedLoads()
{
const u8* loadPairedIllegal = AlignCode4();
UD2();
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
// don't need hardware access handling. This will definitely crash if paired loads occur
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
// for a good reason, or merely because no game does this.
// If we find something that actually does do this, maybe this should be changed. How
// much of a performance hit would it be?
const u8* loadPairedFloatTwo = AlignCode4();
if (jit->js.memcheck)
{
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
else if (cpu_info.bSSSE3)
{
MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
PSHUFB(XMM0, M(pbswapShuffle2x4));
}
else
{
LoadAndSwap(64, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
RET();
const u8* loadPairedFloatOne = AlignCode4();
if (jit->js.memcheck)
{
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
UNPCKLPS(XMM0, M(m_one));
}
else if (cpu_info.bSSSE3)
{
MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
PSHUFB(XMM0, M(pbswapShuffle1x4));
UNPCKLPS(XMM0, M(m_one));
}
else
{
LoadAndSwap(32, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
UNPCKLPS(XMM0, M(m_one));
}
RET();
const u8* loadPairedU8Two = AlignCode4();
if (jit->js.memcheck)
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
else
{
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
}
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVZXBD(XMM0, R(XMM0));
}
else
{
PXOR(XMM1, R(XMM1));
PUNPCKLBW(XMM0, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedU8One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedS8Two = AlignCode4();
if (jit->js.memcheck)
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
else
{
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
}
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVSXBD(XMM0, R(XMM0));
}
else
{
PUNPCKLBW(XMM0, R(XMM0));
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 24);
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS8One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedU16Two = AlignCode4();
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVZXWD(XMM0, R(XMM0));
}
else
{
PXOR(XMM1, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedU16One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
const u8* loadPairedS16Two = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
if (cpu_info.bSSE4_1)
{
PMOVSXWD(XMM0, R(XMM0));
}
else
{
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 16);
}
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH2), Imm8(5));
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS16One = AlignCode4();
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
SHR(32, R(RSCRATCH2), Imm8(5));
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
UNPCKLPS(XMM0, M(m_one));
RET();
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*));
pairedLoadQuantized[0] = loadPairedFloatTwo;
pairedLoadQuantized[1] = loadPairedIllegal;
pairedLoadQuantized[2] = loadPairedIllegal;
pairedLoadQuantized[3] = loadPairedIllegal;
pairedLoadQuantized[4] = loadPairedU8Two;
pairedLoadQuantized[5] = loadPairedU16Two;
pairedLoadQuantized[6] = loadPairedS8Two;
pairedLoadQuantized[7] = loadPairedS16Two;
pairedLoadQuantized[8] = loadPairedFloatOne;
pairedLoadQuantized[9] = loadPairedIllegal;
pairedLoadQuantized[10] = loadPairedIllegal;
pairedLoadQuantized[11] = loadPairedIllegal;
pairedLoadQuantized[12] = loadPairedU8One;
pairedLoadQuantized[13] = loadPairedU16One;
pairedLoadQuantized[14] = loadPairedS8One;
pairedLoadQuantized[15] = loadPairedS16One;
}

View File

@ -4,7 +4,7 @@
#pragma once
#include "Core/PowerPC/JitCommon/Jit_Util.h"
#include "Common/CommonTypes.h"
extern const u8 GC_ALIGNED16(pbswapShuffle1x4[16]);
extern const u8 GC_ALIGNED16(pbswapShuffle2x4[16]);
@ -15,7 +15,6 @@ extern const float GC_ALIGNED16(m_dequantizeTableS[]);
class CommonAsmRoutinesBase
{
public:
const u8 *fifoDirectWrite8;
const u8 *fifoDirectWrite16;
const u8 *fifoDirectWrite32;
@ -51,19 +50,5 @@ public:
// In: ECX: Address to write to.
// In: XMM0: Bottom 32-bit slot holds the float to be written.
const u8 **singleStoreQuantized;
};
class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock
{
protected:
void GenQuantizedLoads();
void GenQuantizedStores();
void GenQuantizedSingleStores();
public:
void GenFifoWrite(int size);
void GenFrsqrte();
void GenFres();
void GenMfcr();
};

View File

@ -23,8 +23,8 @@
#include "Core/PowerPC/PowerPC.h"
#include "Core/PowerPC/PPCAnalyst.h"
#include "Core/PowerPC/PPCTables.h"
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
#include "Core/PowerPC/JitCommon/Jit_Util.h"
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
#include "Core/PowerPC/JitCommon/JitCache.h"
#include "Core/PowerPC/JitCommon/TrampolineCache.h"