mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2024-11-14 21:37:52 -07:00
Merge pull request #1958 from Sonicadvance1/Rearchitect_asmcommon
Rearchitect a bit of our AsmCommon routines.
This commit is contained in:
commit
b1fc18cbaa
@ -199,6 +199,7 @@ if(_M_X86)
|
||||
PowerPC/Jit64/Jit_Paired.cpp
|
||||
PowerPC/Jit64/JitRegCache.cpp
|
||||
PowerPC/Jit64/Jit_SystemRegisters.cpp
|
||||
PowerPC/Jit64Common/Jit64AsmCommon.cpp
|
||||
PowerPC/JitCommon/JitBackpatch.cpp
|
||||
PowerPC/JitCommon/Jit_Util.cpp
|
||||
PowerPC/JitCommon/TrampolineCache.cpp)
|
||||
|
@ -235,6 +235,7 @@
|
||||
<ClCompile Include="PowerPC\Jit64\Jit_LoadStorePaired.cpp" />
|
||||
<ClCompile Include="PowerPC\Jit64\Jit_Paired.cpp" />
|
||||
<ClCompile Include="PowerPC\Jit64\Jit_SystemRegisters.cpp" />
|
||||
<ClCompile Include="PowerPC\Jit64Common\Jit64AsmCommon.cpp" />
|
||||
<ClCompile Include="PowerPC\JitCommon\JitAsmCommon.cpp" />
|
||||
<ClCompile Include="PowerPC\JitCommon\JitBackpatch.cpp" />
|
||||
<ClCompile Include="PowerPC\JitCommon\JitBase.cpp" />
|
||||
@ -417,6 +418,7 @@
|
||||
<ClInclude Include="PowerPC\Jit64\JitRegCache.h" />
|
||||
<ClInclude Include="PowerPC\JitILCommon\IR.h" />
|
||||
<ClInclude Include="PowerPC\JitILCommon\JitILBase.h" />
|
||||
<ClInclude Include="PowerPC\Jit64Common\Jit64AsmCommon.h" />
|
||||
<ClInclude Include="PowerPC\JitCommon\JitAsmCommon.h" />
|
||||
<ClInclude Include="PowerPC\JitCommon\JitBase.h" />
|
||||
<ClInclude Include="PowerPC\JitCommon\JitCache.h" />
|
||||
|
@ -631,6 +631,9 @@
|
||||
<ClCompile Include="PowerPC\SignatureDB.cpp">
|
||||
<Filter>PowerPC</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="PowerPC\Jit64Common\Jit64AsmCommon.cpp">
|
||||
<Filter>PowerPC\Jit64Common</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="PowerPC\JitCommon\Jit_Util.cpp">
|
||||
<Filter>PowerPC\JitCommon</Filter>
|
||||
</ClCompile>
|
||||
@ -1184,6 +1187,9 @@
|
||||
<ClInclude Include="PowerPC\SignatureDB.h">
|
||||
<Filter>PowerPC</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="PowerPC\Jit64Common\Jit64AsmCommon.h">
|
||||
<Filter>PowerPC\Jit64Common</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="PowerPC\JitCommon\Jit_Util.h">
|
||||
<Filter>PowerPC\JitCommon</Filter>
|
||||
</ClInclude>
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
|
||||
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
|
||||
|
||||
// In Dolphin, we don't use inline assembly. Instead, we generate all machine-near
|
||||
// code at runtime. In the case of fixed code like this, after writing it, we write
|
||||
|
601
Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
Normal file
601
Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp
Normal file
@ -0,0 +1,601 @@
|
||||
// Copyright 2013 Dolphin Emulator Project
|
||||
// Licensed under GPLv2
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include "Common/MathUtil.h"
|
||||
#include "Common/x64ABI.h"
|
||||
#include "Common/x64Emitter.h"
|
||||
|
||||
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
|
||||
#include "Core/PowerPC/JitCommon/JitBase.h"
|
||||
|
||||
#define QUANTIZED_REGS_TO_SAVE \
|
||||
(ABI_ALL_CALLER_SAVED & ~BitSet32 { \
|
||||
RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \
|
||||
})
|
||||
|
||||
#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 })
|
||||
|
||||
using namespace Gen;
|
||||
|
||||
void CommonAsmRoutines::GenFifoWrite(int size)
|
||||
{
|
||||
// Assume value in RSCRATCH
|
||||
u32 gather_pipe = (u32)(u64)GPFifo::m_gatherPipe;
|
||||
_assert_msg_(DYNA_REC, gather_pipe <= 0x7FFFFFFF, "Gather pipe not in low 2GB of memory!");
|
||||
MOV(32, R(RSCRATCH2), M(&GPFifo::m_gatherPipeCount));
|
||||
SwapAndStore(size, MDisp(RSCRATCH2, gather_pipe), RSCRATCH);
|
||||
ADD(32, R(RSCRATCH2), Imm8(size >> 3));
|
||||
MOV(32, M(&GPFifo::m_gatherPipeCount), R(RSCRATCH2));
|
||||
RET();
|
||||
}
|
||||
|
||||
void CommonAsmRoutines::GenFrsqrte()
|
||||
{
|
||||
// Assume input in XMM0.
|
||||
// This function clobbers all three RSCRATCH.
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
|
||||
// Negative and zero inputs set an exception and take the complex path.
|
||||
TEST(64, R(RSCRATCH), R(RSCRATCH));
|
||||
FixupBranch zero = J_CC(CC_Z, true);
|
||||
FixupBranch negative = J_CC(CC_S, true);
|
||||
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
|
||||
|
||||
// Zero and max exponents (non-normal floats) take the complex path.
|
||||
FixupBranch complex1 = J_CC(CC_Z, true);
|
||||
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
|
||||
FixupBranch complex2 = J_CC(CC_E, true);
|
||||
|
||||
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
|
||||
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
|
||||
MOV(32, R(RSCRATCH2), Imm32(0x3FF));
|
||||
SUB(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
|
||||
SHL(64, R(RSCRATCH2), Imm8(52)); // exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) / 2)) & (0x7FFLL << 52);
|
||||
|
||||
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||
SHR(64, R(RSCRATCH_EXTRA), Imm8(48));
|
||||
AND(32, R(RSCRATCH_EXTRA), Imm8(0x1F));
|
||||
XOR(32, R(RSCRATCH_EXTRA), Imm8(0x10)); // int index = i / 2048 + (odd_exponent ? 16 : 0);
|
||||
|
||||
SHR(64, R(RSCRATCH), Imm8(37));
|
||||
AND(32, R(RSCRATCH), Imm32(0x7FF));
|
||||
IMUL(32, RSCRATCH, MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_dec));
|
||||
MOV(32, R(RSCRATCH_EXTRA), MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_base));
|
||||
SUB(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||
SHL(64, R(RSCRATCH_EXTRA), Imm8(26));
|
||||
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(frsqrte_expected_base[index] - frsqrte_expected_dec[index] * (i % 2048)) << 26;
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
||||
RET();
|
||||
|
||||
// Exception flags for zero input.
|
||||
SetJumpTarget(zero);
|
||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
|
||||
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
|
||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
|
||||
FixupBranch complex3 = J();
|
||||
|
||||
// Exception flags for negative input.
|
||||
SetJumpTarget(negative);
|
||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
|
||||
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
|
||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
|
||||
|
||||
SetJumpTarget(skip_set_fx1);
|
||||
SetJumpTarget(skip_set_fx2);
|
||||
SetJumpTarget(complex1);
|
||||
SetJumpTarget(complex2);
|
||||
SetJumpTarget(complex3);
|
||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot);
|
||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||
RET();
|
||||
}
|
||||
|
||||
void CommonAsmRoutines::GenFres()
|
||||
{
|
||||
// Assume input in XMM0.
|
||||
// This function clobbers all three RSCRATCH.
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
|
||||
// Zero inputs set an exception and take the complex path.
|
||||
TEST(64, R(RSCRATCH), R(RSCRATCH));
|
||||
FixupBranch zero = J_CC(CC_Z);
|
||||
|
||||
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
|
||||
MOV(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
|
||||
AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); // exp
|
||||
AND(32, R(RSCRATCH2), Imm32(0x800)); // sign
|
||||
CMP(32, R(RSCRATCH_EXTRA), Imm32(895));
|
||||
// Take the complex path for very large/small exponents.
|
||||
FixupBranch complex1 = J_CC(CC_L);
|
||||
CMP(32, R(RSCRATCH_EXTRA), Imm32(1149));
|
||||
FixupBranch complex2 = J_CC(CC_GE);
|
||||
|
||||
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x7FD));
|
||||
NEG(32, R(RSCRATCH_EXTRA));
|
||||
OR(32, R(RSCRATCH_EXTRA), R(RSCRATCH2));
|
||||
SHL(64, R(RSCRATCH_EXTRA), Imm8(52)); // vali = sign | exponent
|
||||
|
||||
MOV(64, R(RSCRATCH2), R(RSCRATCH));
|
||||
SHR(64, R(RSCRATCH), Imm8(37));
|
||||
SHR(64, R(RSCRATCH2), Imm8(47));
|
||||
AND(32, R(RSCRATCH), Imm32(0x3FF)); // i % 1024
|
||||
AND(32, R(RSCRATCH2), Imm8(0x1F)); // i / 1024
|
||||
|
||||
IMUL(32, RSCRATCH, MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_dec));
|
||||
ADD(32, R(RSCRATCH), Imm8(1));
|
||||
SHR(32, R(RSCRATCH), Imm8(1));
|
||||
|
||||
MOV(32, R(RSCRATCH2), MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_base));
|
||||
SUB(32, R(RSCRATCH2), R(RSCRATCH));
|
||||
SHL(64, R(RSCRATCH2), Imm8(29));
|
||||
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(fres_expected_base[i / 1024] - (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
||||
RET();
|
||||
|
||||
// Exception flags for zero input.
|
||||
SetJumpTarget(zero);
|
||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
|
||||
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
|
||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
|
||||
SetJumpTarget(skip_set_fx1);
|
||||
|
||||
SetJumpTarget(complex1);
|
||||
SetJumpTarget(complex2);
|
||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal);
|
||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||
RET();
|
||||
}
|
||||
|
||||
void CommonAsmRoutines::GenMfcr()
|
||||
{
|
||||
// Input: none
|
||||
// Output: RSCRATCH
|
||||
// This function clobbers all three RSCRATCH.
|
||||
X64Reg dst = RSCRATCH;
|
||||
X64Reg tmp = RSCRATCH2;
|
||||
X64Reg cr_val = RSCRATCH_EXTRA;
|
||||
XOR(32, R(dst), R(dst));
|
||||
// we only need to zero the high bits of tmp once
|
||||
XOR(32, R(tmp), R(tmp));
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
static const u32 m_flagTable[8] = { 0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9 };
|
||||
if (i != 0)
|
||||
SHL(32, R(dst), Imm8(4));
|
||||
|
||||
MOV(64, R(cr_val), PPCSTATE(cr_val[i]));
|
||||
|
||||
// EQ: Bits 31-0 == 0; set flag bit 1
|
||||
TEST(32, R(cr_val), R(cr_val));
|
||||
// FIXME: is there a better way to do this without the partial register merging?
|
||||
SETcc(CC_Z, R(tmp));
|
||||
LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));
|
||||
|
||||
// GT: Value > 0; set flag bit 2
|
||||
TEST(64, R(cr_val), R(cr_val));
|
||||
SETcc(CC_G, R(tmp));
|
||||
LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0));
|
||||
|
||||
// SO: Bit 61 set; set flag bit 0
|
||||
// LT: Bit 62 set; set flag bit 3
|
||||
SHR(64, R(cr_val), Imm8(61));
|
||||
OR(32, R(dst), MScaled(cr_val, SCALE_4, (u32)(u64)m_flagTable));
|
||||
}
|
||||
RET();
|
||||
}
|
||||
|
||||
// Safe + Fast Quantizers, originally from JITIL by magumagu
|
||||
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
|
||||
static const float GC_ALIGNED16(m_32767) = 32767.0f;
|
||||
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
|
||||
static const float GC_ALIGNED16(m_255) = 255.0f;
|
||||
static const float GC_ALIGNED16(m_127) = 127.0f;
|
||||
static const float GC_ALIGNED16(m_m128) = -128.0f;
|
||||
|
||||
#define QUANTIZE_OVERFLOW_SAFE
|
||||
|
||||
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range
|
||||
// while it's OK for large negatives, it isn't for positives
|
||||
// I don't know whether the overflow actually happens in any games
|
||||
// but it potentially can cause problems, so we need some clamping
|
||||
|
||||
// See comment in header for in/outs.
|
||||
void CommonAsmRoutines::GenQuantizedStores()
|
||||
{
|
||||
const u8* storePairedIllegal = AlignCode4();
|
||||
UD2();
|
||||
|
||||
const u8* storePairedFloat = AlignCode4();
|
||||
if (cpu_info.bSSSE3)
|
||||
{
|
||||
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
ROL(64, R(RSCRATCH), Imm8(32));
|
||||
BSWAP(64, RSCRATCH);
|
||||
}
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedU8 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||
MINPS(XMM0, M(m_65535));
|
||||
#endif
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
PACKUSWB(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedS8 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||
MINPS(XMM0, M(m_65535));
|
||||
#endif
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
PACKSSWB(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedU16 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||
MINPS(XMM0, M(m_65535));
|
||||
#endif
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PACKUSDW(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
BSWAP(32, RSCRATCH);
|
||||
ROL(32, R(RSCRATCH), Imm8(16));
|
||||
}
|
||||
else
|
||||
{
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXPS(XMM0, R(XMM1));
|
||||
MINPS(XMM0, M(m_65535));
|
||||
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
BSWAP(32, RSCRATCH);
|
||||
}
|
||||
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedS16 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||
MINPS(XMM0, M(m_65535));
|
||||
#endif
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
BSWAP(32, RSCRATCH);
|
||||
ROL(32, R(RSCRATCH), Imm8(16));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||
ReserveCodeSpace(8 * sizeof(u8*));
|
||||
|
||||
pairedStoreQuantized[0] = storePairedFloat;
|
||||
pairedStoreQuantized[1] = storePairedIllegal;
|
||||
pairedStoreQuantized[2] = storePairedIllegal;
|
||||
pairedStoreQuantized[3] = storePairedIllegal;
|
||||
pairedStoreQuantized[4] = storePairedU8;
|
||||
pairedStoreQuantized[5] = storePairedU16;
|
||||
pairedStoreQuantized[6] = storePairedS8;
|
||||
pairedStoreQuantized[7] = storePairedS16;
|
||||
}
|
||||
|
||||
// See comment in header for in/outs.
|
||||
void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||
{
|
||||
const u8* storeSingleIllegal = AlignCode4();
|
||||
UD2();
|
||||
|
||||
// Easy!
|
||||
const u8* storeSingleFloat = AlignCode4();
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXSS(XMM0, R(XMM1));
|
||||
MINSS(XMM0, M(&m_255));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleS8 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MAXSS(XMM0, M(&m_m128));
|
||||
MINSS(XMM0, M(&m_127));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXSS(XMM0, R(XMM1));
|
||||
MINSS(XMM0, M(m_65535));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleS16 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MAXSS(XMM0, M(&m_m32768));
|
||||
MINSS(XMM0, M(&m_32767));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||
ReserveCodeSpace(8 * sizeof(u8*));
|
||||
|
||||
singleStoreQuantized[0] = storeSingleFloat;
|
||||
singleStoreQuantized[1] = storeSingleIllegal;
|
||||
singleStoreQuantized[2] = storeSingleIllegal;
|
||||
singleStoreQuantized[3] = storeSingleIllegal;
|
||||
singleStoreQuantized[4] = storeSingleU8;
|
||||
singleStoreQuantized[5] = storeSingleU16;
|
||||
singleStoreQuantized[6] = storeSingleS8;
|
||||
singleStoreQuantized[7] = storeSingleS16;
|
||||
}
|
||||
|
||||
void CommonAsmRoutines::GenQuantizedLoads()
|
||||
{
|
||||
const u8* loadPairedIllegal = AlignCode4();
|
||||
UD2();
|
||||
|
||||
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
|
||||
// don't need hardware access handling. This will definitely crash if paired loads occur
|
||||
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
|
||||
// for a good reason, or merely because no game does this.
|
||||
// If we find something that actually does do this, maybe this should be changed. How
|
||||
// much of a performance hit would it be?
|
||||
const u8* loadPairedFloatTwo = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
{
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
}
|
||||
else if (cpu_info.bSSSE3)
|
||||
{
|
||||
MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
||||
}
|
||||
else
|
||||
{
|
||||
LoadAndSwap(64, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
}
|
||||
RET();
|
||||
|
||||
const u8* loadPairedFloatOne = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
{
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
}
|
||||
else if (cpu_info.bSSSE3)
|
||||
{
|
||||
MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||
PSHUFB(XMM0, M(pbswapShuffle1x4));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
}
|
||||
else
|
||||
{
|
||||
LoadAndSwap(32, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
}
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU8Two = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
{
|
||||
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
||||
}
|
||||
else
|
||||
{
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
||||
}
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVZXBD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PUNPCKLBW(XMM0, R(XMM1));
|
||||
PUNPCKLWD(XMM0, R(XMM1));
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU8One = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS8Two = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
{
|
||||
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
||||
}
|
||||
else
|
||||
{
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
||||
}
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVSXBD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PUNPCKLBW(XMM0, R(XMM0));
|
||||
PUNPCKLWD(XMM0, R(XMM0));
|
||||
PSRAD(XMM0, 24);
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS8One = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU16Two = AlignCode4();
|
||||
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
|
||||
if (jit->js.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVZXWD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PUNPCKLWD(XMM0, R(XMM1));
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU16One = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS16Two = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVSXWD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PUNPCKLWD(XMM0, R(XMM0));
|
||||
PSRAD(XMM0, 16);
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS16One = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
RET();
|
||||
|
||||
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||
ReserveCodeSpace(16 * sizeof(u8*));
|
||||
|
||||
pairedLoadQuantized[0] = loadPairedFloatTwo;
|
||||
pairedLoadQuantized[1] = loadPairedIllegal;
|
||||
pairedLoadQuantized[2] = loadPairedIllegal;
|
||||
pairedLoadQuantized[3] = loadPairedIllegal;
|
||||
pairedLoadQuantized[4] = loadPairedU8Two;
|
||||
pairedLoadQuantized[5] = loadPairedU16Two;
|
||||
pairedLoadQuantized[6] = loadPairedS8Two;
|
||||
pairedLoadQuantized[7] = loadPairedS16Two;
|
||||
|
||||
pairedLoadQuantized[8] = loadPairedFloatOne;
|
||||
pairedLoadQuantized[9] = loadPairedIllegal;
|
||||
pairedLoadQuantized[10] = loadPairedIllegal;
|
||||
pairedLoadQuantized[11] = loadPairedIllegal;
|
||||
pairedLoadQuantized[12] = loadPairedU8One;
|
||||
pairedLoadQuantized[13] = loadPairedU16One;
|
||||
pairedLoadQuantized[14] = loadPairedS8One;
|
||||
pairedLoadQuantized[15] = loadPairedS16One;
|
||||
}
|
22
Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h
Normal file
22
Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.h
Normal file
@ -0,0 +1,22 @@
|
||||
// Copyright 2013 Dolphin Emulator Project
|
||||
// Licensed under GPLv2
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
||||
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
|
||||
|
||||
class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock
|
||||
{
|
||||
protected:
|
||||
void GenQuantizedLoads();
|
||||
void GenQuantizedStores();
|
||||
void GenQuantizedSingleStores();
|
||||
|
||||
public:
|
||||
void GenFifoWrite(int size);
|
||||
void GenFrsqrte();
|
||||
void GenFres();
|
||||
void GenMfcr();
|
||||
};
|
@ -2,194 +2,7 @@
|
||||
// Licensed under GPLv2
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include "Common/CPUDetect.h"
|
||||
#include "Common/MathUtil.h"
|
||||
#include "Common/MemoryUtil.h"
|
||||
|
||||
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
|
||||
#include "Core/PowerPC/JitCommon/JitBase.h"
|
||||
|
||||
#define QUANTIZED_REGS_TO_SAVE \
|
||||
(ABI_ALL_CALLER_SAVED & ~BitSet32 { \
|
||||
RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \
|
||||
})
|
||||
|
||||
#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 })
|
||||
|
||||
using namespace Gen;
|
||||
|
||||
void CommonAsmRoutines::GenFifoWrite(int size)
|
||||
{
|
||||
// Assume value in RSCRATCH
|
||||
u32 gather_pipe = (u32)(u64)GPFifo::m_gatherPipe;
|
||||
_assert_msg_(DYNA_REC, gather_pipe <= 0x7FFFFFFF, "Gather pipe not in low 2GB of memory!");
|
||||
MOV(32, R(RSCRATCH2), M(&GPFifo::m_gatherPipeCount));
|
||||
SwapAndStore(size, MDisp(RSCRATCH2, gather_pipe), RSCRATCH);
|
||||
ADD(32, R(RSCRATCH2), Imm8(size >> 3));
|
||||
MOV(32, M(&GPFifo::m_gatherPipeCount), R(RSCRATCH2));
|
||||
RET();
|
||||
}
|
||||
|
||||
void CommonAsmRoutines::GenFrsqrte()
|
||||
{
|
||||
// Assume input in XMM0.
|
||||
// This function clobbers all three RSCRATCH.
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
|
||||
// Negative and zero inputs set an exception and take the complex path.
|
||||
TEST(64, R(RSCRATCH), R(RSCRATCH));
|
||||
FixupBranch zero = J_CC(CC_Z, true);
|
||||
FixupBranch negative = J_CC(CC_S, true);
|
||||
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
|
||||
|
||||
// Zero and max exponents (non-normal floats) take the complex path.
|
||||
FixupBranch complex1 = J_CC(CC_Z, true);
|
||||
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
|
||||
FixupBranch complex2 = J_CC(CC_E, true);
|
||||
|
||||
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
|
||||
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
|
||||
MOV(32, R(RSCRATCH2), Imm32(0x3FF));
|
||||
SUB(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
|
||||
SHL(64, R(RSCRATCH2), Imm8(52)); // exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) / 2)) & (0x7FFLL << 52);
|
||||
|
||||
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||
SHR(64, R(RSCRATCH_EXTRA), Imm8(48));
|
||||
AND(32, R(RSCRATCH_EXTRA), Imm8(0x1F));
|
||||
XOR(32, R(RSCRATCH_EXTRA), Imm8(0x10)); // int index = i / 2048 + (odd_exponent ? 16 : 0);
|
||||
|
||||
SHR(64, R(RSCRATCH), Imm8(37));
|
||||
AND(32, R(RSCRATCH), Imm32(0x7FF));
|
||||
IMUL(32, RSCRATCH, MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_dec));
|
||||
MOV(32, R(RSCRATCH_EXTRA), MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_base));
|
||||
SUB(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||
SHL(64, R(RSCRATCH_EXTRA), Imm8(26));
|
||||
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(frsqrte_expected_base[index] - frsqrte_expected_dec[index] * (i % 2048)) << 26;
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
||||
RET();
|
||||
|
||||
// Exception flags for zero input.
|
||||
SetJumpTarget(zero);
|
||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
|
||||
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
|
||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
|
||||
FixupBranch complex3 = J();
|
||||
|
||||
// Exception flags for negative input.
|
||||
SetJumpTarget(negative);
|
||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
|
||||
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
|
||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
|
||||
|
||||
SetJumpTarget(skip_set_fx1);
|
||||
SetJumpTarget(skip_set_fx2);
|
||||
SetJumpTarget(complex1);
|
||||
SetJumpTarget(complex2);
|
||||
SetJumpTarget(complex3);
|
||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot);
|
||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||
RET();
|
||||
}
|
||||
|
||||
void CommonAsmRoutines::GenFres()
|
||||
{
|
||||
// Assume input in XMM0.
|
||||
// This function clobbers all three RSCRATCH.
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
|
||||
// Zero inputs set an exception and take the complex path.
|
||||
TEST(64, R(RSCRATCH), R(RSCRATCH));
|
||||
FixupBranch zero = J_CC(CC_Z);
|
||||
|
||||
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
|
||||
MOV(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
|
||||
AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); // exp
|
||||
AND(32, R(RSCRATCH2), Imm32(0x800)); // sign
|
||||
CMP(32, R(RSCRATCH_EXTRA), Imm32(895));
|
||||
// Take the complex path for very large/small exponents.
|
||||
FixupBranch complex1 = J_CC(CC_L);
|
||||
CMP(32, R(RSCRATCH_EXTRA), Imm32(1149));
|
||||
FixupBranch complex2 = J_CC(CC_GE);
|
||||
|
||||
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x7FD));
|
||||
NEG(32, R(RSCRATCH_EXTRA));
|
||||
OR(32, R(RSCRATCH_EXTRA), R(RSCRATCH2));
|
||||
SHL(64, R(RSCRATCH_EXTRA), Imm8(52)); // vali = sign | exponent
|
||||
|
||||
MOV(64, R(RSCRATCH2), R(RSCRATCH));
|
||||
SHR(64, R(RSCRATCH), Imm8(37));
|
||||
SHR(64, R(RSCRATCH2), Imm8(47));
|
||||
AND(32, R(RSCRATCH), Imm32(0x3FF)); // i % 1024
|
||||
AND(32, R(RSCRATCH2), Imm8(0x1F)); // i / 1024
|
||||
|
||||
IMUL(32, RSCRATCH, MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_dec));
|
||||
ADD(32, R(RSCRATCH), Imm8(1));
|
||||
SHR(32, R(RSCRATCH), Imm8(1));
|
||||
|
||||
MOV(32, R(RSCRATCH2), MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_base));
|
||||
SUB(32, R(RSCRATCH2), R(RSCRATCH));
|
||||
SHL(64, R(RSCRATCH2), Imm8(29));
|
||||
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(fres_expected_base[i / 1024] - (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
||||
RET();
|
||||
|
||||
// Exception flags for zero input.
|
||||
SetJumpTarget(zero);
|
||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
|
||||
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
|
||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
|
||||
SetJumpTarget(skip_set_fx1);
|
||||
|
||||
SetJumpTarget(complex1);
|
||||
SetJumpTarget(complex2);
|
||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal);
|
||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||
RET();
|
||||
}
|
||||
|
||||
void CommonAsmRoutines::GenMfcr()
|
||||
{
|
||||
// Input: none
|
||||
// Output: RSCRATCH
|
||||
// This function clobbers all three RSCRATCH.
|
||||
X64Reg dst = RSCRATCH;
|
||||
X64Reg tmp = RSCRATCH2;
|
||||
X64Reg cr_val = RSCRATCH_EXTRA;
|
||||
XOR(32, R(dst), R(dst));
|
||||
// we only need to zero the high bits of tmp once
|
||||
XOR(32, R(tmp), R(tmp));
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
static const u32 m_flagTable[8] = { 0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9 };
|
||||
if (i != 0)
|
||||
SHL(32, R(dst), Imm8(4));
|
||||
|
||||
MOV(64, R(cr_val), PPCSTATE(cr_val[i]));
|
||||
|
||||
// EQ: Bits 31-0 == 0; set flag bit 1
|
||||
TEST(32, R(cr_val), R(cr_val));
|
||||
// FIXME: is there a better way to do this without the partial register merging?
|
||||
SETcc(CC_Z, R(tmp));
|
||||
LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));
|
||||
|
||||
// GT: Value > 0; set flag bit 2
|
||||
TEST(64, R(cr_val), R(cr_val));
|
||||
SETcc(CC_G, R(tmp));
|
||||
LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0));
|
||||
|
||||
// SO: Bit 61 set; set flag bit 0
|
||||
// LT: Bit 62 set; set flag bit 3
|
||||
SHR(64, R(cr_val), Imm8(61));
|
||||
OR(32, R(dst), MScaled(cr_val, SCALE_4, (u32)(u64)m_flagTable));
|
||||
}
|
||||
RET();
|
||||
}
|
||||
|
||||
// Safe + Fast Quantizers, originally from JITIL by magumagu
|
||||
|
||||
const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
|
||||
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 };
|
||||
@ -250,414 +63,4 @@ const float GC_ALIGNED16(m_dequantizeTableS[]) =
|
||||
(1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
|
||||
};
|
||||
|
||||
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
|
||||
static const float GC_ALIGNED16(m_32767) = 32767.0f;
|
||||
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
|
||||
static const float GC_ALIGNED16(m_255) = 255.0f;
|
||||
static const float GC_ALIGNED16(m_127) = 127.0f;
|
||||
static const float GC_ALIGNED16(m_m128) = -128.0f;
|
||||
|
||||
const float GC_ALIGNED16(m_one[]) = { 1.0f, 0.0f, 0.0f, 0.0f };
|
||||
|
||||
#define QUANTIZE_OVERFLOW_SAFE
|
||||
|
||||
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range
|
||||
// while it's OK for large negatives, it isn't for positives
|
||||
// I don't know whether the overflow actually happens in any games
|
||||
// but it potentially can cause problems, so we need some clamping
|
||||
|
||||
// See comment in header for in/outs.
|
||||
void CommonAsmRoutines::GenQuantizedStores()
|
||||
{
|
||||
const u8* storePairedIllegal = AlignCode4();
|
||||
UD2();
|
||||
|
||||
const u8* storePairedFloat = AlignCode4();
|
||||
if (cpu_info.bSSSE3)
|
||||
{
|
||||
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||
ROL(64, R(RSCRATCH), Imm8(32));
|
||||
BSWAP(64, RSCRATCH);
|
||||
}
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedU8 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||
MINPS(XMM0, M(m_65535));
|
||||
#endif
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
PACKUSWB(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedS8 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||
MINPS(XMM0, M(m_65535));
|
||||
#endif
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
PACKSSWB(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedU16 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||
MINPS(XMM0, M(m_65535));
|
||||
#endif
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PACKUSDW(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
BSWAP(32, RSCRATCH);
|
||||
ROL(32, R(RSCRATCH), Imm8(16));
|
||||
}
|
||||
else
|
||||
{
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXPS(XMM0, R(XMM1));
|
||||
MINPS(XMM0, M(m_65535));
|
||||
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
BSWAP(32, RSCRATCH);
|
||||
}
|
||||
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedS16 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||
MINPS(XMM0, M(m_65535));
|
||||
#endif
|
||||
CVTTPS2DQ(XMM0, R(XMM0));
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
BSWAP(32, RSCRATCH);
|
||||
ROL(32, R(RSCRATCH), Imm8(16));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||
ReserveCodeSpace(8 * sizeof(u8*));
|
||||
|
||||
pairedStoreQuantized[0] = storePairedFloat;
|
||||
pairedStoreQuantized[1] = storePairedIllegal;
|
||||
pairedStoreQuantized[2] = storePairedIllegal;
|
||||
pairedStoreQuantized[3] = storePairedIllegal;
|
||||
pairedStoreQuantized[4] = storePairedU8;
|
||||
pairedStoreQuantized[5] = storePairedU16;
|
||||
pairedStoreQuantized[6] = storePairedS8;
|
||||
pairedStoreQuantized[7] = storePairedS16;
|
||||
}
|
||||
|
||||
// See comment in header for in/outs.
|
||||
void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||
{
|
||||
const u8* storeSingleIllegal = AlignCode4();
|
||||
UD2();
|
||||
|
||||
// Easy!
|
||||
const u8* storeSingleFloat = AlignCode4();
|
||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXSS(XMM0, R(XMM1));
|
||||
MINSS(XMM0, M(&m_255));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleS8 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MAXSS(XMM0, M(&m_m128));
|
||||
MINSS(XMM0, M(&m_127));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXSS(XMM0, R(XMM1));
|
||||
MINSS(XMM0, M(m_65535));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleS16 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MAXSS(XMM0, M(&m_m32768));
|
||||
MINSS(XMM0, M(&m_32767));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||
ReserveCodeSpace(8 * sizeof(u8*));
|
||||
|
||||
singleStoreQuantized[0] = storeSingleFloat;
|
||||
singleStoreQuantized[1] = storeSingleIllegal;
|
||||
singleStoreQuantized[2] = storeSingleIllegal;
|
||||
singleStoreQuantized[3] = storeSingleIllegal;
|
||||
singleStoreQuantized[4] = storeSingleU8;
|
||||
singleStoreQuantized[5] = storeSingleU16;
|
||||
singleStoreQuantized[6] = storeSingleS8;
|
||||
singleStoreQuantized[7] = storeSingleS16;
|
||||
}
|
||||
|
||||
void CommonAsmRoutines::GenQuantizedLoads()
|
||||
{
|
||||
const u8* loadPairedIllegal = AlignCode4();
|
||||
UD2();
|
||||
|
||||
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
|
||||
// don't need hardware access handling. This will definitely crash if paired loads occur
|
||||
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
|
||||
// for a good reason, or merely because no game does this.
|
||||
// If we find something that actually does do this, maybe this should be changed. How
|
||||
// much of a performance hit would it be?
|
||||
const u8* loadPairedFloatTwo = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
{
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
}
|
||||
else if (cpu_info.bSSSE3)
|
||||
{
|
||||
MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
||||
}
|
||||
else
|
||||
{
|
||||
LoadAndSwap(64, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
}
|
||||
RET();
|
||||
|
||||
const u8* loadPairedFloatOne = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
{
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
}
|
||||
else if (cpu_info.bSSSE3)
|
||||
{
|
||||
MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||
PSHUFB(XMM0, M(pbswapShuffle1x4));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
}
|
||||
else
|
||||
{
|
||||
LoadAndSwap(32, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
}
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU8Two = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
{
|
||||
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
||||
}
|
||||
else
|
||||
{
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
||||
}
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVZXBD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PUNPCKLBW(XMM0, R(XMM1));
|
||||
PUNPCKLWD(XMM0, R(XMM1));
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU8One = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS8Two = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
{
|
||||
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
||||
}
|
||||
else
|
||||
{
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
||||
}
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVSXBD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PUNPCKLBW(XMM0, R(XMM0));
|
||||
PUNPCKLWD(XMM0, R(XMM0));
|
||||
PSRAD(XMM0, 24);
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS8One = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU16Two = AlignCode4();
|
||||
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
|
||||
if (jit->js.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVZXWD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PUNPCKLWD(XMM0, R(XMM1));
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU16One = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS16Two = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
PMOVSXWD(XMM0, R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
PUNPCKLWD(XMM0, R(XMM0));
|
||||
PSRAD(XMM0, 16);
|
||||
}
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS16One = AlignCode4();
|
||||
if (jit->js.memcheck)
|
||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||
else
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M(m_one));
|
||||
RET();
|
||||
|
||||
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||
ReserveCodeSpace(16 * sizeof(u8*));
|
||||
|
||||
pairedLoadQuantized[0] = loadPairedFloatTwo;
|
||||
pairedLoadQuantized[1] = loadPairedIllegal;
|
||||
pairedLoadQuantized[2] = loadPairedIllegal;
|
||||
pairedLoadQuantized[3] = loadPairedIllegal;
|
||||
pairedLoadQuantized[4] = loadPairedU8Two;
|
||||
pairedLoadQuantized[5] = loadPairedU16Two;
|
||||
pairedLoadQuantized[6] = loadPairedS8Two;
|
||||
pairedLoadQuantized[7] = loadPairedS16Two;
|
||||
|
||||
pairedLoadQuantized[8] = loadPairedFloatOne;
|
||||
pairedLoadQuantized[9] = loadPairedIllegal;
|
||||
pairedLoadQuantized[10] = loadPairedIllegal;
|
||||
pairedLoadQuantized[11] = loadPairedIllegal;
|
||||
pairedLoadQuantized[12] = loadPairedU8One;
|
||||
pairedLoadQuantized[13] = loadPairedU16One;
|
||||
pairedLoadQuantized[14] = loadPairedS8One;
|
||||
pairedLoadQuantized[15] = loadPairedS16One;
|
||||
}
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
||||
#include "Common/CommonTypes.h"
|
||||
|
||||
extern const u8 GC_ALIGNED16(pbswapShuffle1x4[16]);
|
||||
extern const u8 GC_ALIGNED16(pbswapShuffle2x4[16]);
|
||||
@ -15,7 +15,6 @@ extern const float GC_ALIGNED16(m_dequantizeTableS[]);
|
||||
class CommonAsmRoutinesBase
|
||||
{
|
||||
public:
|
||||
|
||||
const u8 *fifoDirectWrite8;
|
||||
const u8 *fifoDirectWrite16;
|
||||
const u8 *fifoDirectWrite32;
|
||||
@ -51,19 +50,5 @@ public:
|
||||
// In: ECX: Address to write to.
|
||||
// In: XMM0: Bottom 32-bit slot holds the float to be written.
|
||||
const u8 **singleStoreQuantized;
|
||||
|
||||
};
|
||||
|
||||
class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock
|
||||
{
|
||||
protected:
|
||||
void GenQuantizedLoads();
|
||||
void GenQuantizedStores();
|
||||
void GenQuantizedSingleStores();
|
||||
|
||||
public:
|
||||
void GenFifoWrite(int size);
|
||||
void GenFrsqrte();
|
||||
void GenFres();
|
||||
void GenMfcr();
|
||||
};
|
||||
|
@ -23,8 +23,8 @@
|
||||
#include "Core/PowerPC/PowerPC.h"
|
||||
#include "Core/PowerPC/PPCAnalyst.h"
|
||||
#include "Core/PowerPC/PPCTables.h"
|
||||
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
|
||||
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
||||
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
|
||||
#include "Core/PowerPC/JitCommon/JitCache.h"
|
||||
#include "Core/PowerPC/JitCommon/TrampolineCache.h"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user