|
|
|
@ -10,6 +10,7 @@
|
|
|
|
|
#include "Common/x64ABI.h"
|
|
|
|
|
#include "Common/x64Emitter.h"
|
|
|
|
|
#include "Core/HW/GPFifo.h"
|
|
|
|
|
#include "Core/PowerPC/Gekko.h"
|
|
|
|
|
#include "Core/PowerPC/JitCommon/JitBase.h"
|
|
|
|
|
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
|
|
|
|
#include "Core/PowerPC/PowerPC.h"
|
|
|
|
@ -219,26 +220,206 @@ alignas(16) static const float m_255 = 255.0f;
|
|
|
|
|
alignas(16) static const float m_127 = 127.0f;
|
|
|
|
|
alignas(16) static const float m_m128 = -128.0f;
|
|
|
|
|
|
|
|
|
|
#define QUANTIZE_OVERFLOW_SAFE
|
|
|
|
|
// Sizes of the various quantized store types
|
|
|
|
|
constexpr std::array<u8, 8> sizes{{32, 0, 0, 0, 8, 16, 8, 16}};
|
|
|
|
|
|
|
|
|
|
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of
|
|
|
|
|
// int32 range
|
|
|
|
|
// while it's OK for large negatives, it isn't for positives
|
|
|
|
|
// I don't know whether the overflow actually happens in any games
|
|
|
|
|
// but it potentially can cause problems, so we need some clamping
|
|
|
|
|
|
|
|
|
|
// See comment in header for in/outs.
|
|
|
|
|
void CommonAsmRoutines::GenQuantizedStores()
|
|
|
|
|
{
|
|
|
|
|
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
|
|
|
|
ReserveCodeSpace(8 * sizeof(u8*));
|
|
|
|
|
|
|
|
|
|
for (int type = 0; type < 8; type++)
|
|
|
|
|
pairedStoreQuantized[type] = GenQuantizedStoreRuntime(false, static_cast<EQuantizeType>(type));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// See comment in header for in/outs.
|
|
|
|
|
void CommonAsmRoutines::GenQuantizedSingleStores()
|
|
|
|
|
{
|
|
|
|
|
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
|
|
|
|
ReserveCodeSpace(8 * sizeof(u8*));
|
|
|
|
|
|
|
|
|
|
for (int type = 0; type < 8; type++)
|
|
|
|
|
singleStoreQuantized[type] = GenQuantizedStoreRuntime(true, static_cast<EQuantizeType>(type));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const u8* CommonAsmRoutines::GenQuantizedStoreRuntime(bool single, EQuantizeType type)
|
|
|
|
|
{
|
|
|
|
|
const void* start = GetCodePtr();
|
|
|
|
|
const u8* load = AlignCode4();
|
|
|
|
|
GenQuantizedStore(single, type, -1);
|
|
|
|
|
RET();
|
|
|
|
|
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore_%i_%i", type, single);
|
|
|
|
|
|
|
|
|
|
const u8* storePairedIllegal = AlignCode4();
|
|
|
|
|
return load;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void CommonAsmRoutines::GenQuantizedLoads()
|
|
|
|
|
{
|
|
|
|
|
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
|
|
|
|
ReserveCodeSpace(16 * sizeof(u8*));
|
|
|
|
|
|
|
|
|
|
for (int type = 0; type < 8; type++)
|
|
|
|
|
pairedLoadQuantized[type] = GenQuantizedLoadRuntime(false, static_cast<EQuantizeType>(type));
|
|
|
|
|
for (int type = 0; type < 8; type++)
|
|
|
|
|
pairedLoadQuantized[type + 8] = GenQuantizedLoadRuntime(true, static_cast<EQuantizeType>(type));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const u8* CommonAsmRoutines::GenQuantizedLoadRuntime(bool single, EQuantizeType type)
|
|
|
|
|
{
|
|
|
|
|
const void* start = GetCodePtr();
|
|
|
|
|
const u8* load = AlignCode4();
|
|
|
|
|
GenQuantizedLoad(single, type, -1);
|
|
|
|
|
RET();
|
|
|
|
|
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad_%i_%i", type, single);
|
|
|
|
|
|
|
|
|
|
return load;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type, int quantize)
|
|
|
|
|
{
|
|
|
|
|
// In: one or two single floats in XMM0, if quantize is -1, a quantization factor in RSCRATCH2
|
|
|
|
|
|
|
|
|
|
int size = sizes[type] * (single ? 1 : 2);
|
|
|
|
|
bool isInline = quantize != -1;
|
|
|
|
|
|
|
|
|
|
// illegal
|
|
|
|
|
if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3)
|
|
|
|
|
{
|
|
|
|
|
UD2();
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const u8* storePairedFloat = AlignCode4();
|
|
|
|
|
if (type == QUANTIZE_FLOAT)
|
|
|
|
|
{
|
|
|
|
|
GenQuantizedStoreFloat(single, isInline);
|
|
|
|
|
}
|
|
|
|
|
else if (single)
|
|
|
|
|
{
|
|
|
|
|
if (quantize == -1)
|
|
|
|
|
{
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
|
|
|
}
|
|
|
|
|
else if (quantize > 0)
|
|
|
|
|
{
|
|
|
|
|
MULSS(XMM0, M(&m_dequantizeTableS[quantize * 2]));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch (type)
|
|
|
|
|
{
|
|
|
|
|
case QUANTIZE_U8:
|
|
|
|
|
XORPS(XMM1, R(XMM1));
|
|
|
|
|
MAXSS(XMM0, R(XMM1));
|
|
|
|
|
MINSS(XMM0, M(&m_255));
|
|
|
|
|
break;
|
|
|
|
|
case QUANTIZE_S8:
|
|
|
|
|
MAXSS(XMM0, M(&m_m128));
|
|
|
|
|
MINSS(XMM0, M(&m_127));
|
|
|
|
|
break;
|
|
|
|
|
case QUANTIZE_U16:
|
|
|
|
|
XORPS(XMM1, R(XMM1));
|
|
|
|
|
MAXSS(XMM0, R(XMM1));
|
|
|
|
|
MINSS(XMM0, M(m_65535));
|
|
|
|
|
break;
|
|
|
|
|
case QUANTIZE_S16:
|
|
|
|
|
MAXSS(XMM0, M(&m_m32768));
|
|
|
|
|
MINSS(XMM0, M(&m_32767));
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (quantize == -1)
|
|
|
|
|
{
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
|
|
|
MULPS(XMM0, R(XMM1));
|
|
|
|
|
}
|
|
|
|
|
else if (quantize > 0)
|
|
|
|
|
{
|
|
|
|
|
MOVQ_xmm(XMM1, M(&m_quantizeTableS[quantize * 2]));
|
|
|
|
|
MULPS(XMM0, R(XMM1));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool hasPACKUSDW = cpu_info.bSSE4_1;
|
|
|
|
|
|
|
|
|
|
// Special case: if we don't have PACKUSDW we need to clamp to zero as well so the shuffle
|
|
|
|
|
// below can work
|
|
|
|
|
if (type == QUANTIZE_U16 && !hasPACKUSDW)
|
|
|
|
|
{
|
|
|
|
|
XORPS(XMM1, R(XMM1));
|
|
|
|
|
MAXPS(XMM0, R(XMM1));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// According to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value
|
|
|
|
|
// is out of int32 range while it's OK for large negatives, it isn't for positives
|
|
|
|
|
// I don't know whether the overflow actually happens in any games but it potentially can
|
|
|
|
|
// cause problems, so we need some clamping
|
|
|
|
|
MINPS(XMM0, M(m_65535));
|
|
|
|
|
CVTTPS2DQ(XMM0, R(XMM0));
|
|
|
|
|
|
|
|
|
|
switch (type)
|
|
|
|
|
{
|
|
|
|
|
case QUANTIZE_U8:
|
|
|
|
|
PACKSSDW(XMM0, R(XMM0));
|
|
|
|
|
PACKUSWB(XMM0, R(XMM0));
|
|
|
|
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
|
|
|
break;
|
|
|
|
|
case QUANTIZE_S8:
|
|
|
|
|
PACKSSDW(XMM0, R(XMM0));
|
|
|
|
|
PACKSSWB(XMM0, R(XMM0));
|
|
|
|
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
|
|
|
break;
|
|
|
|
|
case QUANTIZE_U16:
|
|
|
|
|
if (hasPACKUSDW)
|
|
|
|
|
{
|
|
|
|
|
PACKUSDW(XMM0, R(XMM0)); // AAAABBBB CCCCDDDD ... -> AABBCCDD ...
|
|
|
|
|
MOVD_xmm(R(RSCRATCH), XMM0); // AABBCCDD ... -> AABBCCDD
|
|
|
|
|
BSWAP(32, RSCRATCH); // AABBCCDD -> DDCCBBAA
|
|
|
|
|
ROL(32, R(RSCRATCH), Imm8(16)); // DDCCBBAA -> BBAADDCC
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// We don't have PACKUSDW so we'll shuffle instead (assumes 32-bit values >= 0 and < 65536)
|
|
|
|
|
PSHUFLW(XMM0, R(XMM0), 2); // AABB0000 CCDD0000 ... -> CCDDAABB ...
|
|
|
|
|
MOVD_xmm(R(RSCRATCH), XMM0); // CCDDAABB ... -> CCDDAABB
|
|
|
|
|
BSWAP(32, RSCRATCH); // CCDDAABB -> BBAADDCC
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case QUANTIZE_S16:
|
|
|
|
|
PACKSSDW(XMM0, R(XMM0));
|
|
|
|
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
|
|
|
BSWAP(32, RSCRATCH);
|
|
|
|
|
ROL(32, R(RSCRATCH), Imm8(16));
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG;
|
|
|
|
|
if (!single)
|
|
|
|
|
flags |= SAFE_LOADSTORE_NO_SWAP;
|
|
|
|
|
|
|
|
|
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, size, 0, QUANTIZED_REGS_TO_SAVE, flags);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void QuantizedMemoryRoutines::GenQuantizedStoreFloat(bool single, bool isInline)
|
|
|
|
|
{
|
|
|
|
|
if (single)
|
|
|
|
|
{
|
|
|
|
|
// Easy!
|
|
|
|
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (cpu_info.bSSSE3)
|
|
|
|
|
{
|
|
|
|
|
PSHUFB(XMM0, M((void*)pbswapShuffle2x4));
|
|
|
|
|
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
|
|
|
|
MOVQ_xmm(R(RSCRATCH), XMM0);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
@ -247,199 +428,195 @@ void CommonAsmRoutines::GenQuantizedStores()
|
|
|
|
|
ROL(64, R(RSCRATCH), Imm8(32));
|
|
|
|
|
BSWAP(64, RSCRATCH);
|
|
|
|
|
}
|
|
|
|
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE,
|
|
|
|
|
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RET();
|
|
|
|
|
void QuantizedMemoryRoutines::GenQuantizedLoad(bool single, EQuantizeType type, int quantize)
|
|
|
|
|
{
|
|
|
|
|
// Note that this method assumes that inline methods know the value of quantize ahead of
|
|
|
|
|
// time. The methods generated AOT assume that the quantize flag is placed in RSCRATCH in
|
|
|
|
|
// the second lowest byte, ie: 0x0000xx00
|
|
|
|
|
|
|
|
|
|
const u8* storePairedU8 = AlignCode4();
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
|
|
|
MULPS(XMM0, R(XMM1));
|
|
|
|
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
|
|
|
|
MINPS(XMM0, M(m_65535));
|
|
|
|
|
#endif
|
|
|
|
|
CVTTPS2DQ(XMM0, R(XMM0));
|
|
|
|
|
PACKSSDW(XMM0, R(XMM0));
|
|
|
|
|
PACKUSWB(XMM0, R(XMM0));
|
|
|
|
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
|
|
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
|
|
|
|
|
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
|
|
|
int size = sizes[type] * (single ? 1 : 2);
|
|
|
|
|
bool isInline = quantize != -1;
|
|
|
|
|
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* storePairedS8 = AlignCode4();
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
|
|
|
MULPS(XMM0, R(XMM1));
|
|
|
|
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
|
|
|
|
MINPS(XMM0, M(m_65535));
|
|
|
|
|
#endif
|
|
|
|
|
CVTTPS2DQ(XMM0, R(XMM0));
|
|
|
|
|
PACKSSDW(XMM0, R(XMM0));
|
|
|
|
|
PACKSSWB(XMM0, R(XMM0));
|
|
|
|
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
|
|
|
|
|
|
|
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
|
|
|
|
|
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
|
|
|
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* storePairedU16 = AlignCode4();
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
|
|
|
MULPS(XMM0, R(XMM1));
|
|
|
|
|
|
|
|
|
|
if (cpu_info.bSSE4_1)
|
|
|
|
|
// illegal
|
|
|
|
|
if (type == QUANTIZE_INVALID1 || type == QUANTIZE_INVALID2 || type == QUANTIZE_INVALID3)
|
|
|
|
|
{
|
|
|
|
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
|
|
|
|
MINPS(XMM0, M(m_65535));
|
|
|
|
|
#endif
|
|
|
|
|
CVTTPS2DQ(XMM0, R(XMM0));
|
|
|
|
|
PACKUSDW(XMM0, R(XMM0));
|
|
|
|
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
|
|
|
BSWAP(32, RSCRATCH);
|
|
|
|
|
ROL(32, R(RSCRATCH), Imm8(16));
|
|
|
|
|
UD2();
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Floats don't use quantization and can generate more optimal code
|
|
|
|
|
if (type == QUANTIZE_FLOAT)
|
|
|
|
|
{
|
|
|
|
|
GenQuantizedLoadFloat(single, isInline);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool extend = single && (type == QUANTIZE_S8 || type == QUANTIZE_S16);
|
|
|
|
|
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
{
|
|
|
|
|
BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE_LOAD;
|
|
|
|
|
int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG;
|
|
|
|
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags);
|
|
|
|
|
if (!single && (type == QUANTIZE_U8 || type == QUANTIZE_S8))
|
|
|
|
|
{
|
|
|
|
|
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
|
|
|
|
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
XORPS(XMM1, R(XMM1));
|
|
|
|
|
MAXPS(XMM0, R(XMM1));
|
|
|
|
|
MINPS(XMM0, M(m_65535));
|
|
|
|
|
|
|
|
|
|
CVTTPS2DQ(XMM0, R(XMM0));
|
|
|
|
|
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
|
|
|
|
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
|
|
|
BSWAP(32, RSCRATCH);
|
|
|
|
|
switch (type)
|
|
|
|
|
{
|
|
|
|
|
case QUANTIZE_U8:
|
|
|
|
|
case QUANTIZE_S8:
|
|
|
|
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend);
|
|
|
|
|
break;
|
|
|
|
|
case QUANTIZE_U16:
|
|
|
|
|
case QUANTIZE_S16:
|
|
|
|
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, size, 0, extend);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE,
|
|
|
|
|
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
|
|
|
if (single)
|
|
|
|
|
{
|
|
|
|
|
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* storePairedS16 = AlignCode4();
|
|
|
|
|
if (quantize == -1)
|
|
|
|
|
{
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
|
|
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
|
|
|
}
|
|
|
|
|
else if (quantize > 0)
|
|
|
|
|
{
|
|
|
|
|
MULSS(XMM0, M(&m_dequantizeTableS[quantize * 2]));
|
|
|
|
|
}
|
|
|
|
|
UNPCKLPS(XMM0, M(m_one));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
switch (type)
|
|
|
|
|
{
|
|
|
|
|
case QUANTIZE_U8:
|
|
|
|
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
if (cpu_info.bSSE4_1)
|
|
|
|
|
{
|
|
|
|
|
PMOVZXBD(XMM0, R(XMM0));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
PXOR(XMM1, R(XMM1));
|
|
|
|
|
PUNPCKLBW(XMM0, R(XMM1));
|
|
|
|
|
PUNPCKLWD(XMM0, R(XMM1));
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case QUANTIZE_S8:
|
|
|
|
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
if (cpu_info.bSSE4_1)
|
|
|
|
|
{
|
|
|
|
|
PMOVSXBD(XMM0, R(XMM0));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
PUNPCKLBW(XMM0, R(XMM0));
|
|
|
|
|
PUNPCKLWD(XMM0, R(XMM0));
|
|
|
|
|
PSRAD(XMM0, 24);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case QUANTIZE_U16:
|
|
|
|
|
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
|
|
|
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
if (cpu_info.bSSE4_1)
|
|
|
|
|
{
|
|
|
|
|
PMOVZXWD(XMM0, R(XMM0));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
PXOR(XMM1, R(XMM1));
|
|
|
|
|
PUNPCKLWD(XMM0, R(XMM1));
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case QUANTIZE_S16:
|
|
|
|
|
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
|
|
|
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
if (cpu_info.bSSE4_1)
|
|
|
|
|
{
|
|
|
|
|
PMOVSXWD(XMM0, R(XMM0));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
PUNPCKLWD(XMM0, R(XMM0));
|
|
|
|
|
PSRAD(XMM0, 16);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
CVTDQ2PS(XMM0, R(XMM0));
|
|
|
|
|
|
|
|
|
|
if (quantize == -1)
|
|
|
|
|
{
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
|
|
|
MULPS(XMM0, R(XMM1));
|
|
|
|
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
|
|
|
|
MINPS(XMM0, M(m_65535));
|
|
|
|
|
#endif
|
|
|
|
|
CVTTPS2DQ(XMM0, R(XMM0));
|
|
|
|
|
PACKSSDW(XMM0, R(XMM0));
|
|
|
|
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
|
|
|
BSWAP(32, RSCRATCH);
|
|
|
|
|
ROL(32, R(RSCRATCH), Imm8(16));
|
|
|
|
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE,
|
|
|
|
|
SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
|
|
|
}
|
|
|
|
|
else if (quantize > 0)
|
|
|
|
|
{
|
|
|
|
|
MOVQ_xmm(XMM1, M(&m_dequantizeTableS[quantize * 2]));
|
|
|
|
|
MULPS(XMM0, R(XMM1));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore");
|
|
|
|
|
|
|
|
|
|
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
|
|
|
|
ReserveCodeSpace(8 * sizeof(u8*));
|
|
|
|
|
|
|
|
|
|
pairedStoreQuantized[0] = storePairedFloat;
|
|
|
|
|
pairedStoreQuantized[1] = storePairedIllegal;
|
|
|
|
|
pairedStoreQuantized[2] = storePairedIllegal;
|
|
|
|
|
pairedStoreQuantized[3] = storePairedIllegal;
|
|
|
|
|
pairedStoreQuantized[4] = storePairedU8;
|
|
|
|
|
pairedStoreQuantized[5] = storePairedU16;
|
|
|
|
|
pairedStoreQuantized[6] = storePairedS8;
|
|
|
|
|
pairedStoreQuantized[7] = storePairedS16;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// See comment in header for in/outs.
|
|
|
|
|
void CommonAsmRoutines::GenQuantizedSingleStores()
|
|
|
|
|
void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline)
|
|
|
|
|
{
|
|
|
|
|
const void* start = GetCodePtr();
|
|
|
|
|
int size = single ? 32 : 64;
|
|
|
|
|
bool extend = false;
|
|
|
|
|
|
|
|
|
|
const u8* storeSingleIllegal = AlignCode4();
|
|
|
|
|
UD2();
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
{
|
|
|
|
|
BitSet32 regsToSave = QUANTIZED_REGS_TO_SAVE;
|
|
|
|
|
int flags = isInline ? 0 : SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG;
|
|
|
|
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), size, 0, regsToSave, extend, flags);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Easy!
|
|
|
|
|
const u8* storeSingleFloat = AlignCode4();
|
|
|
|
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
|
|
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE,
|
|
|
|
|
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
|
|
|
XORPS(XMM1, R(XMM1));
|
|
|
|
|
MAXSS(XMM0, R(XMM1));
|
|
|
|
|
MINSS(XMM0, M(&m_255));
|
|
|
|
|
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
|
|
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE,
|
|
|
|
|
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* storeSingleS8 = AlignCode4();
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
|
|
|
MAXSS(XMM0, M(&m_m128));
|
|
|
|
|
MINSS(XMM0, M(&m_127));
|
|
|
|
|
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
|
|
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE,
|
|
|
|
|
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
|
|
|
XORPS(XMM1, R(XMM1));
|
|
|
|
|
MAXSS(XMM0, R(XMM1));
|
|
|
|
|
MINSS(XMM0, M(m_65535));
|
|
|
|
|
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
|
|
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
|
|
|
|
|
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* storeSingleS16 = AlignCode4();
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
|
|
|
MAXSS(XMM0, M(&m_m32768));
|
|
|
|
|
MINSS(XMM0, M(&m_32767));
|
|
|
|
|
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
|
|
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE,
|
|
|
|
|
SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedSingleStore");
|
|
|
|
|
|
|
|
|
|
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
|
|
|
|
ReserveCodeSpace(8 * sizeof(u8*));
|
|
|
|
|
|
|
|
|
|
singleStoreQuantized[0] = storeSingleFloat;
|
|
|
|
|
singleStoreQuantized[1] = storeSingleIllegal;
|
|
|
|
|
singleStoreQuantized[2] = storeSingleIllegal;
|
|
|
|
|
singleStoreQuantized[3] = storeSingleIllegal;
|
|
|
|
|
singleStoreQuantized[4] = storeSingleU8;
|
|
|
|
|
singleStoreQuantized[5] = storeSingleU16;
|
|
|
|
|
singleStoreQuantized[6] = storeSingleS8;
|
|
|
|
|
singleStoreQuantized[7] = storeSingleS16;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void CommonAsmRoutines::GenQuantizedLoads()
|
|
|
|
|
{
|
|
|
|
|
const void* start = GetCodePtr();
|
|
|
|
|
|
|
|
|
|
const u8* loadPairedIllegal = AlignCode4();
|
|
|
|
|
UD2();
|
|
|
|
|
if (single)
|
|
|
|
|
{
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
{
|
|
|
|
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
}
|
|
|
|
|
else if (cpu_info.bSSSE3)
|
|
|
|
|
{
|
|
|
|
|
MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
|
|
|
|
PSHUFB(XMM0, M(pbswapShuffle1x4));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
LoadAndSwap(32, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
|
|
|
|
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
UNPCKLPS(XMM0, M(m_one));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
|
|
|
|
|
// don't need hardware access handling. This will definitely crash if paired loads occur
|
|
|
|
|
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
|
|
|
|
|
// for a good reason, or merely because no game does this.
|
|
|
|
|
// If we find something that actually does do this, maybe this should be changed. How
|
|
|
|
|
// much of a performance hit would it be?
|
|
|
|
|
const u8* loadPairedFloatTwo = AlignCode4();
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
{
|
|
|
|
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false,
|
|
|
|
|
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
|
|
|
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
|
|
|
|
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
}
|
|
|
|
@ -454,203 +631,5 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
|
|
|
|
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
|
|
|
|
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
}
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* loadPairedFloatOne = AlignCode4();
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
{
|
|
|
|
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false,
|
|
|
|
|
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
|
|
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
UNPCKLPS(XMM0, M(m_one));
|
|
|
|
|
}
|
|
|
|
|
else if (cpu_info.bSSSE3)
|
|
|
|
|
{
|
|
|
|
|
MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
|
|
|
|
PSHUFB(XMM0, M(pbswapShuffle1x4));
|
|
|
|
|
UNPCKLPS(XMM0, M(m_one));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
LoadAndSwap(32, RSCRATCH_EXTRA, MRegSum(RMEM, RSCRATCH_EXTRA));
|
|
|
|
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
UNPCKLPS(XMM0, M(m_one));
|
|
|
|
|
}
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* loadPairedU8Two = AlignCode4();
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
{
|
|
|
|
|
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
|
|
|
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
|
|
|
|
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
|
|
|
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
|
|
|
|
}
|
|
|
|
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
if (cpu_info.bSSE4_1)
|
|
|
|
|
{
|
|
|
|
|
PMOVZXBD(XMM0, R(XMM0));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
PXOR(XMM1, R(XMM1));
|
|
|
|
|
PUNPCKLBW(XMM0, R(XMM1));
|
|
|
|
|
PUNPCKLWD(XMM0, R(XMM1));
|
|
|
|
|
}
|
|
|
|
|
CVTDQ2PS(XMM0, R(XMM0));
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
|
|
|
MULPS(XMM0, R(XMM1));
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* loadPairedU8One = AlignCode4();
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
|
|
|
|
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
|
|
|
else
|
|
|
|
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
|
|
|
|
|
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
|
|
|
UNPCKLPS(XMM0, M(m_one));
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* loadPairedS8Two = AlignCode4();
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
{
|
|
|
|
|
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
|
|
|
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
|
|
|
|
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
|
|
|
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
|
|
|
|
}
|
|
|
|
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
if (cpu_info.bSSE4_1)
|
|
|
|
|
{
|
|
|
|
|
PMOVSXBD(XMM0, R(XMM0));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
PUNPCKLBW(XMM0, R(XMM0));
|
|
|
|
|
PUNPCKLWD(XMM0, R(XMM0));
|
|
|
|
|
PSRAD(XMM0, 24);
|
|
|
|
|
}
|
|
|
|
|
CVTDQ2PS(XMM0, R(XMM0));
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
|
|
|
MULPS(XMM0, R(XMM1));
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* loadPairedS8One = AlignCode4();
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true,
|
|
|
|
|
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
|
|
|
else
|
|
|
|
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
|
|
|
|
|
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
|
|
|
UNPCKLPS(XMM0, M(m_one));
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* loadPairedU16Two = AlignCode4();
|
|
|
|
|
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
|
|
|
|
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
|
|
|
else
|
|
|
|
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
|
|
|
|
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
|
|
|
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
if (cpu_info.bSSE4_1)
|
|
|
|
|
{
|
|
|
|
|
PMOVZXWD(XMM0, R(XMM0));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
PXOR(XMM1, R(XMM1));
|
|
|
|
|
PUNPCKLWD(XMM0, R(XMM1));
|
|
|
|
|
}
|
|
|
|
|
CVTDQ2PS(XMM0, R(XMM0));
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
|
|
|
MULPS(XMM0, R(XMM1));
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* loadPairedU16One = AlignCode4();
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
|
|
|
|
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
|
|
|
else
|
|
|
|
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
|
|
|
|
|
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
|
|
|
UNPCKLPS(XMM0, M(m_one));
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* loadPairedS16Two = AlignCode4();
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false,
|
|
|
|
|
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
|
|
|
else
|
|
|
|
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
|
|
|
|
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
|
|
|
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
if (cpu_info.bSSE4_1)
|
|
|
|
|
{
|
|
|
|
|
PMOVSXWD(XMM0, R(XMM0));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
PUNPCKLWD(XMM0, R(XMM0));
|
|
|
|
|
PSRAD(XMM0, 16);
|
|
|
|
|
}
|
|
|
|
|
CVTDQ2PS(XMM0, R(XMM0));
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
|
|
|
MULPS(XMM0, R(XMM1));
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
const u8* loadPairedS16One = AlignCode4();
|
|
|
|
|
if (jit->jo.memcheck)
|
|
|
|
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true,
|
|
|
|
|
SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
|
|
|
else
|
|
|
|
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
|
|
|
|
|
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
|
|
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
|
|
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
|
|
|
UNPCKLPS(XMM0, M(m_one));
|
|
|
|
|
RET();
|
|
|
|
|
|
|
|
|
|
JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedLoad");
|
|
|
|
|
|
|
|
|
|
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
|
|
|
|
ReserveCodeSpace(16 * sizeof(u8*));
|
|
|
|
|
|
|
|
|
|
pairedLoadQuantized[0] = loadPairedFloatTwo;
|
|
|
|
|
pairedLoadQuantized[1] = loadPairedIllegal;
|
|
|
|
|
pairedLoadQuantized[2] = loadPairedIllegal;
|
|
|
|
|
pairedLoadQuantized[3] = loadPairedIllegal;
|
|
|
|
|
pairedLoadQuantized[4] = loadPairedU8Two;
|
|
|
|
|
pairedLoadQuantized[5] = loadPairedU16Two;
|
|
|
|
|
pairedLoadQuantized[6] = loadPairedS8Two;
|
|
|
|
|
pairedLoadQuantized[7] = loadPairedS16Two;
|
|
|
|
|
|
|
|
|
|
pairedLoadQuantized[8] = loadPairedFloatOne;
|
|
|
|
|
pairedLoadQuantized[9] = loadPairedIllegal;
|
|
|
|
|
pairedLoadQuantized[10] = loadPairedIllegal;
|
|
|
|
|
pairedLoadQuantized[11] = loadPairedIllegal;
|
|
|
|
|
pairedLoadQuantized[12] = loadPairedU8One;
|
|
|
|
|
pairedLoadQuantized[13] = loadPairedU16One;
|
|
|
|
|
pairedLoadQuantized[14] = loadPairedS8One;
|
|
|
|
|
pairedLoadQuantized[15] = loadPairedS16One;
|
|
|
|
|
}
|
|
|
|
|