JIT: more optimizing of float ops based on known input characteristics

If the inputs are both float singles, and the top half is known to be identical
to the bottom half, we can use packed arithmetic instead of scalar to skip
the movddup.

This is slower on a few rather old CPUs, plus the Atom+Silvermont, so detect
Atom and disable it in that case.

Also avoid PPC_FP on stores if we know that the output came from a float op.
This commit is contained in:
Fiora 2014-10-11 14:22:44 -07:00
parent 4e0591cdf1
commit 72c96c20d3
9 changed files with 154 additions and 39 deletions

View File

@ -50,10 +50,10 @@ struct CPUInfo
bool bMOVBE;
// This flag indicates that the hardware supports some mode
// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
// TODO: ARM
bool bFlushToZero;
bool bLAHFSAHF64;
bool bLongMode;
bool bAtom;
// ARM specific CPUInfo
bool bSwp;

View File

@ -129,6 +129,12 @@ void CPUInfo::Detect()
if (max_std_fn >= 1)
{
__cpuid(cpu_id, 0x00000001);
int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff);
int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0);
// Detect people unfortunate enough to be running Dolphin on an Atom
if (family == 6 && (model == 0x1C || model == 0x26 ||model == 0x27 || model == 0x35 || model == 0x36 ||
model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D))
bAtom = true;
logical_cpu_count = (cpu_id[1] >> 16) & 0xFF;
ht = (cpu_id[3] >> 28) & 1;

View File

@ -151,7 +151,7 @@ public:
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
bool Rc = false, bool carry = false);
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool packed = false, bool roundRHS = false);
void FloatCompare(UGeckoInstruction inst, bool upper = false);
// OPCODES

View File

@ -11,11 +11,12 @@
using namespace Gen;
static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg),
void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool packed, bool roundRHS)
{
fpr.Lock(d, a, b);
fpr.BindToRegister(d, d == a || d == b || !single);
@ -34,13 +35,20 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X
}
else
{
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), false, reversible);
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible);
}
if (single)
{
ForceSinglePrecisionS(fpr.RX(d));
if (packed)
{
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
}
else
{
ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d));
MOVDDUP(fpr.RX(d), fpr.R(d));
}
}
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
}
@ -63,14 +71,32 @@ void Jit64::fp_arith(UGeckoInstruction inst)
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
int d = inst.FD;
int arg2 = inst.SUBOP5 == 25 ? c : b;
bool single = inst.OPCD == 59;
bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
// If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by
// using packed arithmetic instead.
bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[arg2];
// Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case.
// Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones.
if (inst.SUBOP5 == 18 || cpu_info.bAtom)
packed = false;
switch (inst.SUBOP5)
{
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, round_input); break; //mul
case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, inst, packed); break;
case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, inst, packed); break;
case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, inst, packed); break;
case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
packed ? &XEmitter::MULPD : &XEmitter::MULSD, inst, packed, round_input); break;
default:
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
}
@ -88,6 +114,9 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
int d = inst.FD;
bool single = inst.OPCD == 59;
bool round_input = single && !jit->js.op->fprIsSingle[c];
bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c];
if (cpu_info.bAtom)
packed = false;
fpr.Lock(a, b, c, d);
@ -109,9 +138,15 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
switch (inst.SUBOP5)
{
case 28: //msub
if (packed)
VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
else
VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
break;
case 29: //madd
if (packed)
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
else
VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
break;
// PowerPC and x86 define NMADD/NMSUB differently
@ -119,9 +154,15 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
// PPC: D = -(A*C (+/-) B)
// so we have to swap them; the ADD/SUB here isn't a typo.
case 30: //nmsub
if (packed)
VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
else
VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
break;
case 31: //nmadd
if (packed)
VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
else
VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
break;
}
@ -133,32 +174,57 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
else
MOVAPD(XMM1, fpr.R(c));
MULSD(XMM1, fpr.R(a));
MOVAPD(XMM0, fpr.R(b));
if (packed)
{
MULPD(XMM1, fpr.R(a));
SUBPD(XMM0, R(XMM1));
}
else
{
MULSD(XMM1, fpr.R(a));
SUBSD(XMM0, R(XMM1));
}
}
else
{
if (single && round_input)
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
else
MOVAPD(XMM0, fpr.R(c));
MULSD(XMM0, fpr.R(a));
if (packed)
{
MULPD(XMM0, fpr.R(a));
if (inst.SUBOP5 == 28) //msub
SUBSD(XMM0, fpr.R(b));
SUBPD(XMM0, fpr.R(b));
else //(n)madd
ADDPD(XMM0, fpr.R(b));
}
else
{
MULSD(XMM0, fpr.R(a));
if (inst.SUBOP5 == 28)
SUBSD(XMM0, fpr.R(b));
else
ADDSD(XMM0, fpr.R(b));
}
if (inst.SUBOP5 == 31) //nmadd
PXOR(XMM0, M((void*)&psSignBits));
PXOR(XMM0, M((void*)&(packed ? psSignBits2 : psSignBits)));
}
fpr.BindToRegister(d, !single);
//YES it is necessary to dupe the result :(
//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
if (single)
{
ForceSinglePrecisionS(XMM0);
MOVDDUP(fpr.RX(d), R(XMM0));
if (packed)
{
ForceSinglePrecisionP(fpr.RX(d), XMM0);
}
else
{
ForceSinglePrecisionS(fpr.RX(d), XMM0);
MOVDDUP(fpr.RX(d), fpr.R(d));
}
}
else
{
@ -427,7 +493,7 @@ void Jit64::frspx(UGeckoInstruction inst)
fpr.BindToRegister(d, d == b);
if (b != d)
MOVAPD(fpr.RX(d), fpr.R(b));
ForceSinglePrecisionS(fpr.RX(d));
ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d));
MOVDDUP(fpr.RX(d), fpr.R(d));
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();

View File

@ -107,9 +107,16 @@ void Jit64::stfXXX(UGeckoInstruction inst)
FALLBACK_IF(update && js.memcheck && a == b);
if (single)
{
if (jit->js.op->fprIsStoreSafe[s])
{
CVTSD2SS(XMM0, fpr.R(s));
}
else
{
fpr.BindToRegister(s, true, false);
ConvertDoubleToSingle(XMM0, fpr.RX(s));
}
MOVD_xmm(R(RSCRATCH), XMM0);
}
else

View File

@ -667,13 +667,17 @@ void EmuCodeBlock::WriteToConstRamAddress(int accessSize, OpArg arg, u32 address
MOV(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), R(reg));
}
void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm)
void EmuCodeBlock::ForceSinglePrecisionS(X64Reg output, X64Reg input)
{
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
if (jit->jo.accurateSinglePrecision)
{
CVTSD2SS(xmm, R(xmm));
CVTSS2SD(xmm, R(xmm));
CVTSD2SS(input, R(input));
CVTSS2SD(output, R(input));
}
else if (output != input)
{
MOVAPD(output, R(input));
}
}

View File

@ -130,7 +130,7 @@ public:
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8),
Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm);
void ForceSinglePrecisionS(Gen::X64Reg xmm);
void ForceSinglePrecisionS(Gen::X64Reg output, Gen::X64Reg input);
void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input);
void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp);

View File

@ -830,18 +830,45 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
fprInUse[code[i].fregOut] = true;
}
// Forward scan, for flags that need the other direction for calculation
BitSet32 fprIsSingle;
// Forward scan, for flags that need the other direction for calculation.
BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe;
for (u32 i = 0; i < block->m_num_instructions; i++)
{
code[i].fprIsSingle = fprIsSingle;
code[i].fprIsDuplicated = fprIsDuplicated;
code[i].fprIsStoreSafe = fprIsStoreSafe;
if (code[i].fregOut >= 0)
{
// This instruction outputs float, so we can omit the special rounding done in fmuls/fmadds
if (code[i].opinfo->type == OPTYPE_SINGLEFP || code[i].opinfo->type == OPTYPE_PS || strncmp(code[i].opinfo->opname, "lfs", 3))
fprIsSingle[code[i].fregOut] = true;
else
fprIsSingle[code[i].fregOut] = false;
fprIsDuplicated[code[i].fregOut] = false;
fprIsStoreSafe[code[i].fregOut] = false;
// Single, duplicated, and doesn't need PPC_FP.
if (code[i].opinfo->type == OPTYPE_SINGLEFP)
{
fprIsSingle[code[i].fregOut] = true;
fprIsDuplicated[code[i].fregOut] = true;
fprIsStoreSafe[code[i].fregOut] = true;
}
// Single and duplicated, but might be a denormal (not safe to skip PPC_FP).
// TODO: if we go directly from a load to store, skip conversion entirely?
// TODO: if we go directly from a load to a float instruction, and the value isn't used
// for anything else, we can skip PPC_FP on a load too.
if (!strncmp(code[i].opinfo->opname, "lfs", 3))
{
fprIsSingle[code[i].fregOut] = true;
fprIsDuplicated[code[i].fregOut] = true;
}
// Paired are still floats, but the top/bottom halves may differ.
if (code[i].opinfo->type == OPTYPE_PS || code[i].opinfo->type == OPTYPE_LOADPS)
{
fprIsSingle[code[i].fregOut] = true;
fprIsStoreSafe[code[i].fregOut] = true;
}
// Careful: changing the float mode in a block breaks this optimization, since
// a previous float op might have had had FTZ off while the later store has FTZ
// on. So, discard all information we have.
if (!strncmp(code[i].opinfo->opname, "mtfs", 4))
fprIsStoreSafe = BitSet32(0);
}
}
return address;

View File

@ -53,6 +53,11 @@ struct CodeOp //16B
BitSet32 fprInXmm;
// whether an fpr is known to be an actual single-precision value at this point in the block.
BitSet32 fprIsSingle;
// whether an fpr is known to have identical top and bottom halves (e.g. due to a single instruction)
BitSet32 fprIsDuplicated;
// whether an fpr is the output of a single-precision arithmetic instruction, i.e. whether we can safely
// skip PPC_FP.
BitSet32 fprIsStoreSafe;
};
struct BlockStats