JitArm64: Use one instruction for making NaNs quiet

Instead of materializing the quiet bit in a register and ORing the NaN
with it, we can perform an arithmetic operation on the NaN. This is a
cycle or two slower on some CPUs in cases where generating the quiet bit
pipelined well, but this is farcode that rarely runs, so instruction
fetch latency is the bigger concern. And for non-SIMD cases, we also
save a register.
This commit is contained in:
JosJuice 2023-08-09 20:53:10 +02:00
parent 5d9838548b
commit 4ecdb9e57e
3 changed files with 16 additions and 55 deletions

View File

@ -180,10 +180,6 @@ public:
void FloatCompare(UGeckoInstruction inst, bool upper = false);
// temp_gpr can be INVALID_REG if single is true
void EmitQuietNaNBitConstant(Arm64Gen::ARM64Reg dest_reg, bool single,
Arm64Gen::ARM64Reg temp_gpr);
bool IsFPRStoreSafe(size_t guest_reg) const;
protected:

View File

@ -137,12 +137,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
const ARM64Reg temp_gpr = m_accurate_nans && !single ? gpr.GetReg() : ARM64Reg::INVALID_REG;
if (m_accurate_nans)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetReg();
}
switch (op5)
{
case 18:
@ -202,10 +196,6 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
SwitchToFarCode();
SetJumpTarget(nan);
const ARM64Reg quiet_bit_reg = reg_encoder(V0Q);
EmitQuietNaNBitConstant(quiet_bit_reg, inputs_are_singles && output_is_single, temp_gpr);
Common::SmallVector<ARM64Reg, 3> inputs;
inputs.push_back(VA);
if (use_b && VA != VB)
@ -213,7 +203,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
if (use_c && VA != VC && (!use_b || VB != VC))
inputs.push_back(VC);
// If any inputs are NaNs, pick the first NaN of them and OR it with the quiet bit
// If any inputs are NaNs, pick the first NaN of them and set its quiet bit
for (size_t i = 0; i < inputs.size(); ++i)
{
// Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at
@ -228,8 +218,9 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
skip = B(CCFlags::CC_VC);
}
m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(input),
EncodeRegToDouble(quiet_bit_reg));
// Make the NaN quiet
m_float_emit.FADD(VD, input, input);
nan_fixups.push_back(B());
if (check_input)
@ -886,29 +877,6 @@ void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, AR
}
}
void JitArm64::EmitQuietNaNBitConstant(ARM64Reg dest_reg, bool single, ARM64Reg temp_gpr)
{
// dest_reg = QNaN & ~SNaN
//
// (Alternatively, dest_reg = QNaN would also work, but that would take
// two instructions to emit even for singles)
if (single)
{
m_float_emit.MOVI(32, dest_reg, 0x40, 16);
}
else
{
ASSERT(temp_gpr != ARM64Reg::INVALID_REG);
MOVI2R(EncodeRegTo64(temp_gpr), 0x0008'0000'0000'0000);
if (IsQuad(dest_reg))
m_float_emit.DUP(64, dest_reg, EncodeRegTo64(temp_gpr));
else
m_float_emit.FMOV(dest_reg, EncodeRegTo64(temp_gpr));
}
}
bool JitArm64::IsFPRStoreSafe(size_t guest_reg) const
{
return js.fpr_is_store_safe[guest_reg];

View File

@ -272,16 +272,15 @@ void JitArm64::ps_arith(UGeckoInstruction inst)
// Make the NaNs quiet
const ARM64Reg quiet_bit_reg = VD == result_reg ? reg_encoder(V2Q) : VD;
EmitQuietNaNBitConstant(quiet_bit_reg, singles, temp_gpr);
const ARM64Reg quiet_nan_reg = VD == result_reg ? reg_encoder(V2Q) : VD;
m_float_emit.FADD(size, quiet_nan_reg, result_reg, result_reg);
m_float_emit.FCMEQ(size, nan_temp_reg_paired, result_reg, result_reg);
m_float_emit.ORR(quiet_bit_reg, quiet_bit_reg, result_reg);
if (negate_result)
m_float_emit.FNEG(size, result_reg, result_reg);
if (VD == result_reg)
m_float_emit.BIF(VD, quiet_bit_reg, nan_temp_reg_paired);
else // quiet_bit_reg == VD
m_float_emit.BIF(VD, quiet_nan_reg, nan_temp_reg_paired);
else // quiet_nan_reg == VD
m_float_emit.BIT(VD, result_reg, nan_temp_reg_paired);
nan_fixup = B();
@ -381,7 +380,6 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
const ARM64Reg VC = fpr.R(c, type);
const ARM64Reg VD = fpr.RW(d, type);
const ARM64Reg V0 = fpr.GetReg();
const ARM64Reg V1 = m_accurate_nans ? fpr.GetReg() : ARM64Reg::INVALID_REG;
const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG;
m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1);
@ -398,22 +396,23 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
SwitchToFarCode();
SetJumpTarget(nan);
EmitQuietNaNBitConstant(scalar_reg_encoder(V1), singles, temp_gpr);
if (upper)
{
m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(input));
m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V1));
m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
scalar_reg_encoder(input));
m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0));
}
else if (d != c)
{
m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(V1), EncodeRegToDouble(input));
m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(input),
scalar_reg_encoder(input));
m_float_emit.INS(size, VD, 1, VC, 1);
}
else
{
m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(input));
m_float_emit.INS(size, VD, 0, V1, 0);
m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(input),
scalar_reg_encoder(input));
m_float_emit.INS(size, VD, 0, V0, 0);
}
FixupBranch nan_done = B();
@ -449,8 +448,6 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
}
fpr.Unlock(V0);
if (m_accurate_nans)
fpr.Unlock(V1);
if (temp_gpr != ARM64Reg::INVALID_REG)
gpr.Unlock(temp_gpr);