Merge pull request #11181 from JosJuice/jitarm64-25-bit-urshr

JitArm64: Reimplement Force25BitPrecision
This commit is contained in:
Markus Wick
2022-10-22 10:26:07 +02:00
committed by GitHub
5 changed files with 57 additions and 32 deletions

View File

@ -333,8 +333,7 @@ protected:
bool Rc = false);
void SetFPRFIfNeeded(bool single, Arm64Gen::ARM64Reg reg);
void Force25BitPrecision(Arm64Gen::ARM64Reg output, Arm64Gen::ARM64Reg input,
Arm64Gen::ARM64Reg temp);
void Force25BitPrecision(Arm64Gen::ARM64Reg output, Arm64Gen::ARM64Reg input);
// <Fastmem fault location, slowmem handler location>
std::map<const u8*, FastmemArea> m_fault_to_handler;

View File

@ -45,24 +45,18 @@ void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg)
// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before
// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
// rounding to nearest as it does.
void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input, ARM64Reg temp)
void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input)
{
ASSERT(output != input && output != temp && input != temp);
// temp = 0x0000'0000'0800'0000ULL
// output = 0xFFFF'FFFF'F800'0000ULL
m_float_emit.MOVI(32, temp, 0x08, 24);
m_float_emit.MOVI(64, output, 0xFFFF'FFFF'0000'0000ULL);
m_float_emit.BIC(temp, temp, output);
m_float_emit.ORR(32, output, 0xF8, 24);
// output = (input & ~0xFFFFFFF) + ((input & (1ULL << 27)) << 1)
m_float_emit.AND(temp, input, temp);
m_float_emit.AND(output, input, output);
if (IsQuad(input))
m_float_emit.ADD(64, output, output, temp);
{
m_float_emit.URSHR(64, output, input, 28);
m_float_emit.SHL(64, output, output, 28);
}
else
m_float_emit.ADD(output, output, temp);
{
m_float_emit.URSHR(output, input, 28);
m_float_emit.SHL(output, output, 28);
}
}
void JitArm64::fp_arith(UGeckoInstruction inst)
@ -113,9 +107,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
V0Q = fpr.GetReg();
V1Q = fpr.GetReg();
Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q));
Force25BitPrecision(reg_encoder(V0Q), VC);
VC = reg_encoder(V0Q);
}
@ -160,18 +153,16 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
{
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
V0Q = fpr.GetReg();
V1Q = fpr.GetReg();
Force25BitPrecision(reg_encoder(V1Q), VC, reg_encoder(V0Q));
Force25BitPrecision(reg_encoder(V1Q), VC);
VC = reg_encoder(V1Q);
}
ARM64Reg inaccurate_fma_temp_reg = VD;
if (inaccurate_fma && d == b)
{
if (V0Q == ARM64Reg::INVALID_REG)
V0Q = fpr.GetReg();
V0Q = fpr.GetReg();
inaccurate_fma_temp_reg = reg_encoder(V0Q);
}

View File

@ -103,12 +103,9 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst)
ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
V0Q = fpr.GetReg();
const ARM64Reg V1Q = fpr.GetReg();
Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC), reg_encoder(V1Q));
Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC));
VC = reg_encoder(V0Q);
fpr.Unlock(V1Q);
}
m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0);
@ -165,10 +162,9 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
{
ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
allocate_v0_if_needed();
V1Q = fpr.GetReg();
Force25BitPrecision(reg_encoder(V1Q), VC, V0);
Force25BitPrecision(reg_encoder(V1Q), VC);
VC = reg_encoder(V1Q);
}