From 89a0b0de32101ad88ae4b56fdd60f72b98a6c7ae Mon Sep 17 00:00:00 2001 From: MerryMage Date: Sun, 23 Sep 2018 20:11:09 +0100 Subject: [PATCH] Jit64AsmCommon: Reduce branches in fast-path and inline most behavior The only behavior requires a function call is denormal behaviour. We also fix fpscr exception raising. --- .../PowerPC/Jit64Common/Jit64AsmCommon.cpp | 60 +++++++++++++------ 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp index 65f72805c8..17442add83 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp @@ -32,17 +32,14 @@ void CommonAsmRoutines::GenFrsqrte() // This function clobbers all three RSCRATCH. MOVQ_xmm(R(RSCRATCH), XMM0); - // Negative and zero inputs set an exception and take the complex path. - TEST(64, R(RSCRATCH), R(RSCRATCH)); - FixupBranch zero = J_CC(CC_Z, true); - FixupBranch negative = J_CC(CC_S, true); + // Extract exponent MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH)); SHR(64, R(RSCRATCH_EXTRA), Imm8(52)); - // Zero and max exponents (non-normal floats) take the complex path. - FixupBranch complex1 = J_CC(CC_Z, true); - CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); - FixupBranch complex2 = J_CC(CC_E, true); + // Negatives, zeros, denormals, infinities and NaNs take the complex path. + LEA(32, RSCRATCH2, MDisp(RSCRATCH_EXTRA, -1)); + CMP(32, R(RSCRATCH2), Imm32(0x7FE)); + FixupBranch complex = J_CC(CC_AE, true); SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD)); SAR(32, R(RSCRATCH_EXTRA), Imm8(1)); @@ -75,24 +72,53 @@ void CommonAsmRoutines::GenFrsqrte() MOVQ_xmm(XMM0, R(RSCRATCH2)); RET(); - // Exception flags for zero input. - SetJumpTarget(zero); + SetJumpTarget(complex); + AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); + CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); + FixupBranch nan_or_inf = J_CC(CC_E); + + MOV(64, R(RSCRATCH2), R(RSCRATCH)); + SHL(64, R(RSCRATCH2), Imm8(1)); + FixupBranch nonzero = J_CC(CC_NZ); + + // +0.0 or -0.0 TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX)); FixupBranch skip_set_fx1 = J_CC(CC_NZ); OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX)); - FixupBranch complex3 = J(); + SetJumpTarget(skip_set_fx1); + MOV(64, R(RSCRATCH2), Imm64(0x7FF0'0000'0000'0000)); + OR(64, R(RSCRATCH2), R(RSCRATCH)); + MOVQ_xmm(XMM0, R(RSCRATCH2)); + RET(); - // Exception flags for negative input. + // SNaN or QNaN or +Inf or -Inf + SetJumpTarget(nan_or_inf); + MOV(64, R(RSCRATCH2), R(RSCRATCH)); + SHL(64, R(RSCRATCH2), Imm8(12)); + FixupBranch inf = J_CC(CC_Z); + BTS(64, R(RSCRATCH), Imm8(51)); + MOVQ_xmm(XMM0, R(RSCRATCH)); + RET(); + SetJumpTarget(inf); + BT(64, R(RSCRATCH), Imm8(63)); + FixupBranch negative = J_CC(CC_C); + XORPD(XMM0, R(XMM0)); + RET(); + + SetJumpTarget(nonzero); + FixupBranch denormal = J_CC(CC_NC); + + // Negative sign SetJumpTarget(negative); TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT)); FixupBranch skip_set_fx2 = J_CC(CC_NZ); OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT)); - - SetJumpTarget(skip_set_fx1); SetJumpTarget(skip_set_fx2); - SetJumpTarget(complex1); - SetJumpTarget(complex2); - SetJumpTarget(complex3); + MOV(64, R(RSCRATCH2), Imm64(0x7FF8'0000'0000'0000)); + MOVQ_xmm(XMM0, R(RSCRATCH2)); + RET(); + + SetJumpTarget(denormal); ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); ABI_CallFunction(Common::ApproximateReciprocalSquareRoot); ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);