mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-07-25 15:19:42 -06:00
Merge pull request #7428 from MerryMage/rm-j-GenFrsqrte
GenFrsqrte: Reduce branches in fast-path and inline most behavior
This commit is contained in:
@ -32,17 +32,14 @@ void CommonAsmRoutines::GenFrsqrte()
|
|||||||
// This function clobbers all three RSCRATCH.
|
// This function clobbers all three RSCRATCH.
|
||||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||||
|
|
||||||
// Negative and zero inputs set an exception and take the complex path.
|
// Extract exponent
|
||||||
TEST(64, R(RSCRATCH), R(RSCRATCH));
|
|
||||||
FixupBranch zero = J_CC(CC_Z, true);
|
|
||||||
FixupBranch negative = J_CC(CC_S, true);
|
|
||||||
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||||
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
|
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
|
||||||
|
|
||||||
// Zero and max exponents (non-normal floats) take the complex path.
|
// Negatives, zeros, denormals, infinities and NaNs take the complex path.
|
||||||
FixupBranch complex1 = J_CC(CC_Z, true);
|
LEA(32, RSCRATCH2, MDisp(RSCRATCH_EXTRA, -1));
|
||||||
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
|
CMP(32, R(RSCRATCH2), Imm32(0x7FE));
|
||||||
FixupBranch complex2 = J_CC(CC_E, true);
|
FixupBranch complex = J_CC(CC_AE, true);
|
||||||
|
|
||||||
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
|
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
|
||||||
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
|
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
|
||||||
@ -75,24 +72,53 @@ void CommonAsmRoutines::GenFrsqrte()
|
|||||||
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
// Exception flags for zero input.
|
SetJumpTarget(complex);
|
||||||
SetJumpTarget(zero);
|
AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
|
||||||
|
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
|
||||||
|
FixupBranch nan_or_inf = J_CC(CC_E);
|
||||||
|
|
||||||
|
MOV(64, R(RSCRATCH2), R(RSCRATCH));
|
||||||
|
SHL(64, R(RSCRATCH2), Imm8(1));
|
||||||
|
FixupBranch nonzero = J_CC(CC_NZ);
|
||||||
|
|
||||||
|
// +0.0 or -0.0
|
||||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
|
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
|
||||||
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
|
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
|
||||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
|
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
|
||||||
FixupBranch complex3 = J();
|
SetJumpTarget(skip_set_fx1);
|
||||||
|
MOV(64, R(RSCRATCH2), Imm64(0x7FF0'0000'0000'0000));
|
||||||
|
OR(64, R(RSCRATCH2), R(RSCRATCH));
|
||||||
|
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
||||||
|
RET();
|
||||||
|
|
||||||
// Exception flags for negative input.
|
// SNaN or QNaN or +Inf or -Inf
|
||||||
|
SetJumpTarget(nan_or_inf);
|
||||||
|
MOV(64, R(RSCRATCH2), R(RSCRATCH));
|
||||||
|
SHL(64, R(RSCRATCH2), Imm8(12));
|
||||||
|
FixupBranch inf = J_CC(CC_Z);
|
||||||
|
BTS(64, R(RSCRATCH), Imm8(51));
|
||||||
|
MOVQ_xmm(XMM0, R(RSCRATCH));
|
||||||
|
RET();
|
||||||
|
SetJumpTarget(inf);
|
||||||
|
BT(64, R(RSCRATCH), Imm8(63));
|
||||||
|
FixupBranch negative = J_CC(CC_C);
|
||||||
|
XORPD(XMM0, R(XMM0));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
SetJumpTarget(nonzero);
|
||||||
|
FixupBranch denormal = J_CC(CC_NC);
|
||||||
|
|
||||||
|
// Negative sign
|
||||||
SetJumpTarget(negative);
|
SetJumpTarget(negative);
|
||||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
|
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
|
||||||
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
|
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
|
||||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
|
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
|
||||||
|
|
||||||
SetJumpTarget(skip_set_fx1);
|
|
||||||
SetJumpTarget(skip_set_fx2);
|
SetJumpTarget(skip_set_fx2);
|
||||||
SetJumpTarget(complex1);
|
MOV(64, R(RSCRATCH2), Imm64(0x7FF8'0000'0000'0000));
|
||||||
SetJumpTarget(complex2);
|
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
||||||
SetJumpTarget(complex3);
|
RET();
|
||||||
|
|
||||||
|
SetJumpTarget(denormal);
|
||||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||||
ABI_CallFunction(Common::ApproximateReciprocalSquareRoot);
|
ABI_CallFunction(Common::ApproximateReciprocalSquareRoot);
|
||||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||||
|
@ -13,3 +13,7 @@ add_dolphin_test(DSPAssemblyTest
|
|||||||
add_dolphin_test(ESFormatsTest IOS/ES/FormatsTest.cpp IOS/ES/TestBinaryData.cpp)
|
add_dolphin_test(ESFormatsTest IOS/ES/FormatsTest.cpp IOS/ES/TestBinaryData.cpp)
|
||||||
|
|
||||||
add_dolphin_test(FileSystemTest IOS/FS/FileSystemTest.cpp)
|
add_dolphin_test(FileSystemTest IOS/FS/FileSystemTest.cpp)
|
||||||
|
|
||||||
|
if(_M_X86)
|
||||||
|
add_dolphin_test(PowerPCTest PowerPC/Jit64Common/Frsqrte.cpp)
|
||||||
|
endif()
|
||||||
|
101
Source/UnitTests/Core/PowerPC/Jit64Common/Frsqrte.cpp
Normal file
101
Source/UnitTests/Core/PowerPC/Jit64Common/Frsqrte.cpp
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
// Copyright 2018 Dolphin Emulator Project
|
||||||
|
// Licensed under GPLv2+
|
||||||
|
// Refer to the license.txt file included.
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "Common/BitUtils.h"
|
||||||
|
#include "Common/CommonTypes.h"
|
||||||
|
#include "Common/FloatUtils.h"
|
||||||
|
#include "Common/x64ABI.h"
|
||||||
|
#include "Core/PowerPC/Gekko.h"
|
||||||
|
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
|
||||||
|
#include "Core/PowerPC/Jit64Common/Jit64Base.h"
|
||||||
|
#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
|
||||||
|
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
class TestCommonAsmRoutines : public CommonAsmRoutines
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
TestCommonAsmRoutines()
|
||||||
|
{
|
||||||
|
using namespace Gen;
|
||||||
|
|
||||||
|
AllocCodeSpace(4096);
|
||||||
|
m_const_pool.Init(AllocChildCodeSpace(1024), 1024);
|
||||||
|
|
||||||
|
const auto raw_frsqrte = reinterpret_cast<double (*)(double)>(AlignCode4());
|
||||||
|
GenFrsqrte();
|
||||||
|
|
||||||
|
wrapped_frsqrte = reinterpret_cast<u64 (*)(u64, UReg_FPSCR&)>(AlignCode4());
|
||||||
|
ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
|
||||||
|
|
||||||
|
// We know the frsqrte implementation only accesses the fpscr. We manufacture a
|
||||||
|
// PPCSTATE pointer so we read/write to our provided fpscr argument instead.
|
||||||
|
XOR(32, R(RPPCSTATE), R(RPPCSTATE));
|
||||||
|
LEA(64, RSCRATCH, PPCSTATE(fpscr));
|
||||||
|
SUB(64, R(ABI_PARAM2), R(RSCRATCH));
|
||||||
|
MOV(64, R(RPPCSTATE), R(ABI_PARAM2));
|
||||||
|
|
||||||
|
// Call
|
||||||
|
MOVQ_xmm(XMM0, R(ABI_PARAM1));
|
||||||
|
ABI_CallFunction(raw_frsqrte);
|
||||||
|
MOVQ_xmm(R(ABI_RETURN), XMM0);
|
||||||
|
|
||||||
|
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
|
||||||
|
RET();
|
||||||
|
}
|
||||||
|
|
||||||
|
u64 (*wrapped_frsqrte)(u64, UReg_FPSCR&);
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST(Jit64, Frsqrte)
|
||||||
|
{
|
||||||
|
TestCommonAsmRoutines routines;
|
||||||
|
|
||||||
|
const std::vector<u64> special_values{
|
||||||
|
0x0000'0000'0000'0000, // positive zero
|
||||||
|
0x0000'0000'0000'0001, // smallest positive denormal
|
||||||
|
0x0000'0000'0100'0000,
|
||||||
|
0x000F'FFFF'FFFF'FFFF, // largest positive denormal
|
||||||
|
0x0010'0000'0000'0000, // smallest positive normal
|
||||||
|
0x0010'0000'0000'0002,
|
||||||
|
0x3FF0'0000'0000'0000, // 1.0
|
||||||
|
0x7FEF'FFFF'FFFF'FFFF, // largest positive normal
|
||||||
|
0x7FF0'0000'0000'0000, // positive infinity
|
||||||
|
0x7FF0'0000'0000'0001, // first positive SNaN
|
||||||
|
0x7FF7'FFFF'FFFF'FFFF, // last positive SNaN
|
||||||
|
0x7FF8'0000'0000'0000, // first positive QNaN
|
||||||
|
0x7FFF'FFFF'FFFF'FFFF, // last positive QNaN
|
||||||
|
0x8000'0000'0000'0000, // negative zero
|
||||||
|
0x8000'0000'0000'0001, // smallest negative denormal
|
||||||
|
0x8000'0000'0100'0000,
|
||||||
|
0x800F'FFFF'FFFF'FFFF, // largest negative denormal
|
||||||
|
0x8010'0000'0000'0000, // smallest negative normal
|
||||||
|
0x8010'0000'0000'0002,
|
||||||
|
0xBFF0'0000'0000'0000, // -1.0
|
||||||
|
0xFFEF'FFFF'FFFF'FFFF, // largest negative normal
|
||||||
|
0xFFF0'0000'0000'0000, // negative infinity
|
||||||
|
0xFFF0'0000'0000'0001, // first negative SNaN
|
||||||
|
0xFFF7'FFFF'FFFF'FFFF, // last negative SNaN
|
||||||
|
0xFFF8'0000'0000'0000, // first negative QNaN
|
||||||
|
0xFFFF'FFFF'FFFF'FFFF, // last negative QNaN
|
||||||
|
};
|
||||||
|
|
||||||
|
UReg_FPSCR fpscr;
|
||||||
|
|
||||||
|
for (u64 ivalue : special_values)
|
||||||
|
{
|
||||||
|
double dvalue = Common::BitCast<double>(ivalue);
|
||||||
|
|
||||||
|
u64 expected = Common::BitCast<u64>(Common::ApproximateReciprocalSquareRoot(dvalue));
|
||||||
|
|
||||||
|
u64 actual = routines.wrapped_frsqrte(ivalue, fpscr);
|
||||||
|
|
||||||
|
printf("%016llx -> %016llx == %016llx\n", ivalue, actual, expected);
|
||||||
|
|
||||||
|
EXPECT_EQ(expected, actual);
|
||||||
|
}
|
||||||
|
}
|
Reference in New Issue
Block a user