diff --git a/Source/Core/Common/Src/CPUDetect.h b/Source/Core/Common/Src/CPUDetect.h index e93a902d63..967be0949b 100644 --- a/Source/Core/Common/Src/CPUDetect.h +++ b/Source/Core/Common/Src/CPUDetect.h @@ -43,6 +43,12 @@ struct CPUInfo bool bAVX; bool bFMA; bool bAES; + // FXSAVE/FXRSTOR + bool bFXSR; + // This flag indicates that the hardware supports some mode + // in which denormal inputs _and_ outputs are automatically set to (signed) zero. + // TODO: ARM + bool bFlushToZero; bool bLAHFSAHF64; bool bLongMode; diff --git a/Source/Core/Common/Src/FPURoundMode.h b/Source/Core/Common/Src/FPURoundMode.h index fad4d5d6aa..c552ad7ff0 100644 --- a/Source/Core/Common/Src/FPURoundMode.h +++ b/Source/Core/Common/Src/FPURoundMode.h @@ -36,7 +36,7 @@ namespace FPURoundMode void SetPrecisionMode(u32 mode); - void SetSIMDMode(u32 mode); + void SetSIMDMode(u32 roundingMode, u32 nonIEEEMode); /* * There are two different flavors of float to int conversion: diff --git a/Source/Core/Common/Src/GenericFPURoundMode.cpp b/Source/Core/Common/Src/GenericFPURoundMode.cpp index cc878291a1..c8e70a4990 100644 --- a/Source/Core/Common/Src/GenericFPURoundMode.cpp +++ b/Source/Core/Common/Src/GenericFPURoundMode.cpp @@ -26,7 +26,7 @@ namespace FPURoundMode void SetPrecisionMode(u32 mode) { } - void SetSIMDMode(u32 mode) + void SetSIMDMode(u32 mode, u32 nonIEEEMode) { } void SaveSIMDState() diff --git a/Source/Core/Common/Src/MathUtil.h b/Source/Core/Common/Src/MathUtil.h index 31772c3c60..f085c6ed2b 100644 --- a/Source/Core/Common/Src/MathUtil.h +++ b/Source/Core/Common/Src/MathUtil.h @@ -64,10 +64,10 @@ inline float FlushToZero(float f) return x.f; } -inline double FlushToZeroAsFloat(double d) +inline double FlushToZero(double d) { IntDouble x; x.d = d; - if ((x.i & DOUBLE_EXP) < 0x3800000000000000ULL) + if ((x.i & DOUBLE_EXP) == 0) x.i &= DOUBLE_SIGN; // turn into signed zero return x.d; } diff --git a/Source/Core/Common/Src/x64CPUDetect.cpp b/Source/Core/Common/Src/x64CPUDetect.cpp index 2b434ad2b6..2fa25e8074 100644 --- a/Source/Core/Common/Src/x64CPUDetect.cpp +++ b/Source/Core/Common/Src/x64CPUDetect.cpp @@ -162,6 +162,34 @@ void CPUInfo::Detect() if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true; if ((cpu_id[2] >> 25) & 1) bAES = true; + // To check DAZ support, we first need to check FXSAVE support. + if ((cpu_id[3] >> 24) & 1) + { + // We can use FXSAVE. + bFXSR = true; + + GC_ALIGNED16(u8 fx_state[512]); + memset(fx_state, 0, sizeof(fx_state)); +#ifdef _WIN32 +#ifdef _M_IX86 + _fxsave(fx_state); +#elif defined (_M_X64) + _fxsave64(fx_state); +#endif +#else + __asm__("fxsave %0" : "=m" (fx_state)); +#endif + + // lowest byte of MXCSR_MASK + if ((fx_state[0x1C] >> 6) & 1) + { + // On x86, the FTZ field (supported since SSE1) only flushes denormal _outputs_ to zero, + // now that we checked DAZ support (flushing denormal _inputs_ to zero), + // we can set our generic flag. + bFlushToZero = true; + } + } + // AVX support requires 3 separate checks: // - Is the AVX bit set in CPUID? // - Is the XSAVE bit set in CPUID? @@ -222,7 +250,12 @@ std::string CPUInfo::Summarize() { std::string sum(cpu_string); if (bSSE) sum += ", SSE"; - if (bSSE2) sum += ", SSE2"; + if (bSSE2) + { + sum += ", SSE2"; + if (!bFlushToZero) + sum += " (but not DAZ!)"; + } if (bSSE3) sum += ", SSE3"; if (bSSSE3) sum += ", SSSE3"; if (bSSE4_1) sum += ", SSE4.1"; diff --git a/Source/Core/Common/Src/x64FPURoundMode.cpp b/Source/Core/Common/Src/x64FPURoundMode.cpp index 2c950ade96..f46c6000eb 100644 --- a/Source/Core/Common/Src/x64FPURoundMode.cpp +++ b/Source/Core/Common/Src/x64FPURoundMode.cpp @@ -4,6 +4,7 @@ #include "Common.h" #include "FPURoundMode.h" +#include "CPUDetect.h" #ifndef _WIN32 static const unsigned short FPU_ROUND_NEAR = 0 << 10; @@ -14,8 +15,11 @@ static const unsigned short FPU_ROUND_MASK = 3 << 10; #include #endif -const u32 MASKS = 0x1F80; // mask away the interrupts. +// OR-mask for disabling FPU exceptions (bits 7-12 in the MXCSR register) +const u32 EXCEPTION_MASK = 0x1F80; +// Denormals-Are-Zero (non-IEEE mode: denormal inputs are set to +/- 0) const u32 DAZ = 0x40; +// Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0) const u32 FTZ = 0x8000; namespace FPURoundMode @@ -79,16 +83,28 @@ namespace FPURoundMode //but still - set any useful sse options here #endif } - void SetSIMDMode(u32 mode) + + void SetSIMDMode(u32 roundingMode, u32 nonIEEEMode) { - static const u32 ssetable[4] = + // lookup table for FPSCR.RN-to-MXCSR.RC translation + static const u32 roundingModeLUT[4] = { - (0 << 13) | MASKS, - (3 << 13) | MASKS, - (2 << 13) | MASKS, - (1 << 13) | MASKS, + (0 << 13) | EXCEPTION_MASK, // nearest + (3 << 13) | EXCEPTION_MASK, // -inf + (2 << 13) | EXCEPTION_MASK, // +inf + (1 << 13) | EXCEPTION_MASK, // zero }; - u32 csr = ssetable[mode]; + u32 csr = roundingModeLUT[roundingMode]; + + static const u32 denormalLUT[2] = + { + FTZ, // flush-to-zero only + FTZ | DAZ, // flush-to-zero and denormals-are-zero (may not be supported) + }; + if (nonIEEEMode) + { + csr |= denormalLUT[cpu_info.bFlushToZero]; + } _mm_setcsr(csr); } diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h index d379bf7049..9190a18ed7 100644 --- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h +++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_FPUtils.h @@ -5,6 +5,7 @@ #ifndef _INTERPRETER_FPUTILS_H #define _INTERPRETER_FPUTILS_H +#include "CPUDetect.h" #include "Interpreter.h" #include "MathUtil.h" @@ -69,28 +70,22 @@ inline void UpdateFPSCR() inline double ForceSingle(double _x) { - //if (FPSCR.RN != 0) - // PanicAlert("RN = %d at %x", (int)FPSCR.RN, PC); - if (FPSCR.NI) - _x = FlushToZeroAsFloat(_x); - - double x = static_cast(_x); - + // convert to float... + float x = _x; + if (!cpu_info.bFlushToZero && FPSCR.NI) + { + x = FlushToZero(x); + } + // ...and back to double: return x; } inline double ForceDouble(double d) { - //if (FPSCR.RN != 0) - // PanicAlert("RN = %d at %x", (int)FPSCR.RN, PC); - - //if (FPSCR.NI) - //{ - // IntDouble x; x.d = d; - //if ((x.i & DOUBLE_EXP) == 0) - // x.i &= DOUBLE_SIGN; // turn into signed zero - // return x.d; - //} + if (!cpu_info.bFlushToZero && FPSCR.NI) + { + d = FlushToZero(d); + } return d; } diff --git a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp index 688d166608..475f7591ce 100644 --- a/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp +++ b/Source/Core/Core/Src/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp @@ -48,15 +48,8 @@ static void FPSCRtoFPUSettings(UReg_FPSCR fp) // Pokemon Colosseum does this. Gah. } - // Also corresponding SSE rounding mode setting - if (FPSCR.NI) - { - // Either one of these two breaks Beyond Good & Evil. - // if (cpu_info.bSSSE3) - // csr |= DAZ; - // csr |= FTZ; - } - FPURoundMode::SetSIMDMode(FPSCR.RN); + // Set SSE rounding mode and denormal handling + FPURoundMode::SetSIMDMode(FPSCR.RN, FPSCR.NI); } void Interpreter::mtfsb0x(UGeckoInstruction _inst) diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h index 71a556cb5f..139414a103 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h @@ -182,7 +182,7 @@ public: void ps_sum(UGeckoInstruction inst); void ps_muls(UGeckoInstruction inst); - void fp_arith_s(UGeckoInstruction inst); + void fp_arith(UGeckoInstruction inst); void frsqrtex(UGeckoInstruction inst); void fcmpx(UGeckoInstruction inst); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp index 0ff2bf5d7f..dc81015573 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit64_Tables.cpp @@ -320,12 +320,12 @@ static GekkoOPTemplate table31_2[] = static GekkoOPTemplate table59[] = { - {18, &Jit64::Default}, //{"fdivsx", OPTYPE_FPU, FL_RC_BIT_F, 16}}, - {20, &Jit64::fp_arith_s}, //"fsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, - {21, &Jit64::fp_arith_s}, //"faddsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {18, &Jit64::fp_arith}, //{"fdivsx", OPTYPE_FPU, FL_RC_BIT_F, 16}}, + {20, &Jit64::fp_arith}, //"fsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {21, &Jit64::fp_arith}, //"faddsx", OPTYPE_FPU, FL_RC_BIT_F}}, // {22, &Jit64::Default}, //"fsqrtsx", OPTYPE_FPU, FL_RC_BIT_F}}, // Not implemented on gekko {24, &Jit64::Default}, //"fresx", OPTYPE_FPU, FL_RC_BIT_F}}, - {25, &Jit64::fp_arith_s}, //"fmulsx", OPTYPE_FPU, FL_RC_BIT_F}}, + {25, &Jit64::fp_arith}, //"fmulsx", OPTYPE_FPU, FL_RC_BIT_F}}, {28, &Jit64::fmaddXX}, //"fmsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, {29, &Jit64::fmaddXX}, //"fmaddsx", OPTYPE_FPU, FL_RC_BIT_F}}, {30, &Jit64::fmaddXX}, //"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F}}, @@ -354,12 +354,12 @@ static GekkoOPTemplate table63[] = static GekkoOPTemplate table63_2[] = { - {18, &Jit64::Default}, //"fdivx", OPTYPE_FPU, FL_RC_BIT_F, 30}}, - {20, &Jit64::Default}, //"fsubx", OPTYPE_FPU, FL_RC_BIT_F}}, - {21, &Jit64::Default}, //"faddx", OPTYPE_FPU, FL_RC_BIT_F}}, + {18, &Jit64::fp_arith}, //"fdivx", OPTYPE_FPU, FL_RC_BIT_F, 30}}, + {20, &Jit64::fp_arith}, //"fsubx", OPTYPE_FPU, FL_RC_BIT_F}}, + {21, &Jit64::fp_arith}, //"faddx", OPTYPE_FPU, FL_RC_BIT_F}}, {22, &Jit64::Default}, //"fsqrtx", OPTYPE_FPU, FL_RC_BIT_F}}, {23, &Jit64::Default}, //"fselx", OPTYPE_FPU, FL_RC_BIT_F}}, - {25, &Jit64::fp_arith_s}, //"fmulx", OPTYPE_FPU, FL_RC_BIT_F}}, + {25, &Jit64::fp_arith}, //"fmulx", OPTYPE_FPU, FL_RC_BIT_F}}, {26, &Jit64::frsqrtex}, //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}}, {28, &Jit64::fmaddXX}, //"fmsubx", OPTYPE_FPU, FL_RC_BIT_F}}, {29, &Jit64::fmaddXX}, //"fmaddx", OPTYPE_FPU, FL_RC_BIT_F}}, diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp index c4699ecc62..b0d0ab4853 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -85,7 +85,7 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, fpr.UnlockAll(); } -void Jit64::fp_arith_s(UGeckoInstruction inst) +void Jit64::fp_arith(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITFloatingPointOff) @@ -106,7 +106,7 @@ void Jit64::fp_arith_s(UGeckoInstruction inst) case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD, &XEmitter::VADDSD); break; //add case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, &XEmitter::VMULSD); break; //mul default: - _assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!"); + _assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!"); } }