diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp
index 09f9097d7f..12985e8a0c 100644
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@@ -1437,7 +1437,19 @@ void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {
 	Write8(0x0f);
 	Write8(0x38);
 	Write8(0x00);
-	arg.WriteRest(this, 0);
+	arg.WriteRest(this);
+}
+
+void XEmitter::PTEST(X64Reg dest, OpArg arg) {
+	if (!cpu_info.bSSE4_1) {
+		PanicAlert("Trying to use PTEST on a system that doesn't support it. Nobody hears your screams.");
+	}
+	Write8(0x66);
+	Write8(0x0f);
+	Write8(0x38);
+	Write8(0x17);
+	arg.operandReg = dest;
+	arg.WriteRest(this);
 }
 
 void XEmitter::PAND(X64Reg dest, OpArg arg)     {WriteSSEOp(64, 0xDB, true, dest, arg);}
@@ -1497,6 +1509,8 @@ void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64,
 void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseMUL, false, regOp1, regOp2, arg);}
 void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseDIV, false, regOp1, regOp2, arg);}
 void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)  {WriteAVXOp(64, sseSQRT, false, regOp1, regOp2, arg);}
+void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(64, sseAND, false, regOp1, regOp2, arg);}
+void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseANDN, false, regOp1, regOp2, arg);}
 
 // Prefixes
 
@@ -1526,6 +1540,7 @@ void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, OpArg arg)
 void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, src);}
 void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, dest);}
 void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, dest);}
+void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); }
 
 void XEmitter::RTDSC() { Write8(0x0F); Write8(0x31); }
 
diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h
index fee21a24db..19edf3b822 100644
--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@@ -433,9 +433,27 @@ public:
 	void REPNE();
 
 	// x87
+	enum x87StatusWordBits {
+		x87_InvalidOperation = 0x1,
+		x87_DenormalizedOperand = 0x2,
+		x87_DivisionByZero = 0x4,
+		x87_Overflow = 0x8,
+		x87_Underflow = 0x10,
+		x87_Precision = 0x20,
+		x87_StackFault = 0x40,
+		x87_ErrorSummary = 0x80,
+		x87_C0 = 0x100,
+		x87_C1 = 0x200,
+		x87_C2 = 0x400,
+		x87_TopOfStack = 0x2000 | 0x1000 | 0x800,
+		x87_C3 = 0x4000,
+		x87_FPUBusy = 0x8000,
+	};
+
 	void FLD(int bits, OpArg src);
 	void FST(int bits, OpArg dest);
 	void FSTP(int bits, OpArg dest);
+	void FNSTSW_AX();
 	void FWAIT();
 
 	// SSE/SSE2: Floating point arithmetic
@@ -562,6 +580,7 @@ public:
 	void PUNPCKLWD(X64Reg dest, const OpArg &arg);
 	void PUNPCKLDQ(X64Reg dest, const OpArg &arg);
 
+	void PTEST(X64Reg dest, OpArg arg);
 	void PAND(X64Reg dest, OpArg arg);
 	void PANDN(X64Reg dest, OpArg arg);
 	void PXOR(X64Reg dest, OpArg arg);
@@ -631,6 +650,8 @@ public:
 	void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 	void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 	void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 
 	void RTDSC();
 
diff --git a/Source/Core/Common/x64FPURoundMode.cpp b/Source/Core/Common/x64FPURoundMode.cpp
index 34438d12b2..a336859143 100644
--- a/Source/Core/Common/x64FPURoundMode.cpp
+++ b/Source/Core/Common/x64FPURoundMode.cpp
@@ -16,11 +16,11 @@ static const unsigned short FPU_ROUND_MASK = 3 << 10;
 #endif
 
 // OR-mask for disabling FPU exceptions (bits 7-12 in the MXCSR register)
-const u32 EXCEPTION_MASK = 0x1F80;
+static const u32 EXCEPTION_MASK = 0x1F80;
 // Denormals-Are-Zero (non-IEEE mode: denormal inputs are set to +/- 0)
-const u32 DAZ = 0x40;
+static const u32 DAZ = 0x40;
 // Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0)
-const u32 FTZ = 0x8000;
+static const u32 FTZ = 0x8000;
 
 namespace FPURoundMode
 {
diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
index 1be04a0d5d..c4b063e1a6 100644
--- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
@@ -378,7 +378,7 @@ void RegCache::Flush(FlushMode mode)
 	{
 		if (locks[i])
 		{
-			PanicAlert("Someone forgot to unlock PPC reg %i.", i);
+			PanicAlert("Someone forgot to unlock PPC reg %i (X64 reg %i).", i, RX(i));
 		}
 		if (regs[i].away)
 		{
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
index bc056e6bd1..0aac678151 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@@ -12,6 +12,8 @@
 #include "JitAsm.h"
 #include "JitRegCache.h"
 
+namespace {
+
 // pshufb todo: MOVQ
 const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 const u8 GC_ALIGNED16(bswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
@@ -19,11 +21,10 @@ const u8 GC_ALIGNED16(bswapShuffle1x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10,
 const u8 GC_ALIGNED16(bswapShuffle1x8Dupe[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0};
 const u8 GC_ALIGNED16(bswapShuffle2x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
 
-namespace {
-
 u64 GC_ALIGNED16(temp64);
-u32 GC_ALIGNED16(temp32);
+
 }
+
 // TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common,
 // and pshufb could help a lot.
 // Also add hacks for things like lfs/stfs the same reg consecutively, that is, simple memory moves.
@@ -46,11 +47,9 @@ void Jit64::lfs(UGeckoInstruction inst)
 
 	MEMCHECK_START
 
-	MOV(32, M(&temp32), R(EAX));
 	fpr.Lock(d);
 	fpr.BindToRegister(d, false);
-	CVTSS2SD(fpr.RX(d), M(&temp32));
-	MOVDDUP(fpr.RX(d), fpr.R(d));
+	ConvertSingleToDouble(fpr.RX(d), EAX, true);
 
 	MEMCHECK_END
 
@@ -235,13 +234,15 @@ void Jit64::stfs(UGeckoInstruction inst)
 		return;
 	}
 
+	fpr.BindToRegister(s, true, false);
+	ConvertDoubleToSingle(XMM0, fpr.RX(s));
+
 	if (gpr.R(a).IsImm())
 	{
 		u32 addr = (u32)(gpr.R(a).offset + offset);
 		if (Memory::IsRAMAddress(addr))
 		{
 			if (cpu_info.bSSSE3) {
-				CVTSD2SS(XMM0, fpr.R(s));
 				PSHUFB(XMM0, M((void *)bswapShuffle1x4));
 				WriteFloatToConstRamAddress(XMM0, addr);
 				return;
@@ -250,7 +251,6 @@ void Jit64::stfs(UGeckoInstruction inst)
 		else if (addr == 0xCC008000)
 		{
 			// Float directly to write gather pipe! Fun!
-			CVTSD2SS(XMM0, fpr.R(s));
 			CALL((void*)asm_routines.fifoDirectWriteFloat);
 			// TODO
 			js.fifoBytesThisBlock += 4;
@@ -260,7 +260,6 @@ void Jit64::stfs(UGeckoInstruction inst)
 
 	gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
 	gpr.Lock(a);
-	fpr.Lock(s);
 	MOV(32, R(ABI_PARAM2), gpr.R(a));
 	ADD(32, R(ABI_PARAM2), Imm32(offset));
 	if (update && offset)
@@ -275,7 +274,6 @@ void Jit64::stfs(UGeckoInstruction inst)
 
 		MEMCHECK_END
 	}
-	CVTSD2SS(XMM0, fpr.R(s));
 	SafeWriteFloatToReg(XMM0, ABI_PARAM2, RegistersInUse());
 	gpr.UnlockAll();
 	gpr.UnlockAllX();
@@ -290,11 +288,14 @@ void Jit64::stfsx(UGeckoInstruction inst)
 
 	// We can take a shortcut here - it's not likely that a hardware access would use this instruction.
 	gpr.FlushLockX(ABI_PARAM1);
-	fpr.Lock(inst.RS);
 	MOV(32, R(ABI_PARAM1), gpr.R(inst.RB));
 	if (inst.RA)
 		ADD(32, R(ABI_PARAM1), gpr.R(inst.RA));
-	CVTSD2SS(XMM0, fpr.R(inst.RS));
+
+	int s = inst.RS;
+	fpr.Lock(s);
+	fpr.BindToRegister(s, true, false);
+	ConvertDoubleToSingle(XMM0, fpr.RX(s));
 	MOVD_xmm(R(EAX), XMM0);
 	SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse());
 
@@ -313,21 +314,20 @@ void Jit64::lfsx(UGeckoInstruction inst)
 	{
 		ADD(32, R(EAX), gpr.R(inst.RA));
 	}
+	fpr.Lock(inst.RS);
+	fpr.BindToRegister(inst.RS, false);
+	X64Reg s = fpr.RX(inst.RS);
 	if (cpu_info.bSSSE3 && !js.memcheck) {
-		fpr.Lock(inst.RS);
-		fpr.BindToRegister(inst.RS, false, true);
-		X64Reg r = fpr.R(inst.RS).GetSimpleReg();
 #ifdef _M_IX86
 		AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
-		MOVD_xmm(r, MDisp(EAX, (u32)Memory::base));
+		MOVD_xmm(XMM0, MDisp(EAX, (u32)Memory::base));
 #else
-		MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0));
+		MOVD_xmm(XMM0, MComplex(RBX, EAX, SCALE_1, 0));
 #endif
 		MEMCHECK_START
 
-		PSHUFB(r, M((void *)bswapShuffle1x4));
-		CVTSS2SD(r, R(r));
-		MOVDDUP(r, R(r));
+		PSHUFB(XMM0, M((void *)bswapShuffle1x4));
+		ConvertSingleToDouble(s, XMM0);
 
 		MEMCHECK_END
 	} else {
@@ -335,11 +335,7 @@ void Jit64::lfsx(UGeckoInstruction inst)
 
 		MEMCHECK_START
 
-		MOV(32, M(&temp32), R(EAX));
-		CVTSS2SD(XMM0, M(&temp32));
-		fpr.Lock(inst.RS);
-		fpr.BindToRegister(inst.RS, false, true);
-		MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0));
+		ConvertSingleToDouble(s, EAX, true);
 
 		MEMCHECK_END
 	}
diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
index 5c9d2075a8..be87a77890 100644
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@@ -1288,10 +1288,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit) {
 		}
 		case DupSingleToMReg: {
 			if (!thisUsed) break;
-			X64Reg reg = fregURegWithoutMov(RI, I);
-			Jit->CVTSS2SD(reg, fregLocForInst(RI, getOp1(I)));
-			Jit->MOVDDUP(reg, R(reg));
-			RI.fregs[reg] = I;
+
+			X64Reg input = fregEnsureInReg(RI, getOp1(I));
+			X64Reg output = fregURegWithoutMov(RI, I);
+			Jit->ConvertSingleToDouble(output, input);
+
+			RI.fregs[output] = I;
 			fregNormalRegClear(RI, I);
 			break;
 		}
@@ -1412,9 +1414,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit) {
 		}
 		case DoubleToSingle: {
 			if (!thisUsed) break;
-			X64Reg reg = fregURegWithoutMov(RI, I);
-			Jit->CVTSD2SS(reg, fregLocForInst(RI, getOp1(I)));
-			RI.fregs[reg] = I;
+
+			X64Reg input = fregEnsureInReg(RI, getOp1(I));
+			X64Reg output = fregURegWithoutMov(RI, I);
+			Jit->ConvertDoubleToSingle(output, input);
+
+			RI.fregs[output] = I;
 			fregNormalRegClear(RI, I);
 			break;
 		}
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index 49a83e1831..814dbd3cf1 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -416,6 +416,101 @@ void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm) {
 	}
 }
 
+static u32 GC_ALIGNED16(temp32);
+static u64 GC_ALIGNED16(temp64);
+#ifdef _WIN32
+#include <intrin.h>
+#ifdef _M_X64
+static const __m128i GC_ALIGNED16(single_qnan_bit) = _mm_set_epi64x(0, 0x0000000000400000);
+static const __m128i GC_ALIGNED16(single_exponent) = _mm_set_epi64x(0, 0x000000007f800000);
+static const __m128i GC_ALIGNED16(double_qnan_bit) = _mm_set_epi64x(0, 0x0008000000000000);
+static const __m128i GC_ALIGNED16(double_exponent) = _mm_set_epi64x(0, 0x7ff0000000000000);
+#else
+static const __m128i GC_ALIGNED16(single_qnan_bit) = _mm_set_epi32(0, 0, 0x00000000, 0x00400000);
+static const __m128i GC_ALIGNED16(single_exponent) = _mm_set_epi32(0, 0, 0x00000000, 0x7f800000);
+static const __m128i GC_ALIGNED16(double_qnan_bit) = _mm_set_epi32(0, 0, 0x00080000, 0x00000000);
+static const __m128i GC_ALIGNED16(double_exponent) = _mm_set_epi32(0, 0, 0x7ff00000, 0x00000000);
+#endif
+#else
+static const __uint128_t GC_ALIGNED16(single_qnan_bit) = 0x0000000000400000;
+static const __uint128_t GC_ALIGNED16(single_exponent) = 0x000000007f800000;
+static const __uint128_t GC_ALIGNED16(double_qnan_bit) = 0x0008000000000000;
+static const __uint128_t GC_ALIGNED16(double_exponent) = 0x7ff0000000000000;
+#endif
+
+// Since the following two functions are used in non-arithmetic PPC float instructions,
+// they must convert floats bitexact and never flush denormals to zero or turn SNaNs into QNaNs.
+// This means we can't use CVTSS2SD/CVTSD2SS :(
+// The x87 FPU doesn't even support flush-to-zero so we can use FLD+FSTP even on denormals.
+// If the number is a NaN, make sure to set the QNaN bit back to its original value.
+
+void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
+{
+	if (src_is_gpr) {
+		MOV(32, M(&temp32), R(src));
+		MOVD_xmm(XMM1, R(src));
+	} else {
+		MOVSS(M(&temp32), src);
+		MOVSS(R(XMM1), src);
+	}
+	FLD(32, M(&temp32));
+	CCFlags cond;
+	if (cpu_info.bSSE4_1) {
+		PTEST(XMM1, M((void *)&single_exponent));
+		cond = CC_NC;
+	} else {
+		FNSTSW_AX();
+		TEST(16, R(AX), Imm16(x87_InvalidOperation));
+		cond = CC_Z;
+	}
+	FSTP(64, M(&temp64));
+	MOVSD(dst, M(&temp64));
+	FixupBranch dont_reset_qnan_bit = J_CC(cond);
+
+	PANDN(XMM1, M((void *)&single_qnan_bit));
+	PSLLQ(XMM1, 29);
+	if (cpu_info.bAVX) {
+		VPANDN(dst, XMM1, R(dst));
+	} else {
+		PANDN(XMM1, R(dst));
+		MOVSD(dst, R(XMM1));
+	}
+
+	SetJumpTarget(dont_reset_qnan_bit);
+	MOVDDUP(dst, R(dst));
+}
+
+void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
+{
+	MOVSD(M(&temp64), src);
+	MOVSD(XMM1, R(src));
+	FLD(64, M(&temp64));
+	CCFlags cond;
+	if (cpu_info.bSSE4_1) {
+		PTEST(XMM1, M((void *)&double_exponent));
+		cond = CC_NC;
+	} else {
+		FNSTSW_AX();
+		TEST(16, R(AX), Imm16(x87_InvalidOperation));
+		cond = CC_Z;
+	}
+	FSTP(32, M(&temp32));
+	MOVSS(XMM0, M(&temp32));
+	FixupBranch dont_reset_qnan_bit = J_CC(cond);
+
+	PANDN(XMM1, M((void *)&double_qnan_bit));
+	PSRLQ(XMM1, 29);
+	if (cpu_info.bAVX) {
+		VPANDN(XMM0, XMM1, R(XMM0));
+	} else {
+		PANDN(XMM1, R(XMM0));
+		MOVSS(XMM0, R(XMM1));
+	}
+
+	SetJumpTarget(dont_reset_qnan_bit);
+	MOVDDUP(dst, R(XMM0));
+}
+
 void EmuCodeBlock::JitClearCA()
 {
 	AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
index 278b9d7352..bd7af7e19d 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@@ -47,6 +47,10 @@ public:
 
 	void ForceSinglePrecisionS(Gen::X64Reg xmm);
 	void ForceSinglePrecisionP(Gen::X64Reg xmm);
+
+	// AX might get trashed
+	void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);
+	void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src);
 protected:
 	std::unordered_map<u8 *, u32> registersInUseAtLoc;
 };