diff --git a/Source/Core/Common/CPUDetect.h b/Source/Core/Common/CPUDetect.h
index 752d26afb2..c63076ff7b 100644
--- a/Source/Core/Common/CPUDetect.h
+++ b/Source/Core/Common/CPUDetect.h
@@ -50,10 +50,10 @@ struct CPUInfo
 	bool bMOVBE;
 	// This flag indicates that the hardware supports some mode
 	// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
-	// TODO: ARM
 	bool bFlushToZero;
 	bool bLAHFSAHF64;
 	bool bLongMode;
+	bool bAtom;
 
 	// ARM specific CPUInfo
 	bool bSwp;
diff --git a/Source/Core/Common/x64CPUDetect.cpp b/Source/Core/Common/x64CPUDetect.cpp
index 31409685e8..8ad8046c8b 100644
--- a/Source/Core/Common/x64CPUDetect.cpp
+++ b/Source/Core/Common/x64CPUDetect.cpp
@@ -129,6 +129,12 @@ void CPUInfo::Detect()
 	if (max_std_fn >= 1)
 	{
 		__cpuid(cpu_id, 0x00000001);
+		int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff);
+		int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0);
+		// Detect people unfortunate enough to be running Dolphin on an Atom
+		if (family == 6 && (model == 0x1C || model == 0x26 ||model == 0x27 || model == 0x35 || model == 0x36 ||
+		                    model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D))
+			bAtom = true;
 		logical_cpu_count = (cpu_id[1] >> 16) & 0xFF;
 		ht = (cpu_id[3] >> 28) & 1;
 
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 5793d744c8..a2ec9f2a66 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -151,7 +151,7 @@ public:
 	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
 		          bool Rc = false, bool carry = false);
 	void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
-	               void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
+	               void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool packed = false, bool roundRHS = false);
 	void FloatCompare(UGeckoInstruction inst, bool upper = false);
 
 	// OPCODES
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index 84e1ce4969..f404ccd88b 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -11,11 +11,12 @@
 using namespace Gen;
 
 static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
+static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
 static const u64 GC_ALIGNED16(psAbsMask[2])  = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
 static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
 
 void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg),
-                      void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
+                      void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool packed, bool roundRHS)
 {
 	fpr.Lock(d, a, b);
 	fpr.BindToRegister(d, d == a || d == b || !single);
@@ -34,12 +35,19 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X
 	}
 	else
 	{
-		avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), false, reversible);
+		avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible);
 	}
 	if (single)
 	{
-		ForceSinglePrecisionS(fpr.RX(d));
-		MOVDDUP(fpr.RX(d), fpr.R(d));
+		if (packed)
+		{
+			ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
+		}
+		else
+		{
+			ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d));
+			MOVDDUP(fpr.RX(d), fpr.R(d));
+		}
 	}
 	SetFPRFIfNeeded(inst, fpr.RX(d));
 	fpr.UnlockAll();
@@ -63,14 +71,32 @@ void Jit64::fp_arith(UGeckoInstruction inst)
 	JITDISABLE(bJITFloatingPointOff);
 	FALLBACK_IF(inst.Rc);
 
+	int a = inst.FA;
+	int b = inst.FB;
+	int c = inst.FC;
+	int d = inst.FD;
+	int arg2 = inst.SUBOP5 == 25 ? c : b;
+
 	bool single = inst.OPCD == 59;
 	bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
+	// If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by
+	// using packed arithmetic instead.
+	bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[arg2];
+	// Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case.
+	// Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones.
+	if (inst.SUBOP5 == 18 || cpu_info.bAtom)
+		packed = false;
+
 	switch (inst.SUBOP5)
 	{
-	case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
-	case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
-	case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
-	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, round_input); break; //mul
+	case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
+	                   packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, inst, packed); break;
+	case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
+	                   packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, inst, packed); break;
+	case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
+	                   packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, inst, packed); break;
+	case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
+	                   packed ? &XEmitter::MULPD : &XEmitter::MULSD, inst, packed, round_input); break;
 	default:
 		_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
 	}
@@ -88,6 +114,9 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 	int d = inst.FD;
 	bool single = inst.OPCD == 59;
 	bool round_input = single && !jit->js.op->fprIsSingle[c];
+	bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c];
+	if (cpu_info.bAtom)
+		packed = false;
 
 	fpr.Lock(a, b, c, d);
 
@@ -109,20 +138,32 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 		switch (inst.SUBOP5)
 		{
 		case 28: //msub
-			VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
+			if (packed)
+				VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
+			else
+				VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
 			break;
 		case 29: //madd
-			VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
+			if (packed)
+				VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
+			else
+				VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
 			break;
 			// PowerPC and x86 define NMADD/NMSUB differently
 			// x86: D = -A*C (+/-) B
 			// PPC: D = -(A*C (+/-) B)
 			// so we have to swap them; the ADD/SUB here isn't a typo.
 		case 30: //nmsub
-			VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
+			if (packed)
+				VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
+			else
+				VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
 			break;
 		case 31: //nmadd
-			VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
+			if (packed)
+				VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
+			else
+				VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
 			break;
 		}
 	}
@@ -133,9 +174,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 			Force25BitPrecision(XMM1, fpr.R(c), XMM0);
 		else
 			MOVAPD(XMM1, fpr.R(c));
-		MULSD(XMM1, fpr.R(a));
 		MOVAPD(XMM0, fpr.R(b));
-		SUBSD(XMM0, R(XMM1));
+		if (packed)
+		{
+			MULPD(XMM1, fpr.R(a));
+			SUBPD(XMM0, R(XMM1));
+		}
+		else
+		{
+			MULSD(XMM1, fpr.R(a));
+			SUBSD(XMM0, R(XMM1));
+		}
 	}
 	else
 	{
@@ -143,22 +192,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 			Force25BitPrecision(XMM0, fpr.R(c), XMM1);
 		else
 			MOVAPD(XMM0, fpr.R(c));
-		MULSD(XMM0, fpr.R(a));
-		if (inst.SUBOP5 == 28) //msub
-			SUBSD(XMM0, fpr.R(b));
-		else                   //(n)madd
-			ADDSD(XMM0, fpr.R(b));
+		if (packed)
+		{
+			MULPD(XMM0, fpr.R(a));
+			if (inst.SUBOP5 == 28) //msub
+				SUBPD(XMM0, fpr.R(b));
+			else                   //(n)madd
+				ADDPD(XMM0, fpr.R(b));
+		}
+		else
+		{
+			MULSD(XMM0, fpr.R(a));
+			if (inst.SUBOP5 == 28)
+				SUBSD(XMM0, fpr.R(b));
+			else
+				ADDSD(XMM0, fpr.R(b));
+		}
 		if (inst.SUBOP5 == 31) //nmadd
-			PXOR(XMM0, M((void*)&psSignBits));
+			PXOR(XMM0, M((void*)&(packed ? psSignBits2 : psSignBits)));
 	}
 
 	fpr.BindToRegister(d, !single);
-	//YES it is necessary to dupe the result :(
-	//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
+
 	if (single)
 	{
-		ForceSinglePrecisionS(XMM0);
-		MOVDDUP(fpr.RX(d), R(XMM0));
+		if (packed)
+		{
+			ForceSinglePrecisionP(fpr.RX(d), XMM0);
+		}
+		else
+		{
+			ForceSinglePrecisionS(fpr.RX(d), XMM0);
+			MOVDDUP(fpr.RX(d), fpr.R(d));
+		}
 	}
 	else
 	{
@@ -427,7 +493,7 @@ void Jit64::frspx(UGeckoInstruction inst)
 	fpr.BindToRegister(d, d == b);
 	if (b != d)
 		MOVAPD(fpr.RX(d), fpr.R(b));
-	ForceSinglePrecisionS(fpr.RX(d));
+	ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d));
 	MOVDDUP(fpr.RX(d), fpr.R(d));
 	SetFPRFIfNeeded(inst, fpr.RX(d));
 	fpr.UnlockAll();
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
index a859a53ff9..2a246b3a0b 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@@ -108,8 +108,15 @@ void Jit64::stfXXX(UGeckoInstruction inst)
 
 	if (single)
 	{
-		fpr.BindToRegister(s, true, false);
-		ConvertDoubleToSingle(XMM0, fpr.RX(s));
+		if (jit->js.op->fprIsStoreSafe[s])
+		{
+			CVTSD2SS(XMM0, fpr.R(s));
+		}
+		else
+		{
+			fpr.BindToRegister(s, true, false);
+			ConvertDoubleToSingle(XMM0, fpr.RX(s));
+		}
 		MOVD_xmm(R(RSCRATCH), XMM0);
 	}
 	else
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index 30573246c0..a9808f7d07 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -667,13 +667,17 @@ void EmuCodeBlock::WriteToConstRamAddress(int accessSize, OpArg arg, u32 address
 		MOV(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), R(reg));
 }
 
-void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm)
+void EmuCodeBlock::ForceSinglePrecisionS(X64Reg output, X64Reg input)
 {
 	// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
 	if (jit->jo.accurateSinglePrecision)
 	{
-		CVTSD2SS(xmm, R(xmm));
-		CVTSS2SD(xmm, R(xmm));
+		CVTSD2SS(input, R(input));
+		CVTSS2SD(output, R(input));
+	}
+	else if (output != input)
+	{
+		MOVAPD(output, R(input));
 	}
 }
 
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
index 3487fb374f..67a01249f2 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@@ -130,7 +130,7 @@ public:
 	void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8),
 	            Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm);
 
-	void ForceSinglePrecisionS(Gen::X64Reg xmm);
+	void ForceSinglePrecisionS(Gen::X64Reg output, Gen::X64Reg input);
 	void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input);
 	void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp);
 
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
index 0d72e8a5a4..b5a5c22716 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@@ -830,18 +830,45 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
 			fprInUse[code[i].fregOut] = true;
 	}
 
-	// Forward scan, for flags that need the other direction for calculation
-	BitSet32 fprIsSingle;
+	// Forward scan, for flags that need the other direction for calculation.
+	BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe;
 	for (u32 i = 0; i < block->m_num_instructions; i++)
 	{
 		code[i].fprIsSingle = fprIsSingle;
+		code[i].fprIsDuplicated = fprIsDuplicated;
+		code[i].fprIsStoreSafe = fprIsStoreSafe;
 		if (code[i].fregOut >= 0)
 		{
-			// This instruction outputs float, so we can omit the special rounding done in fmuls/fmadds
-			if (code[i].opinfo->type == OPTYPE_SINGLEFP || code[i].opinfo->type == OPTYPE_PS || strncmp(code[i].opinfo->opname, "lfs", 3))
+			fprIsSingle[code[i].fregOut] = false;
+			fprIsDuplicated[code[i].fregOut] = false;
+			fprIsStoreSafe[code[i].fregOut] = false;
+			// Single, duplicated, and doesn't need PPC_FP.
+			if (code[i].opinfo->type == OPTYPE_SINGLEFP)
+			{
 				fprIsSingle[code[i].fregOut] = true;
-			else
-				fprIsSingle[code[i].fregOut] = false;
+				fprIsDuplicated[code[i].fregOut] = true;
+				fprIsStoreSafe[code[i].fregOut] = true;
+			}
+			// Single and duplicated, but might be a denormal (not safe to skip PPC_FP).
+			// TODO: if we go directly from a load to store, skip conversion entirely?
+			// TODO: if we go directly from a load to a float instruction, and the value isn't used
+			// for anything else, we can skip PPC_FP on a load too.
+			if (!strncmp(code[i].opinfo->opname, "lfs", 3))
+			{
+				fprIsSingle[code[i].fregOut] = true;
+				fprIsDuplicated[code[i].fregOut] = true;
+			}
+			// Paired are still floats, but the top/bottom halves may differ.
+			if (code[i].opinfo->type == OPTYPE_PS || code[i].opinfo->type == OPTYPE_LOADPS)
+			{
+				fprIsSingle[code[i].fregOut] = true;
+				fprIsStoreSafe[code[i].fregOut] = true;
+			}
+			// Careful: changing the float mode in a block breaks this optimization, since
+			// a previous float op might have had had FTZ off while the later store has FTZ
+			// on. So, discard all information we have.
+			if (!strncmp(code[i].opinfo->opname, "mtfs", 4))
+				fprIsStoreSafe = BitSet32(0);
 		}
 	}
 	return address;
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h
index e68be7a5ee..59c637e5b2 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@@ -53,6 +53,11 @@ struct CodeOp //16B
 	BitSet32 fprInXmm;
 	// whether an fpr is known to be an actual single-precision value at this point in the block.
 	BitSet32 fprIsSingle;
+	// whether an fpr is known to have identical top and bottom halves (e.g. due to a single instruction)
+	BitSet32 fprIsDuplicated;
+	// whether an fpr is the output of a single-precision arithmetic instruction, i.e. whether we can safely
+	// skip PPC_FP.
+	BitSet32 fprIsStoreSafe;
 };
 
 struct BlockStats