diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
index 6869edd8fa..f9565d7f6f 100644
--- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp
+++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
@@ -16,7 +16,6 @@
 // http://code.google.com/p/dolphin-emu/
 
 #include <cmath>
-#include <nmmintrin.h>
 #include "Common.h"
 //#include "VideoCommon.h" // to get debug logs
 
@@ -397,11 +396,7 @@ inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr)
     }
 }
 
-template<bool SSSE3>
-inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr);
-
-template<>
-inline void decodebytesC8_To_Raw16<false>(u16* dst, const u8* src, int tlutaddr)
+inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem + tlutaddr);
 	for (int x = 0; x < 8; x++)
@@ -411,29 +406,6 @@ inline void decodebytesC8_To_Raw16<false>(u16* dst, const u8* src, int tlutaddr)
 	}
 }
 
-static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L);
-template<>
-inline void decodebytesC8_To_Raw16<true>(u16* dst, const u8* src, int tlutaddr)
-{
-	u16* tlut = (u16*)(texMem + tlutaddr);
-
-	// Make 8 16-bits unsigned integer values
-	__m128i a;
-	a = _mm_insert_epi16(a, tlut[src[0]], 0);
-	a = _mm_insert_epi16(a, tlut[src[1]], 1);
-	a = _mm_insert_epi16(a, tlut[src[2]], 2);
-	a = _mm_insert_epi16(a, tlut[src[3]], 3);
-	a = _mm_insert_epi16(a, tlut[src[4]], 4);
-	a = _mm_insert_epi16(a, tlut[src[5]], 5);
-	a = _mm_insert_epi16(a, tlut[src[6]], 6);
-	a = _mm_insert_epi16(a, tlut[src[7]], 7);
-
-	// Apply Common::swap16() to 16-bits unsigned integers at once
-	const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16);
-
-	// Store values to dst without polluting the caches
-	_mm_stream_si128((__m128i*)dst, b);
-}
 
 inline void decodebytesC14X2_5A3_To_BGRA32(u32 *dst, const u16 *src, int tlutaddr)
 {
@@ -940,7 +912,6 @@ PC_TexFormat TexDecoder_DirectDecode_real(u8 *dst, const u8 *src, int width, int
 //TODO: to save memory, don't blindly convert everything to argb8888
 //also ARGB order needs to be swapped later, to accommodate modern hardware better
 //need to add DXT support too
-static const __m128i kMaskSwap32 = _mm_set_epi32(0x0C0D0E0FL, 0x08090A0BL, 0x04050607L, 0x00010203L);
 PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
 {
     switch (texformat)
@@ -994,18 +965,10 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
         }
 		else
 		{
-			if (cpu_info.bSSSE3) {
-				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 4; iy++, src += 8)
-							decodebytesC8_To_Raw16<true>((u16*)dst + (y + iy) * width + x, src, tlutaddr);
-
-			} else {
-				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 8)
-						for (int iy = 0; iy < 4; iy++, src += 8)
-							decodebytesC8_To_Raw16<false>((u16*)dst + (y + iy) * width + x, src, tlutaddr);
-			}
+            for (int y = 0; y < height; y += 4)
+                for (int x = 0; x < width; x += 8)
+                    for (int iy = 0; iy < 4; iy++, src += 8)
+                        decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
 		}
         return GetPCFormatFromTLUTFormat(tlutfmt);
     case GX_TF_IA4:
@@ -1071,76 +1034,13 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
         return PC_TEX_FMT_BGRA32;
     case GX_TF_RGBA8:  // speed critical
         {
-			if (cpu_info.bSSE4_1) {
-				for (int y = 0; y < height; y += 4) {
-					__m128i* p = (__m128i*)(src + y * width * 4);
-					for (int x = 0; x < width; x += 4) {
-						// Load 64-bytes at once.
-						const __m128i a0 = _mm_stream_load_si128(p++);
-						const __m128i a1 = _mm_stream_load_si128(p++);
-						const __m128i a2 = _mm_stream_load_si128(p++);
-						const __m128i a3 = _mm_stream_load_si128(p++);
-
-						// Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(),
-						// apply Common::swap32() by _mm_shuffle_epi8() and
-						// store them by _mm_stream_si128().
-						// See decodebytesARGB8_4() about the idea.
-						const __m128i b0 = _mm_unpacklo_epi16(a0, a2);
-						const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32);
-						_mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0);
-
-						const __m128i b1 = _mm_unpackhi_epi16(a0, a2);
-						const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32);
-						_mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1);
-
-						const __m128i b2 = _mm_unpacklo_epi16(a1, a3);
-						const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32);
-						_mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2);
-
-						const __m128i b3 = _mm_unpackhi_epi16(a1, a3);
-						const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32);
-						_mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3);
-					}
-				}
-
-			} else if (cpu_info.bSSSE3) {
-				// SSSE3 can not use _mm_stream_load_si128().
-				// Use _mm_load_si128() instead of _mm_load_si128().
-				for (int y = 0; y < height; y += 4) {
-					__m128i* p = (__m128i*)(src + y * width * 4);
-					for (int x = 0; x < width; x += 4) {
-						const __m128i a0 = _mm_load_si128(p++);
-						const __m128i a1 = _mm_load_si128(p++);
-						const __m128i a2 = _mm_load_si128(p++);
-						const __m128i a3 = _mm_load_si128(p++);
-
-						const __m128i b0 = _mm_unpacklo_epi16(a0, a2);
-						const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32);
-						_mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0);
-
-						const __m128i b1 = _mm_unpackhi_epi16(a0, a2);
-						const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32);
-						_mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1);
-
-						const __m128i b2 = _mm_unpacklo_epi16(a1, a3);
-						const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32);
-						_mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2);
-
-						const __m128i b3 = _mm_unpackhi_epi16(a1, a3);
-						const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32);
-						_mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3);
-					}
-				}
-
-			} else {
-				for (int y = 0; y < height; y += 4)
-					for (int x = 0; x < width; x += 4)
-					{
-						for (int iy = 0; iy < 4; iy++)
-							decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
-						src += 64;
-					}
-			}
+			for (int y = 0; y < height; y += 4)
+                for (int x = 0; x < width; x += 4)
+                {
+					for (int iy = 0; iy < 4; iy++)
+                        decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
+					src += 64;
+                }
         }
         return PC_TEX_FMT_BGRA32;
     case GX_TF_CMPR:  // speed critical
diff --git a/Source/Core/VideoCommon/Src/VertexLoader.cpp b/Source/Core/VideoCommon/Src/VertexLoader.cpp
index 97c6e5703f..35abfa8b3c 100644
--- a/Source/Core/VideoCommon/Src/VertexLoader.cpp
+++ b/Source/Core/VideoCommon/Src/VertexLoader.cpp
@@ -16,10 +16,8 @@
 // http://code.google.com/p/dolphin-emu/
 
 #include <assert.h>
-#include <nmmintrin.h>
 
 #include "Common.h"
-#include "CPUDetect.h"
 #include "VideoCommon.h"
 #include "VideoConfig.h"
 #include "Profiler.h"
@@ -72,7 +70,7 @@ int colIndex;
 TVtxAttr* pVtxAttr;
 int colElements[2];
 float posScale;
-__declspec(align(16)) float tcScale[8];
+float tcScale[8];
 
 using namespace Gen;
 
@@ -635,31 +633,9 @@ void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
 
 	pVtxAttr = &m_VtxAttr;
 	posScale = 1.0f / float(1 << m_VtxAttr.PosFrac);
-	if (m_NativeFmt->m_components & VB_HAS_UVALL) {
-		if (cpu_info.bSSE4_1) {
-			__m128i a0;
-			a0 = _mm_insert_epi32(a0, 1 << m_VtxAttr.texCoord[0].Frac, 0);
-			a0 = _mm_insert_epi32(a0, 1 << m_VtxAttr.texCoord[1].Frac, 1);
-			a0 = _mm_insert_epi32(a0, 1 << m_VtxAttr.texCoord[2].Frac, 2);
-			a0 = _mm_insert_epi32(a0, 1 << m_VtxAttr.texCoord[3].Frac, 3);
-			const __m128 b0 = _mm_cvtepi32_ps(a0);
-			const __m128 c0 = _mm_rcp_ps(b0);
-			_mm_stream_ps(&tcScale[0], c0);
-
-			__m128i a1;
-			a1 = _mm_insert_epi32(a1, 1 << m_VtxAttr.texCoord[4].Frac, 0);
-			a1 = _mm_insert_epi32(a1, 1 << m_VtxAttr.texCoord[5].Frac, 1);
-			a1 = _mm_insert_epi32(a1, 1 << m_VtxAttr.texCoord[6].Frac, 2);
-			a1 = _mm_insert_epi32(a1, 1 << m_VtxAttr.texCoord[7].Frac, 3);
-			const __m128 b1 = _mm_cvtepi32_ps(a1);
-			const __m128 c1 = _mm_rcp_ps(b1);
-			_mm_stream_ps(&tcScale[4], c1);
-		} else {
-			for (int i = 0; i < 8; i++) {
-				tcScale[i] = 1.0f / float(1 << m_VtxAttr.texCoord[i].Frac);
-			}
-		}
-	}
+	if (m_NativeFmt->m_components & VB_HAS_UVALL)
+		for (int i = 0; i < 8; i++)
+			tcScale[i] = 1.0f / float(1 << m_VtxAttr.texCoord[i].Frac);
 	for (int i = 0; i < 2; i++)
 		colElements[i] = m_VtxAttr.color[i].Elements;
 
diff --git a/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp b/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp
index c1920a99a9..907a84cca9 100644
--- a/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp
+++ b/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp
@@ -18,9 +18,7 @@
 #ifndef VERTEXLOADER_POSITION_H
 #define VERTEXLOADER_POSITION_H
 
-#include <nmmintrin.h>
 #include "Common.h"
-#include "CPUDetect.h"
 #include "VideoCommon.h"
 #include "VertexLoader.h"
 #include "VertexLoader_Position.h"
@@ -151,34 +149,18 @@ inline void Pos_ReadIndex_Short(int Index)
 	VertexManager::s_pCurBufferPointer += 12;
 }
 
-static const __m128i kMaskSwap32 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L);
 template<bool three>
 inline void Pos_ReadIndex_Float(int Index)
 {
 	const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION]));
-
-	if (cpu_info.bSSE4_1) {
-		const __m128i a = _mm_loadu_si128((__m128i*)pData);
-		__m128i b = _mm_shuffle_epi8(a, kMaskSwap32);
-		if (!three) {
-			b = _mm_insert_epi32(b, 0, 2);
-		}
-		u8* p = VertexManager::s_pCurBufferPointer;
-		_mm_storeu_si128((__m128i*)p, b);
-		LOG_VTX();
-		p += 12;
-		VertexManager::s_pCurBufferPointer = p;
-
-	} else {
-		((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
-		((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
-		if (three)
-			((u32*)VertexManager::s_pCurBufferPointer)[2] = Common::swap32(pData[2]);
-		else
-			((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f;
-		LOG_VTX();
-		VertexManager::s_pCurBufferPointer += 12;
-	}
+	((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
+	((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
+	if (three)
+		((u32*)VertexManager::s_pCurBufferPointer)[2] = Common::swap32(pData[2]);
+	else
+		((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f;
+	LOG_VTX();
+	VertexManager::s_pCurBufferPointer += 12;
 }
 
 // ==============================================================================
diff --git a/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp b/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp
index 036d0e13cf..00f4a31667 100644
--- a/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp
+++ b/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp
@@ -18,9 +18,7 @@
 #ifndef VERTEXLOADER_TEXCOORD_H
 #define VERTEXLOADER_TEXCOORD_H
 
-#include <nmmintrin.h>
 #include "Common.h"
-#include "CPUDetect.h"
 #include "VideoCommon.h"
 #include "VertexLoader.h"
 #include "VertexLoader_Position.h"
@@ -310,30 +308,15 @@ void LOADERDECL TexCoord_ReadIndex16_Float1()
 	VertexManager::s_pCurBufferPointer += 4;
 	tcIndex++;
 }
-
-static const __m128i kMaskSwap32 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
-void LOADERDECL TexCoord_ReadIndex16_Float2()
+void LOADERDECL TexCoord_ReadIndex16_Float2()	
 {
 	u16 Index = DataReadU16(); 
 	const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]));
-
-	if (cpu_info.bSSSE3) {
-		const __m128i a = _mm_loadl_epi64((__m128i*)pData);
-		const __m128i b = _mm_shuffle_epi8(a, kMaskSwap32);
-		u8* p = VertexManager::s_pCurBufferPointer;
-		_mm_storel_epi64((__m128i*)p, b);
-		LOG_TEX2();
-		p += 8;
-		VertexManager::s_pCurBufferPointer = p;
-		tcIndex++;
-
-	} else {
-		((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
-		((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
-		LOG_TEX2();
-		VertexManager::s_pCurBufferPointer += 8;
-		tcIndex++;
-	}
+	((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
+	((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
 }
 
 #endif