GL Vertex loader moved to VideoCommon.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1692 8ced0084-cf51-0410-be5f-012b33b47a6e
2025-07-24 14:49:42 -06:00 · 2008-12-26 13:09:16 +00:00
parent ab01e9e853
commit 95a341a4e9
15 changed files with 55 additions and 115 deletions
--- a/Source/Core/VideoCommon/Src/VertexLoader.cpp
+++ b/Source/Core/VideoCommon/Src/VertexLoader.cpp
@ -0,0 +1,682 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#include <assert.h>
+
+#include "Common.h"
+#include "VideoCommon.h"
+#include "Profiler.h"
+#include "MemoryUtil.h"
+#include "StringUtil.h"
+#include "x64Emitter.h"
+#include "ABI.h"
+
+#include "LookUpTables.h"
+#include "Statistics.h"
+#include "VertexLoaderManager.h"
+#include "VertexLoader.h"
+#include "BPMemory.h"
+#include "DataReader.h"
+#include "NativeVertexWriter.h"
+
+#include "VertexLoader_Position.h"
+#include "VertexLoader_Normal.h"
+#include "VertexLoader_Color.h"
+#include "VertexLoader_TextCoord.h"
+
+#define USE_JIT
+
+#define COMPILED_CODE_SIZE 4096
+
+NativeVertexFormat *g_nativeVertexFmt;
+
+#ifndef _WIN32
+	#undef inline
+	#define inline
+#endif
+
+// Matrix components are first in GC format but later in PC format - we need to store it temporarily
+// when decoding each vertex.
+static u8 s_curposmtx;
+static u8 s_curtexmtx[8];
+static int s_texmtxwrite = 0;
+static int s_texmtxread = 0;
+
+static int loop_counter;
+
+// Vertex loaders read these. Although the scale ones should be baked into the shader.
+int tcIndex;
+int colIndex;
+TVtxAttr* pVtxAttr;
+int colElements[2];
+float posScale;
+float tcScale[8];
+
+using namespace Gen;
+
+void LOADERDECL PosMtx_ReadDirect_UByte()
+{
+	s_curposmtx = DataReadU8() & 0x3f;
+	PRIM_LOG("posmtx: %d, ", s_curposmtx);
+}
+
+void LOADERDECL PosMtx_Write()
+{
+	*VertexManager::s_pCurBufferPointer++ = s_curposmtx;
+	*VertexManager::s_pCurBufferPointer++ = 0;
+	*VertexManager::s_pCurBufferPointer++ = 0;
+	*VertexManager::s_pCurBufferPointer++ = 0;
+}
+
+void LOADERDECL TexMtx_ReadDirect_UByte()
+{
+	s_curtexmtx[s_texmtxread] = DataReadU8()&0x3f;
+	PRIM_LOG("texmtx%d: %d, ", s_texmtxread, s_curtexmtx[s_texmtxread]);
+	s_texmtxread++;
+}
+
+void LOADERDECL TexMtx_Write_Float()
+{
+	*(float*)VertexManager::s_pCurBufferPointer = (float)s_curtexmtx[s_texmtxwrite++];
+	VertexManager::s_pCurBufferPointer += 4;
+}
+
+void LOADERDECL TexMtx_Write_Float2()
+{
+	((float*)VertexManager::s_pCurBufferPointer)[0] = 0;
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)s_curtexmtx[s_texmtxwrite++];
+	VertexManager::s_pCurBufferPointer += 8;
+}
+
+void LOADERDECL TexMtx_Write_Short3()
+{
+	((s16*)VertexManager::s_pCurBufferPointer)[0] = 0;
+	((s16*)VertexManager::s_pCurBufferPointer)[1] = 0;
+	((s16*)VertexManager::s_pCurBufferPointer)[2] = s_curtexmtx[s_texmtxwrite++];
+	VertexManager::s_pCurBufferPointer += 8;
+}
+
+VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr) 
+{
+	m_compiledCode = NULL;
+	m_numLoadedVertices = 0;
+	m_VertexSize = 0;
+	m_numPipelineStages = 0;
+	m_NativeFmt = NativeVertexFormat::Create();
+	loop_counter = 0;
+	VertexLoader_Normal::Init();
+
+	m_VtxDesc = vtx_desc;
+	SetVAT(vtx_attr.g0.Hex, vtx_attr.g1.Hex, vtx_attr.g2.Hex);
+
+	AllocCodeSpace(COMPILED_CODE_SIZE);
+	CompileVertexTranslator();
+	WriteProtect();
+}
+
+VertexLoader::~VertexLoader() 
+{
+	FreeCodeSpace();
+	delete m_NativeFmt;
+}
+
+void VertexLoader::CompileVertexTranslator()
+{
+	m_VertexSize = 0;
+	const TVtxAttr &vtx_attr = m_VtxAttr;
+
+#ifdef USE_JIT
+	if (m_compiledCode)
+		PanicAlert("trying to recompile a vtx translator");
+
+	m_compiledCode = GetCodePtr();
+	ABI_EmitPrologue(4);
+
+	// Start loop here
+	const u8 *loop_start = GetCodePtr();
+
+	// Reset component counters if present in vertex format only.
+	if (m_VtxDesc.Tex0Coord || m_VtxDesc.Tex1Coord || m_VtxDesc.Tex2Coord || m_VtxDesc.Tex3Coord ||
+		m_VtxDesc.Tex4Coord || m_VtxDesc.Tex5Coord || m_VtxDesc.Tex6Coord || m_VtxDesc.Tex7Coord) {
+		MOV(32, M(&tcIndex), Imm32(0));
+	}
+	if (m_VtxDesc.Color0 || m_VtxDesc.Color1) {
+		MOV(32, M(&colIndex), Imm32(0));
+	}
+	if (m_VtxDesc.Tex0MatIdx || m_VtxDesc.Tex1MatIdx || m_VtxDesc.Tex2MatIdx || m_VtxDesc.Tex3MatIdx ||
+		m_VtxDesc.Tex4MatIdx || m_VtxDesc.Tex5MatIdx || m_VtxDesc.Tex6MatIdx || m_VtxDesc.Tex7MatIdx) {
+		MOV(32, M(&s_texmtxwrite), Imm32(0));
+		MOV(32, M(&s_texmtxread), Imm32(0));
+	}
+#endif
+
+	// Colors
+	const int col[2] = {m_VtxDesc.Color0, m_VtxDesc.Color1};
+	// TextureCoord
+	// Since m_VtxDesc.Text7Coord is broken across a 32 bit word boundary, retrieve its value manually.
+	// If we didn't do this, the vertex format would be read as one bit offset from where it should be, making
+	// 01 become 00, and 10/11 become 01
+	const int tc[8] = {
+		m_VtxDesc.Tex0Coord, m_VtxDesc.Tex1Coord, m_VtxDesc.Tex2Coord, m_VtxDesc.Tex3Coord,
+		m_VtxDesc.Tex4Coord, m_VtxDesc.Tex5Coord, m_VtxDesc.Tex6Coord, (m_VtxDesc.Hex >> 31) & 3
+	};
+	
+	// Reset pipeline
+	m_numPipelineStages = 0;
+
+	// It's a bit ugly that we poke inside m_NativeFmt in this function. Planning to fix this.
+	m_NativeFmt->m_components = 0;
+
+	// Position in pc vertex format.
+	int nat_offset = 0;
+	PortableVertexDeclaration vtx_decl;
+	memset(&vtx_decl, 0, sizeof(vtx_decl));
+	for (int i = 0; i < 8; i++) {
+		vtx_decl.texcoord_offset[i] = -1;
+	}
+
+	// m_VBVertexStride for texmtx and posmtx is computed later when writing.
+	
+	// Position Matrix Index
+	if (m_VtxDesc.PosMatIdx) {
+		WriteCall(PosMtx_ReadDirect_UByte);
+		m_NativeFmt->m_components |= VB_HAS_POSMTXIDX;
+		m_VertexSize += 1;
+	}
+
+	if (m_VtxDesc.Tex0MatIdx) {m_VertexSize += 1; m_NativeFmt->m_components |= VB_HAS_TEXMTXIDX0; WriteCall(TexMtx_ReadDirect_UByte); }
+	if (m_VtxDesc.Tex1MatIdx) {m_VertexSize += 1; m_NativeFmt->m_components |= VB_HAS_TEXMTXIDX1; WriteCall(TexMtx_ReadDirect_UByte); }
+	if (m_VtxDesc.Tex2MatIdx) {m_VertexSize += 1; m_NativeFmt->m_components |= VB_HAS_TEXMTXIDX2; WriteCall(TexMtx_ReadDirect_UByte); }
+	if (m_VtxDesc.Tex3MatIdx) {m_VertexSize += 1; m_NativeFmt->m_components |= VB_HAS_TEXMTXIDX3; WriteCall(TexMtx_ReadDirect_UByte); }
+	if (m_VtxDesc.Tex4MatIdx) {m_VertexSize += 1; m_NativeFmt->m_components |= VB_HAS_TEXMTXIDX4; WriteCall(TexMtx_ReadDirect_UByte); }
+	if (m_VtxDesc.Tex5MatIdx) {m_VertexSize += 1; m_NativeFmt->m_components |= VB_HAS_TEXMTXIDX5; WriteCall(TexMtx_ReadDirect_UByte); }
+	if (m_VtxDesc.Tex6MatIdx) {m_VertexSize += 1; m_NativeFmt->m_components |= VB_HAS_TEXMTXIDX6; WriteCall(TexMtx_ReadDirect_UByte); }
+	if (m_VtxDesc.Tex7MatIdx) {m_VertexSize += 1; m_NativeFmt->m_components |= VB_HAS_TEXMTXIDX7; WriteCall(TexMtx_ReadDirect_UByte); }
+
+	switch (m_VtxDesc.Position) {
+	case NOT_PRESENT:	{_assert_msg_(0, "Vertex descriptor without position!", "WTF?");} break;
+	case DIRECT:
+		switch (m_VtxAttr.PosFormat) {
+        case FORMAT_UBYTE:  m_VertexSize += m_VtxAttr.PosElements?3:2; WriteCall(Pos_ReadDirect_UByte);  break;
+		case FORMAT_BYTE:   m_VertexSize += m_VtxAttr.PosElements?3:2; WriteCall(Pos_ReadDirect_Byte);   break;
+		case FORMAT_USHORT: m_VertexSize += m_VtxAttr.PosElements?6:4; WriteCall(Pos_ReadDirect_UShort); break;
+		case FORMAT_SHORT:  m_VertexSize += m_VtxAttr.PosElements?6:4; WriteCall(Pos_ReadDirect_Short);  break;
+		case FORMAT_FLOAT:  m_VertexSize += m_VtxAttr.PosElements?12:8; WriteCall(Pos_ReadDirect_Float);  break;
+		default: _assert_(0); break;
+		}
+		nat_offset += 12;
+		break;
+	case INDEX8:		
+		switch (m_VtxAttr.PosFormat) {
+		case FORMAT_UBYTE:	WriteCall(Pos_ReadIndex8_UByte);  break; //WTF?
+		case FORMAT_BYTE:	WriteCall(Pos_ReadIndex8_Byte);   break;
+		case FORMAT_USHORT:	WriteCall(Pos_ReadIndex8_UShort); break;
+		case FORMAT_SHORT:	WriteCall(Pos_ReadIndex8_Short);  break;
+		case FORMAT_FLOAT:	WriteCall(Pos_ReadIndex8_Float);  break;
+		default: _assert_(0); break;
+		}
+		m_VertexSize += 1;
+		nat_offset += 12;
+		break;
+	case INDEX16:
+		switch (m_VtxAttr.PosFormat) {
+		case FORMAT_UBYTE:	WriteCall(Pos_ReadIndex16_UByte);  break;
+		case FORMAT_BYTE:	WriteCall(Pos_ReadIndex16_Byte);   break;
+		case FORMAT_USHORT:	WriteCall(Pos_ReadIndex16_UShort); break;
+		case FORMAT_SHORT:	WriteCall(Pos_ReadIndex16_Short);  break;
+		case FORMAT_FLOAT:	WriteCall(Pos_ReadIndex16_Float);  break;
+		default: _assert_(0); break;
+		}
+		m_VertexSize += 2;
+		nat_offset += 12;
+		break;
+	}
+
+	// Normals
+	vtx_decl.num_normals = 0;
+	if (m_VtxDesc.Normal != NOT_PRESENT) {
+		m_VertexSize += VertexLoader_Normal::GetSize(m_VtxDesc.Normal, m_VtxAttr.NormalFormat, m_VtxAttr.NormalElements, m_VtxAttr.NormalIndex3);
+		TPipelineFunction pFunc = VertexLoader_Normal::GetFunction(m_VtxDesc.Normal, m_VtxAttr.NormalFormat, m_VtxAttr.NormalElements, m_VtxAttr.NormalIndex3);
+		if (pFunc == 0)
+		{
+			char temp[256];
+			sprintf(temp,"%i %i %i %i", m_VtxDesc.Normal, m_VtxAttr.NormalFormat, m_VtxAttr.NormalElements, m_VtxAttr.NormalIndex3);
+			g_VideoInitialize.pSysMessage("VertexLoader_Normal::GetFunction returned zero!");
+		}
+		WriteCall(pFunc);
+
+		vtx_decl.num_normals = vtx_attr.NormalElements ? 3 : 1;
+		switch (vtx_attr.NormalFormat) {
+		case FORMAT_UBYTE:	
+		case FORMAT_BYTE:
+			vtx_decl.normal_gl_type = VAR_BYTE;
+			vtx_decl.normal_gl_size = 4;
+			vtx_decl.normal_offset[0] = nat_offset;
+			nat_offset += 4;
+			if (vtx_attr.NormalElements) {
+				vtx_decl.normal_offset[1] = nat_offset;
+				nat_offset += 4;
+				vtx_decl.normal_offset[2] = nat_offset;
+				nat_offset += 4;
+			}
+			break;
+		case FORMAT_USHORT:
+		case FORMAT_SHORT:
+			vtx_decl.normal_gl_type = VAR_SHORT;
+			vtx_decl.normal_gl_size = 4;
+			vtx_decl.normal_offset[0] = nat_offset;
+			nat_offset += 8;
+			if (vtx_attr.NormalElements) {
+				vtx_decl.normal_offset[1] = nat_offset;
+				nat_offset += 8;
+				vtx_decl.normal_offset[2] = nat_offset;
+				nat_offset += 8;
+			}
+			break;
+		case FORMAT_FLOAT:
+			vtx_decl.normal_gl_type = VAR_FLOAT;
+			vtx_decl.normal_gl_size = 3;
+			vtx_decl.normal_offset[0] = nat_offset;
+			nat_offset += 12;
+			if (vtx_attr.NormalElements) {
+				vtx_decl.normal_offset[1] = nat_offset;
+				nat_offset += 12;
+				vtx_decl.normal_offset[2] = nat_offset;
+				nat_offset += 12;
+			}
+			break;
+		default: _assert_(0); break;
+		}
+
+		int numNormals = (m_VtxAttr.NormalElements == 1) ? NRM_THREE : NRM_ONE;
+		m_NativeFmt->m_components |= VB_HAS_NRM0;
+
+		if (numNormals == NRM_THREE)
+			m_NativeFmt->m_components |= VB_HAS_NRM1 | VB_HAS_NRM2;
+	}
+
+	vtx_decl.color_gl_type = VAR_UNSIGNED_BYTE;
+	for (int i = 0; i < 2; i++) {
+		m_NativeFmt->m_components |= VB_HAS_COL0 << i;
+		switch (col[i])
+		{
+		case NOT_PRESENT: 
+			m_NativeFmt->m_components &= ~(VB_HAS_COL0 << i);
+			vtx_decl.color_offset[i] = -1;
+			break;
+		case DIRECT:
+			switch (m_VtxAttr.color[i].Comp)
+			{
+			case FORMAT_16B_565:	m_VertexSize += 2; WriteCall(Color_ReadDirect_16b_565); break;
+			case FORMAT_24B_888:	m_VertexSize += 3; WriteCall(Color_ReadDirect_24b_888); break;
+			case FORMAT_32B_888x:	m_VertexSize += 4; WriteCall(Color_ReadDirect_32b_888x); break;
+			case FORMAT_16B_4444:	m_VertexSize += 2; WriteCall(Color_ReadDirect_16b_4444); break;
+			case FORMAT_24B_6666:	m_VertexSize += 3; WriteCall(Color_ReadDirect_24b_6666); break;
+			case FORMAT_32B_8888:	m_VertexSize += 4; WriteCall(Color_ReadDirect_32b_8888); break;
+			default: _assert_(0); break;
+			}
+			break;
+		case INDEX8:	
+			m_VertexSize += 1;
+			switch (m_VtxAttr.color[i].Comp)
+			{
+			case FORMAT_16B_565:	WriteCall(Color_ReadIndex8_16b_565); break;
+			case FORMAT_24B_888:	WriteCall(Color_ReadIndex8_24b_888); break;
+			case FORMAT_32B_888x:	WriteCall(Color_ReadIndex8_32b_888x); break;
+			case FORMAT_16B_4444:	WriteCall(Color_ReadIndex8_16b_4444); break;
+			case FORMAT_24B_6666:	WriteCall(Color_ReadIndex8_24b_6666); break;
+			case FORMAT_32B_8888:	WriteCall(Color_ReadIndex8_32b_8888); break;
+			default: _assert_(0); break;
+			}
+			break;
+		case INDEX16:
+			m_VertexSize += 2;
+			switch (m_VtxAttr.color[i].Comp)
+			{
+			case FORMAT_16B_565:	WriteCall(Color_ReadIndex16_16b_565); break;
+			case FORMAT_24B_888:	WriteCall(Color_ReadIndex16_24b_888); break;
+			case FORMAT_32B_888x:	WriteCall(Color_ReadIndex16_32b_888x); break;
+			case FORMAT_16B_4444:	WriteCall(Color_ReadIndex16_16b_4444); break;
+			case FORMAT_24B_6666:	WriteCall(Color_ReadIndex16_24b_6666); break;
+			case FORMAT_32B_8888:	WriteCall(Color_ReadIndex16_32b_8888); break;
+			default: _assert_(0); break;
+			}
+			break;
+		}
+		// Common for the three bottom cases
+		if (col[i] != NOT_PRESENT) {
+			vtx_decl.color_offset[i] = nat_offset;
+			nat_offset += 4;
+		}
+	}
+
+	// Texture matrix indices (remove if corresponding texture coordinate isn't enabled)
+	for (int i = 0; i < 8; i++) {
+		m_NativeFmt->m_components |= VB_HAS_UV0 << i;
+		int elements = m_VtxAttr.texCoord[i].Elements;
+		switch (tc[i])
+		{
+		case NOT_PRESENT: 
+			m_NativeFmt->m_components &= ~(VB_HAS_UV0 << i);
+			break;
+		case DIRECT:
+			switch (m_VtxAttr.texCoord[i].Format)
+			{
+			case FORMAT_UBYTE:	m_VertexSize += elements?2:1; WriteCall(elements?TexCoord_ReadDirect_UByte2:TexCoord_ReadDirect_UByte1);  break;
+			case FORMAT_BYTE:	m_VertexSize += elements?2:1; WriteCall(elements?TexCoord_ReadDirect_Byte2:TexCoord_ReadDirect_Byte1);   break;
+			case FORMAT_USHORT:	m_VertexSize += elements?4:2; WriteCall(elements?TexCoord_ReadDirect_UShort2:TexCoord_ReadDirect_UShort1); break;
+			case FORMAT_SHORT:	m_VertexSize += elements?4:2; WriteCall(elements?TexCoord_ReadDirect_Short2:TexCoord_ReadDirect_Short1);  break;
+			case FORMAT_FLOAT:	m_VertexSize += elements?8:4; WriteCall(elements?TexCoord_ReadDirect_Float2:TexCoord_ReadDirect_Float1);  break;
+			default: _assert_(0); break;
+			}
+			break;
+		case INDEX8:	
+			m_VertexSize += 1;
+			switch (m_VtxAttr.texCoord[i].Format)
+			{
+			case FORMAT_UBYTE:	WriteCall(elements?TexCoord_ReadIndex8_UByte2:TexCoord_ReadIndex8_UByte1);  break;
+			case FORMAT_BYTE:	WriteCall(elements?TexCoord_ReadIndex8_Byte2:TexCoord_ReadIndex8_Byte1);   break;
+			case FORMAT_USHORT:	WriteCall(elements?TexCoord_ReadIndex8_UShort2:TexCoord_ReadIndex8_UShort1); break;
+			case FORMAT_SHORT:	WriteCall(elements?TexCoord_ReadIndex8_Short2:TexCoord_ReadIndex8_Short1);  break;
+			case FORMAT_FLOAT:	WriteCall(elements?TexCoord_ReadIndex8_Float2:TexCoord_ReadIndex8_Float1);  break;
+			default: _assert_(0); break;
+			}
+			break;
+		case INDEX16:
+			m_VertexSize += 2;
+			switch (m_VtxAttr.texCoord[i].Format)
+			{
+			case FORMAT_UBYTE:	WriteCall(elements?TexCoord_ReadIndex16_UByte2:TexCoord_ReadIndex16_UByte1);  break;
+			case FORMAT_BYTE:	WriteCall(elements?TexCoord_ReadIndex16_Byte2:TexCoord_ReadIndex16_Byte1);   break;
+			case FORMAT_USHORT:	WriteCall(elements?TexCoord_ReadIndex16_UShort2:TexCoord_ReadIndex16_UShort1); break;
+			case FORMAT_SHORT:	WriteCall(elements?TexCoord_ReadIndex16_Short2:TexCoord_ReadIndex16_Short1);  break;
+			case FORMAT_FLOAT:	WriteCall(elements?TexCoord_ReadIndex16_Float2:TexCoord_ReadIndex16_Float1);  break;
+			default: _assert_(0);
+			}
+			break;
+		}
+
+		if (m_NativeFmt->m_components & (VB_HAS_TEXMTXIDX0 << i)) {
+			if (tc[i] != NOT_PRESENT) {
+				// if texmtx is included, texcoord will always be 3 floats, z will be the texmtx index
+				vtx_decl.texcoord_offset[i] = nat_offset;
+				vtx_decl.texcoord_gl_type[i] = VAR_FLOAT;
+				vtx_decl.texcoord_size[i] = 3;
+				nat_offset += 12;
+				WriteCall(m_VtxAttr.texCoord[i].Elements ? TexMtx_Write_Float : TexMtx_Write_Float2);
+			}
+			else {
+				m_NativeFmt->m_components |= VB_HAS_UV0 << i; // have to include since using now
+				vtx_decl.texcoord_offset[i] = nat_offset;
+				vtx_decl.texcoord_gl_type[i] = VAR_SHORT;
+				vtx_decl.texcoord_size[i] = 4;
+				nat_offset += 8; // still include the texture coordinate, but this time as 6 + 2 bytes
+				WriteCall(TexMtx_Write_Short3);
+			}
+		}
+		else {
+			if (tc[i] != NOT_PRESENT) {
+				vtx_decl.texcoord_offset[i] = nat_offset;
+				vtx_decl.texcoord_gl_type[i] = VAR_FLOAT;
+				vtx_decl.texcoord_size[i] = vtx_attr.texCoord[i].Elements ? 2 : 1;
+				nat_offset += 4 * (vtx_attr.texCoord[i].Elements ? 2 : 1);
+			} else {
+				vtx_decl.texcoord_offset[i] = -1;
+			}
+		}
+
+		if (tc[i] == NOT_PRESENT) {
+			// if there's more tex coords later, have to write a dummy call 
+			int j = i + 1;
+			for (; j < 8; ++j) {
+				if (tc[j] != NOT_PRESENT) {
+					WriteCall(TexCoord_Read_Dummy); // important to get indices right!
+					break;
+				}
+			}
+			// tricky!
+			if (j == 8 && !((m_NativeFmt->m_components & VB_HAS_TEXMTXIDXALL) & (VB_HAS_TEXMTXIDXALL << (i + 1)))) {
+				// no more tex coords and tex matrices, so exit loop
+				break;
+			}
+		}
+	}
+
+	if (m_VtxDesc.PosMatIdx) {
+		WriteCall(PosMtx_Write);
+		vtx_decl.posmtx_offset = nat_offset;
+		nat_offset += 4;
+	} else {
+		vtx_decl.posmtx_offset = -1;
+	}
+
+	native_stride = nat_offset;
+	vtx_decl.stride = native_stride;
+
+#ifdef USE_JIT
+	// End loop here
+	SUB(32, M(&loop_counter), Imm8(1));
+	//SUB(32, R(EBX), Imm8(1));
+	J_CC(CC_NZ, loop_start, true);
+	ABI_EmitEpilogue(4);
+#endif
+	m_NativeFmt->Initialize(vtx_decl);
+}
+
+void VertexLoader::WriteCall(TPipelineFunction func)
+{
+#ifdef USE_JIT
+	CALL((void*)func);
+#else
+	m_PipelineStages[m_numPipelineStages++] = func;
+#endif
+}
+
+void VertexLoader::RunVertices(int vtx_attr_group, int primitive, int count)
+{
+	DVSTARTPROFILE();
+
+	m_numLoadedVertices += count;
+
+	// Flush if our vertex format is different from the currently set.
+	if (g_nativeVertexFmt != NULL && g_nativeVertexFmt != m_NativeFmt)
+	{
+		VertexManager::Flush();
+		// Also move the Set() here?
+	}
+	g_nativeVertexFmt = m_NativeFmt;
+
+	if (bpmem.genMode.cullmode == 3 && primitive < 5)
+	{
+		// if cull mode is none, ignore triangles and quads
+		DataSkip(count * m_VertexSize);
+		return;
+	}
+
+	VertexManager::EnableComponents(m_NativeFmt->m_components);
+
+	// Load position and texcoord scale factors.
+	m_VtxAttr.PosFrac				= g_VtxAttr[vtx_attr_group].g0.PosFrac;
+	m_VtxAttr.texCoord[0].Frac		= g_VtxAttr[vtx_attr_group].g0.Tex0Frac;
+	m_VtxAttr.texCoord[1].Frac		= g_VtxAttr[vtx_attr_group].g1.Tex1Frac;
+	m_VtxAttr.texCoord[2].Frac		= g_VtxAttr[vtx_attr_group].g1.Tex2Frac;
+	m_VtxAttr.texCoord[3].Frac      = g_VtxAttr[vtx_attr_group].g1.Tex3Frac;
+	m_VtxAttr.texCoord[4].Frac		= g_VtxAttr[vtx_attr_group].g2.Tex4Frac;
+	m_VtxAttr.texCoord[5].Frac		= g_VtxAttr[vtx_attr_group].g2.Tex5Frac;
+	m_VtxAttr.texCoord[6].Frac		= g_VtxAttr[vtx_attr_group].g2.Tex6Frac;
+	m_VtxAttr.texCoord[7].Frac		= g_VtxAttr[vtx_attr_group].g2.Tex7Frac;
+
+	pVtxAttr = &m_VtxAttr;
+	posScale = shiftLookup[m_VtxAttr.PosFrac];
+	if (m_NativeFmt->m_components & VB_HAS_UVALL)
+		for (int i = 0; i < 8; i++)
+			tcScale[i] = shiftLookup[m_VtxAttr.texCoord[i].Frac];
+	for (int i = 0; i < 2; i++)
+		colElements[i] = m_VtxAttr.color[i].Elements;
+
+	// if strips or fans, make sure all vertices can fit in buffer, otherwise flush
+	int granularity = 1;
+	switch (primitive) {
+		case 3: // strip .. hm, weird
+		case 4: // fan
+			if (VertexManager::GetRemainingSize() < 3 * native_stride)
+				VertexManager::Flush();
+			break;
+		case 6: // line strip
+			if (VertexManager::GetRemainingSize() < 2 * native_stride)
+				VertexManager::Flush();
+			break;
+		case 0: granularity = 4; break; // quads
+		case 2: granularity = 3; break; // tris
+		case 5: granularity = 2; break; // lines
+	}
+
+	int startv = 0, extraverts = 0;
+	int v = 0;
+
+	while (v < count)
+	{
+		int remainingVerts = VertexManager::GetRemainingSize() / native_stride;
+		if (remainingVerts < granularity) {
+			INCSTAT(stats.thisFrame.numBufferSplits);
+			// This buffer full - break current primitive and flush, to switch to the next buffer.
+			u8* plastptr = VertexManager::s_pCurBufferPointer;
+			if (v - startv > 0)
+				VertexManager::AddVertices(primitive, v - startv + extraverts);
+			VertexManager::Flush();
+			// Why does this need to be so complicated?
+			switch (primitive) {
+				case 3: // triangle strip, copy last two vertices
+					// a little trick since we have to keep track of signs
+					if (v & 1) {
+						memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-2*native_stride, native_stride);
+						memcpy_gc(VertexManager::s_pCurBufferPointer+native_stride, plastptr-native_stride*2, 2*native_stride);
+						VertexManager::s_pCurBufferPointer += native_stride*3;
+						extraverts = 3;
+					}
+					else {
+						memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride*2, native_stride*2);
+						VertexManager::s_pCurBufferPointer += native_stride*2;
+						extraverts = 2;
+					}
+					break;
+				case 4: // tri fan, copy first and last vert
+					memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride*(v-startv+extraverts), native_stride);
+					VertexManager::s_pCurBufferPointer += native_stride;
+					memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride, native_stride);
+					VertexManager::s_pCurBufferPointer += native_stride;
+					extraverts = 2;
+					break;
+				case 6: // line strip
+					memcpy_gc(VertexManager::s_pCurBufferPointer, plastptr-native_stride, native_stride);
+					VertexManager::s_pCurBufferPointer += native_stride;
+					extraverts = 1;
+					break;
+				default:
+					extraverts = 0;
+					break;
+			}
+			startv = v;
+		}
+		int remainingPrims = remainingVerts / granularity;
+		remainingVerts = remainingPrims * granularity;
+		if (count - v < remainingVerts)
+			remainingVerts = count - v;
+
+	#ifdef USE_JIT
+		if (remainingVerts > 0) {
+			loop_counter = remainingVerts;
+			((void (*)())(void*)m_compiledCode)();
+		}
+	#else
+		for (int s = 0; s < remainingVerts; s++)
+		{
+			tcIndex = 0;
+			colIndex = 0;
+			s_texmtxwrite = s_texmtxread = 0;
+			for (int i = 0; i < m_numPipelineStages; i++)
+				m_PipelineStages[i]();
+			PRIM_LOG("\n");
+		}
+	#endif
+		v += remainingVerts;
+	}
+
+	if (startv < count)
+		VertexManager::AddVertices(primitive, count - startv + extraverts);
+}
+
+void VertexLoader::SetVAT(u32 _group0, u32 _group1, u32 _group2) 
+{
+	VAT vat;
+	vat.g0.Hex = _group0;
+	vat.g1.Hex = _group1;
+	vat.g2.Hex = _group2;
+
+	m_VtxAttr.PosElements			= vat.g0.PosElements;
+	m_VtxAttr.PosFormat				= vat.g0.PosFormat;
+	m_VtxAttr.PosFrac				= vat.g0.PosFrac;
+	m_VtxAttr.NormalElements		= vat.g0.NormalElements;
+	m_VtxAttr.NormalFormat			= vat.g0.NormalFormat;
+	m_VtxAttr.color[0].Elements		= vat.g0.Color0Elements;
+	m_VtxAttr.color[0].Comp			= vat.g0.Color0Comp;
+	m_VtxAttr.color[1].Elements		= vat.g0.Color1Elements;
+	m_VtxAttr.color[1].Comp			= vat.g0.Color1Comp;
+	m_VtxAttr.texCoord[0].Elements	= vat.g0.Tex0CoordElements;
+	m_VtxAttr.texCoord[0].Format	= vat.g0.Tex0CoordFormat;
+	m_VtxAttr.texCoord[0].Frac		= vat.g0.Tex0Frac;
+	m_VtxAttr.ByteDequant			= vat.g0.ByteDequant;
+	m_VtxAttr.NormalIndex3			= vat.g0.NormalIndex3;
+
+	m_VtxAttr.texCoord[1].Elements	= vat.g1.Tex1CoordElements;
+	m_VtxAttr.texCoord[1].Format	= vat.g1.Tex1CoordFormat;
+	m_VtxAttr.texCoord[1].Frac		= vat.g1.Tex1Frac;
+	m_VtxAttr.texCoord[2].Elements	= vat.g1.Tex2CoordElements;
+	m_VtxAttr.texCoord[2].Format	= vat.g1.Tex2CoordFormat;
+	m_VtxAttr.texCoord[2].Frac		= vat.g1.Tex2Frac;
+	m_VtxAttr.texCoord[3].Elements	= vat.g1.Tex3CoordElements;
+	m_VtxAttr.texCoord[3].Format	= vat.g1.Tex3CoordFormat;
+	m_VtxAttr.texCoord[3].Frac      = vat.g1.Tex3Frac;
+	m_VtxAttr.texCoord[4].Elements	= vat.g1.Tex4CoordElements;
+	m_VtxAttr.texCoord[4].Format	= vat.g1.Tex4CoordFormat;
+
+	m_VtxAttr.texCoord[4].Frac		= vat.g2.Tex4Frac;
+	m_VtxAttr.texCoord[5].Elements	= vat.g2.Tex5CoordElements;
+	m_VtxAttr.texCoord[5].Format	= vat.g2.Tex5CoordFormat;
+	m_VtxAttr.texCoord[5].Frac		= vat.g2.Tex5Frac;
+	m_VtxAttr.texCoord[6].Elements	= vat.g2.Tex6CoordElements;
+	m_VtxAttr.texCoord[6].Format	= vat.g2.Tex6CoordFormat;
+	m_VtxAttr.texCoord[6].Frac		= vat.g2.Tex6Frac;
+	m_VtxAttr.texCoord[7].Elements	= vat.g2.Tex7CoordElements;
+	m_VtxAttr.texCoord[7].Format	= vat.g2.Tex7CoordFormat;
+	m_VtxAttr.texCoord[7].Frac		= vat.g2.Tex7Frac;
+};
+
+void VertexLoader::AppendToString(std::string *dest) {
+	static const char *posMode[4] = {
+		"Invalid",
+		"Direct",
+		"Idx8",
+		"Idx16",
+	};
+	static const char *posFormats[5] = {
+		"u8", "s8", "u16", "s16", "flt",
+	};
+	dest->append(StringFromFormat("sz: %i skin: %i Pos: %i %s %s Nrm: %i %s %s - %i vtx\n",
+		m_VertexSize, m_VtxDesc.PosMatIdx, m_VtxAttr.PosElements ? 3 : 2, posMode[m_VtxDesc.Position], posFormats[m_VtxAttr.PosFormat],
+		m_VtxAttr.NormalElements, posMode[m_VtxDesc.Normal], posFormats[m_VtxAttr.NormalFormat], m_numLoadedVertices));
+}
--- a/Source/Core/VideoCommon/Src/VertexLoader.h
+++ b/Source/Core/VideoCommon/Src/VertexLoader.h
@ -0,0 +1,101 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#ifndef _VERTEXLOADER_H
+#define _VERTEXLOADER_H
+
+#include <string>
+
+#include "CPMemory.h"
+#include "DataReader.h"
+#include "NativeVertexFormat.h"
+
+#include "x64Emitter.h"
+
+class VertexLoaderUID
+{
+	u32 vid[5];
+public:
+	VertexLoaderUID() {}
+	void InitFromCurrentState(int vtx_attr_group) {
+		vid[0] = g_VtxDesc.Hex & 0xFFFFFFFF;
+		vid[1] = g_VtxDesc.Hex >> 32;
+		vid[2] = g_VtxAttr[vtx_attr_group].g0.Hex & ~VAT_0_FRACBITS;
+		vid[3] = g_VtxAttr[vtx_attr_group].g1.Hex & ~VAT_1_FRACBITS;
+		vid[4] = g_VtxAttr[vtx_attr_group].g2.Hex & ~VAT_2_FRACBITS;
+	}
+	bool operator < (const VertexLoaderUID &other) const {
+		if (vid[0] < other.vid[0])
+			return true;
+		else if (vid[0] > other.vid[0])
+			return false;
+		for (int i = 1; i < 5; ++i) {
+			if (vid[i] < other.vid[i])
+				return true;
+			else if (vid[i] > other.vid[i])
+				return false;
+		}
+		return false;
+	}
+};
+
+class VertexLoader : public Gen::XCodeBlock
+{
+public:
+	VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr);
+	~VertexLoader();
+
+	int GetVertexSize() const {return m_VertexSize;}
+	void RunVertices(int vtx_attr_group, int primitive, int count);
+
+	// For debugging / profiling
+	void AppendToString(std::string *dest);
+
+private:
+	enum
+	{
+		NRM_ZERO = 0,
+		NRM_ONE = 1,
+		NRM_THREE = 3,
+	};
+
+	int m_VertexSize;      // number of bytes of a raw GC vertex. Computed by CompileVertexTranslator.
+
+	// GC vertex format
+	TVtxAttr m_VtxAttr;  // VAT decoded into easy format
+	TVtxDesc m_VtxDesc;  // Not really used currently - or well it is, but could be easily avoided.
+
+	// PC vertex format
+	NativeVertexFormat *m_NativeFmt;
+	int native_stride;
+
+	// Pipeline. To be JIT compiled in the future.
+	TPipelineFunction m_PipelineStages[64];  // TODO - figure out real max. it's lower.
+	int m_numPipelineStages;
+
+	const u8 *m_compiledCode;
+
+	int m_numLoadedVertices;
+
+	void SetVAT(u32 _group0, u32 _group1, u32 _group2);
+
+	void CompileVertexTranslator();
+
+	void WriteCall(TPipelineFunction);
+};									  
+
+#endif
--- a/Source/Core/VideoCommon/Src/VertexLoader_Color.cpp
+++ b/Source/Core/VideoCommon/Src/VertexLoader_Color.cpp
@ -0,0 +1,232 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#ifndef _VERTEXLOADERCOLOR_H
+#define _VERTEXLOADERCOLOR_H
+
+#include "Common.h"
+#include "VideoCommon.h"
+#include "LookUpTables.h"
+#include "VertexLoader.h"
+#include "VertexLoader_Color.h"
+#include "NativeVertexWriter.h"
+
+#define RSHIFT 0
+#define GSHIFT 8
+#define BSHIFT 16
+#define ASHIFT 24
+
+extern int colIndex;
+extern int colElements[2];
+
+inline void _SetCol(u32 val)
+{
+	*(u32*)VertexManager::s_pCurBufferPointer = val;
+	VertexManager::s_pCurBufferPointer += 4;
+	colIndex++;
+}
+
+void _SetCol4444(u16 val)
+{
+	u32 col = lut4to8[(val>>0)&0xF]<<ASHIFT;
+	col    |= lut4to8[(val>>12)&0xF]   <<RSHIFT;
+	col    |= lut4to8[(val>>8)&0xF]    <<GSHIFT;
+	col    |= lut4to8[(val>>4)&0xF]    <<BSHIFT;
+	_SetCol(col);
+}
+
+void _SetCol6666(u32 val)
+{
+	u32 col = lut6to8[(val>>18)&0x3F] << RSHIFT;
+	col    |= lut6to8[(val>>12)&0x3F] << GSHIFT;
+	col    |= lut6to8[(val>>6)&0x3F]  << BSHIFT;
+	col    |= lut6to8[(val>>0)&0x3F]  << ASHIFT;
+	_SetCol(col);
+}
+
+void _SetCol565(u16 val)
+{
+	u32 col = lut5to8[(val>>11)&0x1f] << RSHIFT;
+	col     |= lut6to8[(val>>5 )&0x3f] << GSHIFT;
+	col     |= lut5to8[(val    )&0x1f] << BSHIFT;
+	_SetCol(col | (0xFF<<ASHIFT));
+}
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+inline u32 _Read24(u32 iAddress)
+{
+	u32 col = Memory_Read_U8(iAddress)   << RSHIFT;      //should just get a pointer to main memory instead of going thru slow memhandler
+	col     |= Memory_Read_U8(iAddress+1) << GSHIFT;    //we can guarantee that it is reading from main memory
+	col     |= Memory_Read_U8(iAddress+2) << BSHIFT;
+	return col | (0xFF<<ASHIFT);
+}
+
+inline u32 _Read32(u32 iAddress)
+{
+	u32 col = Memory_Read_U8(iAddress)   << RSHIFT;      //should just get a pointer to main memory instead of going thru slow memhandler
+	col     |= Memory_Read_U8(iAddress+1) << GSHIFT;    //we can guarantee that it is reading from main memory
+	col     |= Memory_Read_U8(iAddress+2) << BSHIFT;
+	col     |= Memory_Read_U8(iAddress+3) << ASHIFT;
+	return col;
+}
+
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+
+void LOADERDECL Color_ReadDirect_24b_888()
+{
+	u32 col = DataReadU8()<<RSHIFT;
+	col     |= DataReadU8()<<GSHIFT;
+	col     |= DataReadU8()<<BSHIFT;
+	_SetCol(col | (0xFF<<ASHIFT));
+}
+
+void LOADERDECL Color_ReadDirect_32b_888x(){
+	u32 col = DataReadU8()<<RSHIFT;
+	col     |= DataReadU8()<<GSHIFT;
+	col     |= DataReadU8()<<BSHIFT;
+	_SetCol(col | (0xFF<<ASHIFT));
+	DataReadU8();
+}
+void LOADERDECL Color_ReadDirect_16b_565()
+{
+	_SetCol565(DataReadU16());
+}
+void LOADERDECL Color_ReadDirect_16b_4444()
+{
+	_SetCol4444(DataReadU16());
+}
+void LOADERDECL Color_ReadDirect_24b_6666()
+{
+	u32 val = DataReadU8()<<16;
+	val|=DataReadU8()<<8;
+	val|=DataReadU8(); 
+	_SetCol6666(val);
+}
+
+// F|RES: i am not 100 percent sure, but the colElements seems to be important for rendering only
+// at least it fixes mario party 4
+//
+//	if (colElements[colIndex])	
+//	else
+//		col |= 0xFF<<ASHIFT;
+//
+void LOADERDECL Color_ReadDirect_32b_8888()
+{
+	// TODO (mb2): check this
+	u32 col = DataReadU8()<<RSHIFT;
+	col     |= DataReadU8()<<GSHIFT;
+	col     |= DataReadU8()<<BSHIFT;
+	col		|= DataReadU8()<<ASHIFT;
+
+	// "kill" the alpha
+	if (!colElements[colIndex])	
+		col |= 0xFF<<ASHIFT;
+
+	_SetCol(col);
+}
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+void LOADERDECL Color_ReadIndex8_16b_565()
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
+	u16 val = Memory_Read_U16(iAddress);
+	_SetCol565(val);
+}
+void LOADERDECL Color_ReadIndex8_24b_888()
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
+	_SetCol(_Read24(iAddress));
+}
+void LOADERDECL Color_ReadIndex8_32b_888x()
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR]+colIndex);
+	_SetCol(_Read24(iAddress));
+}
+void LOADERDECL Color_ReadIndex8_16b_4444()
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
+	u16 val = Memory_Read_U16(iAddress);
+	_SetCol4444(val);
+}
+void LOADERDECL Color_ReadIndex8_24b_6666()
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
+	u32 val = Memory_Read_U8(iAddress+2) | 
+			  (Memory_Read_U8(iAddress+1)<<8) |
+			  (Memory_Read_U8(iAddress)<<16); 
+	
+	_SetCol6666(val);
+}
+void LOADERDECL Color_ReadIndex8_32b_8888()
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
+	_SetCol(_Read32(iAddress));
+}
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////
+void LOADERDECL Color_ReadIndex16_16b_565()
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
+	u16 val = Memory_Read_U16(iAddress);
+	_SetCol565(val);
+}
+void LOADERDECL Color_ReadIndex16_24b_888()
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
+	_SetCol(_Read24(iAddress));
+}
+void LOADERDECL Color_ReadIndex16_32b_888x()
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
+	_SetCol(_Read24(iAddress));
+}
+void LOADERDECL Color_ReadIndex16_16b_4444()
+{
+	u16 Index = DataReadU16();
+	u32 iAddress = arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
+	u16 val = Memory_Read_U16(iAddress);
+	_SetCol4444(val);
+}
+void LOADERDECL Color_ReadIndex16_24b_6666()
+{
+	u16 Index = DataReadU16();
+	u32 iAddress = arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
+	u32 val = Memory_Read_U8(iAddress+2) | 
+			   (Memory_Read_U8(iAddress+1)<<8) |
+			   (Memory_Read_U8(iAddress)<<16); 
+	_SetCol6666(val);
+}
+void LOADERDECL Color_ReadIndex16_32b_8888()
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_COLOR+colIndex] + (Index * arraystrides[ARRAY_COLOR+colIndex]);
+	_SetCol(_Read32(iAddress));
+}
+
+#endif
--- a/Source/Core/VideoCommon/Src/VertexLoader_Color.h
+++ b/Source/Core/VideoCommon/Src/VertexLoader_Color.h
@ -0,0 +1,42 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#ifndef _VERTEXLOADERCOLOR_H
+#define _VERTEXLOADERCOLOR_H
+
+void LOADERDECL Color_ReadDirect_24b_888();
+void LOADERDECL Color_ReadDirect_32b_888x();
+void LOADERDECL Color_ReadDirect_16b_565();
+void LOADERDECL Color_ReadDirect_16b_4444();
+void LOADERDECL Color_ReadDirect_24b_6666();
+void LOADERDECL Color_ReadDirect_32b_8888();
+
+void LOADERDECL Color_ReadIndex8_16b_565();
+void LOADERDECL Color_ReadIndex8_24b_888();
+void LOADERDECL Color_ReadIndex8_32b_888x();
+void LOADERDECL Color_ReadIndex8_16b_4444();
+void LOADERDECL Color_ReadIndex8_24b_6666();
+void LOADERDECL Color_ReadIndex8_32b_8888();
+
+void LOADERDECL Color_ReadIndex16_16b_565();
+void LOADERDECL Color_ReadIndex16_24b_888();
+void LOADERDECL Color_ReadIndex16_32b_888x();
+void LOADERDECL Color_ReadIndex16_16b_4444();
+void LOADERDECL Color_ReadIndex16_24b_6666();
+void LOADERDECL Color_ReadIndex16_32b_8888();
+
+#endif
--- a/Source/Core/VideoCommon/Src/VertexLoader_Normal.cpp
+++ b/Source/Core/VideoCommon/Src/VertexLoader_Normal.cpp
@ -0,0 +1,424 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#include "Common.h"
+#include "VideoCommon.h"
+#include "VertexLoader.h"
+#include "VertexLoader_Normal.h"
+#include "NativeVertexWriter.h"
+
+#define LOG_NORM8() PRIM_LOG("norm: %f %f %f, ", ((s8*)VertexManager::s_pCurBufferPointer)[-3]/127.0f, ((s8*)VertexManager::s_pCurBufferPointer)[-2]/127.0f, ((s8*)VertexManager::s_pCurBufferPointer)[-1]/127.0f);
+#define LOG_NORM16() PRIM_LOG("norm: %f %f %f, ", ((s16*)VertexManager::s_pCurBufferPointer)[-3]/32767.0f, ((s16*)VertexManager::s_pCurBufferPointer)[-2]/32767.0f, ((s16*)VertexManager::s_pCurBufferPointer)[-1]/32767.0f);
+#define LOG_NORMF() PRIM_LOG("norm: %f %f %f, ", ((float*)VertexManager::s_pCurBufferPointer)[-3], ((float*)VertexManager::s_pCurBufferPointer)[-2], ((float*)VertexManager::s_pCurBufferPointer)[-1]);
+
+VertexLoader_Normal::Set VertexLoader_Normal::m_Table[NUM_NRM_TYPE][NUM_NRM_INDICES][NUM_NRM_ELEMENTS][NUM_NRM_FORMAT];
+
+void VertexLoader_Normal::Init(void)
+{
+    m_Table[NRM_DIRECT] [NRM_INDICES1][NRM_NBT] [FORMAT_UBYTE] 	= Set(3,  Normal_DirectByte); //HACK
+    m_Table[NRM_DIRECT] [NRM_INDICES1][NRM_NBT] [FORMAT_BYTE]   = Set(3,  Normal_DirectByte);
+    m_Table[NRM_DIRECT] [NRM_INDICES1][NRM_NBT] [FORMAT_USHORT]	= Set(6,  Normal_DirectShort); //HACK
+    m_Table[NRM_DIRECT] [NRM_INDICES1][NRM_NBT] [FORMAT_SHORT] 	= Set(6,  Normal_DirectShort);
+    m_Table[NRM_DIRECT] [NRM_INDICES1][NRM_NBT] [FORMAT_FLOAT] 	= Set(12, Normal_DirectFloat);
+    m_Table[NRM_DIRECT] [NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] 	= Set(9,  Normal_DirectByte3); //HACK	
+    m_Table[NRM_DIRECT] [NRM_INDICES1][NRM_NBT3][FORMAT_BYTE]  	= Set(9,  Normal_DirectByte3);
+    m_Table[NRM_DIRECT] [NRM_INDICES1][NRM_NBT3][FORMAT_USHORT]	= Set(18, Normal_DirectShort3); //HACK
+    m_Table[NRM_DIRECT] [NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] 	= Set(18, Normal_DirectShort3);
+    m_Table[NRM_DIRECT] [NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] 	= Set(36, Normal_DirectFloat3);
+								  			
+	m_Table[NRM_DIRECT] [NRM_INDICES3][NRM_NBT] [FORMAT_UBYTE] 	= Set(3,  Normal_DirectByte); //HACK
+    m_Table[NRM_DIRECT] [NRM_INDICES3][NRM_NBT] [FORMAT_BYTE]  	= Set(3,  Normal_DirectByte);
+    m_Table[NRM_DIRECT] [NRM_INDICES3][NRM_NBT] [FORMAT_USHORT]	= Set(6,  Normal_DirectShort); //HACK
+    m_Table[NRM_DIRECT] [NRM_INDICES3][NRM_NBT] [FORMAT_SHORT] 	= Set(6,  Normal_DirectShort);
+    m_Table[NRM_DIRECT] [NRM_INDICES3][NRM_NBT] [FORMAT_FLOAT] 	= Set(12, Normal_DirectFloat);
+    m_Table[NRM_DIRECT] [NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] 	= Set(9,  Normal_DirectByte3); //HACK	
+    m_Table[NRM_DIRECT] [NRM_INDICES3][NRM_NBT3][FORMAT_BYTE]  	= Set(9,  Normal_DirectByte3);
+    m_Table[NRM_DIRECT] [NRM_INDICES3][NRM_NBT3][FORMAT_USHORT]	= Set(18, Normal_DirectShort3); //HACK
+    m_Table[NRM_DIRECT] [NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] 	= Set(18, Normal_DirectShort3);
+    m_Table[NRM_DIRECT] [NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] 	= Set(36, Normal_DirectFloat3);
+								  			
+    m_Table[NRM_INDEX8] [NRM_INDICES1][NRM_NBT] [FORMAT_UBYTE] 	= Set(1,  Normal_Index8_Byte); //HACK
+    m_Table[NRM_INDEX8] [NRM_INDICES1][NRM_NBT] [FORMAT_BYTE]  	= Set(1,  Normal_Index8_Byte);
+    m_Table[NRM_INDEX8] [NRM_INDICES1][NRM_NBT] [FORMAT_USHORT]	= Set(1,  Normal_Index8_Short); //HACK
+    m_Table[NRM_INDEX8] [NRM_INDICES1][NRM_NBT] [FORMAT_SHORT] 	= Set(1,  Normal_Index8_Short);
+    m_Table[NRM_INDEX8] [NRM_INDICES1][NRM_NBT] [FORMAT_FLOAT] 	= Set(1,  Normal_Index8_Float);
+    m_Table[NRM_INDEX8] [NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] 	= Set(1,  Normal_Index8_Byte3_Indices1); //HACK	
+    m_Table[NRM_INDEX8] [NRM_INDICES1][NRM_NBT3][FORMAT_BYTE]  	= Set(1,  Normal_Index8_Byte3_Indices1);
+    m_Table[NRM_INDEX8] [NRM_INDICES1][NRM_NBT3][FORMAT_USHORT]	= Set(1,  Normal_Index8_Short3_Indices1); //HACK
+    m_Table[NRM_INDEX8] [NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] 	= Set(1,  Normal_Index8_Short3_Indices1);
+    m_Table[NRM_INDEX8] [NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] 	= Set(1,  Normal_Index8_Float3_Indices1);
+								  			
+	m_Table[NRM_INDEX8] [NRM_INDICES3][NRM_NBT] [FORMAT_UBYTE] 	= Set(1,  Normal_Index8_Byte); //HACK
+    m_Table[NRM_INDEX8] [NRM_INDICES3][NRM_NBT] [FORMAT_BYTE]  	= Set(1,  Normal_Index8_Byte);
+    m_Table[NRM_INDEX8] [NRM_INDICES3][NRM_NBT] [FORMAT_USHORT]	= Set(1,  Normal_Index8_Short); //HACK
+    m_Table[NRM_INDEX8] [NRM_INDICES3][NRM_NBT] [FORMAT_SHORT] 	= Set(1,  Normal_Index8_Short);
+    m_Table[NRM_INDEX8] [NRM_INDICES3][NRM_NBT] [FORMAT_FLOAT] 	= Set(1,  Normal_Index8_Float);
+    m_Table[NRM_INDEX8] [NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] 	= Set(3,  Normal_Index8_Byte3_Indices3); //HACK	
+    m_Table[NRM_INDEX8] [NRM_INDICES3][NRM_NBT3][FORMAT_BYTE]  	= Set(3,  Normal_Index8_Byte3_Indices3);
+    m_Table[NRM_INDEX8] [NRM_INDICES3][NRM_NBT3][FORMAT_USHORT]	= Set(3,  Normal_Index8_Short3_Indices3); //HACK
+    m_Table[NRM_INDEX8] [NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] 	= Set(3,  Normal_Index8_Short3_Indices3);
+    m_Table[NRM_INDEX8] [NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] 	= Set(3,  Normal_Index8_Float3_Indices3);
+								  												  			
+    m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT] [FORMAT_UBYTE] 	= Set(2,  Normal_Index16_Byte); //HACK
+    m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT] [FORMAT_BYTE]  	= Set(2,  Normal_Index16_Byte);
+    m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT] [FORMAT_USHORT]	= Set(2,  Normal_Index16_Short); //HACK
+    m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT] [FORMAT_SHORT] 	= Set(2,  Normal_Index16_Short);
+    m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT] [FORMAT_FLOAT] 	= Set(2,  Normal_Index16_Float);
+    m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] 	= Set(2,  Normal_Index16_Byte3_Indices1); //HACK
+    m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE]  	= Set(2,  Normal_Index16_Byte3_Indices1);
+    m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT]	= Set(2,  Normal_Index16_Short3_Indices1); //HACK
+    m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] 	= Set(2,  Normal_Index16_Short3_Indices1);
+    m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] 	= Set(2,  Normal_Index16_Float3_Indices1);
+								  			
+	m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT] [FORMAT_UBYTE] 	= Set(2,  Normal_Index16_Byte); //HACK
+    m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT] [FORMAT_BYTE]  	= Set(2,  Normal_Index16_Byte);
+    m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT] [FORMAT_USHORT]	= Set(2,  Normal_Index16_Short); //HACK
+    m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT] [FORMAT_SHORT] 	= Set(2,  Normal_Index16_Short);
+    m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT] [FORMAT_FLOAT] 	= Set(2,  Normal_Index16_Float);
+    m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] 	= Set(6,  Normal_Index16_Byte3_Indices3); //HACK	
+    m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE]  	= Set(6,  Normal_Index16_Byte3_Indices3);
+    m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT]	= Set(6,  Normal_Index16_Short3_Indices3); //HACK
+    m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] 	= Set(6,  Normal_Index16_Short3_Indices3);
+    m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] 	= Set(6,  Normal_Index16_Float3_Indices3);
+}
+
+unsigned int VertexLoader_Normal::GetSize(unsigned int _type, unsigned int _format, unsigned int _elements, unsigned int _index3)
+{
+	return m_Table[_type][_index3][_elements][_format].gc_size;
+}
+
+TPipelineFunction VertexLoader_Normal::GetFunction(unsigned int _type, unsigned int _format, unsigned int _elements, unsigned int _index3)
+{
+    TPipelineFunction pFunc = m_Table[_type][_index3][_elements][_format].function;
+    return pFunc;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+// --- Direct ---
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+void LOADERDECL VertexLoader_Normal::Normal_DirectByte()
+{
+    *VertexManager::s_pCurBufferPointer++ = DataReadU8();
+    *VertexManager::s_pCurBufferPointer++ = DataReadU8();
+    *VertexManager::s_pCurBufferPointer++ = DataReadU8();
+    VertexManager::s_pCurBufferPointer++;
+	LOG_NORM8();
+//    ((float*)VertexManager::s_pCurBufferPointer)[0] = ((float)(signed char)DataReadU8()+0.5f) / 127.5f;
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_DirectShort()
+{
+    ((u16*)VertexManager::s_pCurBufferPointer)[0] = DataReadU16();
+    ((u16*)VertexManager::s_pCurBufferPointer)[1] = DataReadU16();
+    ((u16*)VertexManager::s_pCurBufferPointer)[2] = DataReadU16();
+    VertexManager::s_pCurBufferPointer += 8;
+    LOG_NORM16()
+//    ((float*)VertexManager::s_pCurBufferPointer)[0] = ((float)(signed short)DataReadU16()+0.5f) / 32767.5f;
+//    ((float*)VertexManager::s_pCurBufferPointer)[1] = ((float)(signed short)DataReadU16()+0.5f) / 32767.5f;
+//    ((float*)VertexManager::s_pCurBufferPointer)[2] = ((float)(signed short)DataReadU16()+0.5f) / 32767.5f;
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_DirectFloat()
+{
+    ((u32*)VertexManager::s_pCurBufferPointer)[0] = DataReadU32();
+    ((u32*)VertexManager::s_pCurBufferPointer)[1] = DataReadU32();
+    ((u32*)VertexManager::s_pCurBufferPointer)[2] = DataReadU32();
+    VertexManager::s_pCurBufferPointer += 12;
+    LOG_NORMF()
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_DirectByte3()
+{
+    for (int i = 0; i < 3; i++)
+    {
+        *VertexManager::s_pCurBufferPointer++ = DataReadU8();
+        *VertexManager::s_pCurBufferPointer++ = DataReadU8();
+        *VertexManager::s_pCurBufferPointer++ = DataReadU8();
+        VertexManager::s_pCurBufferPointer++;
+        LOG_NORM8();
+    }
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_DirectShort3()
+{
+    for (int i = 0; i < 3; i++)
+    {
+        ((u16*)VertexManager::s_pCurBufferPointer)[0] = DataReadU16();
+        ((u16*)VertexManager::s_pCurBufferPointer)[1] = DataReadU16();
+        ((u16*)VertexManager::s_pCurBufferPointer)[2] = DataReadU16();
+        VertexManager::s_pCurBufferPointer += 8;
+        LOG_NORM16();
+    }
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_DirectFloat3()
+{
+    for (int i = 0; i < 3; i++)
+    {
+        ((u32*)VertexManager::s_pCurBufferPointer)[0] = DataReadU32();
+        ((u32*)VertexManager::s_pCurBufferPointer)[1] = DataReadU32();
+        ((u32*)VertexManager::s_pCurBufferPointer)[2] = DataReadU32();
+        VertexManager::s_pCurBufferPointer += 12;
+        LOG_NORMF();
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+// --- Index8 ---
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+void LOADERDECL VertexLoader_Normal::Normal_Index8_Byte()
+{
+    u8 Index = DataReadU8();
+    u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]);
+    *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress);
+    *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress+1);
+    *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress+2);
+	VertexManager::s_pCurBufferPointer++;
+//    ((float*)VertexManager::s_pCurBufferPointer)[0] = ((float)(signed char)Memory_Read_U8(iAddress)+0.5f) / 127.5f;
+//    ((float*)VertexManager::s_pCurBufferPointer)[1] = ((float)(signed char)Memory_Read_U8(iAddress+1)+0.5f) / 127.5f;
+//    ((float*)VertexManager::s_pCurBufferPointer)[2] = ((float)(signed char)Memory_Read_U8(iAddress+2)+0.5f) / 127.5f;
+//    VertexManager::s_pCurBufferPointer += 12;
+    LOG_NORM8();
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index8_Short()
+{
+    u8 Index = DataReadU8();
+    u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]);
+    ((u16*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U16(iAddress);
+    ((u16*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U16(iAddress+2);
+    ((u16*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U16(iAddress+4);
+    VertexManager::s_pCurBufferPointer += 8;
+    LOG_NORM16();
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index8_Float()
+{
+    u8 Index = DataReadU8();
+    u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]);
+    ((u32*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U32(iAddress);
+    ((u32*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U32(iAddress+4);
+    ((u32*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U32(iAddress+8);
+    VertexManager::s_pCurBufferPointer += 12;
+    LOG_NORMF();
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index8_Byte3_Indices1()
+{
+    u8 Index = DataReadU8();
+    for (int i = 0; i < 3; i++)
+	{
+        u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]) + 1*3*i;
+        *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress);
+        *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress+1);
+        *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress+2);
+		VertexManager::s_pCurBufferPointer++;
+        LOG_NORM8();
+    }
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index8_Short3_Indices1()
+{
+    u8 Index = DataReadU8();
+    for (int i = 0; i < 3; i++)
+	{
+        u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]) + 2*3*i;
+        ((u16*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U16(iAddress);
+        ((u16*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U16(iAddress+2);
+        ((u16*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U16(iAddress+4);
+        VertexManager::s_pCurBufferPointer += 8;
+        LOG_NORM16();
+    }    
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index8_Float3_Indices1()
+{
+    u8 Index = DataReadU8();
+    for (int i = 0; i < 3; i++)
+	{
+        u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]) + 4*3*i;
+        ((u32*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U32(iAddress);
+        ((u32*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U32(iAddress+4);
+        ((u32*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U32(iAddress+8);
+        VertexManager::s_pCurBufferPointer += 12;
+        LOG_NORMF();
+    }    
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index8_Byte3_Indices3()
+{
+    for (int i = 0; i < 3; i++)
+	{
+        u8 Index = DataReadU8();
+        u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]) + 1*3*i;
+        *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress);
+        *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress+1);
+        *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress+2);
+        *VertexManager::s_pCurBufferPointer++;
+        LOG_NORM8();
+    }
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index8_Short3_Indices3()
+{
+    for (int i = 0; i < 3; i++)
+	{
+        u8 Index = DataReadU8();
+        u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]) + 2*3*i;
+        ((u16*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U16(iAddress);
+        ((u16*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U16(iAddress+2);
+        ((u16*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U16(iAddress+4);
+        VertexManager::s_pCurBufferPointer += 8;
+        LOG_NORM16();
+    }
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index8_Float3_Indices3()
+{
+    for (int i = 0; i < 3; i++)
+	{
+        u8 Index = DataReadU8();
+        u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]) + 4*3*i;
+        ((u32*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U32(iAddress);
+        ((u32*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U32(iAddress+4);
+        ((u32*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U32(iAddress+8);
+        VertexManager::s_pCurBufferPointer += 12;
+        LOG_NORMF();
+    }    
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+// --- Index16 ---
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void LOADERDECL VertexLoader_Normal::Normal_Index16_Byte()
+{
+    u16 Index = DataReadU16();
+    u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]);
+    *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress);
+    *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress+1);
+    *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress+2);
+	VertexManager::s_pCurBufferPointer++;
+    LOG_NORM8();
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index16_Short()
+{
+    u16 Index = DataReadU16();
+    u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]);
+    ((u16*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U16(iAddress);
+    ((u16*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U16(iAddress+2);
+    ((u16*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U16(iAddress+4);
+    VertexManager::s_pCurBufferPointer += 8;
+    LOG_NORM16();
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index16_Float()
+{
+    u16 Index = DataReadU16();
+    u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]);
+    ((u32*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U32(iAddress);
+    ((u32*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U32(iAddress+4);
+    ((u32*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U32(iAddress+8);
+    VertexManager::s_pCurBufferPointer += 12;
+    LOG_NORMF();
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index16_Byte3_Indices1()
+{
+    u16 Index = DataReadU16();
+    for (int i = 0; i < 3; i++)
+	{
+        u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]) + 1*3*i;
+        *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress);
+        *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress+1);
+        *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress+2);
+		VertexManager::s_pCurBufferPointer++;
+        LOG_NORM8();
+    }
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index16_Short3_Indices1()
+{
+    u16 Index = DataReadU16();
+    for (int i = 0; i < 3; i++)
+    {
+        u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]) + 2*3*i;
+        ((u16*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U16(iAddress);
+        ((u16*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U16(iAddress+2);
+        ((u16*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U16(iAddress+4);
+        VertexManager::s_pCurBufferPointer += 8;
+        LOG_NORM16();
+    }
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index16_Float3_Indices1()
+{
+    u16 Index = DataReadU16();
+    for (int i = 0; i < 3; i++)
+    {
+        u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]) + 4*3*i;
+        ((u32*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U32(iAddress);
+        ((u32*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U32(iAddress+4);
+		((u32*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U32(iAddress+8);
+        VertexManager::s_pCurBufferPointer += 12;
+        LOG_NORMF();
+    }
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index16_Byte3_Indices3()
+{
+    for (int i = 0; i < 3; i++)
+	{
+        u16 Index = DataReadU16();
+        u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]) + 1*3*i;
+        *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress);
+        *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress+1);
+        *VertexManager::s_pCurBufferPointer++ = Memory_Read_U8(iAddress+2);
+		VertexManager::s_pCurBufferPointer++;
+        LOG_NORM8();
+    }    
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index16_Short3_Indices3()
+{
+    for (int i = 0; i < 3; i++)
+    {
+        u16 Index = DataReadU16();
+        u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]) + 2*3*i;
+        ((u16*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U16(iAddress);
+        ((u16*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U16(iAddress+2);
+        ((u16*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U16(iAddress+4);
+        VertexManager::s_pCurBufferPointer += 8;
+        LOG_NORM16();
+    }
+   
+}
+
+void LOADERDECL VertexLoader_Normal::Normal_Index16_Float3_Indices3()
+{
+    for (int i = 0; i < 3; i++)
+    {
+        u16 Index = DataReadU16();
+        u32 iAddress = arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]) + 4*3*i;
+        ((u32*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U32(iAddress);
+        ((u32*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U32(iAddress+4);
+        ((u32*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U32(iAddress+8);
+        VertexManager::s_pCurBufferPointer += 12;
+        LOG_NORMF();
+    }    
+}
--- a/Source/Core/VideoCommon/Src/VertexLoader_Normal.h
+++ b/Source/Core/VideoCommon/Src/VertexLoader_Normal.h
@ -0,0 +1,111 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#ifndef _VERTEXLOADER_NORMAL_H
+#define _VERTEXLOADER_NORMAL_H
+
+#include "CommonTypes.h"
+
+class VertexLoader_Normal
+{
+public:
+
+    // Init
+    static void Init(void);
+
+    // GetSize
+    static unsigned int GetSize(unsigned int _type, unsigned int _format, unsigned int _elements, unsigned int _index3);
+
+    // GetFunction
+    static TPipelineFunction GetFunction(unsigned int _type, unsigned int _format, unsigned int _elements, unsigned int _index3);
+
+private:
+    enum ENormalType
+    {
+        NRM_NOT_PRESENT		= 0,
+        NRM_DIRECT			= 1,
+        NRM_INDEX8			= 2,
+        NRM_INDEX16			= 3,
+        NUM_NRM_TYPE
+    };
+
+    enum ENormalFormat
+    {
+        FORMAT_UBYTE		= 0,
+        FORMAT_BYTE			= 1,
+        FORMAT_USHORT		= 2,
+        FORMAT_SHORT		= 3,
+        FORMAT_FLOAT		= 4,
+        NUM_NRM_FORMAT
+    };
+
+    enum ENormalElements
+    {
+        NRM_NBT				= 0,
+        NRM_NBT3			= 1,
+        NUM_NRM_ELEMENTS
+    };
+
+	enum ENormalIndices
+	{
+		NRM_INDICES1		= 0,
+		NRM_INDICES3		= 1,
+		NUM_NRM_INDICES
+	};
+
+	struct Set {
+		Set() {}
+		Set(int gc_size_, TPipelineFunction function_) : gc_size(gc_size_), function(function_) {}
+		int gc_size;
+		TPipelineFunction function;
+//		int pc_size;
+	};
+
+	static Set m_Table[NUM_NRM_TYPE][NUM_NRM_INDICES][NUM_NRM_ELEMENTS][NUM_NRM_FORMAT];
+
+    // direct
+    static void LOADERDECL Normal_DirectByte();
+    static void LOADERDECL Normal_DirectShort();
+    static void LOADERDECL Normal_DirectFloat();
+    static void LOADERDECL Normal_DirectByte3();
+    static void LOADERDECL Normal_DirectShort3();
+    static void LOADERDECL Normal_DirectFloat3();
+
+    // index8
+    static void LOADERDECL Normal_Index8_Byte();
+    static void LOADERDECL Normal_Index8_Short();
+    static void LOADERDECL Normal_Index8_Float();
+    static void LOADERDECL Normal_Index8_Byte3_Indices1();
+    static void LOADERDECL Normal_Index8_Short3_Indices1();
+    static void LOADERDECL Normal_Index8_Float3_Indices1();
+	static void LOADERDECL Normal_Index8_Byte3_Indices3();
+    static void LOADERDECL Normal_Index8_Short3_Indices3();
+    static void LOADERDECL Normal_Index8_Float3_Indices3();
+
+    // index16
+    static void LOADERDECL Normal_Index16_Byte();
+    static void LOADERDECL Normal_Index16_Short();
+    static void LOADERDECL Normal_Index16_Float();
+    static void LOADERDECL Normal_Index16_Byte3_Indices1();
+    static void LOADERDECL Normal_Index16_Short3_Indices1();
+    static void LOADERDECL Normal_Index16_Float3_Indices1();
+	static void LOADERDECL Normal_Index16_Byte3_Indices3();
+    static void LOADERDECL Normal_Index16_Short3_Indices3();
+    static void LOADERDECL Normal_Index16_Float3_Indices3();
+};
+
+#endif
--- a/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp
+++ b/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp
@ -0,0 +1,242 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#ifndef VERTEXLOADER_POSITION_H
+#define VERTEXLOADER_POSITION_H
+
+#include "Common.h"
+#include "VideoCommon.h"
+#include "VertexLoader.h"
+#include "VertexLoader_Position.h"
+#include "NativeVertexWriter.h"
+
+extern float posScale;
+extern TVtxAttr *pVtxAttr;
+
+#define LOG_VTX() PRIM_LOG("vtx: %f %f %f, ", ((float*)VertexManager::s_pCurBufferPointer)[0], ((float*)VertexManager::s_pCurBufferPointer)[1], ((float*)VertexManager::s_pCurBufferPointer)[2]);
+
+// Thoughts on the implementation of a vertex loader compiler.
+// s_pCurBufferPointer should definitely be in a register.
+// Could load the position scale factor in XMM7, for example.
+
+// The pointer inside DataReadU8 in another.
+// Let's check out Pos_ReadDirect_UByte(). For Byte, replace MOVZX with MOVSX.
+
+/*
+MOVZX(32, R(EAX), MOffset(ESI, 0));
+MOVZX(32, R(EBX), MOffset(ESI, 1));
+MOVZX(32, R(ECX), MOffset(ESI, 2));
+MOVD(XMM0, R(EAX));
+MOVD(XMM1, R(EBX));
+MOVD(XMM2, R(ECX));                   
+CVTDQ2PS(XMM0, XMM0);
+CVTDQ2PS(XMM1, XMM1);
+CVTDQ2PS(XMM2, XMM2);
+MULSS(XMM0, XMM7);
+MULSS(XMM1, XMM7);
+MULSS(XMM2, XMM7);
+MOVSS(MOffset(EDI, 0), XMM0);
+MOVSS(MOffset(EDI, 4), XMM1);
+MOVSS(MOffset(EDI, 8), XMM2);
+
+Alternatively, lookup table:
+MOVZX(32, R(EAX), MOffset(ESI, 0));
+MOVZX(32, R(EBX), MOffset(ESI, 1));
+MOVZX(32, R(ECX), MOffset(ESI, 2));
+MOV(32, R(EAX), MComplex(LUTREG, EAX, 4));
+MOV(32, R(EBX), MComplex(LUTREG, EBX, 4));
+MOV(32, R(ECX), MComplex(LUTREG, ECX, 4));
+MOV(MOffset(EDI, 0), XMM0);
+MOV(MOffset(EDI, 4), XMM1);
+MOV(MOffset(EDI, 8), XMM2);
+
+SSE4:
+PINSRB(XMM0, MOffset(ESI, 0), 0);
+PINSRB(XMM0, MOffset(ESI, 1), 4);
+PINSRB(XMM0, MOffset(ESI, 2), 8);
+CVTDQ2PS(XMM0, XMM0);
+<two unpacks here to sign extend>
+MULPS(XMM0, XMM7);
+MOVUPS(MOffset(EDI, 0), XMM0);
+
+									 */
+
+// ==============================================================================
+// Direct
+// ==============================================================================
+void LOADERDECL Pos_ReadDirect_UByte()
+{
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)DataReadU8() * posScale;
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)DataReadU8() * posScale;
+	if (pVtxAttr->PosElements)
+		((float*)VertexManager::s_pCurBufferPointer)[2] = (float)DataReadU8() * posScale;
+	else
+		((float*)VertexManager::s_pCurBufferPointer)[2] = 1.0f;
+	LOG_VTX();
+	VertexManager::s_pCurBufferPointer += 12;
+}
+
+void LOADERDECL Pos_ReadDirect_Byte()
+{	
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s8)DataReadU8() * posScale;
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s8)DataReadU8() * posScale;
+	if (pVtxAttr->PosElements)
+		((float*)VertexManager::s_pCurBufferPointer)[2] = (float)(s8)DataReadU8() * posScale;
+	else
+		((float*)VertexManager::s_pCurBufferPointer)[2] = 1.0;
+	LOG_VTX();
+	VertexManager::s_pCurBufferPointer += 12;
+}
+
+void LOADERDECL Pos_ReadDirect_UShort()
+{
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)DataReadU16() * posScale;
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)DataReadU16() * posScale;
+	if (pVtxAttr->PosElements)
+		((float*)VertexManager::s_pCurBufferPointer)[2] = (float)DataReadU16() * posScale;
+	else
+		((float*)VertexManager::s_pCurBufferPointer)[2] = 1.0f;
+	LOG_VTX();
+	VertexManager::s_pCurBufferPointer += 12;
+}
+
+void LOADERDECL Pos_ReadDirect_Short()
+{
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)DataReadU16() * posScale;
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s16)DataReadU16() * posScale;
+	if (pVtxAttr->PosElements)
+		((float*)VertexManager::s_pCurBufferPointer)[2] = (float)(s16)DataReadU16() * posScale;
+	else
+		((float*)VertexManager::s_pCurBufferPointer)[2] = 1.0f;
+	LOG_VTX();
+	VertexManager::s_pCurBufferPointer += 12;
+}
+
+void LOADERDECL Pos_ReadDirect_Float()
+{
+	// No need to use floating point here.
+	((u32 *)VertexManager::s_pCurBufferPointer)[0] = DataReadU32(); 
+	((u32 *)VertexManager::s_pCurBufferPointer)[1] = DataReadU32();
+	if (pVtxAttr->PosElements)
+		((u32 *)VertexManager::s_pCurBufferPointer)[2] = DataReadU32();
+	else
+		((float*)VertexManager::s_pCurBufferPointer)[2] = 1.0f;
+	LOG_VTX();
+	VertexManager::s_pCurBufferPointer += 12;
+}
+
+#define Pos_ReadIndex_Byte(T) { \
+	u32 iAddress = arraybases[ARRAY_POSITION] + ((u32)Index * arraystrides[ARRAY_POSITION]); \
+	((float*)VertexManager::s_pCurBufferPointer)[0] = ((float)(T)Memory_Read_U8(iAddress)) * posScale; \
+	((float*)VertexManager::s_pCurBufferPointer)[1] = ((float)(T)Memory_Read_U8(iAddress+1)) * posScale; \
+	if (pVtxAttr->PosElements) \
+		((float*)VertexManager::s_pCurBufferPointer)[2] = ((float)(T)Memory_Read_U8(iAddress+2)) * posScale; \
+	else \
+		((float*)VertexManager::s_pCurBufferPointer)[2] = 1.0f; \
+	LOG_VTX(); \
+	VertexManager::s_pCurBufferPointer += 12; \
+}
+
+#define Pos_ReadIndex_Short(T) { \
+	u32 iAddress = arraybases[ARRAY_POSITION] + ((u32)Index * arraystrides[ARRAY_POSITION]); \
+	((float*)VertexManager::s_pCurBufferPointer)[0] = ((float)(T)Memory_Read_U16(iAddress)) * posScale; \
+	((float*)VertexManager::s_pCurBufferPointer)[1] = ((float)(T)Memory_Read_U16(iAddress+2)) * posScale; \
+	if (pVtxAttr->PosElements) \
+		((float*)VertexManager::s_pCurBufferPointer)[2] = ((float)(T)Memory_Read_U16(iAddress+4)) * posScale; \
+	else \
+		((float*)VertexManager::s_pCurBufferPointer)[2] = 1.0f; \
+	LOG_VTX(); \
+	VertexManager::s_pCurBufferPointer += 12; \
+}
+
+#define Pos_ReadIndex_Float() { \
+	u32 iAddress = arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION]); \
+	((u32*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U32(iAddress); \
+	((u32*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U32(iAddress+4); \
+	if (pVtxAttr->PosElements) \
+		((u32*)VertexManager::s_pCurBufferPointer)[2] = Memory_Read_U32(iAddress+8); \
+	else \
+		((float*)VertexManager::s_pCurBufferPointer)[2] = 1.0f; \
+	LOG_VTX(); \
+	VertexManager::s_pCurBufferPointer += 12; \
+}
+
+// ==============================================================================
+// Index 8
+// ==============================================================================
+void LOADERDECL Pos_ReadIndex8_UByte() 
+{ 
+	u8 Index = DataReadU8();
+	Pos_ReadIndex_Byte(u8);
+}
+
+void LOADERDECL Pos_ReadIndex8_Byte()
+{
+	u8 Index = DataReadU8();
+	Pos_ReadIndex_Byte(s8);
+}
+
+void LOADERDECL Pos_ReadIndex8_UShort()
+{
+	u8 Index = DataReadU8();
+	Pos_ReadIndex_Short(u16);
+}
+
+void LOADERDECL Pos_ReadIndex8_Short()
+{
+	u8 Index = DataReadU8();
+	Pos_ReadIndex_Short(s16);
+}
+
+void LOADERDECL Pos_ReadIndex8_Float()
+{
+	u8 Index = DataReadU8();
+	Pos_ReadIndex_Float();
+}
+
+// ==============================================================================
+// Index 16
+// ==============================================================================
+
+void LOADERDECL Pos_ReadIndex16_UByte(){
+	u16 Index = DataReadU16(); 
+	Pos_ReadIndex_Byte(u8);
+}
+
+void LOADERDECL Pos_ReadIndex16_Byte(){
+	u16 Index = DataReadU16(); 
+	Pos_ReadIndex_Byte(s8);
+}
+
+void LOADERDECL Pos_ReadIndex16_UShort(){
+	u16 Index = DataReadU16(); 
+	Pos_ReadIndex_Short(u16);
+}
+
+void LOADERDECL Pos_ReadIndex16_Short()
+{
+	u16 Index = DataReadU16(); 
+	Pos_ReadIndex_Short(s16);
+}
+
+void LOADERDECL Pos_ReadIndex16_Float()
+{
+	u16 Index = DataReadU16(); 
+	Pos_ReadIndex_Float();
+}
+
+#endif
--- a/Source/Core/VideoCommon/Src/VertexLoader_Position.h
+++ b/Source/Core/VideoCommon/Src/VertexLoader_Position.h
@ -0,0 +1,39 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#ifndef VERTEXLOADER_POSITION_H
+#define VERTEXLOADER_POSITION_H
+
+void LOADERDECL Pos_ReadDirect_UByte();
+void LOADERDECL Pos_ReadDirect_Byte();
+void LOADERDECL Pos_ReadDirect_UShort();
+void LOADERDECL Pos_ReadDirect_Short();
+void LOADERDECL Pos_ReadDirect_Float();
+
+void LOADERDECL Pos_ReadIndex8_UByte();
+void LOADERDECL Pos_ReadIndex8_Byte();
+void LOADERDECL Pos_ReadIndex8_UShort();
+void LOADERDECL Pos_ReadIndex8_Short();
+void LOADERDECL Pos_ReadIndex8_Float();
+
+void LOADERDECL Pos_ReadIndex16_UByte();
+void LOADERDECL Pos_ReadIndex16_Byte();
+void LOADERDECL Pos_ReadIndex16_UShort();
+void LOADERDECL Pos_ReadIndex16_Short();
+void LOADERDECL Pos_ReadIndex16_Float();
+
+#endif
--- a/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp
+++ b/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp
@ -0,0 +1,338 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#ifndef VERTEXLOADER_TEXCOORD_H
+#define VERTEXLOADER_TEXCOORD_H
+
+#include "Common.h"
+#include "VideoCommon.h"
+#include "VertexLoader.h"
+#include "VertexLoader_Position.h"
+#include "NativeVertexWriter.h"
+
+#define LOG_TEX1() PRIM_LOG("tex: %f, ", ((float*)VertexManager::s_pCurBufferPointer)[0]);
+#define LOG_TEX2() PRIM_LOG("tex: %f %f, ", ((float*)VertexManager::s_pCurBufferPointer)[0], ((float*)VertexManager::s_pCurBufferPointer)[1]);
+
+extern int tcIndex;
+extern float tcScale[8];
+
+void LOADERDECL TexCoord_Read_Dummy()
+{
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadDirect_UByte1()
+{
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)DataReadU8() * tcScale[tcIndex];
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadDirect_UByte2()
+{
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)DataReadU8() * tcScale[tcIndex];
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)DataReadU8() * tcScale[tcIndex];
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadDirect_Byte1()
+{
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s8)DataReadU8() * tcScale[tcIndex];
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadDirect_Byte2()
+{
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s8)DataReadU8() * tcScale[tcIndex];
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s8)DataReadU8() * tcScale[tcIndex];
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadDirect_UShort1()
+{
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)DataReadU16() * tcScale[tcIndex];
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadDirect_UShort2()
+{
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)DataReadU16() * tcScale[tcIndex];
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)DataReadU16() * tcScale[tcIndex];
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadDirect_Short1()
+{
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)DataReadU16() * tcScale[tcIndex];
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadDirect_Short2()
+{
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)DataReadU16() * tcScale[tcIndex];
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s16)DataReadU16() * tcScale[tcIndex];
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadDirect_Float1()
+{
+	((u32*)VertexManager::s_pCurBufferPointer)[0] = DataReadU32();
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadDirect_Float2()
+{
+	((u32*)VertexManager::s_pCurBufferPointer)[0] = DataReadU32();
+	((u32*)VertexManager::s_pCurBufferPointer)[1] = DataReadU32();
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+// ==================================================================================
+void LOADERDECL TexCoord_ReadIndex8_UByte1()	
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(u8)Memory_Read_U8(iAddress) * tcScale[tcIndex];
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadIndex8_UByte2()	
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(u8)Memory_Read_U8(iAddress) * tcScale[tcIndex];
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(u8)Memory_Read_U8(iAddress+1) * tcScale[tcIndex];
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadIndex8_Byte1()		
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s8)Memory_Read_U8(iAddress) * tcScale[tcIndex];
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadIndex8_Byte2()		
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s8)Memory_Read_U8(iAddress) * tcScale[tcIndex];
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s8)Memory_Read_U8(iAddress+1) * tcScale[tcIndex];
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadIndex8_UShort1()	
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(u16)Memory_Read_U16(iAddress) * tcScale[tcIndex];
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadIndex8_UShort2()	
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(u16)Memory_Read_U16(iAddress) * tcScale[tcIndex];
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(u16)Memory_Read_U16(iAddress+2) * tcScale[tcIndex];
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadIndex8_Short1()	
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)Memory_Read_U16(iAddress) * tcScale[tcIndex];
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadIndex8_Short2()	
+{
+	u8 Index = DataReadU8();
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)Memory_Read_U16(iAddress) * tcScale[tcIndex];
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s16)Memory_Read_U16(iAddress+2) * tcScale[tcIndex];
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadIndex8_Float1()	
+{
+	u16 Index = DataReadU8(); 
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+	((u32*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U32(iAddress);
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadIndex8_Float2()	
+{
+	u16 Index = DataReadU8(); 
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+	((u32*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U32(iAddress);
+	((u32*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U32(iAddress+4);
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+// ==================================================================================
+void LOADERDECL TexCoord_ReadIndex16_UByte1()	
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] =  (float)(u8)Memory_Read_U8(iAddress) * tcScale[tcIndex];
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadIndex16_UByte2()	
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] =  (float)(u8)Memory_Read_U8(iAddress) * tcScale[tcIndex];
+	((float*)VertexManager::s_pCurBufferPointer)[1] =  (float)(u8)Memory_Read_U8(iAddress+1) * tcScale[tcIndex];
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadIndex16_Byte1()	
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] =  (float)(s8)Memory_Read_U8(iAddress) * tcScale[tcIndex];
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadIndex16_Byte2()	
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] =  (float)(s8)Memory_Read_U8(iAddress) * tcScale[tcIndex];
+	((float*)VertexManager::s_pCurBufferPointer)[1] =  (float)(s8)Memory_Read_U8(iAddress+1) * tcScale[tcIndex];
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadIndex16_UShort1()	
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(u16)Memory_Read_U16(iAddress) * tcScale[tcIndex];
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadIndex16_UShort2()	
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(u16)Memory_Read_U16(iAddress) * tcScale[tcIndex];
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(u16)Memory_Read_U16(iAddress+2) * tcScale[tcIndex];
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadIndex16_Short1()	
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)Memory_Read_U16(iAddress) * tcScale[tcIndex];
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadIndex16_Short2()	
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)Memory_Read_U16(iAddress) * tcScale[tcIndex];
+	((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s16)Memory_Read_U16(iAddress+2) * tcScale[tcIndex];
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+void LOADERDECL TexCoord_ReadIndex16_Float1()	
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((u32*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U32(iAddress);
+	LOG_TEX1();
+	VertexManager::s_pCurBufferPointer += 4;
+	tcIndex++;
+}
+void LOADERDECL TexCoord_ReadIndex16_Float2()	
+{
+	u16 Index = DataReadU16(); 
+	u32 iAddress = arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]);
+
+	((u32*)VertexManager::s_pCurBufferPointer)[0] = Memory_Read_U32(iAddress);
+	((u32*)VertexManager::s_pCurBufferPointer)[1] = Memory_Read_U32(iAddress + 4);
+	LOG_TEX2();
+	VertexManager::s_pCurBufferPointer += 8;
+	tcIndex++;
+}
+
+#endif
--- a/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.h
+++ b/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.h
@ -0,0 +1,53 @@
+// Copyright (C) 2003-2008 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+#ifndef VERTEXLOADER_TEXCOORD_H
+#define VERTEXLOADER_TEXCOORD_H
+
+void LOADERDECL TexCoord_Read_Dummy();
+void LOADERDECL TexCoord_ReadDirect_UByte1();
+void LOADERDECL TexCoord_ReadDirect_UByte2();
+void LOADERDECL TexCoord_ReadDirect_Byte1();
+void LOADERDECL TexCoord_ReadDirect_Byte2();
+void LOADERDECL TexCoord_ReadDirect_UShort1();
+void LOADERDECL TexCoord_ReadDirect_UShort2();
+void LOADERDECL TexCoord_ReadDirect_Short1();
+void LOADERDECL TexCoord_ReadDirect_Short2();
+void LOADERDECL TexCoord_ReadDirect_Float1();
+void LOADERDECL TexCoord_ReadDirect_Float2();
+void LOADERDECL TexCoord_ReadIndex8_UByte1();
+void LOADERDECL TexCoord_ReadIndex8_UByte2();
+void LOADERDECL TexCoord_ReadIndex8_Byte1();
+void LOADERDECL TexCoord_ReadIndex8_Byte2();
+void LOADERDECL TexCoord_ReadIndex8_UShort1();
+void LOADERDECL TexCoord_ReadIndex8_UShort2();
+void LOADERDECL TexCoord_ReadIndex8_Short1();
+void LOADERDECL TexCoord_ReadIndex8_Short2();
+void LOADERDECL TexCoord_ReadIndex8_Float1();
+void LOADERDECL TexCoord_ReadIndex8_Float2();
+void LOADERDECL TexCoord_ReadIndex16_UByte1();
+void LOADERDECL TexCoord_ReadIndex16_UByte2();
+void LOADERDECL TexCoord_ReadIndex16_Byte1();
+void LOADERDECL TexCoord_ReadIndex16_Byte2();
+void LOADERDECL TexCoord_ReadIndex16_UShort1();
+void LOADERDECL TexCoord_ReadIndex16_UShort2();
+void LOADERDECL TexCoord_ReadIndex16_Short1();
+void LOADERDECL TexCoord_ReadIndex16_Short2();
+void LOADERDECL TexCoord_ReadIndex16_Float1();
+void LOADERDECL TexCoord_ReadIndex16_Float2();
+
+#endif
--- a/Source/Core/VideoCommon/Src/VideoCommon.h
+++ b/Source/Core/VideoCommon/Src/VideoCommon.h
@ -21,6 +21,16 @@
 #include "Common.h"
 #include "pluginspecs_video.h"

+#if defined(_MSC_VER) && !defined(__x86_64__) && !defined(_M_X64)
+void * memcpy_amd(void *dest, const void *src, size_t n);
+unsigned char memcmp_mmx(const void* src1, const void* src2, int cmpsize);
+#define memcpy_gc memcpy_amd
+#define memcmp_gc memcmp_mmx
+#else
+#define memcpy_gc memcpy
+#define memcmp_gc memcmp
+#endif
+
 enum {
 	EFB_WIDTH = 640,
 	EFB_HEIGHT = 528,
--- a/Source/Core/VideoCommon/Src/memcpy_amd.cpp
+++ b/Source/Core/VideoCommon/Src/memcpy_amd.cpp
@ -0,0 +1,473 @@
+/******************************************************************************
+
+ Copyright (c) 2001 Advanced Micro Devices, Inc.
+
+ LIMITATION OF LIABILITY:  THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
+ EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
+ NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
+ PARTICULAR PURPOSE.  IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
+ DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
+ BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
+ INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
+ OF SUCH DAMAGES.  BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
+ OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
+ NOT APPLY TO YOU.
+
+ AMD does not assume any responsibility for any errors which may appear in the
+ Materials nor any responsibility to support or update the Materials.  AMD retains
+ the right to make changes to its test specifications at any time, without notice.
+
+ NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
+ further information, software, technical information, know-how, or show-how
+ available to you.
+
+ So that all may benefit from your experience, please report  any  problems
+ or  suggestions about this software to 3dsdk.support@amd.com
+
+ AMD Developer Technologies, M/S 585
+ Advanced Micro Devices, Inc.
+ 5900 E. Ben White Blvd.
+ Austin, TX 78741
+ 3dsdk.support@amd.com
+******************************************************************************/
+
+#include <assert.h>
+
+/*****************************************************************************
+MEMCPY_AMD.CPP
+******************************************************************************/
+
+// Very optimized memcpy() routine for AMD Athlon and Duron family.
+// This code uses any of FOUR different basic copy methods, depending
+// on the transfer size.
+// NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
+// "Streaming Store"), and also uses the software prefetch instructions,
+// be sure you're running on Athlon/Duron or other recent CPU before calling!
+
+#define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
+// The smallest copy uses the X86 "movsd" instruction, in an optimized
+// form which is an "unrolled loop".
+
+#define IN_CACHE_COPY 2 * 1024  // upper limit for movq/movq copy w/SW prefetch
+// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
+// also using the "unrolled loop" optimization.   This code uses
+// the software prefetch instruction to get the data into the cache.
+
+#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
+// For larger blocks, which will spill beyond the cache, it's faster to
+// use the Streaming Store instruction MOVNTQ.   This write instruction
+// bypasses the cache and writes straight to main memory.  This code also
+// uses the software prefetch instruction to pre-read the data.
+// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
+
+#define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch 
+#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
+// For the largest size blocks, a special technique called Block Prefetch
+// can be used to accelerate the read operations.   Block Prefetch reads
+// one address per cache line, for a series of cache lines, in a short loop.
+// This is faster than using software prefetch.  The technique is great for
+// getting maximum read bandwidth, especially in DDR memory systems.
+
+//#include <stddef.h>
+
+// Inline assembly syntax for use with Visual C++
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#if defined(_MSC_VER) && !defined(__x86_64__) && !defined(_M_X64)
+
+void * memcpy_amd(void *dest, const void *src, size_t n)
+{
+    __asm {
+	mov		ecx, [n]		; number of bytes to copy
+	mov		edi, [dest]		; destination
+	mov		esi, [src]		; source
+	mov		ebx, ecx		; keep a copy of count
+
+	cld
+	cmp		ecx, TINY_BLOCK_COPY
+	jb		$memcpy_ic_3	; tiny? skip mmx copy
+
+	cmp		ecx, 32*1024		; do not align between 32k-64k because
+	jbe		$memcpy_do_align	;  it appears to be slower
+	cmp		ecx, 64*1024
+	jbe		$memcpy_align_done
+$memcpy_do_align:
+	mov		ecx, 8			; a trick that is faster than rep movsb...
+	sub		ecx, edi		; align destination to qword
+	and		ecx, 111b		; get the low bits
+	sub		ebx, ecx		; update copy count
+	neg		ecx				; set up to jump into the array
+	add		ecx, offset $memcpy_align_done
+	jmp		ecx				; jump to array of movsb''s
+
+align 4
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+
+$memcpy_align_done:			; destination is dword aligned
+	mov		ecx, ebx		; number of bytes left to copy
+	shr		ecx, 6			; get 64-byte block count
+	jz		$memcpy_ic_2	; finish the last few bytes
+
+	cmp		ecx, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
+	jae		$memcpy_uc_test
+
+// This is small block copy that uses the MMX registers to copy 8 bytes
+// at a time.  It uses the "unrolled loop" optimization, and also uses
+// the software prefetch instruction to get the data into the cache.
+align 16
+$memcpy_ic_1:			; 64-byte block copies, in-cache copy
+
+	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
+
+	movq	mm0, [esi+0]	; read 64 bits
+	movq	mm1, [esi+8]
+	movq	[edi+0], mm0	; write 64 bits
+	movq	[edi+8], mm1	;    note:  the normal movq writes the
+	movq	mm2, [esi+16]	;    data to cache; a cache line will be
+	movq	mm3, [esi+24]	;    allocated as needed, to store the data
+	movq	[edi+16], mm2
+	movq	[edi+24], mm3
+	movq	mm0, [esi+32]
+	movq	mm1, [esi+40]
+	movq	[edi+32], mm0
+	movq	[edi+40], mm1
+	movq	mm2, [esi+48]
+	movq	mm3, [esi+56]
+	movq	[edi+48], mm2
+	movq	[edi+56], mm3
+
+	add		esi, 64			; update source pointer
+	add		edi, 64			; update destination pointer
+	dec		ecx				; count down
+	jnz		$memcpy_ic_1	; last 64-byte block?
+
+$memcpy_ic_2:
+	mov		ecx, ebx		; has valid low 6 bits of the byte count
+$memcpy_ic_3:
+	shr		ecx, 2			; dword count
+	and		ecx, 1111b		; only look at the "remainder" bits
+	neg		ecx				; set up to jump into the array
+	add		ecx, offset $memcpy_last_few
+	jmp		ecx				; jump to array of movsd''s
+
+$memcpy_uc_test:
+	cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
+	jae		$memcpy_bp_1
+
+$memcpy_64_test:
+	or		ecx, ecx		; tail end of block prefetch will jump here
+	jz		$memcpy_ic_2	; no more 64-byte blocks left
+
+// For larger blocks, which will spill beyond the cache, it's faster to
+// use the Streaming Store instruction MOVNTQ.   This write instruction
+// bypasses the cache and writes straight to main memory.  This code also
+// uses the software prefetch instruction to pre-read the data.
+align 16
+$memcpy_uc_1:				; 64-byte blocks, uncached copy
+
+	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
+
+	movq	mm0,[esi+0]		; read 64 bits
+	add		edi,64			; update destination pointer
+	movq	mm1,[esi+8]
+	add		esi,64			; update source pointer
+	movq	mm2,[esi-48]
+	movntq	[edi-64], mm0	; write 64 bits, bypassing the cache
+	movq	mm0,[esi-40]	;    note: movntq also prevents the CPU
+	movntq	[edi-56], mm1	;    from READING the destination address
+	movq	mm1,[esi-32]	;    into the cache, only to be over-written
+	movntq	[edi-48], mm2	;    so that also helps performance
+	movq	mm2,[esi-24]
+	movntq	[edi-40], mm0
+	movq	mm0,[esi-16]
+	movntq	[edi-32], mm1
+	movq	mm1,[esi-8]
+	movntq	[edi-24], mm2
+	movntq	[edi-16], mm0
+	dec		ecx
+	movntq	[edi-8], mm1
+	jnz		$memcpy_uc_1	; last 64-byte block?
+
+	jmp		$memcpy_ic_2		; almost done
+
+// For the largest size blocks, a special technique called Block Prefetch
+// can be used to accelerate the read operations.   Block Prefetch reads
+// one address per cache line, for a series of cache lines, in a short loop.
+// This is faster than using software prefetch.  The technique is great for
+// getting maximum read bandwidth, especially in DDR memory systems.
+$memcpy_bp_1:			; large blocks, block prefetch copy
+
+	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
+	jl		$memcpy_64_test			; no, back to regular uncached copy
+
+	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
+	add		esi, CACHEBLOCK * 64	; move to the top of the block
+align 16
+$memcpy_bp_2:
+	mov		edx, [esi-64]		; grab one address per cache line
+	mov		edx, [esi-128]		; grab one address per cache line
+	sub		esi, 128			; go reverse order to suppress HW prefetcher
+	dec		eax					; count down the cache lines
+	jnz		$memcpy_bp_2		; keep grabbing more lines into cache
+
+	mov		eax, CACHEBLOCK		; now that it is in cache, do the copy
+align 16
+$memcpy_bp_3:
+	movq	mm0, [esi   ]		; read 64 bits
+	movq	mm1, [esi+ 8]
+	movq	mm2, [esi+16]
+	movq	mm3, [esi+24]
+	movq	mm4, [esi+32]
+	movq	mm5, [esi+40]
+	movq	mm6, [esi+48]
+	movq	mm7, [esi+56]
+	add		esi, 64				; update source pointer
+	movntq	[edi   ], mm0		; write 64 bits, bypassing cache
+	movntq	[edi+ 8], mm1		;    note: movntq also prevents the CPU
+	movntq	[edi+16], mm2		;    from READING the destination address 
+	movntq	[edi+24], mm3		;    into the cache, only to be over-written,
+	movntq	[edi+32], mm4		;    so that also helps performance
+	movntq	[edi+40], mm5
+	movntq	[edi+48], mm6
+	movntq	[edi+56], mm7
+	add		edi, 64				; update dest pointer
+
+	dec		eax					; count down
+
+	jnz		$memcpy_bp_3		; keep copying
+	sub		ecx, CACHEBLOCK		; update the 64-byte block count
+	jmp		$memcpy_bp_1		; keep processing chunks
+
+// The smallest copy uses the X86 "movsd" instruction, in an optimized
+// form which is an "unrolled loop".   Then it handles the last few bytes.
+align 4
+	movsd
+	movsd			; perform last 1-15 dword copies
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd			; perform last 1-7 dword copies
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+
+$memcpy_last_few:		; dword aligned from before movsd''s
+	mov		ecx, ebx	; has valid low 2 bits of the byte count
+	and		ecx, 11b	; the last few cows must come home
+	jz		$memcpy_final	; no more, lets leave
+	rep		movsb		; the last 1, 2, or 3 bytes
+
+$memcpy_final: 
+	emms				; clean up the MMX state
+	sfence				; flush the write buffer
+	mov		eax, [dest]	; ret value = destination pointer
+
+    }
+}
+
+// mmx memcmp implementation, size has to be a multiple of 8
+// returns 0 is equal, nonzero value if not equal
+// ~10 times faster than standard memcmp
+// (zerofrog)
+unsigned char memcmp_mmx(const void* src1, const void* src2, int cmpsize)
+{
+	assert( (cmpsize&7) == 0 );
+
+	__asm {
+		push esi
+		mov ecx, cmpsize
+		mov edx, src1
+		mov esi, src2
+		
+		cmp ecx, 32
+		jl Done4
+
+		// custom test first 8 to make sure things are ok
+		movq mm0, [esi]
+		movq mm1, [esi+8]
+		pcmpeqd mm0, [edx]
+		pcmpeqd mm1, [edx+8]
+		pand mm0, mm1
+		movq mm2, [esi+16]
+		pmovmskb eax, mm0
+		movq mm3, [esi+24]
+
+		// check if eq
+		cmp eax, 0xff
+		je NextComp
+		mov eax, 1
+		jmp End
+
+NextComp:
+		pcmpeqd mm2, [edx+16]
+		pcmpeqd mm3, [edx+24]
+		pand mm2, mm3
+		pmovmskb eax, mm2
+
+		sub ecx, 32
+		add esi, 32
+		add edx, 32
+
+		// check if eq
+		cmp eax, 0xff
+		je ContinueTest
+		mov eax, 1
+		jmp End
+
+		cmp ecx, 64
+		jl Done8
+
+Cmp8:
+		movq mm0, [esi]
+		movq mm1, [esi+8]
+		movq mm2, [esi+16]
+		movq mm3, [esi+24]
+		movq mm4, [esi+32]
+		movq mm5, [esi+40]
+		movq mm6, [esi+48]
+		movq mm7, [esi+56]
+		pcmpeqd mm0, [edx]
+		pcmpeqd mm1, [edx+8]
+		pcmpeqd mm2, [edx+16]
+		pcmpeqd mm3, [edx+24]
+		pand mm0, mm1
+		pcmpeqd mm4, [edx+32]
+		pand mm0, mm2
+		pcmpeqd mm5, [edx+40]
+		pand mm0, mm3
+		pcmpeqd mm6, [edx+48]
+		pand mm0, mm4
+		pcmpeqd mm7, [edx+56]
+		pand mm0, mm5
+		pand mm0, mm6
+		pand mm0, mm7
+		pmovmskb eax, mm0
+		
+		// check if eq
+		cmp eax, 0xff
+		je Continue
+		mov eax, 1
+		jmp End
+
+Continue:
+		sub ecx, 64
+		add esi, 64
+		add edx, 64
+ContinueTest:
+		cmp ecx, 64
+		jge Cmp8
+
+Done8:
+		test ecx, 0x20
+		jz Done4
+		movq mm0, [esi]
+		movq mm1, [esi+8]
+		movq mm2, [esi+16]
+		movq mm3, [esi+24]
+		pcmpeqd mm0, [edx]
+		pcmpeqd mm1, [edx+8]
+		pcmpeqd mm2, [edx+16]
+		pcmpeqd mm3, [edx+24]
+		pand mm0, mm1
+		pand mm0, mm2
+		pand mm0, mm3
+		pmovmskb eax, mm0
+		sub ecx, 32
+		add esi, 32
+		add edx, 32
+
+		// check if eq
+		cmp eax, 0xff
+		je Done4
+		mov eax, 1
+		jmp End
+
+Done4:
+		cmp ecx, 24
+		jne Done2
+		movq mm0, [esi]
+		movq mm1, [esi+8]
+		movq mm2, [esi+16]
+		pcmpeqd mm0, [edx]
+		pcmpeqd mm1, [edx+8]
+		pcmpeqd mm2, [edx+16]
+		pand mm0, mm1
+		pand mm0, mm2
+		pmovmskb eax, mm0
+
+		// check if eq
+		cmp eax, 0xff
+		setne al
+		jmp End
+
+Done2:
+		cmp ecx, 16
+		jne Done1
+
+		movq mm0, [esi]
+		movq mm1, [esi+8]
+		pcmpeqd mm0, [edx]
+		pcmpeqd mm1, [edx+8]
+		pand mm0, mm1
+		pmovmskb eax, mm0
+
+		// check if eq
+		cmp eax, 0xff
+		setne al
+		jmp End
+
+Done1:
+		cmp ecx, 8
+		jne Done
+
+		mov eax, [esi]
+		mov esi, [esi+4]
+		cmp eax, [edx]
+		je Next
+		mov eax, 1
+		jmp End
+
+Next:
+		cmp esi, [edx+4]
+		setne al
+		jmp End
+
+Done:
+		xor eax, eax
+
+End:
+		pop esi
+		emms
+	}
+}
+
+#else // _MSC_VER
+// assume gcc or mingw or win x64
+
+#include <memory.h>
+#include <string.h>
+
+void * memcpy_amd(void *dest, const void *src, size_t n)
+{
+memcpy(dest, src, n);
+return dest;
+}
+
+
+#endif