Turn the X86 emitter into a class, so the code pointer is no longer a global, yay! Created XCodeBlock that derives from XEmitter, and the Jit now derives from XCodeBlock so it can call all ADD SUB JNZ etc without having to prefix them with "emit.". I think someone's gonna like this.

There's some cleanup still to be done, but hey, it works. There shouldn't be a noticable speed difference. I hope GCC doesn't have a problem with the "member function pointers" I used. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1594 8ced0084-cf51-0410-be5f-012b33b47a6e
2025-07-24 14:49:42 -06:00 · 2008-12-19 21:24:52 +00:00
parent b5dcdcf779
commit 104acd5bc1
31 changed files with 1297 additions and 1153 deletions
--- a/Source/Core/Common/Src/ABI.cpp
+++ b/Source/Core/Common/Src/ABI.cpp
@ -25,7 +25,7 @@ using namespace Gen;
 // ====================================

 // Sets up a __cdecl function.
-void ABI_EmitPrologue(int maxCallParams)
+void XEmitter::ABI_EmitPrologue(int maxCallParams)
 {
 #ifdef _M_IX86
 	// Don't really need to do anything
@ -40,7 +40,8 @@ void ABI_EmitPrologue(int maxCallParams)
 #error Arch not supported
 #endif
 }
-void ABI_EmitEpilogue(int maxCallParams)
+
+void XEmitter::ABI_EmitEpilogue(int maxCallParams)
 {
 #ifdef _M_IX86
 	RET();
@ -60,14 +61,14 @@ void ABI_EmitEpilogue(int maxCallParams)
 // Shared code between Win32 and Unix32
 // ====================================

-void ABI_CallFunctionC(void *func, u32 param1) {
+void XEmitter::ABI_CallFunctionC(void *func, u32 param1) {
 	ABI_AlignStack(1 * 4);
 	PUSH(32, Imm32(param1));
 	CALL(func);
 	ABI_RestoreStack(1 * 4);
 }

-void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
+void XEmitter::ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
 	ABI_AlignStack(2 * 4);
 	PUSH(32, Imm32(param2));
 	PUSH(32, Imm32(param1));
@ -76,14 +77,14 @@ void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
 }

 // Pass a register as a paremeter.
-void ABI_CallFunctionR(void *func, X64Reg reg1) {
+void XEmitter::ABI_CallFunctionR(void *func, X64Reg reg1) {
 	ABI_AlignStack(1 * 4);
 	PUSH(32, R(reg1));
 	CALL(func);
 	ABI_RestoreStack(1 * 4);
 }

-void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
+void XEmitter::ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
 {
 	ABI_AlignStack(2 * 4);
 	PUSH(32, R(reg2));
@ -92,7 +93,7 @@ void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
 	ABI_RestoreStack(2 * 4);
 }

-void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
+void XEmitter::ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
 {
 	ABI_AlignStack(2 * 4);
 	PUSH(32, arg1);
@ -101,7 +102,7 @@ void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
 	ABI_RestoreStack(2 * 4);
 }

-void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	// Note: 4 * 4 = 16 bytes, so alignment is preserved.
 	PUSH(EBP);
 	PUSH(EBX);
@ -109,14 +110,14 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	PUSH(EDI);
 }

-void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
 	POP(EDI);
 	POP(ESI);
 	POP(EBX);
 	POP(EBP);
 }

-unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) {
+unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) {
 	frameSize += 4; // reserve space for return address
 	unsigned int alignedSize =
 #ifdef __GNUC__
@ -128,7 +129,7 @@ unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) {
 }


-void ABI_AlignStack(unsigned int frameSize) {
+void XEmitter::ABI_AlignStack(unsigned int frameSize) {
 // Mac OS X requires the stack to be 16-byte aligned before every call.
 // Linux requires the stack to be 16-byte aligned before calls that put SSE
 // vectors on the stack, but since we do not keep track of which calls do that,
@ -145,7 +146,7 @@ void ABI_AlignStack(unsigned int frameSize) {
 #endif
 }

-void ABI_RestoreStack(unsigned int frameSize) {
+void XEmitter::ABI_RestoreStack(unsigned int frameSize) {
 	unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize);
 	alignedSize -= 4; // return address is POPped at end of call
 	if (alignedSize != 0) {
@ -155,26 +156,26 @@ void ABI_RestoreStack(unsigned int frameSize) {

 #else

-void ABI_CallFunctionC(void *func, u32 param1) {
+void XEmitter::ABI_CallFunctionC(void *func, u32 param1) {
 	MOV(32, R(ABI_PARAM1), Imm32(param1));
 	CALL(func);
 }

-void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
+void XEmitter::ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
 	MOV(32, R(ABI_PARAM1), Imm32(param1));
 	MOV(32, R(ABI_PARAM2), Imm32(param2));
 	CALL(func);
 }

 // Pass a register as a paremeter.
-void ABI_CallFunctionR(void *func, X64Reg reg1) {
+void XEmitter::ABI_CallFunctionR(void *func, X64Reg reg1) {
 	if (reg1 != ABI_PARAM1)
 		MOV(32, R(ABI_PARAM1), R(reg1));
 	CALL(func);
 }

 // Pass a register as a paremeter.
-void ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) {
+void XEmitter::ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) {
 	if (reg1 != ABI_PARAM1)
 		MOV(32, R(ABI_PARAM1), R(reg1));
 	if (reg2 != ABI_PARAM2)
@ -182,7 +183,7 @@ void ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) {
 	CALL(func);
 }

-void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
+void XEmitter::ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
 {
 	if (!arg1.IsSimpleReg(ABI_PARAM1))
 		MOV(32, R(ABI_PARAM1), arg1);
@ -190,21 +191,21 @@ void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
 	CALL(func);
 }

-unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) {
+unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) {
 	return frameSize;
 }

-void ABI_AlignStack(unsigned int /*frameSize*/) {
+void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) {
 }

-void ABI_RestoreStack(unsigned int /*frameSize*/) {
+void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) {
 }

 #ifdef _WIN32

 // Win64 Specific Code
 // ====================================
-void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	//we only want to do this once
 	PUSH(RBX); 
 	PUSH(RSI); 
@ -218,7 +219,7 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	SUB(64, R(RSP), Imm8(0x28));
 }

-void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
 	ADD(64, R(RSP), Imm8(0x28));
 	POP(R15);
 	POP(R14); 
@ -232,7 +233,7 @@ void ABI_PopAllCalleeSavedRegsAndAdjustStack() {

 // Win64 Specific Code
 // ====================================
-void ABI_PushAllCallerSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() {
 	PUSH(RCX);
 	PUSH(RDX);
 	PUSH(RSI); 
@ -245,7 +246,7 @@ void ABI_PushAllCallerSavedRegsAndAdjustStack() {
 	SUB(64, R(RSP), Imm8(0x28));
 }

-void ABI_PopAllCallerSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() {
 	ADD(64, R(RSP), Imm8(0x28));
 	POP(R11);
 	POP(R10);
@ -260,7 +261,7 @@ void ABI_PopAllCallerSavedRegsAndAdjustStack() {
 #else
 // Unix64 Specific Code
 // ====================================
-void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	PUSH(RBX); 
 	PUSH(RBP);
 	PUSH(R12); 
@ -270,7 +271,7 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	PUSH(R15); //just to align stack. duped push/pop doesn't hurt.
 }

-void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
 	POP(R15);
 	POP(R15);
 	POP(R14); 
@ -280,7 +281,7 @@ void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
 	POP(RBX); 
 }

-void ABI_PushAllCallerSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() {
 	PUSH(RCX);
 	PUSH(RDX);
 	PUSH(RSI); 
@ -292,7 +293,7 @@ void ABI_PushAllCallerSavedRegsAndAdjustStack() {
 	PUSH(R11);
 }

-void ABI_PopAllCallerSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() {
 	POP(R11);
 	POP(R11);
 	POP(R10);
--- a/Source/Core/Common/Src/ABI.h
+++ b/Source/Core/Common/Src/ABI.h
@ -18,8 +18,6 @@
 #ifndef _JIT_ABI_H
 #define _JIT_ABI_H

-#include "x64Emitter.h"
-
 // x86/x64 ABI:s, and helpers to help follow them when JIT-ing code.
 // All convensions return values in EAX (+ possibly EDX).

@ -81,42 +79,5 @@

 #endif

-// Utility functions
-// These only support u32 parameters, but that's enough for a lot of uses.
-// These will destroy the 1 or 2 first "parameter regs".
-void ABI_CallFunctionC(void *func, u32 param1);
-void ABI_CallFunctionCC(void *func, u32 param1, u32 param2);
-void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2);
-
-// Pass a register as a paremeter.
-void ABI_CallFunctionR(void *func, Gen::X64Reg reg1);
-void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2);
-
-// A function that doesn't have any control over what it will do to regs,
-// such as the dispatcher, should be surrounded by these.
-void ABI_PushAllCalleeSavedRegsAndAdjustStack();
-void ABI_PopAllCalleeSavedRegsAndAdjustStack();
-
-// A function that doesn't know anything about it's surroundings, should
-// be surrounded by these to establish a safe environment, where it can roam free.
-// An example is a backpatch injected function.
-void ABI_PushAllCallerSavedRegsAndAdjustStack();
-void ABI_PopAllCallerSavedRegsAndAdjustStack();
-
-unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize);
-void ABI_AlignStack(unsigned int frameSize);
-void ABI_RestoreStack(unsigned int frameSize);
-
-// Sets up a __cdecl function.
-// Only x64 really needs the parameter.
-void ABI_EmitPrologue(int maxCallParams);
-void ABI_EmitEpilogue(int maxCallParams);
-
-#ifdef _M_IX86
-inline int ABI_GetNumXMMRegs() { return 8; }
-#else
-inline int ABI_GetNumXMMRegs() { return 16; }
-#endif
-
 #endif  // _JIT_ABI_H

--- a/Source/Core/Common/Src/MemoryUtil.cpp
+++ b/Source/Core/Common/Src/MemoryUtil.cpp
@ -38,7 +38,7 @@
 // This is purposedely not a full wrapper for virtualalloc/mmap, but it
 // provides exactly the primitive operations that Dolphin needs.

-void* AllocateExecutableMemory(int size, bool low)
+void* AllocateExecutableMemory(size_t size, bool low)
 {
 #ifdef _WIN32
 	void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
@ -71,7 +71,7 @@ void* AllocateExecutableMemory(int size, bool low)
 }


-void* AllocateMemoryPages(int size)
+void* AllocateMemoryPages(size_t size)
 {
 #ifdef _WIN32
 	void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_READWRITE);
@ -99,7 +99,7 @@ void* AllocateMemoryPages(int size)
 }


-void FreeMemoryPages(void* ptr, int size)
+void FreeMemoryPages(void* ptr, size_t size)
 {
 #ifdef _WIN32
 	if (ptr)
@ -113,7 +113,7 @@ void FreeMemoryPages(void* ptr, int size)
 }


-void WriteProtectMemory(void* ptr, int size, bool allowExecute)
+void WriteProtectMemory(void* ptr, size_t size, bool allowExecute)
 {
 #ifdef _WIN32
 	VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READ : PAGE_READONLY, 0);
@ -123,7 +123,7 @@ void WriteProtectMemory(void* ptr, int size, bool allowExecute)
 }


-void UnWriteProtectMemory(void* ptr, int size, bool allowExecute)
+void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute)
 {
 #ifdef _WIN32
 	VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READWRITE : PAGE_READONLY, 0);
--- a/Source/Core/Common/Src/MemoryUtil.h
+++ b/Source/Core/Common/Src/MemoryUtil.h
@ -18,14 +18,14 @@
 #ifndef _MEMORYUTIL_H
 #define _MEMORYUTIL_H

-void* AllocateExecutableMemory(int size, bool low = true);
-void* AllocateMemoryPages(int size);
-void FreeMemoryPages(void* ptr, int size);
-void WriteProtectMemory(void* ptr, int size, bool executable = false);
-void UnWriteProtectMemory(void* ptr, int size, bool allowExecute);
+void* AllocateExecutableMemory(size_t size, bool low = true);
+void* AllocateMemoryPages(size_t size);
+void FreeMemoryPages(void* ptr, size_t size);
+void WriteProtectMemory(void* ptr, size_t size, bool executable = false);
+void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute);


-inline int GetPageSize() {return(4096);}
+inline int GetPageSize() {return 4096;}


 #endif
--- a/Source/Core/Common/Src/Thunk.cpp
+++ b/Source/Core/Common/Src/Thunk.cpp
@ -18,33 +18,29 @@
 #include <map>

 #include "Common.h"
-#include "Thunk.h"
 #include "x64Emitter.h"
 #include "MemoryUtil.h"
 #include "ABI.h"
+#include "Thunk.h"

-using namespace Gen;
+ThunkManager thunks;

 #define THUNK_ARENA_SIZE 1024*1024*1

-namespace {
-static std::map<void *, const u8 *> thunks;
-u8 GC_ALIGNED32(saved_fp_state[16 * 4 * 4]);
-u8 GC_ALIGNED32(saved_gpr_state[16 * 8]);
-
-static u8 *thunk_memory;
-static u8 *thunk_code;
-static const u8 *save_regs;
-static const u8 *load_regs;
-static u16 saved_mxcsr;
-}
-
-void Thunk_Init()
+namespace
 {
-	thunk_memory = (u8 *)AllocateExecutableMemory(THUNK_ARENA_SIZE);
-	thunk_code = thunk_memory;

-	GenContext ctx(&thunk_code);
+static u8 GC_ALIGNED32(saved_fp_state[16 * 4 * 4]);
+static u8 GC_ALIGNED32(saved_gpr_state[16 * 8]);
+static u16 saved_mxcsr;
+
+}  // namespace
+
+using namespace Gen;
+
+void ThunkManager::Init()
+{
+	AllocCodeSpace(THUNK_ARENA_SIZE);
 	save_regs = GetCodePtr();
 	for (int i = 2; i < ABI_GetNumXMMRegs(); i++)
 		MOVAPS(M(saved_fp_state + i * 16), (X64Reg)(XMM0 + i));
@ -89,31 +85,27 @@ void Thunk_Init()
 	RET();
 }

-void Thunk_Reset()
+void ThunkManager::Reset()
 {
 	thunks.clear();
-	thunk_code = thunk_memory;
+	ResetCodePtr();
 }

-void Thunk_Shutdown()
+void ThunkManager::Shutdown()
 {
-	Thunk_Reset();
-	FreeMemoryPages(thunk_memory, THUNK_ARENA_SIZE);
-	thunk_memory = 0;
-	thunk_code = 0;
+	Reset();
+	FreeCodeSpace();
 }

-void *ProtectFunction(void *function, int num_params)
+void *ThunkManager::ProtectFunction(void *function, int num_params)
 {
 	std::map<void *, const u8 *>::iterator iter;
 	iter = thunks.find(function);
 	if (iter != thunks.end())
 		return (void *)iter->second;
-
-	if (!thunk_memory)
+	if (!region)
 		PanicAlert("Trying to protect functions before the emu is started. Bad bad bad.");

-	GenContext gen(&thunk_code);
 	const u8 *call_point = GetCodePtr();
 	// Make sure to align stack.
 #ifdef _M_X64
--- a/Source/Core/Common/Src/Thunk.h
+++ b/Source/Core/Common/Src/Thunk.h
@ -18,6 +18,11 @@
 #ifndef _THUNK_H
 #define _THUNK_H

+#include <map>
+
+#include "Common.h"
+#include "x64Emitter.h"
+
 // This simple class creates a wrapper around a C/C++ function that saves all fp state
 // before entering it, and restores it upon exit. This is required to be able to selectively
 // call functions from generated code, without inflicting the performance hit and increase
@ -30,10 +35,21 @@
 // NOT THREAD SAFE. This may only be used from the CPU thread.
 // Any other thread using this stuff will be FATAL.

-void Thunk_Init();
-void Thunk_Reset();
-void Thunk_Shutdown();
+class ThunkManager : public Gen::XCodeBlock
+{
+	std::map<void *, const u8 *> thunks;

-void *ProtectFunction(void *function, int num_params);
+	const u8 *save_regs;
+	const u8 *load_regs;
+
+public:
+	void Init();
+	void Reset();
+	void Shutdown();
+
+	void *ProtectFunction(void *function, int num_params);
+};
+
+extern ThunkManager thunks;

 #endif
--- a/Source/Core/Common/Src/x64Emitter.cpp
+++ b/Source/Core/Common/Src/x64Emitter.cpp
--- a/Source/Core/Common/Src/x64Emitter.h
+++ b/Source/Core/Common/Src/x64Emitter.h
@ -21,217 +21,264 @@
 #define _DOLPHIN_INTEL_CODEGEN

 #include "Common.h"
+#include "MemoryUtil.h"

 namespace Gen
 {
-	enum X64Reg
+
+enum X64Reg
+{
+	EAX = 0, EBX = 3, ECX = 1, EDX = 2,
+	ESI = 6, EDI = 7, EBP = 5, ESP = 4,
+	
+	RAX = 0, RBX = 3, RCX = 1, RDX = 2,
+	RSI = 6, RDI = 7, RBP = 5, RSP = 4,
+	R8  = 8, R9  = 9, R10 = 10,R11 = 11,
+	R12 = 12,R13 = 13,R14 = 14,R15 = 15,
+
+	AL = 0, BL = 3, CL = 1, DL = 2,
+	AH = 4, BH = 7, CH = 5, DH = 6,
+
+	AX = 0, BX = 3, CX = 1, DX = 2,
+	SI = 6, DI = 7, BP = 5, SP = 4,
+
+	XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, 
+	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
+
+	INVALID_REG = 0xFFFFFFFF
+};
+
+enum CCFlags
+{
+	CC_O   = 0,
+	CC_NO  = 1,
+	CC_B   = 2, CC_C  = 2, CC_NAE = 2,
+	CC_NB  = 3, CC_NC = 3, CC_AE  = 3,
+	CC_Z   = 4, CC_E   = 4,
+	CC_NZ  = 5,	CC_NE  = 5, 
+	CC_BE  = 6, CC_NA  = 6,
+	CC_NBE = 7, CC_A   = 7,
+	CC_S   = 8,
+	CC_NS  = 9,
+	CC_P   = 0xA, CC_PE  = 0xA,
+	CC_NP  = 0xB, CC_PO  = 0xB,
+	CC_L   = 0xC, CC_NGE = 0xC,
+	CC_NL  = 0xD, CC_GE  = 0xD,
+	CC_LE  = 0xE, CC_NG  = 0xE,
+	CC_NLE = 0xF, CC_G   = 0xF
+};
+
+enum
+{
+	NUMGPRs = 16,
+	NUMXMMs = 16,
+};
+
+enum
+{
+	SCALE_NONE = 0,
+	SCALE_1 = 1,
+	SCALE_2 = 2,
+	SCALE_4 = 4,
+	SCALE_8 = 8,
+	SCALE_ATREG = 16,
+	SCALE_RIP = 0xFF,
+	SCALE_IMM8  = 0xF0,
+	SCALE_IMM16 = 0xF1,
+	SCALE_IMM32 = 0xF2,
+	SCALE_IMM64 = 0xF3,
+};
+
+enum NormalOp {
+	nrmADD,
+	nrmADC,
+	nrmSUB,
+	nrmSBB,
+	nrmAND,
+	nrmOR ,
+	nrmXOR,
+	nrmMOV,
+	nrmTEST,
+	nrmCMP,
+	nrmXCHG,
+};
+
+class XEmitter;
+
+// RIP addressing does not benefit from micro op fusion on Core arch
+struct OpArg
+{
+	OpArg() {}  // dummy op arg, used for storage
+	OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
 	{
-		EAX = 0, EBX = 3, ECX = 1, EDX = 2,
-		ESI = 6, EDI = 7, EBP = 5, ESP = 4,
-		
-		RAX = 0, RBX = 3, RCX = 1, RDX = 2,
-		RSI = 6, RDI = 7, RBP = 5, RSP = 4,
-		R8  = 8, R9  = 9, R10 = 10,R11 = 11,
-		R12 = 12,R13 = 13,R14 = 14,R15 = 15,
+		operandReg = 0;
+		scale = (u8)_scale;
+		offsetOrBaseReg = (u8)rmReg;
+		indexReg = (u8)scaledReg;
+		//if scale == 0 never mind offseting
+		offset = _offset;
+	}
+	void WriteRex(XEmitter *emit, bool op64, int customOp = -1) const;
+	void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF) const;
+	void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
+	// This one is public - must be written to
+	u64 offset;  // use RIP-relative as much as possible - 64-bit immediates are not available.
+	u8 operandReg;

-		AL = 0, BL = 3, CL = 1, DL = 2,
-		AH = 4, BH = 7, CH = 5, DH = 6,
+	void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const;
+	bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
+	bool IsSimpleReg() const {return scale == SCALE_NONE;}
+	bool IsSimpleReg(X64Reg reg) const {
+		if (!IsSimpleReg())
+			return false;
+		return GetSimpleReg() == reg;
+	}

-		AX = 0, BX = 3, CX = 1, DX = 2,
-		SI = 6, DI = 7, BP = 5, SP = 4,
-
-		XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, 
-		XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
-
-		INVALID_REG = 0xFFFFFFFF
-	};
-
-	enum CCFlags
+	bool CanDoOpWith(const OpArg &other) const
 	{
-		CC_O   = 0,
-		CC_NO  = 1,
-		CC_B   = 2, CC_C  = 2, CC_NAE = 2,
-		CC_NB  = 3, CC_NC = 3, CC_AE  = 3,
-		CC_Z   = 4, CC_E   = 4,
-		CC_NZ  = 5,	CC_NE  = 5, 
-		CC_BE  = 6, CC_NA  = 6,
-		CC_NBE = 7, CC_A   = 7,
-		CC_S   = 8,
-		CC_NS  = 9,
-		CC_P   = 0xA, CC_PE  = 0xA,
-		CC_NP  = 0xB, CC_PO  = 0xB,
-		CC_L   = 0xC, CC_NGE = 0xC,
-		CC_NL  = 0xD, CC_GE  = 0xD,
-		CC_LE  = 0xE, CC_NG  = 0xE,
-		CC_NLE = 0xF, CC_G   = 0xF
-	};
+		if (IsSimpleReg()) return true;
+		if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false;
+		return true;
+	}

-	enum
+	int GetImmBits() const
 	{
-		NUMGPRs = 16,
-		NUMXMMs = 16,
-	};
+		switch (scale)
+		{
+		case SCALE_IMM8: return 8;
+		case SCALE_IMM16: return 16;
+		case SCALE_IMM32: return 32;
+		case SCALE_IMM64: return 64;
+		default: return -1;
+		}
+	}

-	enum
+	X64Reg GetSimpleReg() const
 	{
-		SCALE_NONE = 0,
-		SCALE_1 = 1,
-		SCALE_2 = 2,
-		SCALE_4 = 4,
-		SCALE_8 = 8,
-		SCALE_ATREG = 16,
-		SCALE_RIP = 0xFF,
-		SCALE_IMM8  = 0xF0,
-		SCALE_IMM16 = 0xF1,
-		SCALE_IMM32 = 0xF2,
-		SCALE_IMM64 = 0xF3,
-	};
+		if (scale == SCALE_NONE)
+			return (X64Reg)offsetOrBaseReg;
+		else
+			return INVALID_REG;
+	}
+private:
+	u8 scale;
+	u8 offsetOrBaseReg;
+	u8 indexReg;
+};
+
+inline OpArg M(void *ptr)	    {return OpArg((u64)ptr, (int)SCALE_RIP);}
+inline OpArg R(X64Reg value)	{return OpArg(0, SCALE_NONE, value);}
+inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);}
+inline OpArg MDisp(X64Reg value, int offset) {
+	return OpArg((u32)offset, SCALE_ATREG, value); }
+inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
+{
+	return OpArg(offset, scale, base, scaled);
+}
+inline OpArg Imm8 (u8 imm)  {return OpArg(imm, SCALE_IMM8);}
+inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used
+inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);}
+inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);}
+#ifdef _M_X64
+inline OpArg ImmPtr(void* imm) {return Imm64((u64)imm);}
+#else
+inline OpArg ImmPtr(void* imm) {return Imm32((u32)imm);}
+#endif
+
+struct FixupBranch
+{
+	u8 *ptr;
+	int type; //0 = 8bit 1 = 32bit
+};
+
+enum SSECompare
+{
+	EQ = 0,
+	LT,
+	LE,
+	UNORD,
+	NEQ,
+	NLT,
+	NLE,
+	ORD,
+};
+
+typedef const u8* JumpTarget;
+
+class XEmitter
+{
+	friend struct OpArg;  // for Write8 etc
+private:
+	u8 *code;
+
+	void Rex(int w, int r, int x, int b);
+	void WriteSimple1Byte(int bits, u8 byte, X64Reg reg);
+	void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
+	void WriteMulDivType(int bits, OpArg src, int ext);
+	void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2);
+	void WriteShift(int bits, OpArg dest, OpArg &shift, int ext);
+	void WriteMXCSR(OpArg arg, int ext);
+	void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
+
+protected:
+	inline void Write8(u8 value)   {*code++ = value;}
+	inline void Write16(u16 value) {*(u16*)code = (value); code += 2;}
+	inline void Write32(u32 value) {*(u32*)code = (value); code += 4;}
+	inline void Write64(u64 value) {*(u64*)code = (value); code += 8;}
+
+public:
+	XEmitter() { code = NULL; }
+	XEmitter(u8 *code_ptr) { code = code_ptr; }
+
+	void WriteModRM(int mod, int rm, int reg);
+	void WriteSIB(int scale, int index, int base);

 	void SetCodePtr(u8 *ptr);
 	void ReserveCodeSpace(int bytes);
 	const u8 *AlignCode4();
 	const u8 *AlignCode16();
 	const u8 *AlignCodePage();
-	const u8 *GetCodePtr();
+	const u8 *GetCodePtr() const;
 	u8 *GetWritableCodePtr();

+	// Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
+	// INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr., 
+	// INC and DEC are slow on Intel Core, but not on AMD. They create a
+	// false flag dependency because they only update a subset of the flags.
+	// XCHG is SLOW and should be avoided.

-	// Safe way to temporarily redirect the code generator.
-	class GenContext 
-	{
-		u8 **code_ptr_ptr;
-		u8 *saved_ptr;
-	public:
-		GenContext(u8 **code_ptr_ptr_)
-		{
-			saved_ptr = GetWritableCodePtr();
-			code_ptr_ptr = code_ptr_ptr_;
-			SetCodePtr(*code_ptr_ptr);
-		}
-		~GenContext()
-		{
-			*code_ptr_ptr = GetWritableCodePtr();
-			SetCodePtr(saved_ptr);
-		}
-	};
-
-	enum NormalOp {
-		nrmADD,
-		nrmADC,
-		nrmSUB,
-		nrmSBB,
-		nrmAND,
-		nrmOR ,
-		nrmXOR,
-		nrmMOV,
-		nrmTEST,
-		nrmCMP,
-		nrmXCHG,
-	};
-
-	// Make the generation routine examine which direction to go
-	// probably has to be a static
-
-	// RIP addressing does not benefit from micro op fusion on Core arch
-	struct OpArg
-	{
-		OpArg() {} //dummy op arg, used for storage
-		OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
-		{
-			operandReg = 0;
-			scale = (u8)_scale;
-			offsetOrBaseReg = (u8)rmReg;
-			indexReg = (u8)scaledReg;
-			//if scale == 0 never mind offseting
-			offset = _offset;
-		}
-		void WriteRex(bool op64, int customOp = -1) const;
-		void WriteRest(int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF) const;
-		void WriteSingleByteOp(u8 op, X64Reg operandReg, int bits);
-		//This one is public - must be written to
-		u64 offset; //use RIP-relative as much as possible - avoid 64-bit immediates at all costs
-		u8 operandReg;
-
-		void WriteNormalOp(bool toRM, NormalOp op, const OpArg &operand, int bits) const;
-		bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
-		bool IsSimpleReg() const {return scale == SCALE_NONE;}
-		bool IsSimpleReg(X64Reg reg) const {
-			if (!IsSimpleReg())
-				return false;
-			return GetSimpleReg() == reg;
-		}
-		bool CanDoOpWith(const OpArg &other) const
-		{
-			if (IsSimpleReg()) return true;
-			if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false;
-			return true;
-		}
-
-		int GetImmBits() const
-		{
-			switch (scale)
-			{
-			case SCALE_IMM8: return 8;
-			case SCALE_IMM16: return 16;
-			case SCALE_IMM32: return 32;
-			case SCALE_IMM64: return 64;
-			default: return -1;
-			}
-		}
-		X64Reg GetSimpleReg() const
-		{
-			if (scale == SCALE_NONE)
-				return (X64Reg)offsetOrBaseReg;
-			else
-				return INVALID_REG;
-		}
-	private:
-		u8 scale;
-		u8 offsetOrBaseReg;
-		u8 indexReg;
-	};
-
-	inline OpArg M(void *ptr)	    {return OpArg((u64)ptr, (int)SCALE_RIP);}
-	inline OpArg R(X64Reg value)	{return OpArg(0, SCALE_NONE, value);}
-	inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);}
-	inline OpArg MDisp(X64Reg value, int offset) {
-		return OpArg((u32)offset, SCALE_ATREG, value); }
-	inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
-	{
-		return OpArg(offset, scale, base, scaled);
-	}
-	inline OpArg Imm8 (u8 imm)  {return OpArg(imm, SCALE_IMM8);}
-	inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used
-	inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);}
-	inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);}
-#ifdef _M_X64
-	inline OpArg ImmPtr(void* imm) {return Imm64((u64)imm);}
-#else
-	inline OpArg ImmPtr(void* imm) {return Imm32((u32)imm);}
-#endif
-
+	// Debug breakpoint
 	void INT3();
+
+	// Do nothing
 	void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals)
+
+	// Save energy in wait-loops on P4 only. Probably not too useful.
 	void PAUSE();
-	void RET();
+
+	// Flag control
 	void STC();
 	void CLC();
 	void CMC();
+
+	// These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD!
+	void LAHF(); // 3 cycle vector path
+	void SAHF(); // direct path fast
+
+
+	// Stack control
 	void PUSH(X64Reg reg);
 	void POP(X64Reg reg);
 	void PUSH(int bits, const OpArg &reg);
 	void POP(int bits, const OpArg &reg);
 	void PUSHF();
 	void POPF();
-	
-	typedef const u8* JumpTarget;
-	
-	struct FixupBranch
-	{
-		u8 *ptr;
-		int type; //0 = 8bit 1 = 32bit
-	};

+	// Flow control
+	void RET();
+	void RET_FAST();
+	void UD2();
 	FixupBranch J(bool force5bytes = false);

 	void JMP(const u8 * addr, bool force5Bytes = false);
@ -239,7 +286,7 @@ namespace Gen
 	void JMPptr(const OpArg &arg);
 	void JMPself(); //infinite loop!

-	void CALL(void *fnptr);
+	void CALL(const void *fnptr);
 	void CALLptr(OpArg arg);

 	FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
@ -248,66 +295,20 @@ namespace Gen

 	void SetJumpTarget(const FixupBranch &branch);

-	//WARNING - INC and DEC slow on Intel Core, but not on AMD, since it creates
-	//false flags dependencies because they only update a subset of the flags
-
-	// ector - I hereby BAN inc and dec due to their horribleness :P
-	// void INC(int bits, OpArg arg);
-	// void DEC(int bits, OpArg arg);
-
 	void SETcc(CCFlags flag, OpArg dest);
-	// Note: CMOV brings small if any benefit on current cpus, unfortunately.
+	// Note: CMOV brings small if any benefit on current cpus.
 	void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag);

+	// Fences
 	void LFENCE();
 	void MFENCE();
 	void SFENCE();

+	// Bit scan
 	void BSF(int bits, X64Reg dest, OpArg src); //bottom bit to top bit
 	void BSR(int bits, X64Reg dest, OpArg src); //top bit to bottom bit

-	//These two can not be executed on early Intel 64-bit CPU:s, only on AMD!
-
-	void LAHF(); // 3 cycle vector path
-	void SAHF(); // direct path fast
-	
-	//Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
-	//LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr., 
-
-	//Actually REP MOVSD could be useful :P
-
-	void MOVNTI(int bits, OpArg dest, X64Reg src);
-
-	void MUL(int bits, OpArg src); //UNSIGNED
-	void DIV(int bits, OpArg src);
-	void IMUL(int bits, OpArg src); //SIGNED
-	void IDIV(int bits, OpArg src);
-	void IMUL(int bits, X64Reg regOp, OpArg src);
-	void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm);
-
-
-	void NEG(int bits, OpArg src);
-	void NOT(int bits, OpArg src);
-
-	void ROL(int bits, OpArg dest, OpArg shift);
-	void ROR(int bits, OpArg dest, OpArg shift);
-	void RCL(int bits, OpArg dest, OpArg shift);
-	void RCR(int bits, OpArg dest, OpArg shift);
-	void SHL(int bits, OpArg dest, OpArg shift);
-	void SHR(int bits, OpArg dest, OpArg shift);
-	void SAR(int bits, OpArg dest, OpArg shift);
-
-
-	void CWD(int bits = 16);
-	inline void CDQ() {CWD(32);}
-	inline void CQO() {CWD(64);}
-	void CBW(int bits = 8);
-	inline void CWDE() {CBW(16);}
-	inline void CDQE() {CBW(32);}
-
-	void LEA(int bits, X64Reg dest, OpArg src);
-
-
+	// Cache control
 	enum PrefetchLevel
 	{
 		PF_NTA, //Non-temporal (data used once and only once)
@ -316,58 +317,82 @@ namespace Gen
 		PF_T2,  //Levels 3+ (aliased to T0 on AMD)
 	};
 	void PREFETCH(PrefetchLevel level, OpArg arg);
-	
+	void MOVNTI(int bits, OpArg dest, X64Reg src);
+	void MOVNTDQ(OpArg arg, X64Reg regOp);
+	void MOVNTPS(OpArg arg, X64Reg regOp);
+	void MOVNTPD(OpArg arg, X64Reg regOp);

+	// Multiplication / division
+	void MUL(int bits, OpArg src); //UNSIGNED
+	void IMUL(int bits, OpArg src); //SIGNED
+	void IMUL(int bits, X64Reg regOp, OpArg src);
+	void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm);
+	void DIV(int bits, OpArg src);
+	void IDIV(int bits, OpArg src);
+
+	// Shift 
+	void ROL(int bits, OpArg dest, OpArg shift);
+	void ROR(int bits, OpArg dest, OpArg shift);
+	void RCL(int bits, OpArg dest, OpArg shift);
+	void RCR(int bits, OpArg dest, OpArg shift);
+	void SHL(int bits, OpArg dest, OpArg shift);
+	void SHR(int bits, OpArg dest, OpArg shift);
+	void SAR(int bits, OpArg dest, OpArg shift);
+
+	// Extend EAX into EDX in various ways
+	void CWD(int bits = 16);
+	inline void CDQ() {CWD(32);}
+	inline void CQO() {CWD(64);}
+	void CBW(int bits = 8);
+	inline void CWDE() {CBW(16);}
+	inline void CDQE() {CBW(32);}
+
+	// Load effective address
+	void LEA(int bits, X64Reg dest, OpArg src);
+
+	// Integer arithmetic
+	void NEG (int bits, OpArg src);
 	void ADD (int bits, const OpArg &a1, const OpArg &a2);
 	void ADC (int bits, const OpArg &a1, const OpArg &a2);
 	void SUB (int bits, const OpArg &a1, const OpArg &a2);
 	void SBB (int bits, const OpArg &a1, const OpArg &a2);
 	void AND (int bits, const OpArg &a1, const OpArg &a2);
+	void CMP (int bits, const OpArg &a1, const OpArg &a2);
+
+	// Bit operations
+	void NOT (int bits, OpArg src);
 	void OR  (int bits, const OpArg &a1, const OpArg &a2);
 	void XOR (int bits, const OpArg &a1, const OpArg &a2);
 	void MOV (int bits, const OpArg &a1, const OpArg &a2);
 	void TEST(int bits, const OpArg &a1, const OpArg &a2);
-	void CMP (int bits, const OpArg &a1, const OpArg &a2);
-	
-	// XCHG is SLOW and should be avoided.
-	//void XCHG(int bits, const OpArg &a1, const OpArg &a2);

+	// Are these useful at all? Consider removing.
+	void XCHG(int bits, const OpArg &a1, const OpArg &a2);
 	void XCHG_AHAL();
+
+	// Byte swapping (32 and 64-bit only).
 	void BSWAP(int bits, X64Reg reg);
+
+	// Sign/zero extension
 	void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
 	void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); 

-	enum SSECompare
-	{
-		EQ = 0,
-		LT,
-		LE,
-		UNORD,
-		NEQ,
-		NLT,
-		NLE,
-		ORD,
-	};
-
 	// WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
 	void STMXCSR(OpArg memloc);
 	void LDMXCSR(OpArg memloc);

-	// Regular SSE/SSE2 instructions
+	// Prefixes
+	void LOCK();
+	void REP();
+	void REPNE();
+
+	void FWAIT();
+
+	// SSE/SSE2: Floating point arithmetic
 	void ADDSS(X64Reg regOp, OpArg arg);  
 	void ADDSD(X64Reg regOp, OpArg arg);  
 	void SUBSS(X64Reg regOp, OpArg arg);  
 	void SUBSD(X64Reg regOp, OpArg arg);  
-	void CMPSS(X64Reg regOp, OpArg arg, u8 compare);  
-	void CMPSD(X64Reg regOp, OpArg arg, u8 compare);  
-	void ANDSS(X64Reg regOp, OpArg arg);  
-	void ANDSD(X64Reg regOp, OpArg arg);  
-	void ANDNSS(X64Reg regOp, OpArg arg); 
-	void ANDNSD(X64Reg regOp, OpArg arg); 
-	void ORSS(X64Reg regOp, OpArg arg);   
-	void ORSD(X64Reg regOp, OpArg arg);   
-	void XORSS(X64Reg regOp, OpArg arg);   
-	void XORSD(X64Reg regOp, OpArg arg);   
 	void MULSS(X64Reg regOp, OpArg arg);  
 	void MULSD(X64Reg regOp, OpArg arg);  
 	void DIVSS(X64Reg regOp, OpArg arg);  
@ -381,45 +406,65 @@ namespace Gen
 	void RSQRTSS(X64Reg regOp, OpArg arg);
 	void RSQRTSD(X64Reg regOp, OpArg arg);

-	void COMISS(X64Reg regOp, OpArg arg);
-	void COMISD(X64Reg regOp, OpArg arg);
+	// SSE/SSE2: Floating point bitwise (yes)
+	void CMPSS(X64Reg regOp, OpArg arg, u8 compare);  
+	void CMPSD(X64Reg regOp, OpArg arg, u8 compare);  
+	void ANDSS(X64Reg regOp, OpArg arg);  
+	void ANDSD(X64Reg regOp, OpArg arg);  
+	void ANDNSS(X64Reg regOp, OpArg arg); 
+	void ANDNSD(X64Reg regOp, OpArg arg); 
+	void ORSS(X64Reg regOp, OpArg arg);   
+	void ORSD(X64Reg regOp, OpArg arg);   
+	void XORSS(X64Reg regOp, OpArg arg);   
+	void XORSD(X64Reg regOp, OpArg arg);   

+	// SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
 	void ADDPS(X64Reg regOp, OpArg arg); 
 	void ADDPD(X64Reg regOp, OpArg arg); 
 	void SUBPS(X64Reg regOp, OpArg arg); 
 	void SUBPD(X64Reg regOp, OpArg arg); 
 	void CMPPS(X64Reg regOp, OpArg arg, u8 compare);  
-	void CMPPD(X64Reg regOp, OpArg arg, u8 compare);  
-	void ANDPS(X64Reg regOp, OpArg arg); 
-	void ANDPD(X64Reg regOp, OpArg arg); 
-	void ANDNPS(X64Reg regOp, OpArg arg);
-	void ANDNPD(X64Reg regOp, OpArg arg);
-	void ORPS(X64Reg regOp, OpArg arg);  
-	void ORPD(X64Reg regOp, OpArg arg);  
-	void XORPS(X64Reg regOp, OpArg arg);  
-	void XORPD(X64Reg regOp, OpArg arg);  
-	void MULPS(X64Reg regOp, OpArg arg); 
-	void MULPD(X64Reg regOp, OpArg arg); 
-	void DIVPS(X64Reg regOp, OpArg arg); 
-	void DIVPD(X64Reg regOp, OpArg arg); 
-	void MINPS(X64Reg regOp, OpArg arg); 
-	void MINPD(X64Reg regOp, OpArg arg); 
-	void MAXPS(X64Reg regOp, OpArg arg); 
-	void MAXPD(X64Reg regOp, OpArg arg); 
+	void CMPPD(X64Reg regOp, OpArg arg, u8 compare);
+	void MULPS(X64Reg regOp, OpArg arg);
+	void MULPD(X64Reg regOp, OpArg arg);
+	void DIVPS(X64Reg regOp, OpArg arg);
+	void DIVPD(X64Reg regOp, OpArg arg);
+	void MINPS(X64Reg regOp, OpArg arg);
+	void MINPD(X64Reg regOp, OpArg arg);
+	void MAXPS(X64Reg regOp, OpArg arg);
+	void MAXPD(X64Reg regOp, OpArg arg);
 	void SQRTPS(X64Reg regOp, OpArg arg);
 	void SQRTPD(X64Reg regOp, OpArg arg);
 	void RSQRTPS(X64Reg regOp, OpArg arg);
 	void RSQRTPD(X64Reg regOp, OpArg arg);
+
+	// SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
+	void ANDPS(X64Reg regOp, OpArg arg); 
+	void ANDPD(X64Reg regOp, OpArg arg); 
+	void ANDNPS(X64Reg regOp, OpArg arg);
+	void ANDNPD(X64Reg regOp, OpArg arg);
+	void ORPS(X64Reg regOp, OpArg arg);
+	void ORPD(X64Reg regOp, OpArg arg);
+	void XORPS(X64Reg regOp, OpArg arg);
+	void XORPD(X64Reg regOp, OpArg arg);
+
+	// SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
 	void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle);  
 	void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle);  
-
+	
+	// SSE/SSE2: Useful alternative to shuffle in some cases.
 	void MOVDDUP(X64Reg regOp, OpArg arg);

+	void UNPCKLPD(X64Reg dest, OpArg src);
+	void UNPCKHPD(X64Reg dest, OpArg src);
+
+	// SSE/SSE2: Compares.
 	void COMISS(X64Reg regOp, OpArg arg);
 	void COMISD(X64Reg regOp, OpArg arg);
 	void UCOMISS(X64Reg regOp, OpArg arg);
 	void UCOMISD(X64Reg regOp, OpArg arg);

+	// SSE/SSE2: Moves. Use the right data type for your data, in most cases.
 	void MOVAPS(X64Reg regOp, OpArg arg);
 	void MOVAPD(X64Reg regOp, OpArg arg);
 	void MOVAPS(OpArg arg, X64Reg regOp);
@ -435,20 +480,20 @@ namespace Gen
 	void MOVSS(OpArg arg, X64Reg regOp);
 	void MOVSD(OpArg arg, X64Reg regOp);

-	void MOVMSKPS(X64Reg dest, OpArg arg);
-	void MOVMSKPD(X64Reg dest, OpArg arg);
-
 	void MOVD_xmm(X64Reg dest, const OpArg &arg);
 	void MOVQ_xmm(X64Reg dest, OpArg arg);
 	void MOVD_xmm(const OpArg &arg, X64Reg src);
 	void MOVQ_xmm(OpArg arg, X64Reg src);

+	// SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question.
+	void MOVMSKPS(X64Reg dest, OpArg arg);
+	void MOVMSKPD(X64Reg dest, OpArg arg);
+
+	// SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one.
 	void MASKMOVDQU(X64Reg dest, X64Reg src);
 	void LDDQU(X64Reg dest, OpArg src);

-	void UNPCKLPD(X64Reg dest, OpArg src);
-	void UNPCKHPD(X64Reg dest, OpArg src);
-
+	// SSE/SSE2: Data type conversions.
 	void CVTPS2PD(X64Reg dest, OpArg src);
 	void CVTPD2PS(X64Reg dest, OpArg src);
 	void CVTSS2SD(X64Reg dest, OpArg src);
@ -458,7 +503,7 @@ namespace Gen
 	void CVTPD2DQ(X64Reg regOp, OpArg arg);
 	void CVTDQ2PS(X64Reg regOp, const OpArg &arg);

-	//Integer SSE instructions
+	// SSE2: Packed integer instructions
 	void PACKSSDW(X64Reg dest, OpArg arg);
 	void PACKSSWB(X64Reg dest, OpArg arg);
 	//void PACKUSDW(X64Reg dest, OpArg arg);
@ -528,42 +573,138 @@ namespace Gen

 	void RTDSC();

-void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2);
-void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
-void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
-void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
+	// Utility functions
+	// These only support u32 parameters, but that's enough for a lot of uses.
+	// These will destroy the 1 or 2 first "parameter regs".
+	void ABI_CallFunctionC(void *func, u32 param1);
+	void ABI_CallFunctionCC(void *func, u32 param1, u32 param2);
+	void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2);
+
+	// Pass a register as a paremeter.
+	void ABI_CallFunctionR(void *func, Gen::X64Reg reg1);
+	void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2);
+
+	// A function that doesn't have any control over what it will do to regs,
+	// such as the dispatcher, should be surrounded by these.
+	void ABI_PushAllCalleeSavedRegsAndAdjustStack();
+	void ABI_PopAllCalleeSavedRegsAndAdjustStack();
+
+	// A function that doesn't know anything about it's surroundings, should
+	// be surrounded by these to establish a safe environment, where it can roam free.
+	// An example is a backpatch injected function.
+	void ABI_PushAllCallerSavedRegsAndAdjustStack();
+	void ABI_PopAllCallerSavedRegsAndAdjustStack();
+
+	unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize);
+	void ABI_AlignStack(unsigned int frameSize);
+	void ABI_RestoreStack(unsigned int frameSize);
+
+	// Sets up a __cdecl function.
+	// Only x64 really needs the parameter.
+	void ABI_EmitPrologue(int maxCallParams);
+	void ABI_EmitEpilogue(int maxCallParams);
+
+	#ifdef _M_IX86
+	inline int ABI_GetNumXMMRegs() { return 8; }
+	#else
+	inline int ABI_GetNumXMMRegs() { return 16; }
+	#endif
+
+	// Strange call wrappers.
+	void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2);
+	void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
+	void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
+	void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);

 #if defined(_M_IX86) || !defined(_WIN32)

-#define CallCdeclFunction3_I(a,b,c,d) CallCdeclFunction3((void *)(a), (b), (c), (d))
-#define CallCdeclFunction4_I(a,b,c,d,e) CallCdeclFunction4((void *)(a), (b), (c), (d), (e)) 
-#define CallCdeclFunction5_I(a,b,c,d,e,f) CallCdeclFunction5((void *)(a), (b), (c), (d), (e), (f)) 
-#define CallCdeclFunction6_I(a,b,c,d,e,f,g) CallCdeclFunction6((void *)(a), (b), (c), (d), (e), (f), (g)) 
+	#define CallCdeclFunction3_I(a,b,c,d) CallCdeclFunction3((void *)(a), (b), (c), (d))
+	#define CallCdeclFunction4_I(a,b,c,d,e) CallCdeclFunction4((void *)(a), (b), (c), (d), (e)) 
+	#define CallCdeclFunction5_I(a,b,c,d,e,f) CallCdeclFunction5((void *)(a), (b), (c), (d), (e), (f)) 
+	#define CallCdeclFunction6_I(a,b,c,d,e,f,g) CallCdeclFunction6((void *)(a), (b), (c), (d), (e), (f), (g)) 

-#define DECLARE_IMPORT(x)
+	#define DECLARE_IMPORT(x)

 #else

-// Comments from VertexLoader.cpp about these horrors:
+	// Comments from VertexLoader.cpp about these horrors:

-// This is a horrible hack that is necessary in 64-bit mode because Opengl32.dll is based way, way above the 32-bit
-// address space that is within reach of a CALL, and just doing &fn gives us these high uncallable addresses. So we
-// want to grab the function pointers from the import table instead.
+	// This is a horrible hack that is necessary in 64-bit mode because Opengl32.dll is based way, way above the 32-bit
+	// address space that is within reach of a CALL, and just doing &fn gives us these high uncallable addresses. So we
+	// want to grab the function pointers from the import table instead.

-void ___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2);
-void ___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
-void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
-void ___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
+	void ___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2);
+	void ___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
+	void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
+	void ___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);

-#define CallCdeclFunction3_I(a,b,c,d) ___CallCdeclImport3(&__imp_##a,b,c,d)
-#define CallCdeclFunction4_I(a,b,c,d,e) ___CallCdeclImport4(&__imp_##a,b,c,d,e)
-#define CallCdeclFunction5_I(a,b,c,d,e,f) ___CallCdeclImport5(&__imp_##a,b,c,d,e,f)
-#define CallCdeclFunction6_I(a,b,c,d,e,f,g) ___CallCdeclImport6(&__imp_##a,b,c,d,e,f,g)
+	#define CallCdeclFunction3_I(a,b,c,d) ___CallCdeclImport3(&__imp_##a,b,c,d)
+	#define CallCdeclFunction4_I(a,b,c,d,e) ___CallCdeclImport4(&__imp_##a,b,c,d,e)
+	#define CallCdeclFunction5_I(a,b,c,d,e,f) ___CallCdeclImport5(&__imp_##a,b,c,d,e,f)
+	#define CallCdeclFunction6_I(a,b,c,d,e,f,g) ___CallCdeclImport6(&__imp_##a,b,c,d,e,f,g)

-#define DECLARE_IMPORT(x) extern "C" void *__imp_##x
+	#define DECLARE_IMPORT(x) extern "C" void *__imp_##x

 #endif
+};  // class XEmitter

-}
+
+// Everything that needs to generate X86 code should inherit from this.
+// You get memory management for free, plus, you can use all the MOV etc functions without
+// having to prefix them with gen-> or something similar.
+class XCodeBlock : public XEmitter
+{
+protected:
+	u8 *region;
+	size_t region_size;
+
+public:
+	XCodeBlock() : region(NULL), region_size(0) {}
+	virtual ~XCodeBlock() { if (region) FreeCodeSpace(); }
+
+	// Call this before you generate any code.
+	void AllocCodeSpace(int size)
+	{
+		region_size = size;
+		region = (u8*)AllocateExecutableMemory(region_size);
+		SetCodePtr(region);
+	}
+
+	// Always clear code space with breakpoints, so that if someone accidentally executes
+	// uninitialized, it just breaks into the debugger.
+	void ClearCodeSpace() 
+	{
+		// x86/64: 0xCC = breakpoint
+		memset(region, 0xCC, region_size);
+		ResetCodePtr();
+	}
+
+	// Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
+	void FreeCodeSpace()
+	{
+		FreeMemoryPages(region, region_size);
+		region = NULL;
+		region_size = 0;
+	}
+
+	// Cannot currently be undone. Will write protect the entire code region.
+	// Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
+	void WriteProtect()
+	{
+		WriteProtectMemory(region, region_size, true);		
+	}
+
+	void ResetCodePtr()
+	{
+		SetCodePtr(region);
+	}
+
+	size_t GetSpaceLeft() const
+	{
+		return region_size - (GetCodePtr() - region);
+	}
+};
+
+}  // namespace

 #endif