diff --git a/common/include/Utilities/PageFaultSource.h b/common/include/Utilities/PageFaultSource.h
index 823ef804a8bf0..a4d971b0109a2 100644
--- a/common/include/Utilities/PageFaultSource.h
+++ b/common/include/Utilities/PageFaultSource.h
@@ -342,8 +342,8 @@ class SpatialArrayReserve : public BaseVmReserveListener
 struct _EXCEPTION_POINTERS;
 extern int SysPageFaultExceptionFilter(struct _EXCEPTION_POINTERS* eps);
 
-#	define PCSX2_PAGEFAULT_PROTECT		__try
-#	define PCSX2_PAGEFAULT_EXCEPT		__except(SysPageFaultExceptionFilter(GetExceptionInformation())) {}
+#	define PCSX2_PAGEFAULT_PROTECT __try
+#	define PCSX2_PAGEFAULT_EXCEPT  __except(SysPageFaultExceptionFilter(GetExceptionInformation())) {}
 
 #else
 #	error PCSX2 - Unsupported operating system platform.
@@ -352,5 +352,7 @@ extern int SysPageFaultExceptionFilter(struct _EXCEPTION_POINTERS* eps);
 extern void pxInstallSignalHandler();
 extern void _platform_InstallSignalHandler();
 
+#include "Threading.h"
 extern SrcType_PageFault* Source_PageFault;
+extern Threading::Mutex   PageFault_Mutex;
 
diff --git a/common/include/Utilities/Threading.h b/common/include/Utilities/Threading.h
index 85f0134117f98..6e6edb1e9dbe4 100644
--- a/common/include/Utilities/Threading.h
+++ b/common/include/Utilities/Threading.h
@@ -179,17 +179,20 @@ namespace Threading
 // from these little beasties!  (these are all implemented internally using cross-platform
 // implementations of _InterlockedExchange and such)
 
+	extern u32 AtomicRead( volatile u32& Target );
+	extern s32 AtomicRead( volatile s32& Target );
 	extern u32 AtomicExchange( volatile u32& Target, u32 value );
-	extern u32 AtomicExchangeAdd( volatile u32& Target, u32 value );
-	extern u32 AtomicIncrement( volatile u32& Target );
-	extern u32 AtomicDecrement( volatile u32& Target );
 	extern s32 AtomicExchange( volatile s32& Target, s32 value );
+	extern u32 AtomicExchangeAdd( volatile u32& Target, u32 value );
 	extern s32 AtomicExchangeAdd( volatile s32& Target, s32 value );
 	extern s32 AtomicExchangeSub( volatile s32& Target, s32 value );
+	extern u32 AtomicIncrement( volatile u32& Target );
 	extern s32 AtomicIncrement( volatile s32& Target );
+	extern u32 AtomicDecrement( volatile u32& Target );
 	extern s32 AtomicDecrement( volatile s32& Target );
 
 	extern bool AtomicBitTestAndReset( volatile u32& bitset, u8 bit );
+	extern bool AtomicBitTestAndReset( volatile s32& bitset, u8 bit );
 
 	extern void* _AtomicExchangePointer( volatile uptr& target, uptr value );
 	extern void* _AtomicCompareExchangePointer( volatile uptr& target, uptr value, uptr comparand );
@@ -393,5 +396,34 @@ namespace Threading
 
 		bool Failed() const { return !m_IsLocked; }
 	};
+
+// --------------------------------------------------------------------------------------
+//  ScopedLockBool
+// --------------------------------------------------------------------------------------
+// A ScopedLock in which you specify an external bool to get updated on locks/unlocks.
+// Note that the isLockedBool should only be used as an indicator for the locked status,
+// and not actually depended on for thread synchronization...
+
+	struct ScopedLockBool {	
+		ScopedLock m_lock;
+		volatile __aligned(4) bool& m_bool;
+
+		ScopedLockBool(Mutex& mutexToLock, volatile __aligned(4) bool& isLockedBool)
+			: m_lock(mutexToLock),
+			  m_bool(isLockedBool) {
+			m_bool = m_lock.IsLocked();
+		}
+		virtual ~ScopedLockBool() throw() {
+			m_bool = false;
+		}
+		void Acquire() {
+			m_lock.Acquire();
+			m_bool = m_lock.IsLocked();
+		}
+		void Release() {
+			m_bool = false;
+			m_lock.Release();
+		}
+	};
 }
 
diff --git a/common/include/x86emitter/x86types.h b/common/include/x86emitter/x86types.h
index c862ee25c00b4..9c5ead8fc72dd 100644
--- a/common/include/x86emitter/x86types.h
+++ b/common/include/x86emitter/x86types.h
@@ -35,10 +35,12 @@ enum XMMSSEType
 // as a project option.  The multithreaded emitter relies on native compiler support for
 // TLS -- Macs are crap out of luck there (for now).
 
+#include "Utilities/Threading.h"
+
 #ifndef x86EMIT_MULTITHREADED
-#	define x86EMIT_MULTITHREADED	0
-#else
-#	if !PCSX2_THREAD_LOCAL
+#	if PCSX2_THREAD_LOCAL
+#		define x86EMIT_MULTITHREADED	1
+#	else
 		// No TLS support?  Force-clear the MT flag:
 #		pragma message("x86emitter: TLS not available, multithreaded emitter disabled.")
 #		undef x86EMIT_MULTITHREADED
diff --git a/common/src/Utilities/Linux/LnxHostSys.cpp b/common/src/Utilities/Linux/LnxHostSys.cpp
index ffacbc92321c5..c2804862baa9a 100644
--- a/common/src/Utilities/Linux/LnxHostSys.cpp
+++ b/common/src/Utilities/Linux/LnxHostSys.cpp
@@ -46,6 +46,12 @@ static void SysPageFaultSignalFilter( int signal, siginfo_t *siginfo, void * )
 	// Note: Use of stdio functions isn't safe here.  Avoid console logs,
 	// assertions, file logs, or just about anything else useful.
 
+
+	// Note: This signal can be accessed by the EE or MTVU thread
+	// Source_PageFault is a global variable with its own state information
+	// so for now we lock this exception code unless someone can fix this better...
+	Threading::ScopedLock lock(PageFault_Mutex);
+
 	Source_PageFault->Dispatch( PageFaultInfo( (uptr)siginfo->si_addr & ~m_pagemask ) );
 
 	// resumes execution right where we left off (re-executes instruction that
diff --git a/common/src/Utilities/ThreadTools.cpp b/common/src/Utilities/ThreadTools.cpp
index 10ef9ea0b4deb..01ea10f1b1399 100644
--- a/common/src/Utilities/ThreadTools.cpp
+++ b/common/src/Utilities/ThreadTools.cpp
@@ -786,72 +786,70 @@ void Threading::WaitEvent::Wait()
 //  InterlockedExchanges / AtomicExchanges (PCSX2's Helper versions)
 // --------------------------------------------------------------------------------------
 // define some overloads for InterlockedExchanges for commonly used types, like u32 and s32.
+// Note: For all of these atomic operations below to be atomic, the variables need to be 4-byte
+// aligned. Read: http://msdn.microsoft.com/en-us/library/ms684122%28v=vs.85%29.aspx
 
-__fi bool Threading::AtomicBitTestAndReset( volatile u32& bitset, u8 bit )
-{
-	return _interlockedbittestandreset( (volatile long*)& bitset, bit ) != 0;
+__fi u32 Threading::AtomicRead(volatile u32& Target) {
+	return Target; // Properly-aligned 32-bit reads are atomic
 }
-
-__fi u32 Threading::AtomicExchange( volatile u32& Target, u32 value )
-{
-	return _InterlockedExchange( (volatile long*)&Target, value );
+__fi s32 Threading::AtomicRead(volatile s32& Target) {
+	return Target; // Properly-aligned 32-bit reads are atomic
 }
 
-__fi u32 Threading::AtomicExchangeAdd( volatile u32& Target, u32 value )
-{
-	return _InterlockedExchangeAdd( (volatile long*)&Target, value );
+__fi bool Threading::AtomicBitTestAndReset( volatile u32& bitset, u8 bit ) {
+	return _interlockedbittestandreset( (volatile long*)& bitset, bit ) != 0;
 }
-
-__fi u32 Threading::AtomicIncrement( volatile u32& Target )
-{
-	return _InterlockedExchangeAdd( (volatile long*)&Target, 1 );
+__fi bool Threading::AtomicBitTestAndReset( volatile s32& bitset, u8 bit ) {
+	return _interlockedbittestandreset( (volatile long*)& bitset, bit ) != 0;
 }
 
-__fi u32 Threading::AtomicDecrement( volatile u32& Target )
-{
-	return _InterlockedExchangeAdd( (volatile long*)&Target, -1 );
+__fi u32 Threading::AtomicExchange(volatile u32& Target, u32 value ) {
+	return _InterlockedExchange( (volatile long*)&Target, value );
 }
-
-__fi s32 Threading::AtomicExchange( volatile s32& Target, s32 value )
-{
+__fi s32 Threading::AtomicExchange( volatile s32& Target, s32 value ) {
 	return _InterlockedExchange( (volatile long*)&Target, value );
 }
 
-__fi s32 Threading::AtomicExchangeAdd( volatile s32& Target, s32 value )
-{
+__fi u32 Threading::AtomicExchangeAdd( volatile u32& Target, u32 value ) {
+	return _InterlockedExchangeAdd( (volatile long*)&Target, value );
+}
+__fi s32 Threading::AtomicExchangeAdd( volatile s32& Target, s32 value ) {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, value );
 }
 
-__fi s32 Threading::AtomicExchangeSub( volatile s32& Target, s32 value )
-{
+__fi s32 Threading::AtomicExchangeSub( volatile s32& Target, s32 value ) {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, -value );
 }
 
-__fi s32 Threading::AtomicIncrement( volatile s32& Target )
-{
+__fi u32 Threading::AtomicIncrement( volatile u32& Target ) {
+	return _InterlockedExchangeAdd( (volatile long*)&Target, 1 );
+}
+__fi s32 Threading::AtomicIncrement( volatile s32& Target) {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, 1 );
 }
 
-__fi s32 Threading::AtomicDecrement( volatile s32& Target )
-{
+__fi u32 Threading::AtomicDecrement( volatile u32& Target ) {
 	return _InterlockedExchangeAdd( (volatile long*)&Target, -1 );
 }
+__fi s32 Threading::AtomicDecrement(volatile s32& Target) {
+	return _InterlockedExchangeAdd((volatile long*)&Target, -1);
+}
 
-__fi void* Threading::_AtomicExchangePointer( volatile uptr& target, uptr value )
+__fi void* Threading::_AtomicExchangePointer(volatile uptr& target, uptr value)
 {
 #ifdef _M_AMD64		// high-level atomic ops, please leave these 64 bit checks in place.
-	return (void*)_InterlockedExchange64( &(volatile s64&)target, value );
+	return (void*)_InterlockedExchange64(&(volatile s64&)target, value);
 #else
-	return (void*)_InterlockedExchange( (volatile long*)&target, value );
+	return (void*)_InterlockedExchange((volatile long*)&target, value);
 #endif
 }
 
-__fi void* Threading::_AtomicCompareExchangePointer( volatile uptr& target, uptr value, uptr comparand )
+__fi void* Threading::_AtomicCompareExchangePointer(volatile uptr& target, uptr value, uptr comparand)
 {
 #ifdef _M_AMD64		// high-level atomic ops, please leave these 64 bit checks in place.
-	return (void*)_InterlockedCompareExchange64( &(volatile s64&)target, value );
+	return (void*)_InterlockedCompareExchange64(&(volatile s64&)target, value);
 #else
-	return (void*)_InterlockedCompareExchange( &(volatile long&)target, value, comparand );
+	return (void*)_InterlockedCompareExchange(&(volatile long&)target, value, comparand);
 #endif
 }
 
diff --git a/common/src/Utilities/VirtualMemory.cpp b/common/src/Utilities/VirtualMemory.cpp
index 02521f4a5cca4..2e8742354e8b4 100644
--- a/common/src/Utilities/VirtualMemory.cpp
+++ b/common/src/Utilities/VirtualMemory.cpp
@@ -26,11 +26,11 @@
 template class EventSource< IEventListener_PageFault >;
 
 SrcType_PageFault* Source_PageFault = NULL;
+Threading::Mutex   PageFault_Mutex;
 
 void pxInstallSignalHandler()
 {
-	if (!Source_PageFault)
-	{
+	if(!Source_PageFault) {
 		Source_PageFault = new SrcType_PageFault();
 	}
 
diff --git a/common/src/Utilities/Windows/WinHostSys.cpp b/common/src/Utilities/Windows/WinHostSys.cpp
index f010aeec9307b..965e6ae287c8d 100644
--- a/common/src/Utilities/Windows/WinHostSys.cpp
+++ b/common/src/Utilities/Windows/WinHostSys.cpp
@@ -25,6 +25,10 @@ int SysPageFaultExceptionFilter( EXCEPTION_POINTERS* eps )
 	if( eps->ExceptionRecord->ExceptionCode != EXCEPTION_ACCESS_VIOLATION )
 		return EXCEPTION_CONTINUE_SEARCH;
 
+	// Note: This exception can be accessed by the EE or MTVU thread
+	// Source_PageFault is a global variable with its own state information
+	// so for now we lock this exception code unless someone can fix this better...
+	Threading::ScopedLock lock(PageFault_Mutex);
 	Source_PageFault->Dispatch( PageFaultInfo( (uptr)eps->ExceptionRecord->ExceptionInformation[1] ) );
 	return Source_PageFault->WasHandled() ? EXCEPTION_CONTINUE_EXECUTION : EXCEPTION_CONTINUE_SEARCH;
 }
diff --git a/pcsx2/Config.h b/pcsx2/Config.h
index 1d56d24060909..36194e2a910ce 100644
--- a/pcsx2/Config.h
+++ b/pcsx2/Config.h
@@ -377,7 +377,8 @@ struct Pcsx2Config
 				IntcStat		:1,		// tells Pcsx2 to fast-forward through intc_stat waits.
 				WaitLoop		:1,		// enables constant loop detection and fast-forwarding
 				vuFlagHack		:1,		// microVU specific flag hack
-				vuBlockHack		:1;		// microVU specific block flag no-propagation hack
+				vuBlockHack		:1,		// microVU specific block flag no-propagation hack
+				vuThread        :1;		// Enable Threaded VU1
 		BITFIELD_END
 
 		u8	EECycleRate;		// EE cycle rate selector (1.0, 1.5, 2.0)
@@ -471,6 +472,7 @@ TraceLogFilters&				SetTraceConfig();
 
 // ------------ CPU / Recompiler Options ---------------
 
+#define THREAD_VU1					(EmuConfig.Cpu.Recompiler.UseMicroVU1 && EmuConfig.Speedhacks.vuThread)
 #define CHECK_MICROVU0				(EmuConfig.Cpu.Recompiler.UseMicroVU0)
 #define CHECK_MICROVU1				(EmuConfig.Cpu.Recompiler.UseMicroVU1)
 #define CHECK_EEREC					(EmuConfig.Cpu.Recompiler.EnableEE && GetCpuProviders().IsRecAvailable_EE())
diff --git a/pcsx2/FiFo.cpp b/pcsx2/FiFo.cpp
index 0eb816b7beaa7..27ae9eead58e1 100644
--- a/pcsx2/FiFo.cpp
+++ b/pcsx2/FiFo.cpp
@@ -17,9 +17,8 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"
 
-#include "Gif.h"
-#include "Gif_Unit.h"
 #include "GS.h"
+#include "Gif_Unit.h"
 #include "Vif.h"
 #include "Vif_Dma.h"
 #include "IPU/IPU.h"
diff --git a/pcsx2/GS.cpp b/pcsx2/GS.cpp
index 779f10a8e6739..b00e8b65f385f 100644
--- a/pcsx2/GS.cpp
+++ b/pcsx2/GS.cpp
@@ -19,7 +19,6 @@
 #include <list>
 
 #include "GS.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 #include "Counters.h"
 
diff --git a/pcsx2/GS.h b/pcsx2/GS.h
index e3fac7b2c8ffa..a9631824b3824 100644
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@@ -245,6 +245,7 @@ enum MTGS_RingCommand
 ,	GS_RINGTYPE_MODECHANGE		// for issued mode changes.
 ,	GS_RINGTYPE_CRC
 ,	GS_RINGTYPE_GSPACKET
+,	GS_RINGTYPE_MTVU_GSPACKET
 };
 
 
@@ -263,8 +264,8 @@ class SysMtgsThread : public SysThreadBase
 
 public:
 	// note: when m_ReadPos == m_WritePos, the fifo is empty
-	uint			m_ReadPos;			// cur pos gs is reading from
-	uint			m_WritePos;			// cur pos ee thread is writing to
+	__aligned(4) uint m_ReadPos;	// cur pos gs is reading from
+	__aligned(4) uint m_WritePos;	// cur pos ee thread is writing to
 
 	volatile bool	m_RingBufferIsBusy;
 	volatile u32	m_SignalRingEnable;
@@ -273,7 +274,9 @@ class SysMtgsThread : public SysThreadBase
 	volatile s32	m_QueuedFrameCount;
 	volatile u32	m_VsyncSignalListener;
 
-	Mutex			m_mtx_RingBufferBusy;
+	Mutex			m_mtx_RingBufferBusy;  // Is obtained while processing ring-buffer data
+	Mutex			m_mtx_RingBufferBusy2; // This one gets released on semaXGkick waiting...
+	Mutex			m_mtx_WaitGS;
 	Semaphore		m_sem_OnRingReset;
 	Semaphore		m_sem_Vsync;
 
@@ -304,8 +307,7 @@ class SysMtgsThread : public SysThreadBase
 	virtual ~SysMtgsThread() throw();
 
 	// Waits for the GS to empty out the entire ring buffer contents.
-	// Used primarily for plugin startup/shutdown.
-	void WaitGS();
+	void WaitGS(bool syncRegs=true, bool weakWait=false, bool isMTVU=false);
 	void ResetGS();
 
 	void PrepDataPacket( MTGS_RingCommand cmd, u32 size );
diff --git a/pcsx2/Gif.cpp b/pcsx2/Gif.cpp
index 642b682a54531..0a085b87d419b 100644
--- a/pcsx2/Gif.cpp
+++ b/pcsx2/Gif.cpp
@@ -17,7 +17,6 @@
 #include "Common.h"
 
 #include "GS.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 #include "Vif_Dma.h"
 
@@ -87,6 +86,7 @@ __fi void gifInterrupt()
 }
 
 static u32 WRITERING_DMA(u32 *pMem, u32 qwc) {
+	//qwc = min(qwc, 1024u);
 	uint size = gifUnit.TransferGSPacketData(GIF_TRANS_DMA, (u8*)pMem, qwc*16) / 16;
 	incGifChAddr(size);
 	return size;
diff --git a/pcsx2/Gif.h b/pcsx2/Gif.h
index 81238a269e562..bdbe46fe4a6bc 100644
--- a/pcsx2/Gif.h
+++ b/pcsx2/Gif.h
@@ -35,15 +35,17 @@ enum GIF_PATH {
 enum GIF_TRANSFER_TYPE {
 	GIF_TRANS_INVALID  = 0x000, // Invalid
 	GIF_TRANS_XGKICK   = 0x100, // Path 1
-	GIF_TRANS_DIRECT   = 0x201, // Path 2
-	GIF_TRANS_DIRECTHL = 0x301, // Path 2
-	GIF_TRANS_DMA      = 0x402, // Path 3
-	GIF_TRANS_FIFO     = 0x502  // Path 3
+	GIF_TRANS_MTVU     = 0x200, // Path 1
+	GIF_TRANS_DIRECT   = 0x301, // Path 2
+	GIF_TRANS_DIRECTHL = 0x401, // Path 2
+	GIF_TRANS_DMA      = 0x502, // Path 3
+	GIF_TRANS_FIFO     = 0x602  // Path 3
 };
 
-static const char Gif_TransferStr[6][32] = {
+static const char Gif_TransferStr[7][32] = {
 	"Invalid Transfer Type",
 	"GIF_TRANS_XGKICK",
+	"GIF_TRANS_MTVU",
 	"GIF_TRANS_DIRECT",
 	"GIF_TRANS_DIRECTHL",
 	"GIF_TRANS_DMA",
diff --git a/pcsx2/Gif_Logger.cpp b/pcsx2/Gif_Logger.cpp
index bf28de58764e2..7811c3cc1e8d0 100644
--- a/pcsx2/Gif_Logger.cpp
+++ b/pcsx2/Gif_Logger.cpp
@@ -15,7 +15,6 @@
 
 #include "PrecompiledHeader.h"
 #include "Common.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 
 #define GIF_PARSE DevCon.WriteLn
diff --git a/pcsx2/Gif_Unit.cpp b/pcsx2/Gif_Unit.cpp
index 206043a0ffc08..eefd76cf07c41 100644
--- a/pcsx2/Gif_Unit.cpp
+++ b/pcsx2/Gif_Unit.cpp
@@ -19,6 +19,7 @@
 #include "GS.h"
 #include "Gif_Unit.h"
 #include "Vif_Dma.h"
+#include "MTVU.h"
 
 Gif_Unit gifUnit;
 
@@ -76,12 +77,32 @@ bool Gif_HandlerAD(u8* pMem) {
 	return false;
 }
 
+// Returns true if pcsx2 needed to process the packet...
+bool Gif_HandlerAD_Debug(u8* pMem) {
+	u32   reg = pMem[8];
+	if   (reg == 0x50) { Console.Error("GIF Handler Debug - BITBLTBUF"); return 1; }
+	elif (reg == 0x52) { Console.Error("GIF Handler Debug - TRXREG");    return 1; }
+	elif (reg == 0x53) { Console.Error("GIF Handler Debug - TRXDIR");    return 1; }
+	elif (reg == 0x60) { Console.Error("GIF Handler Debug - SIGNAL");    return 1; }
+	elif (reg == 0x61) { Console.Error("GIF Handler Debug - FINISH");    return 1; }
+	elif (reg == 0x62) { Console.Error("GIF Handler Debug - LABEL");     return 1; }
+	elif (reg >= 0x63 && reg != 0x7f) {
+		DevCon.Warning("GIF Handler Debug - Write to unknown register! [reg=%x]", reg);
+	}
+	return 0;
+}
+
 void Gif_FinishIRQ() {
 	if (CSRreg.FINISH && !(GSIMR&0x200)) {
 		gsIrq();
 	}
 }
 
+// Used in MTVU mode... MTVU will later complete a real packet
+void Gif_AddGSPacketMTVU(GS_Packet& gsPack, GIF_PATH path) {
+	GetMTGS().SendSimpleGSPacket(GS_RINGTYPE_MTVU_GSPACKET, 0, 0, path);
+}
+
 void Gif_AddCompletedGSPacket(GS_Packet& gsPack, GIF_PATH path) {
 	//DevCon.WriteLn("Adding Completed Gif Packet [size=%x]", gsPack.size);
 	if (COPY_GS_PACKET_TO_MTGS) {
@@ -91,6 +112,7 @@ void Gif_AddCompletedGSPacket(GS_Packet& gsPack, GIF_PATH path) {
 		GetMTGS().SendDataPacket();
 	}
 	else {
+		pxAssertDev(!gsPack.readAmount, "Gif Unit - gsPack.readAmount only valid for MTVU path 1!");
 		AtomicExchangeAdd(gifUnit.gifPath[path].readAmount, gsPack.size);
 		GetMTGS().SendSimpleGSPacket(GS_RINGTYPE_GSPACKET,  gsPack.offset, gsPack.size, path);
 	}
@@ -102,35 +124,47 @@ void Gif_AddBlankGSPacket(u32 size, GIF_PATH path) {
 	GetMTGS().SendSimpleGSPacket(GS_RINGTYPE_GSPACKET, ~0u, size, path);
 }
 
-void Gif_MTGS_Wait() {
-	GetMTGS().WaitGS();
-}
-
-void Gif_Execute() {
-	gifUnit.Execute();
+void Gif_MTGS_Wait(bool isMTVU) {
+	GetMTGS().WaitGS(false, true, isMTVU);
 }
 
 void SaveStateBase::gifPathFreeze(u32 path) {
 
 	Gif_Path& gifPath = gifUnit.gifPath[path];
-	pxAssertDev(gifPath.readAmount==0, "Gif Path readAmount should be 0!");
+	pxAssertDev(!gifPath.readAmount,           "Gif Path readAmount should be 0!");
+	pxAssertDev(!gifPath.gsPack.readAmount,     "GS Pack readAmount should be 0!");
+	pxAssertDev(!gifPath.GetPendingGSPackets(), "MTVU GS Pack Queue should be 0!");
+
 	if (IsSaving()) { // Move all the buffered data to the start of buffer
 		gifPath.RealignPacket(); // May add readAmount which we need to clear on load
 	}
 	u8* bufferPtr = gifPath.buffer; // Backup current buffer ptr
-	Freeze(gifPath);
+	Freeze(gifPath.mtvu.fakePackets);
+	FreezeMem(&gifPath,  sizeof(gifPath) - sizeof(gifPath.mtvu));
 	FreezeMem(bufferPtr, gifPath.curSize);
 	gifPath.buffer = bufferPtr;
-	if (!IsSaving()) gifPath.readAmount = 0;
+	if(!IsSaving()) {
+		gifPath.readAmount        = 0;
+		gifPath.gsPack.readAmount = 0;
+	}
 }
 
 void SaveStateBase::gifFreeze() {
-	Gif_MTGS_Wait();
+	bool mtvuMode = THREAD_VU1;
+	pxAssert(vu1Thread.IsDone());
+	GetMTGS().WaitGS();
 	FreezeTag("Gif Unit");
+	Freeze(mtvuMode);
 	Freeze(gifUnit.stat);
 	Freeze(gifUnit.gsSIGNAL);
 	Freeze(gifUnit.lastTranType);
 	gifPathFreeze(GIF_PATH_1);
 	gifPathFreeze(GIF_PATH_2);
 	gifPathFreeze(GIF_PATH_3);
+	if (!IsSaving()) {
+		if (mtvuMode != THREAD_VU1) {
+			DevCon.Warning("gifUnit: MTVU Mode has switched between save/load state");
+			// ToDo: gifUnit.SwitchMTVU(mtvuMode);
+		}
+	}
 }
diff --git a/pcsx2/Gif_Unit.h b/pcsx2/Gif_Unit.h
index 9649bf1f9e797..dc792c3dd85eb 100644
--- a/pcsx2/Gif_Unit.h
+++ b/pcsx2/Gif_Unit.h
@@ -14,11 +14,16 @@
  */
 
 #pragma once
+#include <deque>
 #include "System/SysThreads.h"
+#include "Gif.h"
 struct GS_Packet;
-extern void Gif_MTGS_Wait();
+extern void Gif_MTGS_Wait(bool isMTVU);
 extern void Gif_FinishIRQ();
 extern bool Gif_HandlerAD(u8* pMem);
+extern bool Gif_HandlerAD_Debug(u8* pMem);
+extern void Gif_AddBlankGSPacket(u32 size, GIF_PATH path);
+extern void Gif_AddGSPacketMTVU     (GS_Packet& gsPack, GIF_PATH path);
 extern void Gif_AddCompletedGSPacket(GS_Packet& gsPack, GIF_PATH path);
 extern void Gif_ParsePacket(u8* data, u32 size, GIF_PATH path);
 extern void Gif_ParsePacket(GS_Packet& gsPack, GIF_PATH path);
@@ -105,10 +110,11 @@ struct Gif_Tag {
 };
 
 struct GS_Packet {
-	u32  offset; // Path buffer offset for start of packet
-	u32  size;	 // Full size of GS-Packet
-	s32  cycles; // EE Cycles taken to process this GS packet
-	bool done;	 // 0 = Incomplete, 1 = Complete
+	u32  offset;     // Path buffer offset for start of packet
+	u32  size;	     // Full size of GS-Packet
+	s32  cycles;     // EE Cycles taken to process this GS packet
+	s32  readAmount; // Dummy read-amount data needed for proper buffer calculations
+	bool done;	     // 0 = Incomplete, 1 = Complete
 	GS_Packet()  { Reset(); }
 	void Reset() { memzero(*this); }
 };
@@ -124,8 +130,16 @@ static __fi void incTag(u32& offset, u32& size, u32 incAmount) {
 	offset += incAmount;
 }
 
+struct Gif_Path_MTVU {
+	u32   fakePackets; // Fake packets pending to be sent to MTGS
+	Mutex gsPackMutex; // Used for atomic access to gsPackQueue
+	std::deque<GS_Packet> gsPackQueue; // VU1 programs' XGkick(s)
+	Gif_Path_MTVU() { Reset(); }
+	void Reset()    { fakePackets = 0; gsPackQueue.clear(); }
+};
+
 struct Gif_Path {
-	volatile s32 __aligned(4) readAmount; // Amount of data MTGS still needs to read
+	__aligned(4) volatile s32 readAmount; // Amount of data MTGS still needs to read
 	u8* buffer;		  // Path packet buffer
 	u32 buffSize;	  // Full size of buffer
 	u32 buffLimit;	  // Cut off limit to wrap around
@@ -135,6 +149,7 @@ struct Gif_Path {
 	GS_Packet gsPack; // Current GS Packet info
 	GIF_PATH  idx;	  // Gif Path Index
 	GIF_PATH_STATE state; // Path State
+	Gif_Path_MTVU  mtvu;  // Must be last for saved states
 
 	Gif_Path()  {}
 	~Gif_Path() { _aligned_free(buffer); }
@@ -156,6 +171,7 @@ struct Gif_Path {
 			//curOffset = curSize;
 			return;
 		}
+		mtvu.Reset();
 		curSize     = 0;
 		curOffset   = 0;
 		readAmount  = 0;
@@ -163,32 +179,38 @@ struct Gif_Path {
 		gsPack.Reset();
 	}
 
+	bool isMTVU()           { return !idx && THREAD_VU1; }
+	s32 getReadAmount()     { return AtomicRead(readAmount) + gsPack.readAmount; }
 	bool hasDataRemaining() { return curOffset < curSize; }
-	bool isDone() { return !hasDataRemaining() && state == GIF_PATH_IDLE; }
+	bool isDone()           { return isMTVU() ? !mtvu.fakePackets 
+							: (!hasDataRemaining() && state == GIF_PATH_IDLE); }
 
 	// Waits on the MTGS to process gs packets
 	void mtgsReadWait() {
-		//pxAssertDev(AtomicExchangeAdd(readAmount, 0) != 0, "Gif Path Buffer Overflow!");
-		DevCon.WriteLn(Color_Red, "Gif Path[%d] - MTGS Wait! [r=0x%x]", 
-			           idx+1, AtomicExchangeAdd(readAmount, 0));
-		Gif_MTGS_Wait();
+		if (IsDevBuild) {
+			DevCon.WriteLn(Color_Red,   "Gif Path[%d] - MTGS Wait! [r=0x%x]", idx+1, getReadAmount());
+			Gif_MTGS_Wait(isMTVU());
+			DevCon.WriteLn(Color_Green, "Gif Path[%d] - MTGS Wait! [r=0x%x]", idx+1, getReadAmount());
+			return;
+		}
+		Gif_MTGS_Wait(isMTVU());
 	}
 
 	// Moves packet data to start of buffer
 	void RealignPacket() {
-		extern void Gif_AddBlankGSPacket(u32 size, GIF_PATH path);
 		GUNIT_LOG("Path Buffer: Realigning packet!");
 		s32 offset    = curOffset - gsPack.size;
 		s32 sizeToAdd = curSize   - offset;
 		s32 intersect = sizeToAdd - offset;
 		if (intersect < 0) intersect = 0;
 		for(;;) {
-			s32 frontFree  = offset - AtomicExchangeAdd(readAmount, 0);
+			s32 frontFree  = offset - getReadAmount();
 			if (frontFree >= sizeToAdd - intersect) break;
 			mtgsReadWait();
 		}
 		if (offset < (s32)buffLimit) { // Needed for correct readAmount values
-			Gif_AddBlankGSPacket(buffLimit - offset, idx);
+			if (isMTVU()) gsPack.readAmount += buffLimit - offset;
+			else Gif_AddBlankGSPacket(buffLimit - offset, idx);
 		}
 		//DevCon.WriteLn("Realign Packet [%d]", curSize - offset);
 		if (intersect) memmove(buffer, &buffer[offset], curSize - offset);
@@ -200,12 +222,12 @@ struct Gif_Path {
 
 	void CopyGSPacketData(u8* pMem, u32 size, bool aligned = false) {	
 		if (curSize + size > buffSize) { // Move gsPack to front of buffer
-			DevCon.Warning("CopyGSPacketData: Realigning packet!");
+			GUNIT_LOG("CopyGSPacketData: Realigning packet!");
 			RealignPacket();
 		}
 		for(;;) {
 			s32 offset  = curOffset - gsPack.size;
-			s32 readPos = offset    - AtomicExchangeAdd(readAmount, 0);
+			s32 readPos = offset    - getReadAmount();
 			if (readPos >= 0) break; // MTGS is reading in back of curOffset
 			if ((s32)buffLimit + readPos > (s32)curSize + (s32)size) break; // Enough free front space
 			mtgsReadWait(); // Let MTGS run to free up buffer space
@@ -217,12 +239,21 @@ struct Gif_Path {
 	}
 
 	// If completed a GS packet (with EOP) then returned GS_Packet.done = 1
+	// MTVU: This function only should be called called on EE thread
 	GS_Packet ExecuteGSPacket() {
+		if (mtvu.fakePackets) { // For MTVU mode...
+			mtvu.fakePackets--;
+			GS_Packet fakePack;
+			fakePack.done =  1; // Fake packets don't get processed by pcsx2
+			fakePack.size =~0u; // Used to indicate that its a fake packet
+			return fakePack;
+		}
+		pxAssert(!isMTVU());
 		for(;;) {
 			if (!gifTag.isValid) { // Need new Gif Tag
 				// We don't have enough data for a Gif Tag
 				if (curOffset + 16 > curSize) {
-					GUNIT_LOG("Path Buffer: Not enough data for gif tag! [%d]", curSize-curOffset);
+					//GUNIT_LOG("Path Buffer: Not enough data for gif tag! [%d]", curSize-curOffset);
 					return gsPack;
 				}
 
@@ -249,7 +280,7 @@ struct Gif_Path {
 				while(gifTag.nLoop && !dblSIGNAL) {
 					if (curOffset + 16 > curSize) return gsPack; // Exit Early
 					if (gifTag.curReg() == GIF_REG_A_D) {
-						dblSIGNAL = Gif_HandlerAD(&buffer[curOffset]);
+						if (!isMTVU()) dblSIGNAL = Gif_HandlerAD(&buffer[curOffset]);
 					}
 					incTag(curOffset, gsPack.size, 16); // 1 QWC
 					gifTag.packedStep();
@@ -271,6 +302,84 @@ struct Gif_Path {
 			}
 		}
 	}
+
+	// MTVU: Gets called on VU XGkicks on MTVU thread
+	void ExecuteGSPacketMTVU() {
+		// Move packet to start of buffer
+		if (curOffset > buffLimit) {
+			RealignPacket();
+		}
+		if (IsDevBuild) { // We check the packet to see if it actually
+			for(;;) {     // needed to be processed by pcsx2...
+				if (curOffset + 16 > curSize) break;
+				gifTag.setTag(&buffer[curOffset], 1);
+				
+				if(!gifTag.hasAD && curOffset + 16 + gifTag.len > curSize) break;
+				incTag(curOffset, gsPack.size, 16); // Tag Size
+				
+				if (gifTag.hasAD) { // Only can be true if GIF_FLG_PACKED
+					while(gifTag.nLoop) {
+						if (curOffset + 16 > curSize) break; // Exit Early
+						if (gifTag.curReg() == GIF_REG_A_D) {
+							pxAssert(!Gif_HandlerAD_Debug(&buffer[curOffset]));
+						}
+						incTag(curOffset, gsPack.size, 16); // 1 QWC
+						gifTag.packedStep();
+					}
+				}
+				else incTag(curOffset, gsPack.size, gifTag.len); // Data length
+				if (curOffset >= curSize) break;
+				if (gifTag.tag.EOP)       break;
+			}
+			pxAssert(curOffset == curSize);
+			gifTag.isValid = false;
+		}
+		else {
+			// We assume every packet is a full GS Packet
+			// And we don't process anything on pcsx2 side
+			gsPack.size += curSize - curOffset;
+			curOffset    = curSize;
+		}
+	}
+
+	// MTVU: Gets called after VU1 execution on MTVU thread
+	void FinishGSPacketMTVU() {
+		if (1) {
+			ScopedLock lock(mtvu.gsPackMutex);
+			AtomicExchangeAdd(readAmount, gsPack.size + gsPack.readAmount);
+			mtvu.gsPackQueue.push_back(gsPack);
+		}
+		gsPack.Reset();
+		gsPack.offset = curOffset;
+	}
+
+	// MTVU: Gets called by MTGS thread
+	GS_Packet GetGSPacketMTVU() {
+		ScopedLock lock(mtvu.gsPackMutex);
+		if (mtvu.gsPackQueue.size()) {
+			GS_Packet t = mtvu.gsPackQueue[0];
+			return t; // XGkick GS packet(s)
+		}
+		Console.Error("MTVU: Expected gsPackQueue to have elements!");
+		pxAssert(0);
+		return GS_Packet(); // gsPack.size will be 0
+	}
+
+	// MTVU: Gets called by MTGS thread
+	void PopGSPacketMTVU() {
+		ScopedLock lock(mtvu.gsPackMutex);
+		if (mtvu.gsPackQueue.size()) {
+			mtvu.gsPackQueue.pop_front();
+		}
+	}
+
+	// MTVU: Returns the amount of pending
+	// GS Packets that MTGS hasn't yet processed
+	u32 GetPendingGSPackets() {
+		ScopedLock lock(mtvu.gsPackMutex);
+		u32 t = mtvu.gsPackQueue.size();
+		return t;
+	}
 };
 
 struct Gif_Unit {
@@ -280,8 +389,8 @@ struct Gif_Unit {
 	GIF_TRANSFER_TYPE lastTranType; // Last Transfer Type
 
 	Gif_Unit() : stat(gifRegs.stat) {
-		gifPath[0].Init(GIF_PATH_1, _1mb*8, _16kb + _1kb);
-		gifPath[1].Init(GIF_PATH_2, _1mb*8, _1mb  + _1kb);
+		gifPath[0].Init(GIF_PATH_1, _1mb*9, _1mb  + _1kb);
+		gifPath[1].Init(GIF_PATH_2, _1mb*9, _1mb  + _1kb);
 		gifPath[2].Init(GIF_PATH_3, _1mb*9, _1mb  + _1kb);
 	}
 
@@ -307,24 +416,24 @@ struct Gif_Unit {
 
 	// Adds a finished GS Packet to the MTGS ring buffer
 	__fi void AddCompletedGSPacket(GS_Packet& gsPack, GIF_PATH path) {
-		Gif_AddCompletedGSPacket(gsPack, path);
+		if (gsPack.size==~0u) Gif_AddGSPacketMTVU     (gsPack, path);
+		else                  Gif_AddCompletedGSPacket(gsPack, path);
 		if (PRINT_GIF_PACKET) Gif_ParsePacket(gsPack, path);
 	}
 
 	// Returns GS Packet Size in bytes
-	u32 GetGSPacketSize(GIF_PATH pathIdx, u8* pMem, u32 offset = 0) {
-		u32 memMask = pathIdx ? 0xffffffffu : 0x3fffu;
-		u32 size    = 0;
+	u32 GetGSPacketSize(GIF_PATH pathIdx, u8* pMem, u32 offset = 0, u32 size = ~0u) {
+		u32 memMask = pathIdx ? ~0u : 0x3fffu;
+		u32 curSize = 0;
 		for(;;) {
 			Gif_Tag gifTag(&pMem[offset & memMask]);
-			incTag(offset, size, 16 + gifTag.len); // Tag + Data length
-			if (pathIdx == GIF_PATH_1 && size >= 0x4000) {
+			incTag(offset, curSize, 16 + gifTag.len); // Tag + Data length
+			if (pathIdx == GIF_PATH_1 && curSize >= 0x4000) {
 				Console.Warning("Gif Unit - GS packet size exceeded VU memory size!");
 				return 0; // Bios does this... (Fixed if you delay vu1's xgkick by 103 vu cycles)
 			}
-			if (gifTag.tag.EOP) {
-				return size;
-			}
+			if (curSize >= size) return size;
+			if (gifTag.tag.EOP)  return curSize;
 		}
 	}
 
@@ -332,8 +441,22 @@ struct Gif_Unit {
 	// The return value is the amount of data (in bytes) that was processed
 	// If transfer cannot take place at this moment the return value is 0
 	u32 TransferGSPacketData(GIF_TRANSFER_TYPE tranType, u8* pMem, u32 size, bool aligned=false) {
-		
-		GIF_LOG("%s - [path=%d][size=%d]", Gif_TransferStr[(tranType>>8)&0xf], (tranType&3)+1, size);
+
+		if (THREAD_VU1) {
+			Gif_Path& path1 = gifPath[GIF_PATH_1];
+			if (tranType == GIF_TRANS_XGKICK) { // This is on the MTVU thread
+				path1.CopyGSPacketData(pMem, size, aligned);
+				path1.ExecuteGSPacketMTVU();
+				return size;
+			}
+			if (tranType == GIF_TRANS_MTVU) {   // This is on the EE thread
+				path1.mtvu.fakePackets++;
+				if (CanDoGif()) Execute();
+				return 0;
+			}
+		}
+
+		GUNIT_LOG("%s - [path=%d][size=%d]", Gif_TransferStr[(tranType>>8)&0xf], (tranType&3)+1, size);
 		if (size == 0)  { GUNIT_WARN("Gif Unit - Size == 0"); return 0; }
 		if(!CanDoGif()) { GUNIT_WARN("Gif Unit - Signal or PSE Set or Dir = GS to EE"); }
 		pxAssertDev((stat.APATH==0) || checkPaths(1,1,1), "Gif Unit - APATH wasn't cleared?");
@@ -344,6 +467,7 @@ struct Gif_Unit {
 		}
 		if (tranType == GIF_TRANS_DMA) {
 			if(!CanDoPath3())   { if (!Path3Masked()) stat.P3Q = 1; return 0; } // DMA Stall
+			//if (stat.P2Q) DevCon.WriteLn("P2Q while path 3");
 		}
 		if (tranType == GIF_TRANS_XGKICK) {
 			if(!CanDoPath1())   { stat.P1Q = 1; } // We always buffer path1 packets
@@ -404,7 +528,7 @@ struct Gif_Unit {
 				GS_Packet gsPack = path.ExecuteGSPacket();
 				if(!gsPack.done) {
 					if (stat.APATH == 3 && CanDoP3Slice() && !gsSIGNAL.queued) {
-						if(!didPath3 && checkPaths(1,1,0)) { // Path3 slicing
+						if(!didPath3 && /*!Path3Masked() &&*/ checkPaths(1,1,0)) { // Path3 slicing
 							didPath3 = true;
 							stat.APATH = 0;
 							stat.IP3   = 1;
@@ -433,7 +557,7 @@ struct Gif_Unit {
 			}
 			if   (!gsSIGNAL.queued && !gifPath[0].isDone()) { stat.APATH = 1; stat.P1Q = 0; }
 			elif (!gsSIGNAL.queued && !gifPath[1].isDone()) { stat.APATH = 2; stat.P2Q = 0; }
-			elif (!gsSIGNAL.queued && !gifPath[2].isDone() && !Path3Masked())
+			elif (!gsSIGNAL.queued && !gifPath[2].isDone() && !Path3Masked() /*&& !stat.P2Q*/)
 				 { stat.APATH = 3; stat.P3Q = 0; stat.IP3 = 0; }
 			else { stat.APATH = 0; stat.OPH = 0; break; }
 		}
diff --git a/pcsx2/Hw.cpp b/pcsx2/Hw.cpp
index 3f45f280f8d48..bcfca1fc7cf90 100644
--- a/pcsx2/Hw.cpp
+++ b/pcsx2/Hw.cpp
@@ -19,7 +19,6 @@
 #include "Hardware.h"
 #include "newVif.h"
 #include "IPU/IPUdma.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 
 using namespace R5900;
diff --git a/pcsx2/HwWrite.cpp b/pcsx2/HwWrite.cpp
index d7e9a215bfb04..08317d393c57e 100644
--- a/pcsx2/HwWrite.cpp
+++ b/pcsx2/HwWrite.cpp
@@ -17,7 +17,6 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"
 #include "Hardware.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 
 #include "ps2/HwInternal.h"
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index 1b3b33beee4f6..35cff8d9fcf0f 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -21,6 +21,7 @@
 
 #include "GS.h"
 #include "Gif_Unit.h"
+#include "MTVU.h"
 #include "Elfheader.h"
 #include "SamplProf.h"
 
@@ -242,36 +243,29 @@ void SysMtgsThread::OpenPlugin()
 	GSsetGameCRC( ElfCRC, 0 );
 }
 
-class RingBufferLock : public ScopedLock
-{
-	typedef ScopedLock _parent;
-	
-protected:
-	SysMtgsThread&		m_mtgs;
+struct RingBufferLock {	
+	ScopedLock     m_lock1;
+	ScopedLock     m_lock2;
+	SysMtgsThread& m_mtgs;
 
-public:
-	RingBufferLock( SysMtgsThread& mtgs )
-		: ScopedLock( mtgs.m_mtx_RingBufferBusy )
-		, m_mtgs( mtgs )
-	{
+	RingBufferLock(SysMtgsThread& mtgs)
+		: m_lock1(mtgs.m_mtx_RingBufferBusy),
+		  m_lock2(mtgs.m_mtx_RingBufferBusy2),
+		  m_mtgs(mtgs) {
 		m_mtgs.m_RingBufferIsBusy = true;
 	}
-
-	virtual ~RingBufferLock() throw()
-	{
+	virtual ~RingBufferLock() throw() {
 		m_mtgs.m_RingBufferIsBusy = false;
 	}
-	
-	void Acquire()
-	{
-		_parent::Acquire();
+	void Acquire() {
+		m_lock1.Acquire();
+		m_lock2.Acquire();
 		m_mtgs.m_RingBufferIsBusy = true;
 	}
-	
-	void Release()
-	{
+	void Release() {
 		m_mtgs.m_RingBufferIsBusy = false;
-		_parent::Release();	
+		m_lock2.Release();
+		m_lock1.Release();
 	}
 };
 
@@ -281,10 +275,9 @@ void SysMtgsThread::ExecuteTaskInThread()
 	PacketTagType prevCmd;
 #endif
 
-	RingBufferLock busy( *this );
+	RingBufferLock busy (*this);
 
-	while( true )
-	{
+	while(true) {
 		busy.Release();
 
 		// Performance note: Both of these perform cancellation tests, but pthread_testcancel
@@ -299,8 +292,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 		// ever be modified by this thread.
 		while( m_ReadPos != volatize(m_WritePos))
 		{
-			if( EmuConfig.GS.DisableOutput )
-			{
+			if (EmuConfig.GS.DisableOutput) {
 				m_ReadPos = m_WritePos;
 				continue;
 			}
@@ -327,7 +319,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 			switch( tag.command )
 			{
-#if COPY_GS_PACKET_TO_MTGS == 1 // d
+#if COPY_GS_PACKET_TO_MTGS == 1
 				case GS_RINGTYPE_P1:
 				{
 					uint datapos = (m_ReadPos+1) & RingBufferMask;
@@ -412,6 +404,21 @@ void SysMtgsThread::ExecuteTaskInThread()
 					break;
 				}
 
+				case GS_RINGTYPE_MTVU_GSPACKET: {
+					MTVU_LOG("MTGS - Waiting on semaXGkick!");
+					vu1Thread.KickStart(true);
+					busy.m_lock2.Release();
+					// Wait for MTVU to complete vu1 program
+					vu1Thread.semaXGkick.WaitWithoutYield();
+					busy.m_lock2.Acquire();
+					Gif_Path& path   = gifUnit.gifPath[GIF_PATH_1];
+					GS_Packet gsPack = path.GetGSPacketMTVU(); // Get vu1 program's xgkick packet(s)
+					if (gsPack.size) GSgifTransfer((u32*)&path.buffer[gsPack.offset], gsPack.size/16);
+					AtomicExchangeSub(path.readAmount, gsPack.size + gsPack.readAmount);
+					path.PopGSPacketMTVU(); // Should be done last, for proper Gif_MTGS_Wait()
+					break;
+				}
+
 				default:
 				{
 					switch( tag.command )
@@ -572,27 +579,43 @@ void SysMtgsThread::OnCleanupInThread()
 }
 
 // Waits for the GS to empty out the entire ring buffer contents.
-// Used primarily for plugin startup/shutdown.
-void SysMtgsThread::WaitGS()
+// If syncRegs, then writes pcsx2's gs regs to MTGS's internal copy
+// If weakWait, then this function is allowed to exit after MTGS finished a path1 packet
+// If isMTVU, then this implies this function is being called from the MTVU thread...
+void SysMtgsThread::WaitGS(bool syncRegs, bool weakWait, bool isMTVU)
 {
 	pxAssertDev( !IsSelf(), "This method is only allowed from threads *not* named MTGS." );
 
 	if( m_ExecMode == ExecMode_NoThreadYet || !IsRunning() ) return;
 	if( !pxAssertDev( IsOpen(), "MTGS Warning!  WaitGS issued on a closed thread." ) ) return;
 
-	if( volatize(m_ReadPos) != m_WritePos )
-	{
+	Gif_Path&   path = gifUnit.gifPath[GIF_PATH_1];
+	u32 startP1Packs = weakWait ? path.GetPendingGSPackets() : 0;
+
+	if (isMTVU || volatize(m_ReadPos) != m_WritePos) {
 		SetEvent();
 		RethrowException();
-
-		do {
-			m_mtx_RingBufferBusy.Wait();
+		for(;;) {
+			if (weakWait) m_mtx_RingBufferBusy2.Wait();
+			else          m_mtx_RingBufferBusy .Wait();
 			RethrowException();
-		} while( volatize(m_ReadPos) != m_WritePos );
+			if(!isMTVU && volatize(m_ReadPos) == m_WritePos) break;
+			u32 curP1Packs = weakWait ? path.GetPendingGSPackets() : 0;
+			if (weakWait && ((startP1Packs-curP1Packs) || !curP1Packs)) break;
+			// On weakWait we will stop waiting on the MTGS thread if the
+			// MTGS thread has processed a vu1 xgkick packet, or is pending on
+			// its final vu1 xgkick packet (!curP1Packs)...
+			// Note: m_WritePos doesn't seem to have proper atomic write
+			// code, so reading it from the MTVU thread might be dangerous;
+			// hence it has been avoided...
+		}
 	}
 	
-	// Completely synchronize GS and MTGS register states.
-	memcpy_fast( RingBuffer.Regs, PS2MEM_GS, sizeof(RingBuffer.Regs) );
+	if (syncRegs) {
+		ScopedLock lock(m_mtx_WaitGS);
+		// Completely synchronize GS and MTGS register states.
+		memcpy_fast(RingBuffer.Regs, PS2MEM_GS, sizeof(RingBuffer.Regs));
+	}
 }
 
 // Sets the gsEvent flag and releases a timeslice.
diff --git a/pcsx2/MTVU.cpp b/pcsx2/MTVU.cpp
new file mode 100644
index 0000000000000..5e7de04466197
--- /dev/null
+++ b/pcsx2/MTVU.cpp
@@ -0,0 +1,37 @@
+/*  PCSX2 - PS2 Emulator for PCs
+ *  Copyright (C) 2002-2010  PCSX2 Dev Team
+ *
+ *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
+ *  of the GNU Lesser General Public License as published by the Free Software Found-
+ *  ation, either version 3 of the License, or (at your option) any later version.
+ *
+ *  PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ *  PURPOSE.  See the GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along with PCSX2.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "PrecompiledHeader.h"
+#include "Common.h"
+#include "MTVU.h"
+#include "newVif.h"
+
+__aligned16 VU_Thread vu1Thread(CpuVU1, VU1);
+
+// Calls the vif unpack functions from the MTVU thread
+void MTVU_Unpack(void* data, VIFregisters& vifRegs) {
+	bool isFill = vifRegs.cycle.cl < vifRegs.cycle.wl;
+	if (newVifDynaRec) dVifUnpack<1>((u8*)data, isFill);
+	else              _nVifUnpack(1, (u8*)data, vifRegs.mode, isFill);
+}
+
+// Called on Saving/Loading states...
+void SaveStateBase::mtvuFreeze() {
+	FreezeTag("MTVU");
+	pxAssert(vu1Thread.IsDone());
+	if (!IsSaving()) vu1Thread.Reset();
+	Freeze(vu1Thread.vuCycles);
+	Freeze(vu1Thread.vuCycleIdx);
+}
diff --git a/pcsx2/MTVU.h b/pcsx2/MTVU.h
new file mode 100644
index 0000000000000..1e18065bf7115
--- /dev/null
+++ b/pcsx2/MTVU.h
@@ -0,0 +1,305 @@
+
+#pragma once
+#include "System/SysThreads.h"
+#include "Vif.h"
+#include "Vif_Dma.h"
+#include "VUmicro.h"
+#include "Gif_Unit.h"
+
+extern void MTVU_Unpack(void* data, VIFregisters& vifRegs);
+#define volatize(x) (*reinterpret_cast<volatile uint*>(&(x)))
+#define size_u32(x) (((u32)x+3u)>>2) // Rounds up a size in bytes for size in u32's
+#define MTVU_ALWAYS_KICK 0
+#define MTVU_SYNC_MODE   0
+#define MTVU_LOG(...) do{} while(0)
+//#define MTVU_LOG DevCon.WriteLn
+
+enum MTVU_EVENT {
+	MTVU_VU_EXECUTE,     // Execute VU program
+	MTVU_VU_WRITE_MICRO, // Write to VU micro-mem
+	MTVU_VU_WRITE_DATA,  // Write to VU data-mem
+	MTVU_VIF_WRITE_COL,  // Write to Vif col reg
+	MTVU_VIF_WRITE_ROW,  // Write to Vif row reg
+	MTVU_VIF_UNPACK,     // Execute Vif Unpack
+	MTVU_NULL_PACKET,    // Go back to beginning of buffer
+	MTVU_RESET
+};
+
+// Notes:
+// - This class should only be accessed from the EE thread...
+// - buffer_size must be power of 2
+// - ring-buffer has no complete pending packets when read_pos==write_pos
+struct VU_Thread : public pxThread {
+	static const u32 buffer_size = (_1mb * 16) / sizeof(u32);
+	static const u32 buffer_mask = buffer_size - 1;
+	__aligned(4) u32 buffer[buffer_size];
+	__aligned(4) volatile s32  read_pos; // Only modified by VU thread
+	__aligned(4) volatile bool isBusy;   // Is thread processing data?
+	__aligned(4) s32  write_pos;    // Only modified by EE thread
+	__aligned(4) s32  write_offset; // Only modified by EE thread
+	__aligned(4) Mutex     mtxBusy;
+	__aligned(4) Semaphore semaEvent;
+	__aligned(4) Semaphore semaXGkick;
+	__aligned(4) BaseVUmicroCPU*& vuCPU;
+	__aligned(4) VURegs&          vuRegs;
+	__aligned16  vifStruct        vif;
+	__aligned16  VIFregisters     vifRegs;
+	__aligned(4) u32 vuCycles[4]; // Used for VU cycle stealing hack
+	__aligned(4) u32 vuCycleIdx;  // Used for VU cycle stealing hack
+
+	VU_Thread(BaseVUmicroCPU*& _vuCPU, VURegs& _vuRegs) : 
+			vuCPU(_vuCPU), vuRegs(_vuRegs) {
+		m_name = L"MTVU";
+		Reset();
+	}
+	virtual ~VU_Thread() throw() {
+		pxThread::Cancel();
+	}
+	void InitThread() {
+		Start(); // Starts the pxThread
+	}
+	void Reset() {
+		read_pos     = 0;
+		write_pos    = 0;
+		write_offset = 0;
+		vuCycleIdx   = 0;
+		isBusy = false;
+		memzero(vif);
+		memzero(vifRegs);
+		memzero(vuCycles);
+	}
+protected:
+	// Should only be called by ReserveSpace()
+	__ri void WaitOnSize(s32 size) {
+		for(;;) {
+			s32 readPos  = GetReadPos();
+			if (readPos <= write_pos) break; // MTVU is reading in back of write_pos
+			if (readPos >  write_pos + size) break; // Enough free front space
+			if (1) { // Let MTVU run to free up buffer space
+				KickStart();
+				if (IsDevBuild) DevCon.WriteLn("WaitOnSize()");
+				ScopedLock lock(mtxBusy);
+			}
+		}
+	}
+
+	// Makes sure theres enough room in the ring buffer
+	// to write a continuous 'size * sizeof(u32)' bytes
+	void ReserveSpace(s32 size) {
+		pxAssert(write_pos < buffer_size);
+		pxAssert(size      < buffer_size);
+		pxAssert(size > 0);
+		pxAssert(write_offset == 0);
+		if (write_pos + size > buffer_size) {
+			pxAssert(write_pos > 0);
+			WaitOnSize(1); // Size of MTVU_NULL_PACKET
+			Write(MTVU_NULL_PACKET);
+			write_offset = 0;
+			AtomicExchange(volatize(write_pos), 0);
+		}
+		WaitOnSize(size);
+	}
+
+	// Use this when reading read_pos from ee thread
+	__fi volatile s32 GetReadPos() {
+		return AtomicRead(read_pos);
+	}
+	// Use this when reading write_pos from vu thread
+	__fi volatile s32 GetWritePos() {
+		return AtomicRead(volatize(write_pos));
+	}
+	// Gets the effective write pointer after adding write_offset
+	__fi u32* GetWritePtr() {
+		return &buffer[(write_pos + write_offset) & buffer_mask];
+	}
+
+	__fi void incReadPos(s32 offset) { // Offset in u32 sizes
+		s32 temp = (read_pos + offset) & buffer_mask;
+		AtomicExchange(read_pos, temp);
+	}
+	__fi void incWritePos() { // Adds write_offset
+		s32 temp = (write_pos + write_offset) & buffer_mask;
+		write_offset = 0;
+		AtomicExchange(volatize(write_pos), temp);
+		if (MTVU_ALWAYS_KICK) KickStart();
+		if (MTVU_SYNC_MODE)   WaitVU();
+	}
+
+	__fi u32 Read() {
+		u32 ret = buffer[read_pos];
+		incReadPos(1);
+		return ret;
+	}
+	__fi void Read(void* dest, u32 size) { // Size in bytes
+		memcpy_fast(dest, &buffer[read_pos], size);
+		incReadPos(size_u32(size));
+	}
+
+	__fi void Write(u32 val) {
+		GetWritePtr()[0] = val;
+		write_offset += 1;
+	}
+	__fi void Write(void* src, u32 size) { // Size in bytes
+		memcpy_fast(GetWritePtr(), src, size);
+		write_offset += size_u32(size);
+	}
+
+	void ExecuteTaskInThread() {
+		PCSX2_PAGEFAULT_PROTECT {
+			ExecuteRingBuffer();
+		} PCSX2_PAGEFAULT_EXCEPT;
+	}
+
+	void ExecuteRingBuffer() {
+		for(;;) {
+			semaEvent.WaitWithoutYield();
+			ScopedLockBool lock(mtxBusy, isBusy);
+			while (read_pos != GetWritePos()) {
+				u32 tag = Read();
+				switch (tag) {
+					case MTVU_VU_EXECUTE: {
+						vuRegs.cycle = 0;
+						s32 addr     = Read();
+						vifRegs.top  = Read();
+						vifRegs.itop = Read();
+						if (addr != -1) vuRegs.VI[REG_TPC].UL = addr;
+						vuCPU->Execute(vu1RunCycles);
+						gifUnit.gifPath[GIF_PATH_1].FinishGSPacketMTVU();
+						semaXGkick.Post(); // Tell MTGS a path1 packet is complete
+						AtomicExchange(vuCycles[vuCycleIdx], vuRegs.cycle);
+						vuCycleIdx  = (vuCycleIdx + 1) & 3;
+						break;
+					}
+					case MTVU_VU_WRITE_MICRO: {
+						u32 vu_micro_addr = Read();
+						u32 size = Read();
+						vuCPU->Clear(vu_micro_addr, size);
+						Read(&vuRegs.Micro[vu_micro_addr], size);
+						break;
+					}
+					case MTVU_VU_WRITE_DATA: {
+						u32 vu_data_addr = Read();
+						u32 size = Read();
+						Read(&vuRegs.Mem[vu_data_addr], size);
+						break;
+					}
+					case MTVU_VIF_WRITE_COL:
+						Read(&vif.MaskCol, sizeof(vif.MaskCol));
+						break;
+					case MTVU_VIF_WRITE_ROW:
+						Read(&vif.MaskRow, sizeof(vif.MaskRow));
+						break;
+					case MTVU_VIF_UNPACK: {
+						u32 vif_copy_size = (uptr)&vif.StructEnd - (uptr)&vif.tag;
+						Read(&vif.tag, vif_copy_size);
+						Read(&vifRegs, sizeof(vifRegs));
+						u32 size = Read();
+						MTVU_Unpack(&buffer[read_pos], vifRegs);
+						incReadPos(size_u32(size));
+						break;
+					}
+					case MTVU_NULL_PACKET:
+						AtomicExchange(read_pos, 0);
+						break;
+					jNO_DEFAULT;
+				}
+			}
+		}
+	}
+
+	// Returns Average number of vu Cycles from last 4 runs
+	u32 Get_vuCycles() { // Used for vu cycle stealing hack
+		return (AtomicRead(vuCycles[0]) + AtomicRead(vuCycles[1])
+			  + AtomicRead(vuCycles[2]) + AtomicRead(vuCycles[3])) >> 2;
+	}
+public:
+	
+	// Get MTVU to start processing its packets if it isn't already
+	void KickStart(bool forceKick = false) {
+		if ((forceKick && !semaEvent.Count())
+		|| (!isBusy && GetReadPos() != write_pos)) semaEvent.Post();
+	}
+
+	// Used for assertions...
+	bool IsDone() { return !isBusy && GetReadPos() == GetWritePos(); }
+
+	// Waits till MTVU is done processing
+	void WaitVU() {
+		MTVU_LOG("MTVU - WaitVU!");
+		for(;;) {
+			if (IsDone()) break;
+			//DevCon.WriteLn("WaitVU()");
+			pxAssert(THREAD_VU1);
+			KickStart();
+			ScopedLock lock(mtxBusy);
+		}
+	}
+
+	void ExecuteVU(u32 vu_addr, u32 vif_top, u32 vif_itop) {
+		MTVU_LOG("MTVU - ExecuteVU!");
+		ReserveSpace(4);
+		Write(MTVU_VU_EXECUTE);
+		Write(vu_addr);
+		Write(vif_top);
+		Write(vif_itop);
+		incWritePos();
+		gifUnit.TransferGSPacketData(GIF_TRANS_MTVU, NULL, 0);
+		KickStart();
+		u32 cycles = std::min(Get_vuCycles(), 3000u);
+		cpuRegs.cycle += cycles * EmuConfig.Speedhacks.VUCycleSteal;
+	}
+
+	void VifUnpack(vifStruct& _vif, VIFregisters& _vifRegs, u8* data, u32 size) {
+		MTVU_LOG("MTVU - VifUnpack!");
+		u32 vif_copy_size = (uptr)&_vif.StructEnd - (uptr)&_vif.tag;
+		ReserveSpace(1 + size_u32(vif_copy_size) + size_u32(sizeof(_vifRegs)) + 1 + size_u32(size));
+		Write(MTVU_VIF_UNPACK);
+		Write(&_vif.tag, vif_copy_size);
+		Write(&_vifRegs, sizeof(_vifRegs));
+		Write(size);
+		Write(data, size);
+		incWritePos();
+		KickStart();
+	}
+
+	// Writes to VU's Micro Memory (size in bytes)
+	void WriteMicroMem(u32 vu_micro_addr, void* data, u32 size) {
+		MTVU_LOG("MTVU - WriteMicroMem!");
+		ReserveSpace(3 + size_u32(size));
+		Write(MTVU_VU_WRITE_MICRO);
+		Write(vu_micro_addr);
+		Write(size);
+		Write(data, size);
+		incWritePos();
+	}
+
+	// Writes to VU's Data Memory (size in bytes)
+	void WriteDataMem(u32 vu_data_addr, void* data, u32 size) {
+		MTVU_LOG("MTVU - WriteDataMem!");
+		ReserveSpace(3 + size_u32(size));
+		Write(MTVU_VU_WRITE_DATA);
+		Write(vu_data_addr);
+		Write(size);
+		Write(data, size);
+		incWritePos();
+	}
+
+	void WriteCol(vifStruct& _vif) {
+		MTVU_LOG("MTVU - WriteCol!");
+		ReserveSpace(1 + size_u32(sizeof(_vif.MaskCol)));
+		Write(MTVU_VIF_WRITE_COL);
+		Write(&_vif.MaskCol, sizeof(_vif.MaskCol));
+		incWritePos();
+	}
+
+	void WriteRow(vifStruct& _vif) {
+		MTVU_LOG("MTVU - WriteRow!");
+		ReserveSpace(1 + size_u32(sizeof(_vif.MaskRow)));
+		Write(MTVU_VIF_WRITE_ROW);
+		Write(&_vif.MaskRow, sizeof(_vif.MaskRow));
+		incWritePos();
+	}
+};
+
+extern __aligned16 VU_Thread vu1Thread;
+
diff --git a/pcsx2/Memory.cpp b/pcsx2/Memory.cpp
index 5192d62fae6a1..1ad1469d6da94 100644
--- a/pcsx2/Memory.cpp
+++ b/pcsx2/Memory.cpp
@@ -38,8 +38,9 @@ BIOS
 #include <wx/file.h>
 
 #include "IopCommon.h"
-#include "VUmicro.h"
 #include "GS.h"
+#include "VUmicro.h"
+#include "MTVU.h"
 
 #include "ps2/HwInternal.h"
 #include "ps2/BiosTools.h"
@@ -102,6 +103,7 @@ static vtlbHandler
 
 	vu0_micro_mem,
 	vu1_micro_mem,
+	vu1_data_mem,
 
 	hw_by_page[0x10] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
 
@@ -131,7 +133,11 @@ void memMapVUmicro()
 	// VU0/VU1 memory (data)
 	// VU0 is 4k, mirrored 4 times across a 16k area.
 	vtlb_MapBlock(VU0.Mem,0x11004000,0x00004000,0x1000);
-	vtlb_MapBlock(VU1.Mem,0x1100c000,0x00004000);
+	// Note: In order for the below conditional to work correctly
+	// support needs to be coded to reset the memMappings when MTVU is
+	// turned off/on. For now we just always use the vu data handlers...
+	if (1||THREAD_VU1) vtlb_MapHandler(vu1_data_mem,0x1100c000,0x00004000);
+	else               vtlb_MapBlock  (VU1.Mem,     0x1100c000,0x00004000);
 }
 
 void memMapPhy()
@@ -431,128 +437,186 @@ static void __fastcall _ext_memWrite128(u32 mem, const mem128_t *value)
 
 typedef void __fastcall ClearFunc_t( u32 addr, u32 qwc );
 
-template<int vunum>
-static __fi void ClearVuFunc( u32 addr, u32 size )
-{
-	if( vunum==0 )
-		CpuVU0->Clear(addr,size);
-	else
-		CpuVU1->Clear(addr,size);
+template<int vunum> static __fi void ClearVuFunc(u32 addr, u32 size) {
+	if (vunum) CpuVU1->Clear(addr, size);
+	else       CpuVU0->Clear(addr, size);
 }
 
-template<int vunum>
-static mem8_t __fastcall vuMicroRead8(u32 addr)
-{
-	addr&=(vunum==0)?0xfff:0x3fff;
-	VURegs* vu=(vunum==0)?&VU0:&VU1;
-
+// VU Micro Memory Reads...
+template<int vunum> static mem8_t __fc vuMicroRead8(u32 addr) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
 	return vu->Micro[addr];
 }
-
-template<int vunum>
-static mem16_t __fastcall vuMicroRead16(u32 addr)
-{
-	addr&=(vunum==0)?0xfff:0x3fff;
-	VURegs* vu=(vunum==0)?&VU0:&VU1;
-
+template<int vunum> static mem16_t __fc vuMicroRead16(u32 addr) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
 	return *(u16*)&vu->Micro[addr];
 }
-
-template<int vunum>
-static mem32_t __fastcall vuMicroRead32(u32 addr)
-{
-	addr&=(vunum==0)?0xfff:0x3fff;
-	VURegs* vu=(vunum==0)?&VU0:&VU1;
-
+template<int vunum> static mem32_t __fc vuMicroRead32(u32 addr) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
 	return *(u32*)&vu->Micro[addr];
 }
-
-template<int vunum>
-static void __fastcall vuMicroRead64(u32 addr,mem64_t* data)
-{
-	addr&=(vunum==0)?0xfff:0x3fff;
-	VURegs* vu=(vunum==0)?&VU0:&VU1;
-
+template<int vunum> static void __fc vuMicroRead64(u32 addr,mem64_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
 	*data=*(u64*)&vu->Micro[addr];
 }
-
-template<int vunum>
-static void __fastcall vuMicroRead128(u32 addr,mem128_t* data)
-{
-	addr&=(vunum==0)?0xfff:0x3fff;
-	VURegs* vu=(vunum==0)?&VU0:&VU1;
-
+template<int vunum> static void __fc vuMicroRead128(u32 addr,mem128_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
 	CopyQWC(data,&vu->Micro[addr]);
 }
 
 // Profiled VU writes: Happen very infrequently, with exception of BIOS initialization (at most twice per
 //   frame in-game, and usually none at all after BIOS), so cpu clears aren't much of a big deal.
-
-template<int vunum>
-static void __fastcall vuMicroWrite8(u32 addr,mem8_t data)
-{
-	addr &= (vunum==0) ? 0xfff : 0x3fff;
-	VURegs& vu = (vunum==0) ? VU0 : VU1;
-
-	if (vu.Micro[addr]!=data)
-	{
-		ClearVuFunc<vunum>(addr&(~7), 8); // Clear before writing new data (clearing 8 bytes because an instruction is 8 bytes) (cottonvibes)
-		vu.Micro[addr]=data;
+template<int vunum> static void __fc vuMicroWrite8(u32 addr,mem8_t data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteMicroMem(addr, &data, sizeof(u8));
+		return;
+	}
+	if (vu->Micro[addr]!=data) {     // Clear before writing new data
+		ClearVuFunc<vunum>(addr, 8); //(clearing 8 bytes because an instruction is 8 bytes) (cottonvibes)
+		vu->Micro[addr] =data;
 	}
 }
-
-template<int vunum>
-static void __fastcall vuMicroWrite16(u32 addr,mem16_t data)
-{
-	addr &= (vunum==0) ? 0xfff : 0x3fff;
-	VURegs& vu = (vunum==0) ? VU0 : VU1;
-
-	if (*(u16*)&vu.Micro[addr]!=data)
-	{
-		ClearVuFunc<vunum>(addr&(~7), 8);
-		*(u16*)&vu.Micro[addr]=data;
+template<int vunum> static void __fc vuMicroWrite16(u32 addr, mem16_t data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteMicroMem(addr, &data, sizeof(u16));
+		return;
+	}
+	if (*(u16*)&vu->Micro[addr]!=data) {
+		ClearVuFunc<vunum>(addr, 8);
+		*(u16*)&vu->Micro[addr] =data;
 	}
 }
-
-template<int vunum>
-static void __fastcall vuMicroWrite32(u32 addr,mem32_t data)
-{
-	addr &= (vunum==0) ? 0xfff : 0x3fff;
-	VURegs& vu = (vunum==0) ? VU0 : VU1;
-
-	if (*(u32*)&vu.Micro[addr]!=data)
-	{
-		ClearVuFunc<vunum>(addr&(~7), 8);
-		*(u32*)&vu.Micro[addr]=data;
+template<int vunum> static void __fc vuMicroWrite32(u32 addr, mem32_t data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteMicroMem(addr, &data, sizeof(u32));
+		return;
+	}
+	if (*(u32*)&vu->Micro[addr]!=data) {
+		ClearVuFunc<vunum>(addr, 8);
+		*(u32*)&vu->Micro[addr] =data;
 	}
 }
-
-template<int vunum>
-static void __fastcall vuMicroWrite64(u32 addr,const mem64_t* data)
-{
-	addr &= (vunum==0) ? 0xfff : 0x3fff;
-	VURegs& vu = (vunum==0) ? VU0 : VU1;
-
-	if (*(u64*)&vu.Micro[addr]!=data[0])
-	{
-		ClearVuFunc<vunum>(addr&(~7), 8);
-		*(u64*)&vu.Micro[addr]=data[0];
+template<int vunum> static void __fc vuMicroWrite64(u32 addr, const mem64_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteMicroMem(addr, (void*)data, sizeof(u64));
+		return;
+	}
+	if (*(u64*)&vu->Micro[addr]!=data[0]) {
+		ClearVuFunc<vunum>(addr, 8);
+		*(u64*)&vu->Micro[addr] =data[0];
+	}
+}
+template<int vunum> static void __fc vuMicroWrite128(u32 addr, const mem128_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteMicroMem(addr, (void*)data, sizeof(u128));
+		return;
+	}
+	if ((u128&)vu->Micro[addr]!=*data) {
+		ClearVuFunc<vunum>(addr, 16);
+		CopyQWC(&vu->Micro[addr],data);
 	}
 }
 
-template<int vunum>
-static void __fastcall vuMicroWrite128(u32 addr,const mem128_t* data)
-{
-	addr &= (vunum==0) ? 0xfff : 0x3fff;
-	VURegs& vu = (vunum==0) ? VU0 : VU1;
-
-	if ((u128&)vu.Micro[addr] != *data)
-	{
-		ClearVuFunc<vunum>(addr&(~7), 16);
-		CopyQWC(&vu.Micro[addr],data);
+// VU Data Memory Reads...
+template<int vunum> static mem8_t __fc vuDataRead8(u32 addr) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
+	return vu->Mem[addr];
+}
+template<int vunum> static mem16_t __fc vuDataRead16(u32 addr) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
+	return *(u16*)&vu->Mem[addr];
+}
+template<int vunum> static mem32_t __fc vuDataRead32(u32 addr) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
+	return *(u32*)&vu->Mem[addr];
+}
+template<int vunum> static void __fc vuDataRead64(u32 addr, mem64_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
+	*data=*(u64*)&vu->Mem[addr];
+}
+template<int vunum> static void __fc vuDataRead128(u32 addr, mem128_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) vu1Thread.WaitVU();
+	CopyQWC(data,&vu->Mem[addr]);
+}
+
+// VU Data Memory Writes...
+template<int vunum> static void __fc vuDataWrite8(u32 addr, mem8_t data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteDataMem(addr, &data, sizeof(u8));
+		return;
 	}
+	vu->Mem[addr] = data;
+}
+template<int vunum> static void __fc vuDataWrite16(u32 addr, mem16_t data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteDataMem(addr, &data, sizeof(u16));
+		return;
+	}
+	*(u16*)&vu->Mem[addr] = data;
+}
+template<int vunum> static void __fc vuDataWrite32(u32 addr, mem32_t data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteDataMem(addr, &data, sizeof(u32));
+		return;
+	}
+	*(u32*)&vu->Mem[addr] = data;
+}
+template<int vunum> static void __fc vuDataWrite64(u32 addr, const mem64_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteDataMem(addr, (void*)data, sizeof(u64));
+		return;
+	}
+	*(u64*)&vu->Mem[addr] = data[0];
+}
+template<int vunum> static void __fc vuDataWrite128(u32 addr, const mem128_t* data) {
+	VURegs* vu = vunum ?  &VU1 :  &VU0;
+	addr      &= vunum ? 0x3fff: 0xfff;
+	if (vunum && THREAD_VU1) {
+		vu1Thread.WriteDataMem(addr, (void*)data, sizeof(u128));
+		return;
+	}
+	CopyQWC(&vu->Mem[addr], data);
 }
 
+
 void memSetPageAddr(u32 vaddr, u32 paddr)
 {
 	//Console.WriteLn("memSetPageAddr: %8.8x -> %8.8x", vaddr, paddr);
@@ -640,9 +704,8 @@ void eeMemoryReserve::Commit()
 // Resets memory mappings, unmaps TLBs, reloads bios roms, etc.
 void eeMemoryReserve::Reset()
 {
-	if (!mmap_faultHandler)
-	{
-		pxAssume(Source_PageFault);
+	if(!mmap_faultHandler) {
+		pxAssert(Source_PageFault);
 		mmap_faultHandler = new mmap_PageFaultHandler();
 	}
 	
@@ -674,7 +737,8 @@ void eeMemoryReserve::Reset()
 	// Dynarec versions of VUs
 	vu0_micro_mem = vtlb_RegisterHandlerTempl1(vuMicro,0);
 	vu1_micro_mem = vtlb_RegisterHandlerTempl1(vuMicro,1);
-
+	vu1_data_mem  = (1||THREAD_VU1) ? vtlb_RegisterHandlerTempl1(vuData,1) : NULL;
+	
 	//////////////////////////////////////////////////////////////////////////////////////////
 	// IOP's "secret" Hardware Register mapping, accessible from the EE (and meant for use
 	// by debugging or BIOS only).  The IOP's hw regs are divided into three main pages in
diff --git a/pcsx2/Pcsx2Config.cpp b/pcsx2/Pcsx2Config.cpp
index e0a35365548a0..ebe37c7529ab8 100644
--- a/pcsx2/Pcsx2Config.cpp
+++ b/pcsx2/Pcsx2Config.cpp
@@ -64,6 +64,7 @@ void Pcsx2Config::SpeedhackOptions::LoadSave( IniInterface& ini )
 	IniBitBool( WaitLoop );
 	IniBitBool( vuFlagHack );
 	IniBitBool( vuBlockHack );
+	IniBitBool( vuThread );
 }
 
 void Pcsx2Config::ProfilerOptions::LoadSave( IniInterface& ini )
diff --git a/pcsx2/R5900.cpp b/pcsx2/R5900.cpp
index f698f02784063..9c14ab3dea142 100644
--- a/pcsx2/R5900.cpp
+++ b/pcsx2/R5900.cpp
@@ -21,6 +21,7 @@
 #include "R3000A.h"
 #include "VUmicro.h"
 #include "COP0.h"
+#include "MTVU.h"
 
 #include "System/SysThreads.h"
 #include "R5900Exceptions.h"
@@ -54,6 +55,7 @@ extern SysMainMemory& GetVmMemory();
 
 void cpuReset()
 {
+	vu1Thread.WaitVU();
 	if (GetMTGS().IsOpen())
 		GetMTGS().WaitGS();		// GS better be done processing before we reset the EE, just in case.
 
@@ -281,9 +283,6 @@ static __fi void _cpuTestInterrupts()
 	TESTINT(DMAC_GIF,		gifInterrupt);
 	TESTINT(DMAC_SIF0,		EEsif0Interrupt);
 	TESTINT(DMAC_SIF1,		EEsif1Interrupt);
-
-	//extern void Gif_Execute();
-	//TESTINT(DMAC_GIF_UNIT,	Gif_Execute);
 	
 	// Profile-guided Optimization (sorta)
 	// The following ints are rarely called.  Encasing them in a conditional
diff --git a/pcsx2/SPR.cpp b/pcsx2/SPR.cpp
index 068c986c3be0a..f174d7e680ca4 100644
--- a/pcsx2/SPR.cpp
+++ b/pcsx2/SPR.cpp
@@ -18,6 +18,7 @@
 
 #include "SPR.h"
 #include "VUmicro.h"
+#include "MTVU.h"
 
 extern void mfifoGIFtransfer(int);
 
@@ -31,19 +32,23 @@ void sprInit()
 {
 }
 
-static void TestClearVUs(u32 madr, u32 size)
+static void TestClearVUs(u32 madr, u32 qwc)
 {
 	if (madr >= 0x11000000)
 	{
 		if (madr < 0x11004000)
 		{
 			DbgCon.Warning("scratch pad clearing vu0");
-			CpuVU0->Clear(madr&0xfff, size);
+			CpuVU0->Clear(madr&0xfff, qwc * 16);
 		}
 		else if (madr >= 0x11008000 && madr < 0x1100c000)
 		{
 			DbgCon.Warning("scratch pad clearing vu1");
-			CpuVU1->Clear(madr&0x3fff, size);
+			if (THREAD_VU1) {
+				DevCon.Error("MTVU Warning: SPR Accessing VU1 Memory!!!");
+				vu1Thread.WaitVU();
+			}
+			CpuVU1->Clear(madr&0x3fff, qwc * 16);
 		}
 	}
 }
@@ -83,7 +88,7 @@ int  _SPR0chain()
 			memcpy_qwc(pMem, &psSu128(spr0ch.sadr), partialqwc);
 
 			// clear VU mem also!
-			TestClearVUs(spr0ch.madr, partialqwc << 2); // Wtf is going on here? AFAIK, only VIF should affect VU micromem (cottonvibes)
+			TestClearVUs(spr0ch.madr, partialqwc);
 
 			spr0ch.madr += partialqwc << 4;
 			spr0ch.sadr += partialqwc << 4;
@@ -135,7 +140,7 @@ void _SPR0interleave()
 			case NO_MFD:
 			case MFD_RESERVED:
 				// clear VU mem also!
-				TestClearVUs(spr0ch.madr, spr0ch.qwc << 2);
+				TestClearVUs(spr0ch.madr, spr0ch.qwc);
 				memcpy_qwc(pMem, &psSu128(spr0ch.sadr), spr0ch.qwc);
 				break;
  		}
diff --git a/pcsx2/SaveState.cpp b/pcsx2/SaveState.cpp
index 49ddcca5d86f7..ae7d8c6a0c2be 100644
--- a/pcsx2/SaveState.cpp
+++ b/pcsx2/SaveState.cpp
@@ -21,6 +21,7 @@
 #include "ps2/BiosTools.h"
 #include "COP0.h"
 #include "VUmicro.h"
+#include "MTVU.h"
 #include "Cache.h"
 #include "AppConfig.h"
 
@@ -150,10 +151,9 @@ static const uint MainMemorySizeInBytes =
 
 SaveStateBase& SaveStateBase::FreezeMainMemory()
 {
-	if (IsLoading())
-		PreLoadPrep();
-	else
-		m_memory->MakeRoomFor( m_idx + MainMemorySizeInBytes );
+	vu1Thread.WaitVU(); // Finish VU1 just in-case...
+	if (IsLoading()) PreLoadPrep();
+	else m_memory->MakeRoomFor( m_idx + MainMemorySizeInBytes );
 
 	// First Block - Memory Dumps
 	// ---------------------------
@@ -175,8 +175,8 @@ SaveStateBase& SaveStateBase::FreezeMainMemory()
 
 SaveStateBase& SaveStateBase::FreezeInternals()
 {
-	if( IsLoading() )
-		PreLoadPrep();
+	vu1Thread.WaitVU(); // Finish VU1 just in-case...
+	if (IsLoading()) PreLoadPrep();
 
 	// Second Block - Various CPU Registers and States
 	// -----------------------------------------------
diff --git a/pcsx2/SaveState.h b/pcsx2/SaveState.h
index df3fad3fbfffd..6e11c275b4e5d 100644
--- a/pcsx2/SaveState.h
+++ b/pcsx2/SaveState.h
@@ -24,7 +24,7 @@
 //  the lower 16 bit value.  IF the change is breaking of all compatibility with old
 //  states, increment the upper 16 bit value, and clear the lower 16 bits to 0.
 
-static const u32 g_SaveVersion = (0x9A02 << 16) | 0x0000;
+static const u32 g_SaveVersion = (0x9A03 << 16) | 0x0000;
 
 // this function is meant to be used in the place of GSfreeze, and provides a safe layer
 // between the GS saving function and the MTGS's needs. :)
@@ -193,6 +193,7 @@ class SaveStateBase
 
 	// Load/Save functions for the various components of our glorious emulator!
 
+	void mtvuFreeze();
 	void rcntFreeze();
 	void vuMicroFreeze();
 	void vif0Freeze();
diff --git a/pcsx2/System.h b/pcsx2/System.h
index ab43897e7616d..506bf00bb87f7 100644
--- a/pcsx2/System.h
+++ b/pcsx2/System.h
@@ -154,7 +154,7 @@ class SysCpuProviderPack
 // implemented by the provisioning interface.
 extern SysCpuProviderPack& GetCpuProviders();
 
-extern void SysLogMachineCaps();				// Detects cpu type and fills cpuInfo structs.
+extern void SysLogMachineCaps();		// Detects cpu type and fills cpuInfo structs.
 extern void SysClearExecutionCache();	// clears recompiled execution caches!
 extern void SysOutOfMemory_EmergencyResponse(uptr blocksize);
 
diff --git a/pcsx2/VU1micro.cpp b/pcsx2/VU1micro.cpp
index 2b9240bee9f6a..c1cec190ddc44 100644
--- a/pcsx2/VU1micro.cpp
+++ b/pcsx2/VU1micro.cpp
@@ -19,10 +19,9 @@
 
 #include "PrecompiledHeader.h"
 #include "Common.h"
-
 #include <cmath>
-
 #include "VUmicro.h"
+#include "MTVU.h"
 
 #ifdef PCSX2_DEBUG
 u32 vudump = 0;
@@ -39,6 +38,10 @@ void vu1ResetRegs()
 }
 
 void vu1Finish() {
+	if (THREAD_VU1) {
+		if (VU0.VI[REG_VPU_STAT].UL & 0x100) DevCon.Error("MTVU: VU0.VI[REG_VPU_STAT].UL & 0x100");
+		return;
+	}
 	while (VU0.VI[REG_VPU_STAT].UL & 0x100) {
 		VUM_LOG("vu1ExecMicro > Stalling until current microprogram finishes");
 		CpuVU1->Execute(vu1RunCycles);
@@ -47,10 +50,15 @@ void vu1Finish() {
 
 void __fastcall vu1ExecMicro(u32 addr)
 {
+	if (THREAD_VU1) {
+		vu1Thread.ExecuteVU(addr, vif1Regs.top, vif1Regs.itop);
+		vif1Regs.stat.VEW        = false;
+		VU0.VI[REG_VPU_STAT].UL &= ~0xFF00;
+		return;
+	}
 	static int count = 0;
 	vu1Finish();
 
-	VUM_LOG("vu1ExecMicro %x", addr);
 	VUM_LOG("vu1ExecMicro %x (count=%d)", addr, count++);
 
 	VU0.VI[REG_VPU_STAT].UL &= ~0xFF00;
diff --git a/pcsx2/VU1microInterp.cpp b/pcsx2/VU1microInterp.cpp
index 2446b1a96758c..7e575598a071a 100644
--- a/pcsx2/VU1microInterp.cpp
+++ b/pcsx2/VU1microInterp.cpp
@@ -18,6 +18,7 @@
 #include "Common.h"
 
 #include "VUmicro.h"
+#include "MTVU.h"
 
 extern void _vuFlushAll(VURegs* VU);
 
@@ -173,6 +174,14 @@ InterpVU1::InterpVU1()
 	IsInterpreter = true;
 }
 
+void InterpVU1::Reset() {
+	vu1Thread.WaitVU();
+}
+
+void InterpVU1::Shutdown() {
+	vu1Thread.WaitVU();
+}
+
 void InterpVU1::Step()
 {
 	VU1.VI[REG_TPC].UL &= VU1_PROGMASK;
diff --git a/pcsx2/VUmicro.h b/pcsx2/VUmicro.h
index a4fe2d4cc0606..097231ab2c1bd 100644
--- a/pcsx2/VUmicro.h
+++ b/pcsx2/VUmicro.h
@@ -193,8 +193,8 @@ class InterpVU1 : public BaseVUmicroCPU
 	wxString GetLongName() const		{ return L"VU1 Interpreter"; }
 
 	void Reserve() { }
-	void Shutdown() throw() { }
-	void Reset() { }
+	void Shutdown() throw();
+	void Reset();
 
 	void Step();
 	void Execute(u32 cycles);
diff --git a/pcsx2/VUmicroMem.cpp b/pcsx2/VUmicroMem.cpp
index 184cad74e6610..4b1127135c762 100644
--- a/pcsx2/VUmicroMem.cpp
+++ b/pcsx2/VUmicroMem.cpp
@@ -54,7 +54,8 @@ void vuMemoryReserve::Reset()
 	pxAssert( VU0.Mem );
 	pxAssert( VU1.Mem );
 
-	memMapVUmicro();
+	// Below memMap is already called by "void eeMemoryReserve::Reset()"
+	//memMapVUmicro();
 
 	// === VU0 Initialization ===
 	memzero(VU0.ACC);
diff --git a/pcsx2/VUops.cpp b/pcsx2/VUops.cpp
index 241ea00c286b5..bcf40df61e1bf 100644
--- a/pcsx2/VUops.cpp
+++ b/pcsx2/VUops.cpp
@@ -18,6 +18,7 @@
 #include "VUops.h"
 #include "GS.h"
 #include "Gif_Unit.h"
+#include "MTVU.h"
 
 #include <cmath>
 
@@ -2018,7 +2019,8 @@ static __ri void _vuEEXP(VURegs * VU) {
 
 static __ri void _vuXITOP(VURegs * VU) {
 	if (_It_ == 0) return;
-	VU->VI[_It_].US[0] = VU->GetVifRegs().itop;
+	if (VU==&VU1 && THREAD_VU1) VU->VI[_It_].US[0] = vu1Thread.vifRegs.itop;
+	else                        VU->VI[_It_].US[0] = VU->GetVifRegs().itop;
 }
 
 static __ri void _vuXGKICK(VURegs * VU)
@@ -2041,7 +2043,8 @@ static __ri void _vuXGKICK(VURegs * VU)
 
 static __ri void _vuXTOP(VURegs * VU) {
 	if(_It_ == 0) return;
-	VU->VI[_It_].US[0] = (u16)VU->GetVifRegs().top;
+	if (VU==&VU1 && THREAD_VU1) VU->VI[_It_].US[0] = (u16)vu1Thread.vifRegs.top;
+	else                        VU->VI[_It_].US[0] = (u16)VU->GetVifRegs().top;
 }
 
 #define GET_VF0_FLAG(reg) (((reg)==0)?(1<<REG_VF0_FLAG):0)
diff --git a/pcsx2/Vif.cpp b/pcsx2/Vif.cpp
index 66e12918b555b..6ad92785363b5 100644
--- a/pcsx2/Vif.cpp
+++ b/pcsx2/Vif.cpp
@@ -20,6 +20,7 @@
 #include "newVif.h"
 #include "GS.h"
 #include "Gif.h"
+#include "MTVU.h"
 
 __aligned16 vifStruct  vif0, vif1;
 
@@ -289,18 +290,18 @@ __fi void vif1STAT(u32 value) {
 #define caseVif(x) (idx ? VIF1_##x : VIF0_##x)
 
 _vifT __fi u32 vifRead32(u32 mem) {
-	vifStruct& vif = GetVifX;
-
+	vifStruct& vif = MTVU_VifX;
+	bool wait = idx && THREAD_VU1;
 	switch (mem) {
-		case caseVif(ROW0): return vif.MaskRow._u32[0];
-		case caseVif(ROW1): return vif.MaskRow._u32[1];
-		case caseVif(ROW2): return vif.MaskRow._u32[2];
-		case caseVif(ROW3): return vif.MaskRow._u32[3];
-
-		case caseVif(COL0): return vif.MaskCol._u32[0];
-		case caseVif(COL1): return vif.MaskCol._u32[1];
-		case caseVif(COL2): return vif.MaskCol._u32[2];
-		case caseVif(COL3): return vif.MaskCol._u32[3];
+		case caseVif(ROW0): if (wait) vu1Thread.WaitVU(); return vif.MaskRow._u32[0];
+		case caseVif(ROW1): if (wait) vu1Thread.WaitVU(); return vif.MaskRow._u32[1];
+		case caseVif(ROW2): if (wait) vu1Thread.WaitVU(); return vif.MaskRow._u32[2];
+		case caseVif(ROW3): if (wait) vu1Thread.WaitVU(); return vif.MaskRow._u32[3];
+
+		case caseVif(COL0): if (wait) vu1Thread.WaitVU(); return vif.MaskCol._u32[0];
+		case caseVif(COL1): if (wait) vu1Thread.WaitVU(); return vif.MaskCol._u32[1];
+		case caseVif(COL2): if (wait) vu1Thread.WaitVU(); return vif.MaskCol._u32[2];
+		case caseVif(COL3): if (wait) vu1Thread.WaitVU(); return vif.MaskCol._u32[3];
 	}
 	
 	return psHu32(mem);
@@ -334,15 +335,15 @@ _vifT __fi bool vifWrite32(u32 mem, u32 value) {
 			// standard register writes -- handled by caller.
 		break;
 
-		case caseVif(ROW0): vif.MaskRow._u32[0] = value; return false;
-		case caseVif(ROW1): vif.MaskRow._u32[1] = value; return false;
-		case caseVif(ROW2): vif.MaskRow._u32[2] = value; return false;
-		case caseVif(ROW3): vif.MaskRow._u32[3] = value; return false;
+		case caseVif(ROW0): vif.MaskRow._u32[0] = value; if (idx && THREAD_VU1) vu1Thread.WriteRow(vif); return false;
+		case caseVif(ROW1): vif.MaskRow._u32[1] = value; if (idx && THREAD_VU1) vu1Thread.WriteRow(vif); return false;
+		case caseVif(ROW2): vif.MaskRow._u32[2] = value; if (idx && THREAD_VU1) vu1Thread.WriteRow(vif); return false;
+		case caseVif(ROW3): vif.MaskRow._u32[3] = value; if (idx && THREAD_VU1) vu1Thread.WriteRow(vif); return false;
 
-		case caseVif(COL0): vif.MaskCol._u32[0] = value; return false;
-		case caseVif(COL1): vif.MaskCol._u32[1] = value; return false;
-		case caseVif(COL2): vif.MaskCol._u32[2] = value; return false;
-		case caseVif(COL3): vif.MaskCol._u32[3] = value; return false;
+		case caseVif(COL0): vif.MaskCol._u32[0] = value; if (idx && THREAD_VU1) vu1Thread.WriteCol(vif); return false;
+		case caseVif(COL1): vif.MaskCol._u32[1] = value; if (idx && THREAD_VU1) vu1Thread.WriteCol(vif); return false;
+		case caseVif(COL2): vif.MaskCol._u32[2] = value; if (idx && THREAD_VU1) vu1Thread.WriteCol(vif); return false;
+		case caseVif(COL3): vif.MaskCol._u32[3] = value; if (idx && THREAD_VU1) vu1Thread.WriteCol(vif); return false;
 	}
 
 	// fall-through case: issue standard writeback behavior.
diff --git a/pcsx2/Vif.h b/pcsx2/Vif.h
index 097b579bf62c7..e1df7ca466724 100644
--- a/pcsx2/Vif.h
+++ b/pcsx2/Vif.h
@@ -106,6 +106,7 @@ union tVIF_STAT {
 	};
 	u32 _u32;
 
+	tVIF_STAT() {}
 	tVIF_STAT(u32 val)			{ _u32 = val; }
 	bool test(u32 flags) const	{ return !!(_u32 & flags); }
 	void set_flags	(u32 flags)	{ _u32 |=  flags; }
@@ -145,6 +146,7 @@ union tVIF_ERR {
 	};
 	u32 _u32;
 
+	tVIF_ERR() {}
 	tVIF_ERR  (u32 val)					{ _u32 = val; }
 	void write(u32 val)					{ _u32 = val; }
 	bool test		(u32 flags) const	{ return !!(_u32 & flags); }
@@ -221,6 +223,9 @@ static VIFregisters& vif1Regs = (VIFregisters&)eeHw[0x3C00];
 #define  vifXch		(idx ? (vif1ch)   : (vif0ch))
 #define  vifXRegs	(idx ? (vif1Regs) : (vif0Regs))
 
+#define  MTVU_VifX     (idx ? ((THREAD_VU1) ? vu1Thread.vif     : vif1)     : (vif0))
+#define  MTVU_VifXRegs (idx ? ((THREAD_VU1) ? vu1Thread.vifRegs : vif1Regs) : (vif0Regs))
+
 extern void dmaVIF0();
 extern void dmaVIF1();
 extern void mfifoVIF1transfer(int qwc);
diff --git a/pcsx2/Vif1_Dma.cpp b/pcsx2/Vif1_Dma.cpp
index 209367bbee3fa..58029d30bfe92 100644
--- a/pcsx2/Vif1_Dma.cpp
+++ b/pcsx2/Vif1_Dma.cpp
@@ -17,7 +17,6 @@
 #include "Common.h"
 #include "Vif_Dma.h"
 #include "GS.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 #include "VUmicro.h"
 #include "newVif.h"
diff --git a/pcsx2/Vif1_MFIFO.cpp b/pcsx2/Vif1_MFIFO.cpp
index 6b57cb5174405..1c925ceffcefb 100644
--- a/pcsx2/Vif1_MFIFO.cpp
+++ b/pcsx2/Vif1_MFIFO.cpp
@@ -16,7 +16,6 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"
 #include "Vif.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 #include "Vif_Dma.h"
 
diff --git a/pcsx2/Vif_Codes.cpp b/pcsx2/Vif_Codes.cpp
index a1df9b217b047..46174e8368fd4 100644
--- a/pcsx2/Vif_Codes.cpp
+++ b/pcsx2/Vif_Codes.cpp
@@ -16,11 +16,11 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"
 #include "GS.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 #include "Vif_Dma.h"
 #include "newVif.h"
 #include "VUmicro.h"
+#include "MTVU.h"
 
 #define vifOp(vifCodeName) _vifT int __fastcall vifCodeName(int pass, const u32 *data)
 #define pass1    if (pass == 0)
@@ -36,7 +36,7 @@ vifOp(vifCode_Null);
 
 static __fi void vifFlush(int idx) {
 	if (!idx) vif0FLUSH();
-	else	  vif1FLUSH();
+	else      vif1FLUSH();
 }
 
 static __fi void vuExecMicro(int idx, u32 addr) {
@@ -70,14 +70,16 @@ static __fi void vuExecMicro(int idx, u32 addr) {
 		}
 	}
 
-	if(!idx)startcycles = VU0.cycle;
-	else    startcycles = VU1.cycle;
+	if (!idx) startcycles = VU0.cycle;
+	else      startcycles = VU1.cycle;
 
 	if (!idx) vu0ExecMicro(addr);
 	else	  vu1ExecMicro(addr);
 
-	if(!idx) { g_vu0Cycles += (VU0.cycle-startcycles); g_packetsizeonvu = vif0.vifpacketsize; }
-	else     { g_vu1Cycles += (VU1.cycle-startcycles); g_packetsizeonvu = vif1.vifpacketsize; }
+	if (!idx || !THREAD_VU1) {
+		if (!idx) { g_vu0Cycles += (VU0.cycle-startcycles); g_packetsizeonvu = vif0.vifpacketsize; }
+		else      { g_vu1Cycles += (VU1.cycle-startcycles); g_packetsizeonvu = vif1.vifpacketsize; }
+	}
 	//DevCon.Warning("Ran VU%x, VU0 Cycles %x, VU1 Cycles %x, start %x cycle %x", idx, g_vu0Cycles, g_vu1Cycles, startcycles, VU1.cycle);
 	GetVifX.vifstalled = true;
 }
@@ -225,11 +227,14 @@ static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
 	VURegs& VUx = idx ? VU1 : VU0;
 	pxAssert(VUx.Micro > 0);
 
+	if (idx && THREAD_VU1) {
+		vu1Thread.WriteMicroMem(addr, (u8*)data, size*4);
+		return;
+	}
 	if (memcmp_mmx(VUx.Micro + addr, data, size*4)) {
 		// Clear VU memory before writing!
-		// (VUs expect size to be 32-bit scale, same as VIF's internal working sizes)
-		if (!idx)  CpuVU0->Clear(addr, size);
-		else	   CpuVU1->Clear(addr, size);
+		if (!idx)  CpuVU0->Clear(addr, size*4);
+		else	   CpuVU1->Clear(addr, size*4);
 		memcpy_fast(VUx.Micro + addr, data, size*4);
 	}
 }
@@ -387,7 +392,9 @@ vifOp(vifCode_STCol) {
 		return 1;
 	}
 	pass2 {
-		return _vifCode_STColRow<idx>(data, &vifX.MaskCol._u32[vifX.tag.addr]);
+		u32 ret = _vifCode_STColRow<idx>(data, &vifX.MaskCol._u32[vifX.tag.addr]);
+		if (idx && THREAD_VU1) { vu1Thread.WriteCol(vifX); }
+		return ret;
 	}
 	pass3 { VifCodeLog("STCol"); }
 	return 0;
@@ -401,7 +408,9 @@ vifOp(vifCode_STRow) {
 		return 1;
 	}
 	pass2 {
-		return _vifCode_STColRow<idx>(data, &vifX.MaskRow._u32[vifX.tag.addr]);
+		u32 ret = _vifCode_STColRow<idx>(data, &vifX.MaskRow._u32[vifX.tag.addr]);
+		if (idx && THREAD_VU1) { vu1Thread.WriteRow(vifX); }
+		return ret;
 	}
 	pass3 { VifCodeLog("STRow"); }
 	return 0;
@@ -447,7 +456,9 @@ vifOp(vifCode_Unpack) {
 		vifUnpackSetup<idx>(data);
 		return 1;
 	}
-	pass2 { return nVifUnpack<idx>((u8*)data); }
+	pass2 { 
+		return nVifUnpack<idx>((u8*)data);
+	}
 	pass3 {
 		vifStruct& vifX = GetVifX;
 		VIFregisters& vifRegs = vifXRegs;
diff --git a/pcsx2/Vif_Dma.h b/pcsx2/Vif_Dma.h
index 4746d661d0cec..e461cbaaa6808 100644
--- a/pcsx2/Vif_Dma.h
+++ b/pcsx2/Vif_Dma.h
@@ -56,14 +56,18 @@ union tTRXREG {
 
 // NOTE, if debugging vif stalls, use sega classics, spyro, gt4, and taito
 struct vifStruct {
-	u128	MaskRow, MaskCol;
+	__aligned16 u128 MaskRow;
+	__aligned16 u128 MaskCol;
+
+	struct { // These must be together for MTVU
+		vifCode tag;
+		int cmd;
+		int cl;
+		u8  usn;
+		u8  StructEnd; // Address of this is used to calculate end of struct
+	};
 
-	vifCode tag;
-	int cmd;
 	int irq;
-	int cl;
-	int qwcalign;
-	u8 usn;
 
 	bool done;
 	bool vifstalled;
@@ -72,17 +76,13 @@ struct vifStruct {
 	// GS registers used for calculating the size of the last local->host transfer initiated on the GS
 	// Transfer size calculation should be restricted to GS emulation in the future
 	tBITBLTBUF BITBLTBUF;
-	tTRXREG TRXREG;
-	u32 GSLastDownloadSize;
+	tTRXREG    TRXREG;
+	u32        GSLastDownloadSize;
 
-	u8 irqoffset; // 32bit offset where next vif code is
-	u32 savedtag; // need this for backwards compat with save states
+	u8  irqoffset; // 32bit offset where next vif code is
 	u32 vifpacketsize;
-	u8 inprogress;
-	u32 lastcmd;
-	u8 dmamode;
-	u8 Unused_GifWaitState; // Only here for saved state compatibility
-	//u8 GifWaitState; // 0 = General PATH checking, 1 = Flush path 3, 2 == Wait for VU1
+	u8  inprogress;
+	u8  dmamode;
 };
 
 extern __aligned16 vifStruct  vif0, vif1;
diff --git a/pcsx2/Vif_Transfer.cpp b/pcsx2/Vif_Transfer.cpp
index eee54b477a0c8..c202057f47b31 100644
--- a/pcsx2/Vif_Transfer.cpp
+++ b/pcsx2/Vif_Transfer.cpp
@@ -94,7 +94,6 @@ _vifT void vifTransferLoop(u32* &data) {
 
 			vifCmdHandler[idx][vifX.cmd & 0x7f](0, data);
 			data++; pSize--;
-			vifX.lastcmd = (vifXRegs.code >> 24) & 0x7f;
 			if (analyzeIbit<idx>(data, iBit)) break;
 			continue;
 		}
diff --git a/pcsx2/Vif_Unpack.cpp b/pcsx2/Vif_Unpack.cpp
index fcd651e32d1ea..a3719e3cb989e 100644
--- a/pcsx2/Vif_Unpack.cpp
+++ b/pcsx2/Vif_Unpack.cpp
@@ -17,6 +17,7 @@
 #include "Common.h"
 #include "Vif.h"
 #include "Vif_Dma.h"
+#include "MTVU.h"
 
 enum UnpackOffset {
 	OFFSET_X = 0,
@@ -36,10 +37,10 @@ template< uint idx, uint mode, bool doMask >
 static __ri void writeXYZW(u32 offnum, u32 &dest, u32 data) {
 	int n = 0;
 
-	vifStruct& vif = GetVifX;
+	vifStruct& vif = MTVU_VifX;
 
 	if (doMask) {
-		const VIFregisters& regs = vifXRegs;
+		const VIFregisters& regs = MTVU_VifXRegs;
 		switch (vif.cl) {
 			case 0:  n = (regs.mask >> (offnum * 2)) & 0x3;		break;
 			case 1:  n = (regs.mask >> ( 8 + (offnum * 2))) & 0x3;	break;
diff --git a/pcsx2/gui/CpuUsageProvider.cpp b/pcsx2/gui/CpuUsageProvider.cpp
index 8c638283d52c7..f2ba2f5574bb5 100644
--- a/pcsx2/gui/CpuUsageProvider.cpp
+++ b/pcsx2/gui/CpuUsageProvider.cpp
@@ -23,21 +23,24 @@
 #endif
 
 #include "GS.h"
+#include "MTVU.h"
 
-void AllThreeThreads::LoadWithCurrentTimes()
+void AllPCSX2Threads::LoadWithCurrentTimes()
 {
 	ee		= GetCoreThread().GetCpuTime();
 	gs		= GetMTGS().GetCpuTime();
+	vu		= vu1Thread.GetCpuTime();
 	ui		= GetThreadCpuTime();
 	update	= GetCPUTicks();
 }
 
-AllThreeThreads AllThreeThreads::operator-( const AllThreeThreads& right ) const
+AllPCSX2Threads AllPCSX2Threads::operator-( const AllPCSX2Threads& right ) const
 {
-	AllThreeThreads retval;
+	AllPCSX2Threads retval;
 
 	retval.ee		= ee - right.ee;
 	retval.gs		= gs - right.gs;
+	retval.vu		= vu - right.vu;
 	retval.ui		= ui - right.ui;
 	retval.update	= update - right.update;
 
@@ -48,6 +51,7 @@ DefaultCpuUsageProvider::DefaultCpuUsageProvider()
 {
 	m_pct_ee = 0;
 	m_pct_gs = 0;
+	m_pct_vu = 0;
 	m_pct_ui = 0;
 	m_writepos = 0;
 
@@ -69,16 +73,17 @@ void DefaultCpuUsageProvider::UpdateStats()
 {
 	// Measure deltas between the first and last positions in the ring buffer:
 
-	AllThreeThreads& newone( m_queue[m_writepos] );
+	AllPCSX2Threads& newone( m_queue[m_writepos] );
 	newone.LoadWithCurrentTimes();
 	m_writepos = (m_writepos+1) % QueueDepth;
-	const AllThreeThreads deltas( newone - m_queue[m_writepos] );
+	const AllPCSX2Threads deltas( newone - m_queue[m_writepos] );
 
 	// get the real time passed, scaled to the Thread's tick frequency.
 	u64 timepass	= (deltas.update * GetThreadTicksPerSecond()) / GetTickFrequency();
 
 	m_pct_ee = (deltas.ee * 100) / timepass;
 	m_pct_gs = (deltas.gs * 100) / timepass;
+	m_pct_vu = (deltas.vu * 100) / timepass;
 	m_pct_ui = (deltas.ui * 100) / timepass;
 }
 
@@ -92,6 +97,11 @@ int DefaultCpuUsageProvider::GetGsPct() const
 	return m_pct_gs;
 }
 
+int DefaultCpuUsageProvider::GetVUPct() const
+{
+	return m_pct_vu;
+}
+
 int DefaultCpuUsageProvider::GetGuiPct() const
 {
 	return m_pct_ui;
diff --git a/pcsx2/gui/CpuUsageProvider.h b/pcsx2/gui/CpuUsageProvider.h
index d24f03648bb07..18528dcf0e71a 100644
--- a/pcsx2/gui/CpuUsageProvider.h
+++ b/pcsx2/gui/CpuUsageProvider.h
@@ -27,6 +27,7 @@ class BaseCpuUsageProvider
 	virtual void UpdateStats()=0;
 	virtual int GetEEcorePct() const=0;
 	virtual int GetGsPct() const=0;
+	virtual int GetVUPct() const=0;
 	virtual int GetGuiPct() const=0;
 };
 
@@ -44,16 +45,17 @@ class CpuUsageProvider : public BaseCpuUsageProvider
 	virtual void UpdateStats()			{ m_Implementation->UpdateStats(); }
 	virtual int GetEEcorePct() const	{ return m_Implementation->GetEEcorePct(); }
 	virtual int GetGsPct() const		{ return m_Implementation->GetGsPct(); }
+	virtual int GetVUPct() const		{ return m_Implementation->GetVUPct(); }
 	virtual int GetGuiPct() const		{ return m_Implementation->GetGuiPct(); }
 };
 
-struct AllThreeThreads
+struct AllPCSX2Threads
 {
-	u64		ee, gs, ui;
+	u64		ee, gs, vu, ui;
 	u64		update;
 
 	void LoadWithCurrentTimes();
-	AllThreeThreads operator-( const AllThreeThreads& right ) const;
+	AllPCSX2Threads operator-( const AllPCSX2Threads& right ) const;
 };
 
 class DefaultCpuUsageProvider :
@@ -64,11 +66,12 @@ class DefaultCpuUsageProvider :
 	static const uint QueueDepth = 4;
 
 protected:
-	AllThreeThreads m_queue[QueueDepth];
+	AllPCSX2Threads m_queue[QueueDepth];
 
 	uint	m_writepos;
 	u32		m_pct_ee;
 	u32		m_pct_gs;
+	u32		m_pct_vu;
 	u32		m_pct_ui;
 
 public:
@@ -80,6 +83,7 @@ class DefaultCpuUsageProvider :
 	void UpdateStats();
 	int GetEEcorePct() const;
 	int GetGsPct() const;
+	int GetVUPct() const;
 	int GetGuiPct() const;
 
 protected:
diff --git a/pcsx2/gui/CpuUsageProviderMSW.cpp b/pcsx2/gui/CpuUsageProviderMSW.cpp
index f46a5d952aaca..8f735e44be0a3 100644
--- a/pcsx2/gui/CpuUsageProviderMSW.cpp
+++ b/pcsx2/gui/CpuUsageProviderMSW.cpp
@@ -55,6 +55,7 @@ class CpuUsageProviderMSW : public BaseCpuUsageProvider
 	void UpdateStats();
 	int GetEEcorePct() const;
 	int GetGsPct() const;
+	int GetVUPct() const;
 	int GetGuiPct() const;
 };
 
@@ -264,6 +265,11 @@ int CpuUsageProviderMSW::GetGsPct() const
 	return 0;
 }
 
+int CpuUsageProviderMSW::GetVUPct() const
+{
+	return 0;
+}
+
 int CpuUsageProviderMSW::GetGuiPct() const
 {
 	return 0;
diff --git a/pcsx2/gui/FrameForGS.cpp b/pcsx2/gui/FrameForGS.cpp
index 1326ea3a99871..8d92308143748 100644
--- a/pcsx2/gui/FrameForGS.cpp
+++ b/pcsx2/gui/FrameForGS.cpp
@@ -533,10 +533,18 @@ void GSFrame::OnUpdateTitle( wxTimerEvent& evt )
 	}
 
 	FastFormatUnicode cpuUsage;
-	if( m_CpuUsage.IsImplemented() )
-	{
+	if (m_CpuUsage.IsImplemented()) {
 		m_CpuUsage.UpdateStats();
-		cpuUsage.Write( L" | EE: %3d%% | GS: %3d%% | UI: %3d%%", m_CpuUsage.GetEEcorePct(), m_CpuUsage.GetGsPct(), m_CpuUsage.GetGuiPct() );
+		if (THREAD_VU1) { // Display VU thread's usage
+			cpuUsage.Write(L" | EE: %3d%% | GS: %3d%% | VU: %3d%% | UI: %3d%%",
+				m_CpuUsage.GetEEcorePct(),	m_CpuUsage.GetGsPct(),
+				m_CpuUsage.GetVUPct(),		m_CpuUsage.GetGuiPct());
+		}
+		else {
+			cpuUsage.Write(L" | EE: %3d%% | GS: %3d%% | UI: %3d%%",
+				m_CpuUsage.GetEEcorePct(),	m_CpuUsage.GetGsPct(),
+				m_CpuUsage.GetGuiPct());
+		}
 	}
 
 	const u64& smode2 = *(u64*)PS2GS_BASE(GS_SMODE2);
diff --git a/pcsx2/gui/Panels/ConfigurationPanels.h b/pcsx2/gui/Panels/ConfigurationPanels.h
index a9186bbc83a70..19366862e98b8 100644
--- a/pcsx2/gui/Panels/ConfigurationPanels.h
+++ b/pcsx2/gui/Panels/ConfigurationPanels.h
@@ -335,6 +335,7 @@ namespace Panels
 		pxCheckBox*		m_check_fastCDVD;
 		pxCheckBox*		m_check_vuFlagHack;
 		pxCheckBox*		m_check_vuBlockHack;
+		pxCheckBox*		m_check_vuThread;
 
 	public:
 		virtual ~SpeedHacksPanel() throw() {}
diff --git a/pcsx2/gui/Panels/SpeedhacksPanel.cpp b/pcsx2/gui/Panels/SpeedhacksPanel.cpp
index f6a37e1aae55b..d7db4d164a794 100644
--- a/pcsx2/gui/Panels/SpeedhacksPanel.cpp
+++ b/pcsx2/gui/Panels/SpeedhacksPanel.cpp
@@ -161,10 +161,13 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
 	wxPanelWithHelpers* vuHacksPanel = new wxPanelWithHelpers( right, wxVERTICAL, _("microVU Hacks") );
 
 	m_check_vuFlagHack = new pxCheckBox( vuHacksPanel, _("mVU Flag Hack"),
-		_("Good Speedup and High Compatibility; may cause garbage graphics, SPS, etc... [Recommended]") );
+		_("Good Speedup and High Compatibility; may cause bad graphics... [Recommended]" ) );
 
 	m_check_vuBlockHack = new pxCheckBox( vuHacksPanel, _("mVU Block Hack"),
-		_("Good Speedup and High Compatibility; may cause garbage graphics, SPS, etc...") );
+		_("Good Speedup and High Compatibility; may cause bad graphics, SPS, etc...") );
+
+	m_check_vuThread = new pxCheckBox( vuHacksPanel, _("MTVU (Multi-Threaded microVU1)"),
+		_("Good Speedup and High Compatibility; may cause hanging... [Recommended if 3+ cores]") );
 
 	m_check_vuFlagHack->SetToolTip( pxEt( "!ContextTip:Speedhacks:vuFlagHack",
 		L"Updates Status Flags only on blocks which will read them, instead of all the time. "
@@ -176,6 +179,12 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
 		L"This should be pretty safe. It is unknown if this breaks any game..."
 	) );
 
+	m_check_vuThread->SetToolTip( pxEt( "!ContextTip:Speedhacks:vuThread",
+		L"Runs VU1 on its own thread (microVU1-only). Generally a speedup on CPUs with 3 or more cores. "
+		L"This is safe for most games, but a few games are incompatible and may hang. "
+		L"In the case of GS limited games, it may be a slowdown (especially on dual core CPUs)."
+	) );
+
 	// ------------------------------------------------------------------------
 	// All other hacks Section:
 
@@ -226,7 +235,8 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
 
 	*vuHacksPanel	+= m_check_vuFlagHack;
 	*vuHacksPanel	+= m_check_vuBlockHack;
-	*vuHacksPanel	+= 57; // Aligns left and right boxes in default language and font size
+	*vuHacksPanel	+= m_check_vuThread;
+	//*vuHacksPanel	+= 57; // Aligns left and right boxes in default language and font size
 
 	*miscHacksPanel	+= m_check_intc;
 	*miscHacksPanel	+= m_check_waitloop;
@@ -304,6 +314,7 @@ void Panels::SpeedHacksPanel::ApplyConfigToGui( AppConfig& configToApply, int fl
 
 	m_check_vuFlagHack	->SetValue(opts.vuFlagHack);
 	m_check_vuBlockHack	->SetValue(opts.vuBlockHack);
+	m_check_vuThread	->SetValue(opts.vuThread);
 	m_check_intc		->SetValue(opts.IntcStat);
 	m_check_waitloop	->SetValue(opts.WaitLoop);
 	m_check_fastCDVD	->SetValue(opts.fastCDVD);
@@ -333,6 +344,7 @@ void Panels::SpeedHacksPanel::Apply()
 	opts.IntcStat			= m_check_intc->GetValue();
 	opts.vuFlagHack			= m_check_vuFlagHack->GetValue();
 	opts.vuBlockHack		= m_check_vuBlockHack->GetValue();
+	opts.vuThread			= m_check_vuThread->GetValue();
 
 	// If the user has a command line override specified, we need to disable it
 	// so that their changes take effect
diff --git a/pcsx2/ps2/LegacyDmac.cpp b/pcsx2/ps2/LegacyDmac.cpp
index 931dbd5a5efbc..a8473971fbb22 100644
--- a/pcsx2/ps2/LegacyDmac.cpp
+++ b/pcsx2/ps2/LegacyDmac.cpp
@@ -17,6 +17,7 @@
 #include "PrecompiledHeader.h"
 #include "Common.h"
 #include "Hardware.h"
+#include "MTVU.h"
 
 #include "IPU/IPUdma.h"
 #include "ps2/HwInternal.h"
@@ -91,7 +92,7 @@ __fi void setDmacStat(u32 num)
 }
 
 // Note: Dma addresses are guaranteed to be aligned to 16 bytes (128 bits)
-__fi tDMA_TAG *SPRdmaGetAddr(u32 addr, bool write)
+__fi tDMA_TAG* SPRdmaGetAddr(u32 addr, bool write)
 {
 	// if (addr & 0xf) { DMA_LOG("*PCSX2*: DMA address not 128bit aligned: %8.8x", addr); }
 
@@ -114,6 +115,10 @@ __fi tDMA_TAG *SPRdmaGetAddr(u32 addr, bool write)
 	}
 	else if ((addr >= 0x11004000) && (addr < 0x11010000))
 	{
+		if (THREAD_VU1) {
+			DevCon.Error("MTVU: SPRdmaGetAddr Accessing VU Memory!");
+			vu1Thread.WaitVU();
+		}
 		//Access for VU Memory
 		return (tDMA_TAG*)vtlb_GetPhyPtr(addr & 0x1FFFFFF0);
 	}
diff --git a/pcsx2/vtlb.cpp b/pcsx2/vtlb.cpp
index 845815a565847..3473107b5ddb4 100644
--- a/pcsx2/vtlb.cpp
+++ b/pcsx2/vtlb.cpp
@@ -41,7 +41,7 @@
 using namespace R5900;
 using namespace vtlb_private;
 
-#define verify pxAssume
+#define verify pxAssert
 
 namespace vtlb_private
 {
@@ -512,14 +512,14 @@ void vtlb_MapBlock(void* base, u32 start, u32 size, u32 blocksize)
 {
 	verify(0==(start&VTLB_PAGE_MASK));
 	verify(0==(size&VTLB_PAGE_MASK) && size>0);
-	if (!blocksize)
+	if(!blocksize)
 		blocksize = size;
 	verify(0==(blocksize&VTLB_PAGE_MASK) && blocksize>0);
 	verify(0==(size%blocksize));
 
 	s32 baseint = (s32)base;
 	u32 end = start + (size - VTLB_PAGE_SIZE);
-	pxAssume( (end>>VTLB_PAGE_BITS) < ArraySize(vtlbdata.pmap) );
+	verify((end>>VTLB_PAGE_BITS) < ArraySize(vtlbdata.pmap));
 
 	while (start <= end)
 	{
@@ -544,7 +544,7 @@ void vtlb_Mirror(u32 new_region,u32 start,u32 size)
 	verify(0==(size&VTLB_PAGE_MASK) && size>0);
 
 	u32 end = start + (size-VTLB_PAGE_SIZE);
-	pxAssume( (end>>VTLB_PAGE_BITS) < ArraySize(vtlbdata.pmap) );
+	verify((end>>VTLB_PAGE_BITS) < ArraySize(vtlbdata.pmap));
 
 	while(start <= end)
 	{
diff --git a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
index 89c316ffd8513..db32bc6b83551 100644
--- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
+++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj
@@ -1322,6 +1322,14 @@
 					<Filter
 						Name="VU"
 						>
+						<File
+							RelativePath="..\..\MTVU.cpp"
+							>
+						</File>
+						<File
+							RelativePath="..\..\MTVU.h"
+							>
+						</File>
 						<File
 							RelativePath="..\..\VU.h"
 							>
diff --git a/pcsx2/x86/microVU.cpp b/pcsx2/x86/microVU.cpp
index bffd943ed4f37..f2b83a7e941e0 100644
--- a/pcsx2/x86/microVU.cpp
+++ b/pcsx2/x86/microVU.cpp
@@ -99,6 +99,7 @@ void mVUreset(microVU& mVU, bool resetReserve) {
 	mVU.prog.x86start	= z;
 	mVU.prog.x86ptr		= z;
 	mVU.prog.x86end		= z + ((mVU.cacheSize - mVUcacheSafeZone) * _1mb);
+	//memset(mVU.prog.x86start, 0xcc, mVU.cacheSize*_1mb);
 
 	for(u32 i = 0; i < (mVU.progSize / 2); i++) {
 		if(!mVU.prog.prog[i]) {
@@ -279,7 +280,6 @@ _mVUt __fi void* mVUsearchProg(u32 startPC, uptr pState) {
 //------------------------------------------------------------------
 // recMicroVU0 / recMicroVU1
 //------------------------------------------------------------------
-
 recMicroVU0::recMicroVU0()		  { m_Idx = 0; IsInterpreter = false; }
 recMicroVU1::recMicroVU1()		  { m_Idx = 1; IsInterpreter = false; }
 void recMicroVU0::Vsync() throw() { mVUvsyncUpdate(microVU0); }
@@ -290,8 +290,10 @@ void recMicroVU0::Reserve() {
 		mVUinit(microVU0, 0);
 }
 void recMicroVU1::Reserve() {
-	if (AtomicExchange(m_Reserved, 1) == 0)
+	if (AtomicExchange(m_Reserved, 1) == 0) {
 		mVUinit(microVU1, 1);
+		vu1Thread.InitThread();
+	}
 }
 
 void recMicroVU0::Shutdown() throw() {
@@ -299,8 +301,10 @@ void recMicroVU0::Shutdown() throw() {
 		mVUclose(microVU0);
 }
 void recMicroVU1::Shutdown() throw() {
-	if (AtomicExchange(m_Reserved, 0) == 1)
+	if (AtomicExchange(m_Reserved, 0) == 1) {
+		vu1Thread.WaitVU();
 		mVUclose(microVU1);
+	}
 }
 
 void recMicroVU0::Reset() {
@@ -309,6 +313,7 @@ void recMicroVU0::Reset() {
 }
 void recMicroVU1::Reset() {
 	if(!pxAssertDev(m_Reserved, "MicroVU1 CPU Provider has not been reserved prior to reset!")) return;
+	vu1Thread.WaitVU();
 	mVUreset(microVU1, true);
 }
 
@@ -325,8 +330,10 @@ void recMicroVU0::Execute(u32 cycles) {
 void recMicroVU1::Execute(u32 cycles) {
 	pxAssert(m_Reserved); // please allocate me first! :|
 
-	if(!(VU0.VI[REG_VPU_STAT].UL & 0x100)) return;
-	((mVUrecCall)microVU1.startFunct)(VU1.VI[REG_TPC].UL, vu1RunCycles);
+	if (!THREAD_VU1) {
+		if(!(VU0.VI[REG_VPU_STAT].UL & 0x100)) return;
+	}
+	((mVUrecCall)microVU1.startFunct)(VU1.VI[REG_TPC].UL, cycles);
 }
 
 void recMicroVU0::Clear(u32 addr, u32 size) {
diff --git a/pcsx2/x86/microVU.h b/pcsx2/x86/microVU.h
index 8ae475cd61f70..30c76a9f55526 100644
--- a/pcsx2/x86/microVU.h
+++ b/pcsx2/x86/microVU.h
@@ -24,8 +24,8 @@ using namespace x86Emitter;
 #include <algorithm>
 #include "Common.h"
 #include "VU.h"
+#include "MTVU.h"
 #include "GS.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 #include "iR5900.h"
 #include "R5900OpcodeTables.h"
@@ -217,9 +217,11 @@ struct microVU {
 
 	VURegs& regs() const { return ::vuRegs[index]; }
 
-	__fi VIFregisters&	getVifRegs()	const	{ return regs().GetVifRegs(); }
-	__fi REG_VI&		getVI(uint reg) const	{ return regs().VI[reg]; }
-	__fi VECTOR&		getVF(uint reg) const	{ return regs().VF[reg]; }
+	__fi REG_VI& getVI(uint reg) const	{ return regs().VI[reg]; }
+	__fi VECTOR& getVF(uint reg) const	{ return regs().VF[reg]; }
+	__fi VIFregisters& getVifRegs()	const {
+		return (index && THREAD_VU1) ? vu1Thread.vifRegs : regs().GetVifRegs();
+	}
 };
 
 // microVU rec structs
diff --git a/pcsx2/x86/microVU_Branch.inl b/pcsx2/x86/microVU_Branch.inl
index e60aac2ea25fc..4e75c8173c668 100644
--- a/pcsx2/x86/microVU_Branch.inl
+++ b/pcsx2/x86/microVU_Branch.inl
@@ -77,8 +77,10 @@ void mVUendProgram(mV, microFlagCycles* mFC, int isEbit) {
 	xMOV(ptr32[&mVU.regs().VI[REG_CLIP_FLAG].UL],	gprT2);
 
 	if (isEbit || isVU1) { // Clear 'is busy' Flags
-		xAND(ptr32[&VU0.VI[REG_VPU_STAT].UL], (isVU1 ? ~0x100 : ~0x001)); // VBS0/VBS1 flag
-		xAND(ptr32[&mVU.getVifRegs().stat], ~VIF1_STAT_VEW); // Clear VU 'is busy' signal for vif
+		if (!mVU.index || !THREAD_VU1) {
+			xAND(ptr32[&VU0.VI[REG_VPU_STAT].UL], (isVU1 ? ~0x100 : ~0x001)); // VBS0/VBS1 flag
+			xAND(ptr32[&mVU.getVifRegs().stat], ~VIF1_STAT_VEW); // Clear VU 'is busy' signal for vif
+		}
 	}
 
 	if (isEbit != 2) { // Save PC, and Jump to Exit Point
diff --git a/pcsx2/x86/microVU_Execute.inl b/pcsx2/x86/microVU_Execute.inl
index 485d9a1d43201..02d39a7812c27 100644
--- a/pcsx2/x86/microVU_Execute.inl
+++ b/pcsx2/x86/microVU_Execute.inl
@@ -199,7 +199,10 @@ _mVUt void mVUcleanUp() {
 
 	mVU.cycles = mVU.totalCycles - mVU.cycles;
 	mVU.regs().cycle += mVU.cycles;
-	cpuRegs.cycle += ((mVU.cycles < 3000) ? mVU.cycles : 3000) * EmuConfig.Speedhacks.VUCycleSteal;
+
+	if (!vuIndex || !THREAD_VU1) {
+		cpuRegs.cycle += std::min(mVU.cycles, 3000u) * EmuConfig.Speedhacks.VUCycleSteal;
+	}
 	//static int ax = 0; ax++;
 	//if (!(ax % 100000)) {
 	//	for (u32 i = 0; i < (mVU.progSize / 2); i++) {
diff --git a/pcsx2/x86/microVU_Misc.inl b/pcsx2/x86/microVU_Misc.inl
index 8c8dd58580f97..b1c6aad445c27 100644
--- a/pcsx2/x86/microVU_Misc.inl
+++ b/pcsx2/x86/microVU_Misc.inl
@@ -239,7 +239,14 @@ __fi void mVUrestoreRegs(microVU& mVU, bool fromMemory = false)
 }
 
 // Gets called by mVUaddrFix at execution-time
-static void __fastcall mVUwarningRegAccess(u32 prog, u32 pc) { Console.Error("microVU0 Warning: Accessing VU1 Regs! [%04x] [%x]", pc, prog); }
+static void __fc mVUwarningRegAccess(u32 prog, u32 pc) {
+	Console.Error("microVU0 Warning: Accessing VU1 Regs! [%04x] [%x]", pc, prog);
+}
+
+static void __fc mVUwaitMTVU() {
+	if (IsDevBuild) DevCon.WriteLn("microVU0: Waiting on VU1 thread to access VU1 regs!");
+	if (THREAD_VU1) vu1Thread.WaitVU();
+}
 
 // Transforms the Address in gprReg to valid VU0/VU1 Address
 __fi void mVUaddrFix(mV, const x32& gprReg)
@@ -249,28 +256,31 @@ __fi void mVUaddrFix(mV, const x32& gprReg)
 		xSHL(gprReg, 4);
 	}
 	else {
-		if (IsDevBuild && !isCOP2) mVUbackupRegs(mVU, true);
 		xTEST(gprReg, 0x400);
 		xForwardJNZ8 jmpA;		// if addr & 0x4000, reads VU1's VF regs and VI regs
 			xAND(gprReg, 0xff); // if !(addr & 0x4000), wrap around
-			xForwardJump8 jmpB;
+			xForwardJump32 jmpB;
 		jmpA.SetTarget();
-			if (IsDevBuild && !isCOP2) { // Lets see which games do this!
-				xPUSH(gprT1);			 // Note: Kernel does it via COP2 to initialize VU1!
-				xPUSH(gprT2);			 // So we don't spam console, we'll only check micro-mode...
+			if (THREAD_VU1 || (IsDevBuild && !isCOP2)) {
+				mVUbackupRegs(mVU, true);
+				xPUSH(gprT1);
+				xPUSH(gprT2);
 				xPUSH(gprT3);
-				xMOV (gprT2, mVU.prog.cur->idx);
-				xMOV (gprT3, xPC);
-				xCALL(mVUwarningRegAccess);
+				if (IsDevBuild && !isCOP2) {         // Lets see which games do this!
+					xMOV (gprT2, mVU.prog.cur->idx); // Note: Kernel does it via COP2 to initialize VU1!
+					xMOV (gprT3, xPC);               // So we don't spam console, we'll only check micro-mode...
+					xCALL(mVUwarningRegAccess);
+				}
+				xCALL(mVUwaitMTVU);
 				xPOP (gprT3);
 				xPOP (gprT2);
 				xPOP (gprT1);
+				mVUrestoreRegs(mVU, true);
 			}
 			xAND(gprReg, 0x3f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs!
 			xADD(gprReg, (u128*)VU1.VF - (u128*)VU0.Mem);
 		jmpB.SetTarget();
 		xSHL(gprReg, 4); // multiply by 16 (shift left by 4)
-		if (IsDevBuild && !isCOP2) mVUrestoreRegs(mVU, true);
 	}
 }
 
diff --git a/pcsx2/x86/newVif.h b/pcsx2/x86/newVif.h
index c1540ccbf4364..8f2a6031a3de2 100644
--- a/pcsx2/x86/newVif.h
+++ b/pcsx2/x86/newVif.h
@@ -57,7 +57,6 @@ _vifT extern void  dVifUnpack  (const u8* data, bool isFill);
 
 // nVifBlock - Ordered for Hashing; the 'num' field and the lower 6 bits of upkType are
 //             used as the hash bucket selector.
-//
 struct __aligned16 nVifBlock {
 	u8   num;		// [00] Num  Field
 	u8   upkType;	// [01] Unpack Type [usn*1:mask*1:upk*4]
@@ -74,6 +73,8 @@ struct __aligned16 nVifBlock {
 #define _tParams nVifBlock, _hSize, _cmpS
 struct nVifStruct {
 
+	__aligned16 nVifBlock   block;
+
 	// Buffer for partial transfers (should always be first to ensure alignment)
 	// Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword)
 	__aligned16 u8			buffer[256*16];
diff --git a/pcsx2/x86/newVif_Dynarec.cpp b/pcsx2/x86/newVif_Dynarec.cpp
index db52bce079c4d..925f24f69f084 100644
--- a/pcsx2/x86/newVif_Dynarec.cpp
+++ b/pcsx2/x86/newVif_Dynarec.cpp
@@ -19,30 +19,28 @@
 
 #include "PrecompiledHeader.h"
 #include "newVif_UnpackSSE.h"
+#include "MTVU.h"
 
-static __aligned16 nVifBlock _vBlock = {0};
-
-void dVifReserve(int idx)
-{
-	if (!nVif[idx].recReserve)
+void dVifReserve(int idx) {
+	if(!nVif[idx].recReserve)
 		nVif[idx].recReserve = new RecompiledCodeReserve(pxsFmt(L"VIF%u Unpack Recompiler Cache", idx));
 
 	nVif[idx].recReserve->Reserve( nVif[idx].recReserveSizeMB * _1mb, idx ? HostMemoryMap::VIF1rec : HostMemoryMap::VIF0rec );
 }
 
 void dVifReset(int idx) {
-
 	pxAssertDev(nVif[idx].recReserve, "Dynamic VIF recompiler reserve must be created prior to VIF use or reset!");
 
-	if (!nVif[idx].vifBlocks)
+	if(!nVif[idx].vifBlocks)
 		nVif[idx].vifBlocks = new HashBucket<_tParams>();
 	else
 		nVif[idx].vifBlocks->clear();
 
 	nVif[idx].recReserve->Reset();
 
-	nVif[idx].numBlocks		=  0;
-	nVif[idx].recWritePtr	= nVif[idx].recReserve->GetPtr();
+	nVif[idx].numBlocks   =  0;
+	nVif[idx].recWritePtr = nVif[idx].recReserve->GetPtr();
+	//memset(nVif[idx].recWritePtr, 0xcc, nVif[idx].recReserveSizeMB * _1mb);
 }
 
 void dVifClose(int idx) {
@@ -74,7 +72,8 @@ VifUnpackSSE_Dynarec::VifUnpackSSE_Dynarec(const nVifStruct& vif_, const nVifBlo
 }
 
 __fi void VifUnpackSSE_Dynarec::SetMasks(int cS) const {
-	const vifStruct& vif = v.idx ? vif1 : vif0;
+	const int idx = v.idx;
+	const vifStruct& vif = MTVU_VifX;
 
 	u32 m0 = vB.mask;
 	u32 m1 =  m0 & 0xaaaaaaaa;
@@ -126,7 +125,8 @@ void VifUnpackSSE_Dynarec::doMaskWrite(const xRegisterSSE& regX) const {
 }
 
 void VifUnpackSSE_Dynarec::writeBackRow() const {
-	xMOVAPS(ptr128[&((v.idx ? vif1 : vif0).MaskRow)], xmmRow);
+	const int idx = v.idx;
+	xMOVAPS(ptr128[&(MTVU_VifX.MaskRow)], xmmRow);
 	DevCon.WriteLn("nVif: writing back row reg! [doMode = 2]");
 	// ToDo: Do we need to write back to vifregs.rX too!? :/
 }
@@ -208,25 +208,25 @@ void VifUnpackSSE_Dynarec::CompileRoutine() {
 }
 
 _vifT static __fi u8* dVifsetVUptr(uint cl, uint wl, bool isFill) {
-	vifStruct& vif			= GetVifX;
-	const VURegs& VU		= vuRegs[idx];
-	const uint vuMemLimit	= idx ? 0x4000 : 0x1000;
+	nVifStruct&   v          = nVif[idx];
+	vifStruct&    vif        = MTVU_VifX;
+	const VURegs& VU         = vuRegs[idx];
+	const uint    vuMemLimit = idx ? 0x4000 : 0x1000;
 
-	u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
-	u8* endmem = VU.Mem + vuMemLimit;
-	uint length = (_vBlock.num > 0) ? (_vBlock.num * 16) : 4096; // 0 = 256
+	u8*  startmem = VU.Mem + (vif.tag.addr & (vuMemLimit-0x10));
+	u8*  endmem   = VU.Mem + vuMemLimit;
+	uint length   = (v.block.num > 0) ? (v.block.num * 16) : 4096; // 0 = 256
 
 	if (!isFill) {
 		// Accounting for skipping mode: Subtract the last skip cycle, since the skipped part of the run
 		// shouldn't count as wrapped data.  Otherwise, a trailing skip can cause the emu to drop back
 		// to the interpreter. -- Refraction (test with MGS3)
-
 		uint skipSize  = (cl - wl) * 16;
-		uint blocks    = _vBlock.num / wl;
+		uint blocks    = v.block.num / wl;
 		length += (blocks-1) * skipSize;
 	}
 
-	if ( (startmem+length) <= endmem ) {
+	if ((startmem + length) <= endmem) {
 		return startmem;
 	}
 	//Console.WriteLn("nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", v.idx, vif.tag.addr, vif.tag.addr + (_vBlock.num * 16), _vBlock.num, wl, cl);
@@ -245,12 +245,12 @@ static __fi void dVifRecLimit(int idx) {
 	}
 }
 
-_vifT static __fi bool dVifExecuteUnpack(const u8* data, bool isFill)
+_vifT static __ri bool dVifExecuteUnpack(const u8* data, bool isFill)
 {
-	const nVifStruct& v		= nVif[idx];
-	VIFregisters& vifRegs	= vifXRegs;
+	nVifStruct&   v		  = nVif[idx];
+	VIFregisters& vifRegs = MTVU_VifXRegs;
 
-	if (nVifBlock* b = v.vifBlocks->find(&_vBlock)) {
+	if (nVifBlock* b = v.vifBlocks->find(&v.block)) {
 		if (u8* dest = dVifsetVUptr<idx>(vifRegs.cycle.cl, vifRegs.cycle.wl, isFill)) {
 			//DevCon.WriteLn("Running Recompiled Block!");
 			((nVifrecCall)b->startPtr)((uptr)dest, (uptr)data);
@@ -266,39 +266,37 @@ _vifT static __fi bool dVifExecuteUnpack(const u8* data, bool isFill)
 
 _vifT __fi void dVifUnpack(const u8* data, bool isFill) {
 
-	const nVifStruct& v		= nVif[idx];
-	vifStruct& vif			= GetVifX;
-	VIFregisters& vifRegs	= vifXRegs;
+	nVifStruct&   v       = nVif[idx];
+	vifStruct&    vif	  = MTVU_VifX;
+	VIFregisters& vifRegs = MTVU_VifXRegs;
 
-	const u8	upkType		= (vif.cmd & 0x1f) | (vif.usn << 5);
-	const int	doMask		= isFill? 1 : (vif.cmd & 0x10);
+	const u8	upkType   = (vif.cmd & 0x1f) | (vif.usn << 5);
+	const int	doMask    = isFill? 1 : (vif.cmd & 0x10);
 
-	_vBlock.upkType = upkType;
-	_vBlock.num		= (u8&)vifRegs.num;
-	_vBlock.mode	= (u8&)vifRegs.mode;
-	_vBlock.cl		= vifRegs.cycle.cl;
-	_vBlock.wl		= vifRegs.cycle.wl;
+	v.block.upkType = upkType;
+	v.block.num     = (u8&)vifRegs.num;
+	v.block.mode    = (u8&)vifRegs.mode;
+	v.block.cl      = vifRegs.cycle.cl;
+	v.block.wl      = vifRegs.cycle.wl;
 
 	// Zero out the mask parameter if it's unused -- games leave random junk
 	// values here which cause false recblock cache misses.
-	_vBlock.mask	= doMask ? vifRegs.mask : 0;
+	v.block.mask	= doMask ? vifRegs.mask : 0;
 
 	//DevCon.WriteLn("nVif%d: Recompiled Block! [%d]", idx, nVif[idx].numBlocks++);
 	//DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]",
-	//	_vBlock.num, _vBlock.upkType, _vBlock.scl, _vBlock.cl, _vBlock.wl, _vBlock.mode,
-	//	doMask >> 4, doMask ? wxsFormat( L"0x%08x", _vBlock.mask ).c_str() : L"ignored"
+	//	v.Block.num, v.Block.upkType, v.Block.scl, v.Block.cl, v.Block.wl, v.Block.mode,
+	//	doMask >> 4, doMask ? wxsFormat( L"0x%08x", v.Block.mask ).c_str() : L"ignored"
 	//);
 
 	if (dVifExecuteUnpack<idx>(data, isFill)) return;
 
 	xSetPtr(v.recWritePtr);
-	_vBlock.startPtr = (uptr)xGetAlignedCallTarget();
-	v.vifBlocks->add(_vBlock);
-	VifUnpackSSE_Dynarec( v, _vBlock ).CompileRoutine();
+	v.block.startPtr = (uptr)xGetAlignedCallTarget();
+	v.vifBlocks->add(v.block);
+	VifUnpackSSE_Dynarec(v, v.block).CompileRoutine();
 	nVif[idx].recWritePtr = xGetPtr();
 
-	// [TODO] : Ideally we should test recompile buffer limits prior to each instruction,
-	//   which would be safer and more memory efficient than using an 0.25 meg recEnd marker.
 	dVifRecLimit(idx);
 
 	// Run the block we just compiled.  Various conditions may force us to still use
diff --git a/pcsx2/x86/newVif_Unpack.cpp b/pcsx2/x86/newVif_Unpack.cpp
index c316818c2bcd7..c90843b0ee93b 100644
--- a/pcsx2/x86/newVif_Unpack.cpp
+++ b/pcsx2/x86/newVif_Unpack.cpp
@@ -21,6 +21,7 @@
 #include "Common.h"
 #include "Vif_Dma.h"
 #include "newVif.h"
+#include "MTVU.h"
 
 __aligned16 nVifStruct	nVif[2];
 
@@ -75,7 +76,7 @@ nVifStruct::nVifStruct()
 	vifBlocks	=  NULL;
 	numBlocks	=  0;
 
-	recReserveSizeMB	= 8;
+	recReserveSizeMB = 8;
 }
 
 void reserveNewVif(int idx)
@@ -87,8 +88,8 @@ void resetNewVif(int idx)
 	// Safety Reset : Reassign all VIF structure info, just in case the VU1 pointers have
 	// changed for some reason.
 
-	nVif[idx].idx			= idx;
-	nVif[idx].bSize			= 0;
+	nVif[idx].idx   = idx;
+	nVif[idx].bSize = 0;
 	memzero(nVif[idx].buffer);
 
 	if (newVifDynaRec) dVifReset(idx);
@@ -106,8 +107,8 @@ static __fi u8* getVUptr(uint idx, int offset) {
 
 
 _vifT int nVifUnpack(const u8* data) {
-	nVifStruct& v = nVif[idx];
-	vifStruct& vif = GetVifX;
+	nVifStruct&   v       = nVif[idx];
+	vifStruct&    vif     = GetVifX;
 	VIFregisters& vifRegs = vifXRegs;
 
 	const uint ret    = aMin(vif.vifpacketsize, vif.tag.size);
@@ -118,6 +119,7 @@ _vifT int nVifUnpack(const u8* data) {
 		if (v.bSize) { // Last transfer was partial
 			memcpy_fast(&v.buffer[v.bSize], data, size);
 			v.bSize		+= size;
+			size        = v.bSize;
 			data		= v.buffer;
 
 			vif.cl		= 0;
@@ -125,8 +127,11 @@ _vifT int nVifUnpack(const u8* data) {
 			if (!vifRegs.num) vifRegs.num = 256;
 		}
 
-		if (newVifDynaRec)	dVifUnpack<idx>(data, isFill);
-		else			   _nVifUnpack(idx, data, vifRegs.mode, isFill);
+		if (!idx || !THREAD_VU1) {
+			if (newVifDynaRec)	dVifUnpack<idx>(data, isFill);
+			else			   _nVifUnpack(idx, data, vifRegs.mode, isFill);
+		}
+		else vu1Thread.VifUnpack(vif, vifRegs, (u8*)data, size);
 
 		vif.tag.size	= 0;
 		vif.cmd			= 0;
@@ -147,12 +152,10 @@ _vifT int nVifUnpack(const u8* data) {
 		// We can optimize the calculation either way as some games have big partial chunks (Guitar Hero). 
 		// Skipping writes are easy, filling is a bit more complex, so for now until we can 
 		// be sure its right (if it happens) it just prints debug stuff and processes the old way.
-		if(!isFill) 
-		{
-				vifRegs.num -= (size / vSize);
+		if (!isFill) {
+			vifRegs.num -= (size / vSize);
 		}
-		else
-		{
+		else {
 			int guessedsize = (size / vSize);
 			guessedsize = vifRegs.num - (((guessedsize / vifRegs.cycle.cl) * (vifRegs.cycle.wl - vifRegs.cycle.cl)) + guessedsize);
 
@@ -164,14 +167,11 @@ _vifT int nVifUnpack(const u8* data) {
 					if (vif.cl <= vifRegs.cycle.cl)			size -= vSize;
 					else if (vif.cl == vifRegs.cycle.wl)	vif.cl = 0;
 				}
-				else
-				{
+				else {
 					size -= vSize;
 					if (vif.cl >= vifRegs.cycle.wl) vif.cl = 0;
 				}
 			}
-			
-			
 			DevCon.Warning("Fill!! Partial num left = %x, guessed %x", vifRegs.num, guessedsize);
 		}
 	}
@@ -236,8 +236,8 @@ static void setMasks(const vifStruct& vif, const VIFregisters& v) {
 template< int idx, bool doMode, bool isFill >
 __ri void __fastcall _nVifUnpackLoop(const u8* data) {
 
-	vifStruct& vif = GetVifX;
-	VIFregisters& vifRegs = vifXRegs;
+	vifStruct&    vif     = MTVU_VifX;
+	VIFregisters& vifRegs = MTVU_VifXRegs;
 
 	// skipSize used for skipping writes only
 	const int skipSize  = (vifRegs.cycle.cl - vifRegs.cycle.wl) * 16;
@@ -253,8 +253,8 @@ __ri void __fastcall _nVifUnpackLoop(const u8* data) {
 	//uint vn = (vif.cmd >> 2) & 0x3;
 	//uint vSize = ((32 >> vl) * (vn+1)) / 8;		// size of data (in bytes) used for each write cycle
 
-	const nVifCall*	fnbase			= &nVifUpk[ ((usn*2*16) + upkNum) * (4*1) ];
-	const UNPACKFUNCTYPE ft			= VIFfuncTable[idx][doMode ? vifRegs.mode : 0][ ((usn*2*16) + upkNum) ];
+	const nVifCall*	fnbase  = &nVifUpk[ ((usn*2*16) + upkNum) * (4*1) ];
+	const UNPACKFUNCTYPE ft = VIFfuncTable[idx][doMode ? vifRegs.mode : 0][ ((usn*2*16) + upkNum) ];
 
 	pxAssume (vif.cl == 0);
 	pxAssume (vifRegs.cycle.wl > 0);
diff --git a/pcsx2/x86/sVU_Lower.cpp b/pcsx2/x86/sVU_Lower.cpp
index 749c02814942c..391ea7741f0e0 100644
--- a/pcsx2/x86/sVU_Lower.cpp
+++ b/pcsx2/x86/sVU_Lower.cpp
@@ -26,7 +26,6 @@
 #include "sVU_Micro.h"
 #include "sVU_Debug.h"
 #include "sVU_zerorec.h"
-#include "Gif.h"
 #include "Gif_Unit.h"
 
 using namespace x86Emitter;
diff --git a/pcsx2/x86/sVU_zerorec.cpp b/pcsx2/x86/sVU_zerorec.cpp
index 6a1f93bfb9fa4..7bd57dee8e908 100644
--- a/pcsx2/x86/sVU_zerorec.cpp
+++ b/pcsx2/x86/sVU_zerorec.cpp
@@ -32,6 +32,7 @@
 #include "GS.h"
 #include "Gif.h"
 #include "VU.h"
+#include "MTVU.h"
 
 #include "R5900.h"
 #include "iR5900.h"
@@ -456,15 +457,14 @@ void SuperVUReset(int vuindex)
 	s_recVUPtr[vuindex] = *s_recVUMem[vuindex];
 }
 
-// clear the block and any joining blocks
+// clear the block and any joining blocks (size given in bytes)
 static void __fastcall SuperVUClear(u32 startpc, u32 size, int vuindex)
 {
 	vector<VuFunctionHeader::RANGE>::iterator itrange;
 	list<VuFunctionHeader*>::iterator it = s_listVUHeaders[vuindex].begin();
-	u32 endpc = startpc + ((size * 4 + 7) & ~7); // Adding this code to ensure size is always a multiple of 8, it can be simplified to startpc+size if size is always a multiple of 8 (cottonvibes)
+	u32 endpc = startpc + ((size + 7) & ~7); // Ensure size is a multiple of u64 (round up)
 	while (it != s_listVUHeaders[vuindex].end())
 	{
-
 		// for every fn, check if it has code in the range
 		for(itrange = (*it)->ranges.begin(); itrange != (*it)->ranges.end(); itrange++)
 		{
@@ -4641,11 +4641,13 @@ void recSuperVU1::Reserve()
 
 void recSuperVU1::Shutdown() throw()
 {
+	vu1Thread.WaitVU();
 	SuperVUDestroy( 1 );
 }
 
 void recSuperVU1::Reset()
 {
+	vu1Thread.WaitVU();
 	SuperVUReset( 1 );
 }