From: Noumi Akira <noumiakira@users.sourceforge.jp>
Date: Thu, 2 Jul 2009 08:34:43 +0000 (+0900)
Subject: add FrameReconstructor_SSE2.
X-Git-Tag: ex-1-preview-1~3
X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=6744cea84bd5cff9946494599b34f82d592c0202;p=qtheora%2Fmain.git

add FrameReconstructor_SSE2.
---

diff --git a/Lib/QTheoraEx/FrameDecoder.c b/Lib/QTheoraEx/FrameDecoder.c
index 08b7afd..a7d4647 100644
--- a/Lib/QTheoraEx/FrameDecoder.c
+++ b/Lib/QTheoraEx/FrameDecoder.c
@@ -630,6 +630,15 @@ BOOL QT_FrameDecoder_Setup(
 
 	/* */
 
+	if (g_QT_Enable_SSE2) {
+		t->Reconstructor = (FrameReconstructor_SSE2_t*)QT_MemoryPool_Allocate(pool, sizeof(FrameReconstructor_SSE2_t));
+		if (t->Reconstructor == NULL) {
+			return FALSE;
+		}
+	}
+
+	/* */
+
 	t->QIndex = -1;
 
 	/* */
@@ -691,6 +700,20 @@ BOOL QT_FrameDecoder_Setup(
 
 	/* */
 
+	if (g_QT_Enable_SSE2) {
+		t->UpdateDequantizeMatrix = QT_UpdateDequantizeMatrix_SSE2;
+	} else {
+		t->UpdateDequantizeMatrix = NULL;
+	}
+
+	if (g_QT_Enable_SSE2) {
+		t->Reconstruct = QT_ReconstructFrame_SSE2;
+	} else {
+		t->Reconstruct = QT_ReconstructFrame;
+	}
+
+	/* */
+
 	return TRUE;
 }
 
diff --git a/Lib/QTheoraEx/FrameDecoder.h b/Lib/QTheoraEx/FrameDecoder.h
index 9e58844..0255c1f 100644
--- a/Lib/QTheoraEx/FrameDecoder.h
+++ b/Lib/QTheoraEx/FrameDecoder.h
@@ -75,6 +75,15 @@ struct MotionVector {
 
 typedef struct MotionVector MotionVector_t;
 
+/* FrameReconstructor_SSE2 */
+struct FrameReconstructor_SSE2 {
+
+	INT16 Matrix[2][3][64];
+
+}; /* FrameReconstructor_SSE2 */
+
+typedef struct FrameReconstructor_SSE2 FrameReconstructor_SSE2_t;
+
 /* QT_FrameDecoder */
 
 struct QT_FrameDecoder;
@@ -94,6 +103,8 @@ struct QT_FrameDecoder {
 
 	DequantizeMatrix_t Dequantize;
 
+	FrameReconstructor_SSE2_t* Reconstructor;
+
 	LoopFilter_t Filter;
 
 	INT32 QIndex;
@@ -126,6 +137,10 @@ struct QT_FrameDecoder {
 
 	BOOL (*Decode)(FrameDecoder_t*, const VOID*, SIZE_T);
 
+	VOID (*UpdateDequantizeMatrix)(FrameDecoder_t*);
+
+	VOID (*Reconstruct)(FrameDecoder_t*);
+
 }; /* QT_FrameDecoder */
 
 /* */
diff --git a/Lib/QTheoraEx/FrameDecoder_Impl.h b/Lib/QTheoraEx/FrameDecoder_Impl.h
index 9bcb6be..a7e6dcf 100644
--- a/Lib/QTheoraEx/FrameDecoder_Impl.h
+++ b/Lib/QTheoraEx/FrameDecoder_Impl.h
@@ -610,6 +610,10 @@ static BOOL FrameDecoder_Decode(
 			&(t->Filter),
 			&(t->Setup->Filter),
 			t->QIndex);
+
+		if (t->UpdateDequantizeMatrix != NULL) {
+			t->UpdateDequantizeMatrix(t);
+		}
 	}
 
 	/* */
@@ -717,7 +721,7 @@ static BOOL FrameDecoder_Decode(
 
 	/* */
 
-	QT_ReconstructFrame(t);
+	t->Reconstruct(t);
 
 	/* */
 
diff --git a/Lib/QTheoraEx/FrameReconstructor.h b/Lib/QTheoraEx/FrameReconstructor.h
index 68f9ea8..754dc1c 100644
--- a/Lib/QTheoraEx/FrameReconstructor.h
+++ b/Lib/QTheoraEx/FrameReconstructor.h
@@ -5,9 +5,21 @@
 
 #include "FrameDecoder.h"
 
+/* */
+
 void QT_FrameLoopFilter(
 	FrameDecoder_t* t);
 
 void QT_ReconstructFrame(
 	FrameDecoder_t* t);
 
+/* */
+
+void QT_UpdateDequantizeMatrix_SSE2(
+	FrameDecoder_t* t);
+
+void QT_ReconstructFrame_SSE2(
+	FrameDecoder_t* t);
+
+/* */
+
diff --git a/Lib/QTheoraEx/FrameReconstructor_SSE2.c b/Lib/QTheoraEx/FrameReconstructor_SSE2.c
new file mode 100644
index 0000000..8b5e765
--- /dev/null
+++ b/Lib/QTheoraEx/FrameReconstructor_SSE2.c
@@ -0,0 +1,872 @@
+/* FrameReconstructor_SSE2.c */
+/* 2009/07/02                */
+
+#include "StdAfx.h"
+
+#include "FrameReconstructor.h"
+
+#include "MotionComp_SSE2.h"
+
+/* */
+
+#pragma warning(disable : 4799)
+
+/* */
+
+static void Transpose_SSE2(
+	const INT16* x,
+	INT16*       y)
+{
+	const __m128i* X = (const __m128i*)x;
+	__m128i*       Y = (__m128i*)y;
+
+	__m128i t0, t1, t2, t3, t4, t5, t6, t7;
+	__m128i u0, u1, u2, u3, u4, u5, u6, u7;
+
+	t0 = _mm_loadu_si128(X + 0);
+	t1 = _mm_loadu_si128(X + 1);
+	t2 = _mm_loadu_si128(X + 2);
+	t3 = _mm_loadu_si128(X + 3);
+	t4 = _mm_loadu_si128(X + 4);
+	t5 = _mm_loadu_si128(X + 5);
+	t6 = _mm_loadu_si128(X + 6);
+	t7 = _mm_loadu_si128(X + 7);
+
+	u0 = _mm_unpacklo_epi16(t0, t1);
+	u1 = _mm_unpackhi_epi16(t0, t1);
+	u2 = _mm_unpacklo_epi16(t2, t3);
+	u3 = _mm_unpackhi_epi16(t2, t3);
+	u4 = _mm_unpacklo_epi16(t4, t5);
+	u5 = _mm_unpackhi_epi16(t4, t5);
+	u6 = _mm_unpacklo_epi16(t6, t7);
+	u7 = _mm_unpackhi_epi16(t6, t7);
+
+	t0 = _mm_unpacklo_epi32(u0, u2);
+	t1 = _mm_unpacklo_epi32(u1, u3);
+	t2 = _mm_unpackhi_epi32(u0, u2);
+	t3 = _mm_unpackhi_epi32(u1, u3);
+	t4 = _mm_unpacklo_epi32(u4, u6);
+	t5 = _mm_unpacklo_epi32(u5, u7);
+	t6 = _mm_unpackhi_epi32(u4, u6);
+	t7 = _mm_unpackhi_epi32(u5, u7);
+
+	Y[0] = _mm_unpacklo_epi64(t0, t4);
+	Y[1] = _mm_unpackhi_epi64(t0, t4);
+	Y[2] = _mm_unpacklo_epi64(t2, t6);
+	Y[3] = _mm_unpackhi_epi64(t2, t6);
+	Y[4] = _mm_unpacklo_epi64(t1, t5);
+	Y[5] = _mm_unpackhi_epi64(t1, t5);
+	Y[6] = _mm_unpacklo_epi64(t3, t7);
+	Y[7] = _mm_unpackhi_epi64(t3, t7);
+}
+
+void QT_UpdateDequantizeMatrix_SSE2(
+	FrameDecoder_t* t)
+{
+	FrameReconstructor_SSE2_t* r = t->Reconstructor;
+
+	INT32 i, p;
+
+	for (i = 0; i < 2; i++) {
+		for (p = 0; p < 3; p++) {
+			const INT16* x = t->Dequantize.Matrix[i][p];
+			INT16*       y = r->Matrix[i][p];
+			Transpose_SSE2(x, y);
+		}
+	}
+}
+
+/* */
+
+static __inline void Block_CopyPlane8x8_SSE2(
+	Plane_t* p,
+	INT32    x,
+	INT32    y,
+	Plane_t* r)
+{
+	const UINT8* s = r->Plane + y * r->Pitch + x;
+	UINT8*       d = p->Plane + y * p->Pitch + x;
+
+	__m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+	s0 = *((const __m64*)s); s += r->Pitch;
+	s1 = *((const __m64*)s); s += r->Pitch;
+	s2 = *((const __m64*)s); s += r->Pitch;
+	s3 = *((const __m64*)s); s += r->Pitch;
+	s4 = *((const __m64*)s); s += r->Pitch;
+	s5 = *((const __m64*)s); s += r->Pitch;
+	s6 = *((const __m64*)s); s += r->Pitch;
+	s7 = *((const __m64*)s);
+
+	*((__m64*)d) = s0; d += p->Pitch;
+	*((__m64*)d) = s1; d += p->Pitch;
+	*((__m64*)d) = s2; d += p->Pitch;
+	*((__m64*)d) = s3; d += p->Pitch;
+	*((__m64*)d) = s4; d += p->Pitch;
+	*((__m64*)d) = s5; d += p->Pitch;
+	*((__m64*)d) = s6; d += p->Pitch;
+	*((__m64*)d) = s7;
+}
+
+static __inline void Block_CopyPlane16x16_SSE2(
+	Plane_t* p,
+	INT32    x,
+	INT32    y,
+	Plane_t* r)
+{
+	const UINT8* s = r->Plane + y * r->Pitch + x;
+	UINT8*       d = p->Plane + y * p->Pitch + x;
+
+	__m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+	s0 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s1 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s2 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s3 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s4 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s5 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s6 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s7 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+
+	_mm_store_si128((__m128i*)d, s0); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s1); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s2); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s3); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s4); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s5); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s6); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s7); d += p->Pitch;
+
+	s0 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s1 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s2 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s3 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s4 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s5 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s6 = _mm_load_si128((const __m128i*)s); s += r->Pitch;
+	s7 = _mm_load_si128((const __m128i*)s);
+
+	_mm_store_si128((__m128i*)d, s0); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s1); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s2); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s3); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s4); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s5); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s6); d += p->Pitch;
+	_mm_store_si128((__m128i*)d, s7);
+}
+
+/* */
+
+ALIGN(0x10) static const UINT16 IPRED[8] = {
+	128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static __inline void Block_CopyIntra8x8_SSE2(
+	Plane_t*     p,
+	INT32        x,
+	INT32        y,
+	const INT16* c)
+{
+	UINT8* d = p->Plane + y * p->Pitch + x;
+
+	const __m128i* B = (const __m128i*)IPRED;
+	const __m128i* C = (const __m128i*)c;
+
+	__m128i s0, s1, s2, s3;
+	const __m128i z = _mm_setzero_si128();
+
+	s0 = _mm_packus_epi16(_mm_adds_epi16(C[0], B[0]), z);
+	s1 = _mm_packus_epi16(_mm_adds_epi16(C[1], B[0]), z);
+	s2 = _mm_packus_epi16(_mm_adds_epi16(C[2], B[0]), z);
+	s3 = _mm_packus_epi16(_mm_adds_epi16(C[3], B[0]), z);
+
+	_mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
+	_mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
+	_mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
+	_mm_storel_epi64((__m128i*)d, s3); d += p->Pitch;
+
+	s0 = _mm_packus_epi16(_mm_adds_epi16(C[4], B[0]), z);
+	s1 = _mm_packus_epi16(_mm_adds_epi16(C[5], B[0]), z);
+	s2 = _mm_packus_epi16(_mm_adds_epi16(C[6], B[0]), z);
+	s3 = _mm_packus_epi16(_mm_adds_epi16(C[7], B[0]), z);
+
+	_mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
+	_mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
+	_mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
+	_mm_storel_epi64((__m128i*)d, s3);
+}
+
+static __inline void Block_ReviseInter8x8_SSE2(
+	Plane_t*     p,
+	INT32        x,
+	INT32        y,
+	const INT16* c)
+{
+	UINT8*       d = p->Plane + y * p->Pitch + x;
+	const UINT8* s = d;
+
+	const __m128i* C = (const __m128i*)c;
+
+	__m128i b0, b1, b2, b3;
+	__m128i s0, s1, s2, s3;
+	const __m128i z = _mm_setzero_si128();
+
+	b0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+	b1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+	b2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+	b3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+
+	s0 = _mm_packus_epi16(_mm_adds_epi16(C[0], b0), z);
+	s1 = _mm_packus_epi16(_mm_adds_epi16(C[1], b1), z);
+	s2 = _mm_packus_epi16(_mm_adds_epi16(C[2], b2), z);
+	s3 = _mm_packus_epi16(_mm_adds_epi16(C[3], b3), z);
+
+	_mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
+	_mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
+	_mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
+	_mm_storel_epi64((__m128i*)d, s3); d += p->Pitch;
+
+	b0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+	b1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+	b2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z); s += p->Pitch;
+	b3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)s), z);
+
+	s0 = _mm_packus_epi16(_mm_adds_epi16(C[4], b0), z);
+	s1 = _mm_packus_epi16(_mm_adds_epi16(C[5], b1), z);
+	s2 = _mm_packus_epi16(_mm_adds_epi16(C[6], b2), z);
+	s3 = _mm_packus_epi16(_mm_adds_epi16(C[7], b3), z);
+
+	_mm_storel_epi64((__m128i*)d, s0); d += p->Pitch;
+	_mm_storel_epi64((__m128i*)d, s1); d += p->Pitch;
+	_mm_storel_epi64((__m128i*)d, s2); d += p->Pitch;
+	_mm_storel_epi64((__m128i*)d, s3);
+}
+
+/* */
+
+ALIGN(0x10) static const UINT16 COS[8][8] = {
+	{     8,     8,     8,     8,     8,     8,     8,     8 }, /* 0 */
+	{ 64277, 64277, 64277, 64277, 64277, 64277, 64277, 64277 }, /* 1 */
+	{ 60547, 60547, 60547, 60547, 60547, 60547, 60547, 60547 }, /* 2 */
+	{ 54491, 54491, 54491, 54491, 54491, 54491, 54491, 54491 }, /* 3 */
+	{ 46341, 46341, 46341, 46341, 46341, 46341, 46341, 46341 }, /* 4 */
+	{ 36410, 36410, 36410, 36410, 36410, 36410, 36410, 36410 }, /* 5 */
+	{ 25080, 25080, 25080, 25080, 25080, 25080, 25080, 25080 }, /* 6 */
+	{ 12785, 12785, 12785, 12785, 12785, 12785, 12785, 12785 }, /* 7 */
+};
+
+#define MUL1(T,X) _mm_add_epi16(_mm_mulhi_epi16(X, C[T]), X)
+#define MUL0(T,X) _mm_mulhi_epi16(X, C[T])
+
+static __inline void IDCT_R_8_SSE2(
+	const INT16* x,
+	INT16*       y)
+{
+	const __m128i* C = (const __m128i*)COS[0];
+	const __m128i* X = (const __m128i*)x;
+	__m128i*       Y = (__m128i*)y;
+
+	__m128i s0;
+	__m128i t0, t1, t2, t3, t4, t5, t6, t7;
+
+	/* Stage.1 */
+
+	s0 = _mm_add_epi16(X[0], X[4]);
+	t0 = MUL1(4, s0);
+
+	s0 = _mm_sub_epi16(X[0], X[4]);
+	t1 = MUL1(4, s0);
+
+	t2 = _mm_sub_epi16(MUL0(6, X[2]), MUL1(2, X[6]));
+	t3 = _mm_add_epi16(MUL1(2, X[2]), MUL0(6, X[6]));
+
+	t4 = _mm_sub_epi16(MUL0(7, X[1]), MUL1(1, X[7]));
+	t5 = _mm_sub_epi16(MUL1(3, X[5]), MUL1(5, X[3]));
+
+	t6 = _mm_add_epi16(MUL1(5, X[5]), MUL1(3, X[3]));
+	t7 = _mm_add_epi16(MUL1(1, X[1]), MUL0(7, X[7]));
+
+	/* Stage.2 */
+
+	s0 = _mm_sub_epi16(t4, t5);
+	t4 = _mm_add_epi16(t4, t5);
+	t5 = MUL1(4, s0);
+
+	s0 = _mm_sub_epi16(t7, t6);
+	t7 = _mm_add_epi16(t7, t6);
+	t6 = MUL1(4, s0);
+
+	/* Stage.3 */
+
+	s0 = _mm_sub_epi16(t0, t3);
+	t0 = _mm_add_epi16(t0, t3);
+
+	t3 = _mm_sub_epi16(t1, t2);
+	t1 = _mm_add_epi16(t1, t2);
+
+	t2 = _mm_sub_epi16(t6, t5);
+	t6 = _mm_add_epi16(t6, t5);
+
+	/* Stage.4 */
+
+	Y[0] = _mm_add_epi16(t0, t7);
+	Y[1] = _mm_add_epi16(t1, t6);
+	Y[2] = _mm_add_epi16(t3, t2);
+	Y[3] = _mm_add_epi16(s0, t4);
+	Y[4] = _mm_sub_epi16(s0, t4);
+	Y[5] = _mm_sub_epi16(t3, t2);
+	Y[6] = _mm_sub_epi16(t1, t6);
+	Y[7] = _mm_sub_epi16(t0, t7);
+}
+
+static __inline void IDCT_C_8_SSE2(
+	const INT16* x,
+	INT16*       y)
+{
+	const __m128i* C = (const __m128i*)COS[0];
+	const __m128i* X = (const __m128i*)x;
+	__m128i*       Y = (__m128i*)y;
+
+	__m128i s0;
+	__m128i t0, t1, t2, t3, t4, t5, t6, t7;
+
+	/* Stage.1 */
+
+	s0 = _mm_add_epi16(X[0], X[4]);
+	t0 = MUL1(4, s0);
+
+	s0 = _mm_sub_epi16(X[0], X[4]);
+	t1 = MUL1(4, s0);
+
+	t2 = _mm_sub_epi16(MUL0(6, X[2]), MUL1(2, X[6]));
+	t3 = _mm_add_epi16(MUL1(2, X[2]), MUL0(6, X[6]));
+
+	t4 = _mm_sub_epi16(MUL0(7, X[1]), MUL1(1, X[7]));
+	t5 = _mm_sub_epi16(MUL1(3, X[5]), MUL1(5, X[3]));
+
+	t6 = _mm_add_epi16(MUL1(5, X[5]), MUL1(3, X[3]));
+	t7 = _mm_add_epi16(MUL1(1, X[1]), MUL0(7, X[7]));
+
+	/* Stage.2 */
+
+	s0 = _mm_sub_epi16(t4, t5);
+	t4 = _mm_add_epi16(t4, t5);
+	t5 = MUL1(4, s0);
+
+	s0 = _mm_sub_epi16(t7, t6);
+	t7 = _mm_add_epi16(t7, t6);
+	t6 = MUL1(4, s0);
+
+	/* Stage.3 */
+
+	s0 = _mm_sub_epi16(t0, t3);
+	t0 = _mm_add_epi16(t0, t3);
+
+	t3 = _mm_sub_epi16(t1, t2);
+	t1 = _mm_add_epi16(t1, t2);
+
+	t2 = _mm_sub_epi16(t6, t5);
+	t6 = _mm_add_epi16(t6, t5);
+
+	/* Stage.4 */
+
+	Y[0] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t0, t7), C[0]), 4);
+	Y[1] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t1, t6), C[0]), 4);
+	Y[2] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(t3, t2), C[0]), 4);
+	Y[3] = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(s0, t4), C[0]), 4);
+	Y[4] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(s0, t4), C[0]), 4);
+	Y[5] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t3, t2), C[0]), 4);
+	Y[6] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t1, t6), C[0]), 4);
+	Y[7] = _mm_srai_epi16(_mm_add_epi16(_mm_sub_epi16(t0, t7), C[0]), 4);
+}
+
+static __inline void Transpose_U_SSE2(
+	const INT16* x,
+	INT16*       y)
+{
+	const __m128i* X = (const __m128i*)x;
+	__m128i*       Y = (__m128i*)y;
+
+	__m128i u0, u1, u2, u3, u4, u5, u6, u7;
+	__m128i t0, t1, t2, t3, t4, t5, t6, t7;
+
+	u0 = _mm_unpacklo_epi16(X[0], X[1]);
+	u1 = _mm_unpackhi_epi16(X[0], X[1]);
+	u2 = _mm_unpacklo_epi16(X[2], X[3]);
+	u3 = _mm_unpackhi_epi16(X[2], X[3]);
+	u4 = _mm_unpacklo_epi16(X[4], X[5]);
+	u5 = _mm_unpackhi_epi16(X[4], X[5]);
+	u6 = _mm_unpacklo_epi16(X[6], X[7]);
+	u7 = _mm_unpackhi_epi16(X[6], X[7]);
+
+	t0 = _mm_unpacklo_epi32(u0, u2);
+	t1 = _mm_unpacklo_epi32(u1, u3);
+	t2 = _mm_unpackhi_epi32(u0, u2);
+	t3 = _mm_unpackhi_epi32(u1, u3);
+	t4 = _mm_unpacklo_epi32(u4, u6);
+	t5 = _mm_unpacklo_epi32(u5, u7);
+	t6 = _mm_unpackhi_epi32(u4, u6);
+	t7 = _mm_unpackhi_epi32(u5, u7);
+
+	Y[0] = _mm_unpacklo_epi64(t0, t4);
+	Y[1] = _mm_unpackhi_epi64(t0, t4);
+	Y[2] = _mm_unpacklo_epi64(t2, t6);
+	Y[3] = _mm_unpackhi_epi64(t2, t6);
+	Y[4] = _mm_unpacklo_epi64(t1, t5);
+	Y[5] = _mm_unpackhi_epi64(t1, t5);
+	Y[6] = _mm_unpacklo_epi64(t3, t7);
+	Y[7] = _mm_unpackhi_epi64(t3, t7);
+}
+
+/* */
+
+static const UINT8 TZZ[64] = {
+	 0,  2,  3,  9, 10, 20, 21, 35,
+	 1,  4,  8, 11, 19, 22, 34, 36,
+	 5,  7, 12, 18, 23, 33, 37, 48,
+	 6, 13, 17, 24, 32, 38, 47, 49,
+	14, 16, 25, 31, 39, 46, 50, 57,
+	15, 26, 30, 40, 45, 51, 56, 58,
+	27, 29, 41, 44, 52, 55, 59, 62,
+	28, 42, 43, 53, 54, 60, 61, 63
+};
+
+static __inline void DequantizeIDCT8x8_SSE2(
+	const INT16* block,
+	const INT16* matrix,
+	INT16*       coeff)
+{
+	ALIGN(0x10) INT16 c0[64];
+
+	{ /* Reorder */
+		const UINT8* t = TZZ;
+
+		INT16* c = c0;
+		INT16* e = c + 64;
+		for (; c < e; c += 4, t += 4) {
+			c[0] = block[t[0]];
+			c[1] = block[t[1]];
+			c[2] = block[t[2]];
+			c[3] = block[t[3]];
+		}
+	}
+
+	{ /* Dequantize */
+		const __m128i* m = (const __m128i*)matrix;
+		__m128i*       d = (__m128i*)c0;
+
+		d[0] = _mm_mullo_epi16(d[0], m[0]);
+		d[1] = _mm_mullo_epi16(d[1], m[1]);
+		d[2] = _mm_mullo_epi16(d[2], m[2]);
+		d[3] = _mm_mullo_epi16(d[3], m[3]);
+		d[4] = _mm_mullo_epi16(d[4], m[4]);
+		d[5] = _mm_mullo_epi16(d[5], m[5]);
+		d[6] = _mm_mullo_epi16(d[6], m[6]);
+		d[7] = _mm_mullo_epi16(d[7], m[7]);
+	}
+
+	/* iDCT Row */
+	IDCT_R_8_SSE2(c0, coeff);
+
+	/* Transpose */
+	Transpose_U_SSE2(coeff, c0);
+
+	/* iDCT Colum */
+	IDCT_C_8_SSE2(c0, coeff);
+}
+
+/* */
+
+struct DecodeCoefficientsContext {
+
+	INT32 EOB_Run[64];
+
+	INT8*  Run  [64];
+	INT16* Coeff[64];
+
+}; /* DecodeCoefficientsContext */
+
+typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;
+
+static INT32 DecodeCoefficients(
+	FrameDecoder_t*              t,
+	DecodeCoefficientsContext_t* ctx,
+	INT16*                       block)
+{
+	INT16* b = block;
+	INT16* e = b + 64;
+
+	INT32 i = 0;
+
+	while (b < e) {
+		if (ctx->EOB_Run[i] > 0) {
+			ctx->EOB_Run[i] -= 1;
+			break;
+
+		} else {
+			INT32 run   = *((ctx->Run  [i])++);
+			INT16 coeff = *((ctx->Coeff[i])++);
+
+			if (run < 0) {
+				ctx->EOB_Run[i] = coeff;
+
+			} else {
+				INT16* p = b + run;
+				if (p >= e) {
+					break;
+				}
+
+				while (b < p) {
+					*(b++) = 0;
+				}
+
+				*(b++) = coeff;
+
+				i = b - block;
+			}
+		}
+	}
+
+	while (b < e) {
+		*(b++) = 0;
+	}
+
+	return i;
+}
+
+/* */
+
+static void Reconstruct_IntraBlock(
+	FrameDecoder_t*              t,
+	Plane_t*                     p,
+	INT32                        x,
+	INT32                        y,
+	INT16                        dc,
+	INT32                        plane,
+	Plane_t*                     r,
+	DecodeCoefficientsContext_t* ctx)
+{
+	ALIGN(0x10) INT16 block[64];
+	ALIGN(0x10) INT16 coeff[64];
+
+	const INT16 (*mat)[64] = t->Reconstructor->Matrix[0];
+
+	if (dc == NOT_CODED) {
+		Block_CopyPlane8x8_SSE2(p, x, y, r);
+		return;
+	}
+
+	DecodeCoefficients(t, ctx, block);
+
+	block[0] = dc;
+
+	DequantizeIDCT8x8_SSE2(block, mat[plane], coeff);
+
+	Block_CopyIntra8x8_SSE2(p, x, y, coeff);
+}
+
+static void Reconstruct_InterBlock(
+	FrameDecoder_t*              t,
+	Plane_t*                     p,
+	INT32                        x,
+	INT32                        y,
+	INT16                        dc,
+	INT32                        plane,
+	Plane_t*                     r,
+	DecodeCoefficientsContext_t* ctx)
+{
+	ALIGN(0x10) INT16 block[64];
+	ALIGN(0x10) INT16 coeff[64];
+
+	const INT16 (*mat)[64] = t->Reconstructor->Matrix[1];
+
+	if (dc == NOT_CODED) {
+		if (r != NULL) {
+			Block_CopyPlane8x8_SSE2(p, x, y, r);
+		}
+		return;
+	}
+
+	DecodeCoefficients(t, ctx, block);
+
+	block[0] = dc;
+
+	DequantizeIDCT8x8_SSE2(block, mat[plane], coeff);
+
+	Block_ReviseInter8x8_SSE2(p, x, y, coeff);
+}
+
+/* */
+
+/* */
+
+static const INT8 S_PX[16] = {
+	0*8, 1*8, 1*8, 0*8,
+	0*8, 0*8, 1*8, 1*8,
+	2*8, 2*8, 3*8, 3*8,
+	3*8, 2*8, 2*8, 3*8
+};
+
+static const INT8 S_PY[16] = {
+	0*8, 0*8, 1*8, 1*8,
+	2*8, 3*8, 3*8, 2*8,
+	2*8, 3*8, 3*8, 2*8,
+	1*8, 1*8, 0*8, 0*8
+};
+
+static const INT8 M_PX[4] = {
+	0*16, 0*16,
+	1*16, 1*16
+};
+
+static const INT8 M_PY[4] = {
+	0*16, 1*16,
+	1*16, 0*16
+};
+
+/* */
+
+static void Reconstruct_YPlane_SSE2(
+	FrameDecoder_t* t)
+{
+	INT32 x, y;
+
+	INT32 sx = t->Index->SX[0] * 32;
+	INT32 sy = t->Index->SY[0] * 32;
+
+	INT32 mx = t->Index->MX * 16;
+	INT32 my = t->Index->MY * 16;
+
+	INT32 bx = t->Index->BX[0];
+
+	const UINT16* bi = t->Index->BIndex[0];
+
+	Plane_t* g = t->Frame[0];
+	Plane_t* p = t->Frame[1];
+	Plane_t* r = t->Frame[2];
+
+	const UINT8*          mm = t->MBMode;
+	const MotionVector_t* mv = t->MV;
+
+	DecodeCoefficientsContext_t ctx = { 0 };
+
+	INT32 i;
+	for (i = 0; i < 64; i++) {
+		ctx.Run  [i] = t->BRun  [0][i];
+		ctx.Coeff[i] = t->BCoeff[0][i];
+	}
+
+	for (y = 0; y < sy; y += 32) {
+		for (x = 0; x < sx; x += 32) {
+			INT32 i = 0;
+
+			INT32 m;
+			for (m = 0; m < 4; m++, i += 4) {
+				INT32 x0 = x + M_PX[m];
+				INT32 y0 = y + M_PY[m];
+				if (x0 < mx && y0 < my) {
+					switch (*mm) {
+					case 0: /* INTER_NOMV */
+						Block_CopyPlane16x16_SSE2(p, x0, y0, r);
+
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, NULL, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, NULL, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, NULL, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, NULL, &ctx);
+						break;
+
+					case 1: /* INTRA */
+						Reconstruct_IntraBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+						Reconstruct_IntraBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+						Reconstruct_IntraBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+						Reconstruct_IntraBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+						break;
+
+					case 2: /* INTER_MV */
+					case 3: /* INTER_MV_LAST */
+					case 4: /* INTER_MV_LAST2 */
+						MotionComp_Block16x16_SSE2(p, x0, y0, r, mv);
+
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+						break;
+
+					case 5: /* INTER_GOLDEN_NOMV */
+						Block_CopyPlane16x16_SSE2(p, x0, y0, g);
+
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+						break;
+
+					case 6: /* INTER_GOLDEN_MV */
+						MotionComp_Block16x16_SSE2(p, x0, y0, g, mv);
+
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+						break;
+
+					case 7: /* INTER_MV_FOUR */
+					{
+						const MotionVector_t* v = mv;
+
+						const INT16* dc = t->DC + (x0 >> 3) + (y0 >> 3) * bx;
+
+						if (dc[0] != NOT_CODED) {
+							MotionComp_Block8x8Y_SSE2(p, x0 + 0, y0 + 0, r, v++);
+						}
+
+						if (dc[1] != NOT_CODED) {
+							MotionComp_Block8x8Y_SSE2(p, x0 + 8, y0 + 0, r, v++);
+						}
+
+						if (dc[0 + bx] != NOT_CODED) {
+							MotionComp_Block8x8Y_SSE2(p, x0 + 0, y0 + 8, r, v++);
+						}
+
+						if (dc[1 + bx] != NOT_CODED) {
+							MotionComp_Block8x8Y_SSE2(p, x0 + 8, y0 + 8, r, v++);
+						}
+
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], 0, r, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], 0, r, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], 0, r, &ctx);
+						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], 0, r, &ctx);
+						break;
+					}
+
+					} /* switch */
+
+					bi += 4;
+					mm += 1;
+					mv += 4;
+				}
+			}
+		}
+	}
+}
+
+/* */
+
+static void Reconstruct_CPlane_SSE2(
+	FrameDecoder_t* t)
+{
+	INT32 x, y;
+
+	INT32 sx = t->Index->SX[1] * 32;
+	INT32 sy = t->Index->SY[1] * 32;
+
+	INT32 mx = t->Index->MX * 8;
+	INT32 my = t->Index->MY * 8;
+
+	INT32 bx = t->Index->BX[1];
+
+	const UINT16* bi = t->Index->BIndex[1];
+
+	Plane_t* g = t->Frame[0];
+	Plane_t* p = t->Frame[1];
+	Plane_t* r = t->Frame[2];
+
+	const INT16* DC0 = t->DC + t->Index->BC[0];
+	const INT16* DC1 = DC0   + t->Index->BC[1];
+
+	const UINT8* m = t->BMode + t->Index->BC[0];
+
+	DecodeCoefficientsContext_t ctx[2] = { 0 };
+
+	INT32 i;
+	for (i = 0; i < 64; i++) {
+		ctx[0].Run  [i] = t->BRun  [1][i];
+		ctx[0].Coeff[i] = t->BCoeff[1][i];
+
+		ctx[1].Run  [i] = t->BRun  [2][i];
+		ctx[1].Coeff[i] = t->BCoeff[2][i];
+	}
+
+	for (y = 0; y < sy; y += 32) {
+		for (x = 0; x < sx; x += 32) {
+			INT32 i;
+			for (i = 0; i < 16; i++) {
+				INT32 xx = x + S_PX[i];
+				INT32 yy = y + S_PY[i];
+
+				if (xx < mx && yy < my) {
+					INT32 idx = (xx >> 3) + (yy >> 3) * bx;
+
+					switch (m[idx]) {
+					case 0: /* INTER_NOMV */
+						Block_CopyPlane8x8_SSE2(p + 1, xx, yy, r + 1);
+						Block_CopyPlane8x8_SSE2(p + 2, xx, yy, r + 2);
+
+						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, NULL, ctx + 0);
+						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, NULL, ctx + 1);
+						break;
+
+					case 1: /* INTRA */
+						Reconstruct_IntraBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+						Reconstruct_IntraBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+						break;
+
+					case 2: /* INTER_MV */
+					case 3: /* INTER_MV_LAST */
+					case 4: /* INTER_MV_LAST2 */
+						MotionComp_Block8x8C_SSE2(p + 1, xx, yy, r + 1, t->MVC + idx);
+						MotionComp_Block8x8C_SSE2(p + 2, xx, yy, r + 2, t->MVC + idx);
+
+						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+						break;
+
+					case 5: /* INTER_GOLDEN_NOMV */
+						Block_CopyPlane8x8_SSE2(p + 1, xx, yy, g + 1);
+						Block_CopyPlane8x8_SSE2(p + 2, xx, yy, g + 2);
+
+						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+						break;
+
+					case 6: /* INTER_GOLDEN_MV */
+						MotionComp_Block8x8C_SSE2(p + 1, xx, yy, g + 1, t->MVC + idx);
+						MotionComp_Block8x8C_SSE2(p + 2, xx, yy, g + 2, t->MVC + idx);
+
+						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+						break;
+
+					case 7: /* INTER_MV_FOUR */
+						MotionComp_Block8x8C_SSE2(p + 1, xx, yy, r + 1, t->MVC + idx);
+						MotionComp_Block8x8C_SSE2(p + 2, xx, yy, r + 2, t->MVC + idx);
+
+						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], 1, r + 1, ctx + 0);
+						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], 2, r + 2, ctx + 1);
+						break;
+
+					} /* switch */
+				}
+			}
+		}
+	}
+}
+
+/* */
+
+void QT_ReconstructFrame_SSE2(
+	FrameDecoder_t* t)
+{
+	Reconstruct_YPlane_SSE2(t);
+
+	Reconstruct_CPlane_SSE2(t);
+
+	if (t->Filter.Limit > 0) {
+		QT_FrameLoopFilter(t);
+	}
+}
+
+/* */
+
diff --git a/Lib/QTheoraEx/MotionComp_SSE2.c b/Lib/QTheoraEx/MotionComp_SSE2.c
new file mode 100644
index 0000000..078145f
--- /dev/null
+++ b/Lib/QTheoraEx/MotionComp_SSE2.c
@@ -0,0 +1,466 @@
+/* MotionComp_SSE2.c */
+/* 2009/07/02        */
+
+#include "StdAfx.h"
+
+#include "MotionComp_SSE2.h"
+
+/* */
+
+#pragma warning(disable : 4799)
+
+/* */
+
+static void Block_Extract8x8(
+	const Plane_t* plane,
+	INT32          x,
+	INT32          y,
+	UINT8*         block)
+{
+	INT32 i, j;
+
+	for (i = 0; i < 8; i++) {
+		for (j = 0; j < 8; j++) {
+			INT32 xx = x + j;
+			INT32 yy = y + i;
+
+			if (xx < 0) {
+				xx = 0;
+			} else if (xx >= plane->CX) {
+				xx = plane->CX - 1;
+			}
+
+			if (yy < 0) {
+				yy = 0;
+			} else if (yy >= plane->CY) {
+				yy = plane->CY - 1;
+			}
+
+			block[i * 8 + j] = plane->Plane[yy * plane->Pitch + xx];
+		}
+	}
+}
+
+/* */
+
+void MotionComp_Compensate16x16_SSE2(
+	UINT8*         p,
+	INT32          pitch,
+	const Plane_t* r,
+	INT32          x,
+	INT32          y)
+{
+	if (x >= 0 && x + 16 < r->CX &&
+		y >= 0 && y + 16 < r->CY) {
+		const UINT8* s = r->Plane + y * r->Pitch + x;
+		UINT8*       d = p;
+
+		__m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+		s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s7 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+
+		_mm_store_si128((__m128i*)d, s0); d += pitch;
+		_mm_store_si128((__m128i*)d, s1); d += pitch;
+		_mm_store_si128((__m128i*)d, s2); d += pitch;
+		_mm_store_si128((__m128i*)d, s3); d += pitch;
+		_mm_store_si128((__m128i*)d, s4); d += pitch;
+		_mm_store_si128((__m128i*)d, s5); d += pitch;
+		_mm_store_si128((__m128i*)d, s6); d += pitch;
+		_mm_store_si128((__m128i*)d, s7); d += pitch;
+
+		s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
+		s7 = _mm_loadu_si128((const __m128i*)s);
+
+		_mm_store_si128((__m128i*)d, s0); d += pitch;
+		_mm_store_si128((__m128i*)d, s1); d += pitch;
+		_mm_store_si128((__m128i*)d, s2); d += pitch;
+		_mm_store_si128((__m128i*)d, s3); d += pitch;
+		_mm_store_si128((__m128i*)d, s4); d += pitch;
+		_mm_store_si128((__m128i*)d, s5); d += pitch;
+		_mm_store_si128((__m128i*)d, s6); d += pitch;
+		_mm_store_si128((__m128i*)d, s7);
+
+	} else {
+		MotionComp_Compensate8x8_SSE2(p,                 pitch, r, x,     y    );
+		MotionComp_Compensate8x8_SSE2(p + 8,             pitch, r, x + 8, y    );
+		MotionComp_Compensate8x8_SSE2(p     + 8 * pitch, pitch, r, x,     y + 8);
+		MotionComp_Compensate8x8_SSE2(p + 8 + 8 * pitch, pitch, r, x + 8, y + 8);
+	}
+}
+
+void MotionComp_Compensate8x8_SSE2(
+	UINT8*         p,
+	INT32          pitch,
+	const Plane_t* r,
+	INT32          x,
+	INT32          y)
+{
+	ALIGN(0x10) UINT8 b[64];
+
+	const UINT8* s  = r->Plane + y * r->Pitch + x;
+	INT32        p0 = r->Pitch;
+	UINT8*       d  = p;
+
+	__m64 s0, s1, s2, s3, s4, s5, s6, s7;
+
+	if (x < 0 || x + 8 >= r->CX ||
+		y < 0 || y + 8 >= r->CY) {
+		s  = b;
+		p0 = 8;
+		Block_Extract8x8(r, x, y, b);
+	}
+
+	s0 = *((const __m64*)s); s += p0;
+	s1 = *((const __m64*)s); s += p0;
+	s2 = *((const __m64*)s); s += p0;
+	s3 = *((const __m64*)s); s += p0;
+	s4 = *((const __m64*)s); s += p0;
+	s5 = *((const __m64*)s); s += p0;
+	s6 = *((const __m64*)s); s += p0;
+	s7 = *((const __m64*)s);
+
+	*((__m64*)d) = s0; d += pitch;
+	*((__m64*)d) = s1; d += pitch;
+	*((__m64*)d) = s2; d += pitch;
+	*((__m64*)d) = s3; d += pitch;
+	*((__m64*)d) = s4; d += pitch;
+	*((__m64*)d) = s5; d += pitch;
+	*((__m64*)d) = s6; d += pitch;
+	*((__m64*)d) = s7;
+}
+
+/* */
+
+ALIGN(0x10) static const UINT8 MASK_1[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+
+void MotionComp_Compensate16x16H_SSE2(
+	UINT8*         p,
+	INT32          pitch,
+	const Plane_t* r,
+	INT32          x0,
+	INT32          y0,
+	INT32          x1,
+	INT32          y1)
+{
+	if (x0 >= 0 && x0 + 16 < r->CX &&
+		y0 >= 0 && y0 + 16 < r->CY &&
+		x1 >= 0 && x1 + 16 < r->CX &&
+		y1 >= 0 && y1 + 16 < r->CY) {
+		const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
+		const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
+
+		UINT8* d = p;
+
+		__m128i S0, S1, D;
+		const __m128i M = *((const __m128i*)MASK_1);
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
+		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D); d += pitch;
+
+		S0 = _mm_loadu_si128((const __m128i*)s0);
+		S1 = _mm_loadu_si128((const __m128i*)s1);
+		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
+		_mm_store_si128((__m128i*)d, D);
+
+	} else {
+		MotionComp_Compensate8x8H_SSE2(p,                 pitch, r, x0,     y0    , x1,     y1    );
+		MotionComp_Compensate8x8H_SSE2(p + 8,             pitch, r, x0 + 8, y0    , x1 + 8, y1    );
+		MotionComp_Compensate8x8H_SSE2(p     + 8 * pitch, pitch, r, x0,     y0 + 8, x1,     y1 + 8);
+		MotionComp_Compensate8x8H_SSE2(p + 8 + 8 * pitch, pitch, r, x0 + 8, y0 + 8, x1 + 8, y1 + 8);
+	}
+}
+
+void MotionComp_Compensate8x8H_SSE2(
+	UINT8*         p,
+	INT32          pitch,
+	const Plane_t* r,
+	INT32          x0,
+	INT32          y0,
+	INT32          x1,
+	INT32          y1)
+{
+	ALIGN(0x10) UINT8 b0[64], b1[64];
+
+	const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
+	INT32        p0 = r->Pitch;
+
+	const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
+	INT32        p1 = r->Pitch;
+
+	UINT8* d = p;
+
+	__m64 S0, S1, D;
+	const __m64 M = *((const __m64*)MASK_1);
+
+	if (x0 < 0 || x0 + 8 >= r->CX ||
+		y0 < 0 || y0 + 8 >= r->CY ||
+		x1 < 0 || x1 + 8 >= r->CX ||
+		y1 < 0 || y1 + 8 >= r->CY) {
+		s0 = b0;
+		p0 = 8;
+
+		s1 = b1;
+		p1 = 8;
+
+		Block_Extract8x8(r, x0, y0, b0);
+		Block_Extract8x8(r, x1, y1, b1);
+	}
+
+	S0 = *((const __m64*)s0); s0 += p0;
+	S1 = *((const __m64*)s1); s1 += p1;
+	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+	*((__m64*)d) = D; d += pitch;
+
+	S0 = *((const __m64*)s0); s0 += p0;
+	S1 = *((const __m64*)s1); s1 += p1;
+	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+	*((__m64*)d) = D; d += pitch;
+
+	S0 = *((const __m64*)s0); s0 += p0;
+	S1 = *((const __m64*)s1); s1 += p1;
+	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+	*((__m64*)d) = D; d += pitch;
+
+	S0 = *((const __m64*)s0); s0 += p0;
+	S1 = *((const __m64*)s1); s1 += p1;
+	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+	*((__m64*)d) = D; d += pitch;
+
+	S0 = *((const __m64*)s0); s0 += p0;
+	S1 = *((const __m64*)s1); s1 += p1;
+	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+	*((__m64*)d) = D; d += pitch;
+
+	S0 = *((const __m64*)s0); s0 += p0;
+	S1 = *((const __m64*)s1); s1 += p1;
+	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+	*((__m64*)d) = D; d += pitch;
+
+	S0 = *((const __m64*)s0); s0 += p0;
+	S1 = *((const __m64*)s1); s1 += p1;
+	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+	*((__m64*)d) = D; d += pitch;
+
+	S0 = *((const __m64*)s0);
+	S1 = *((const __m64*)s1);
+	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
+	*((__m64*)d) = D;
+}
+
+/* */
+
+void MotionComp_Block16x16_SSE2(
+	Plane_t*              p,
+	INT32                 x,
+	INT32                 y,
+	const Plane_t*        r,
+	const MotionVector_t* mv)
+{
+	INT32 dx = ((mv->X & 1) != 0);
+	INT32 dy = ((mv->Y & 1) != 0);
+
+	INT32 vx[2] = { mv->X >> 1 };
+	INT32 vy[2] = { mv->Y >> 1 };
+
+	UINT8* d = p->Plane + y * p->Pitch + x;
+
+	if (dx == 0 && dy == 0) {
+		MotionComp_Compensate16x16_SSE2(
+			d,
+			p->Pitch,
+			r,
+			x + vx[0],
+			y + vy[0]);
+
+	} else {
+		vx[1] = vx[0];
+		vy[1] = vy[0];
+
+		vx[mv->X >= 0] += dx;
+		vy[mv->Y >= 0] += dy;
+
+		MotionComp_Compensate16x16H_SSE2(
+			d,
+			p->Pitch,
+			r,
+			x + vx[0],
+			y + vy[0],
+			x + vx[1],
+			y + vy[1]);
+	}
+}
+
+void MotionComp_Block8x8Y_SSE2(
+	Plane_t*              p,
+	INT32                 x,
+	INT32                 y,
+	const Plane_t*        r,
+	const MotionVector_t* mv)
+{
+	INT32 dx = ((mv->X & 1) != 0);
+	INT32 dy = ((mv->Y & 1) != 0);
+
+	INT32 vx[2] = { mv->X >> 1 };
+	INT32 vy[2] = { mv->Y >> 1 };
+
+	UINT8* d = p->Plane + y * p->Pitch + x;
+
+	if (dx == 0 && dy == 0) {
+		MotionComp_Compensate8x8_SSE2(
+			d,
+			p->Pitch,
+			r,
+			x + vx[0],
+			y + vy[0]);
+
+	} else {
+		vx[1] = vx[0];
+		vy[1] = vy[0];
+
+		vx[mv->X >= 0] += dx;
+		vy[mv->Y >= 0] += dy;
+
+		MotionComp_Compensate8x8H_SSE2(
+			d,
+			p->Pitch,
+			r,
+			x + vx[0],
+			y + vy[0],
+			x + vx[1],
+			y + vy[1]);
+	}
+}
+
+void MotionComp_Block8x8C_SSE2(
+	Plane_t*              p,
+	INT32                 x,
+	INT32                 y,
+	const Plane_t*        r,
+	const MotionVector_t* mv0)
+{
+	MotionVector_t mv = {
+		(mv0->X >> 1) | (mv0->X & 1),
+		(mv0->Y >> 1) | (mv0->Y & 1)
+	};
+
+	INT32 dx = ((mv.X & 1) != 0);
+	INT32 dy = ((mv.Y & 1) != 0);
+
+	INT32 vx[2] = { mv.X >> 1 };
+	INT32 vy[2] = { mv.Y >> 1 };
+
+	UINT8* d = p->Plane + y * p->Pitch + x;
+
+	if (dx == 0 && dy == 0) {
+		MotionComp_Compensate8x8_SSE2(
+			d,
+			p->Pitch,
+			r,
+			x + vx[0],
+			y + vy[0]);
+
+	} else {
+		vx[1] = vx[0];
+		vy[1] = vy[0];
+
+		vx[mv.X >= 0] += dx;
+		vy[mv.Y >= 0] += dy;
+
+		MotionComp_Compensate8x8H_SSE2(
+			d,
+			p->Pitch,
+			r,
+			x + vx[0],
+			y + vy[0],
+			x + vx[1],
+			y + vy[1]);
+	}
+}
+
+/* */
+
diff --git a/Lib/QTheoraEx/MotionComp_SSE2.h b/Lib/QTheoraEx/MotionComp_SSE2.h
new file mode 100644
index 0000000..59020a0
--- /dev/null
+++ b/Lib/QTheoraEx/MotionComp_SSE2.h
@@ -0,0 +1,66 @@
+/* MotionComp_SSE2.h */
+/* 2009/07/02        */
+
+#pragma once
+
+#include "FrameDecoder.h"
+
+/* */
+
+void MotionComp_Compensate16x16_SSE2(
+	UINT8*         p,
+	INT32          pitch,
+	const Plane_t* r,
+	INT32          x,
+	INT32          y);
+
+void MotionComp_Compensate8x8_SSE2(
+	UINT8*         p,
+	INT32          pitch,
+	const Plane_t* r,
+	INT32          x,
+	INT32          y);
+
+void MotionComp_Compensate16x16H_SSE2(
+	UINT8*         p,
+	INT32          pitch,
+	const Plane_t* r,
+	INT32          x0,
+	INT32          y0,
+	INT32          x1,
+	INT32          y1);
+
+void MotionComp_Compensate8x8H_SSE2(
+	UINT8*         p,
+	INT32          pitch,
+	const Plane_t* r,
+	INT32          x0,
+	INT32          y0,
+	INT32          x1,
+	INT32          y1);
+
+/* */
+
+void MotionComp_Block16x16_SSE2(
+	Plane_t*              p,
+	INT32                 x,
+	INT32                 y,
+	const Plane_t*        r,
+	const MotionVector_t* mv);
+
+void MotionComp_Block8x8Y_SSE2(
+	Plane_t*              p,
+	INT32                 x,
+	INT32                 y,
+	const Plane_t*        r,
+	const MotionVector_t* mv);
+
+void MotionComp_Block8x8C_SSE2(
+	Plane_t*              p,
+	INT32                 x,
+	INT32                 y,
+	const Plane_t*        r,
+	const MotionVector_t* mv);
+
+/* */
+
diff --git a/Lib/QTheoraEx/QTheoraEx.vcproj b/Lib/QTheoraEx/QTheoraEx.vcproj
index 275e3f8..fc350d8 100644
--- a/Lib/QTheoraEx/QTheoraEx.vcproj
+++ b/Lib/QTheoraEx/QTheoraEx.vcproj
@@ -165,10 +165,18 @@
 				>
 			</File>
 			<File
+				RelativePath=".\FrameReconstructor_SSE2.c"
+				>
+			</File>
+			<File
 				RelativePath=".\MemoryPool.c"
 				>
 			</File>
 			<File
+				RelativePath=".\MotionComp_SSE2.c"
+				>
+			</File>
+			<File
 				RelativePath=".\SetupDecoder.c"
 				>
 				<FileConfiguration
@@ -243,6 +251,10 @@
 				>
 			</File>
 			<File
+				RelativePath=".\MotionComp_SSE2.h"
+				>
+			</File>
+			<File
 				RelativePath=".\QTheoraArch.h"
 				>
 			</File>