From 44cd6da305fa2062bfdf4ff2ca07406ea3823a84 Mon Sep 17 00:00:00 2001 From: Noumi Akira Date: Thu, 9 Jul 2009 16:36:38 +0900 Subject: [PATCH] add MMX CSC code. --- Lib/QTheoraEx/CSConverter.h | 10 +++ Lib/QTheoraEx/CSConverter_MMX.c | 134 ++++++++++++++++++++++++++++++++ Lib/QTheoraEx/FrameReconstructor_SSE2.c | 2 +- Lib/QTheoraEx/QTheoraEx.vcproj | 4 + Lib/QTheoraEx/TheoraDecoder.c | 5 ++ 5 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 Lib/QTheoraEx/CSConverter_MMX.c diff --git a/Lib/QTheoraEx/CSConverter.h b/Lib/QTheoraEx/CSConverter.h index 8439e11..336162f 100644 --- a/Lib/QTheoraEx/CSConverter.h +++ b/Lib/QTheoraEx/CSConverter.h @@ -13,6 +13,16 @@ void QT_CSConvert_YUY2( /* */ +void QT_CSConvert_YV12_MMX( + const QT_Output_t* output, + QT_Frame_t* frame); + +void QT_CSConvert_YUY2_MMX( + const QT_Output_t* output, + QT_Frame_t* frame); + +/* */ + void QT_CSConvert_YV12_SSE2( const QT_Output_t* output, QT_Frame_t* frame); diff --git a/Lib/QTheoraEx/CSConverter_MMX.c b/Lib/QTheoraEx/CSConverter_MMX.c new file mode 100644 index 0000000..1c73953 --- /dev/null +++ b/Lib/QTheoraEx/CSConverter_MMX.c @@ -0,0 +1,134 @@ +/* CSConverter_MMX.c */ +/* 2009/07/09 */ + +#include "StdAfx.h" + +#include "TheoraDecoder.h" + +#include "CSConverter.h" + +/* */ + +#pragma warning(disable : 4799) + +/* */ + +static __inline void CopyCSC_8( + UINT8* d, + const UINT8* s, + INT32 cx) +{ + UINT8* p = d; + UINT8* e = p + cx; + + const UINT8* q = s; + + for (; p < e; p += 8, q += 8) { + *((__m64*)p) = *((const __m64*)q); + } +} + +/* */ + +void QT_CSConvert_YV12_MMX( + const QT_Output_t* output, + QT_Frame_t* frame) +{ + UINT8* pb0 = (UINT8*)(frame->Frame); + UINT8* pb1 = pb0 + frame->Rasters * frame->Pitch; + UINT8* pb2 = pb1 + frame->Rasters * frame->Pitch / 4; + UINT8* end; + + const UINT8* s0 = output->Plane[0] + (output->CY - 1) * output->CX; + const UINT8* s1 = output->Plane[2] + (output->CY / 2 - 1) * output->CX / 2; + const UINT8* s2 = output->Plane[1] + (output->CY / 2 - 1) * output->CX / 2; + + INT32 cx2 = output->CX / 2; + + pb0 += frame->Y * frame->Pitch + frame->X; + pb1 += frame->Y * frame->Pitch / 2 + frame->X / 2; + pb2 += frame->Y * frame->Pitch / 2 + frame->X / 2; + + end = pb0 + output->CY * frame->Pitch; + while (pb0 < end) { + CopyCSC_8(pb0, s0, output->CX); + pb0 += frame->Pitch; + s0 -= output->CX; + } + + end = pb1 + (output->CY / 2) * (frame->Pitch / 2); + while (pb1 < end) { + CopyCSC_8(pb1, s1, cx2); + pb1 += frame->Pitch / 2; + s1 -= cx2; + } + + end = pb2 + (output->CY / 2) * (frame->Pitch / 2); + while (pb2 < end) { + CopyCSC_8(pb2, s2, cx2); + pb2 += frame->Pitch / 2; + s2 -= cx2; + } + + _mm_empty(); +} + +void QT_CSConvert_YUY2_MMX( + const QT_Output_t* output, + QT_Frame_t* frame) +{ + UINT8* pb = (UINT8*)(frame->Frame) + frame->Y * frame->Pitch + frame->X; + UINT8* end = pb + output->CY * frame->Pitch; + + const UINT8* s0 = output->Plane[0] + (output->CY - 1) * output->CX; + const UINT8* s1 = output->Plane[1] + (output->CY / 2 - 1) * output->CX / 2; + const UINT8* s2 = output->Plane[2] + (output->CY / 2 - 1) * output->CX / 2; + + __m64 Y0, Y1, UV0, UV1; + __m64 P0, P1; + + for (; pb < end; pb += frame->Pitch * 2, s0 -= output->CX * 2, s1 -= output->CX / 2, s2 -= output->CX / 2) { + UINT8* pb0 = pb; + UINT8* pb1 = pb + frame->Pitch; + UINT8* pe0 = pb + output->CX * 2; + + const UINT8* y0 = s0; + const UINT8* y1 = s0 - output->CX; + const UINT8* u = s1; + const UINT8* v = s2; + + for (; pb0 < pe0; pb0 += 8 * 2, pb1 += 8 * 2, y0 += 8, y1 += 8, u += 4, v += 4) { + Y0 = *((const __m64*)y0); + Y1 = *((const __m64*)y1); + + UV0 = _mm_unpacklo_pi8( + _mm_cvtsi32_si64(*((const UINT32*)u)), + _mm_cvtsi32_si64(*((const UINT32*)v))); + + UV1 = _mm_unpackhi_pi32(UV0, UV0); + + P0 = _mm_unpacklo_pi8(Y0, UV0); + + Y0 = _mm_unpackhi_pi32(Y0, Y0); + + P1 = _mm_unpacklo_pi8(Y0, UV1); + + *((__m64*)(pb0 + 0)) = P0; + *((__m64*)(pb0 + 8)) = P1; + + P0 = _mm_unpacklo_pi8(Y1, UV0); + + Y1 = _mm_unpackhi_pi32(Y1, Y1); + + P1 = _mm_unpacklo_pi8(Y1, UV1); + + *((__m64*)(pb1 + 0)) = P0; + *((__m64*)(pb1 + 8)) = P1; + } + } + + _mm_empty(); +} + +/* */ + diff --git a/Lib/QTheoraEx/FrameReconstructor_SSE2.c b/Lib/QTheoraEx/FrameReconstructor_SSE2.c index a6413e9..f822dbd 100644 --- a/Lib/QTheoraEx/FrameReconstructor_SSE2.c +++ b/Lib/QTheoraEx/FrameReconstructor_SSE2.c @@ -13,7 +13,7 @@ /* */ -static void Transpose_SSE2( +static __inline void Transpose_SSE2( const INT16* x, INT16* y) { diff --git a/Lib/QTheoraEx/QTheoraEx.vcproj b/Lib/QTheoraEx/QTheoraEx.vcproj index 434fc6b..cca0a39 100644 --- a/Lib/QTheoraEx/QTheoraEx.vcproj +++ b/Lib/QTheoraEx/QTheoraEx.vcproj @@ -161,6 +161,10 @@ > + + diff --git a/Lib/QTheoraEx/TheoraDecoder.c b/Lib/QTheoraEx/TheoraDecoder.c index 89c583f..4f0bc22 100644 --- a/Lib/QTheoraEx/TheoraDecoder.c +++ b/Lib/QTheoraEx/TheoraDecoder.c @@ -283,11 +283,14 @@ BOOL QT_SetupCSC( INT32 cs) { extern BOOL g_QT_Enable_SSE2; + extern BOOL g_QT_Enable_MMX; switch (cs) { case QTCS_YV12: if (g_QT_Enable_SSE2) { t->Convert = QT_CSConvert_YV12_SSE2; + } else if (g_QT_Enable_MMX) { + t->Convert = QT_CSConvert_YV12_MMX; } else { t->Convert = QT_CSConvert_YV12; } @@ -296,6 +299,8 @@ BOOL QT_SetupCSC( case QTCS_YUY2: if (g_QT_Enable_SSE2) { t->Convert = QT_CSConvert_YUY2_SSE2; + } else if (g_QT_Enable_MMX) { + t->Convert = QT_CSConvert_YUY2_MMX; } else { t->Convert = QT_CSConvert_YUY2; } -- 2.11.0