1 /* MotionComp_SSE2.c */
6 #include "MotionComp_SSE2.h"
10 #pragma warning(disable : 4799)
14 static void Block_Extract8x8(
22 for (i = 0; i < 8; i++) {
23 for (j = 0; j < 8; j++) {
29 } else if (xx >= plane->CX) {
35 } else if (yy >= plane->CY) {
39 block[i * 8 + j] = plane->Plane[yy * plane->Pitch + xx];
46 void MotionComp_Compensate16x16_SSE2(
53 if (x >= 0 && x + 16 < r->CX &&
54 y >= 0 && y + 16 < r->CY) {
55 const UINT8* s = r->Plane + y * r->Pitch + x;
58 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
60 s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
61 s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
62 s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
63 s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
64 s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
65 s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
66 s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
67 s7 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
69 _mm_store_si128((__m128i*)d, s0); d += pitch;
70 _mm_store_si128((__m128i*)d, s1); d += pitch;
71 _mm_store_si128((__m128i*)d, s2); d += pitch;
72 _mm_store_si128((__m128i*)d, s3); d += pitch;
73 _mm_store_si128((__m128i*)d, s4); d += pitch;
74 _mm_store_si128((__m128i*)d, s5); d += pitch;
75 _mm_store_si128((__m128i*)d, s6); d += pitch;
76 _mm_store_si128((__m128i*)d, s7); d += pitch;
78 s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
79 s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
80 s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
81 s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
82 s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
83 s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
84 s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
85 s7 = _mm_loadu_si128((const __m128i*)s);
87 _mm_store_si128((__m128i*)d, s0); d += pitch;
88 _mm_store_si128((__m128i*)d, s1); d += pitch;
89 _mm_store_si128((__m128i*)d, s2); d += pitch;
90 _mm_store_si128((__m128i*)d, s3); d += pitch;
91 _mm_store_si128((__m128i*)d, s4); d += pitch;
92 _mm_store_si128((__m128i*)d, s5); d += pitch;
93 _mm_store_si128((__m128i*)d, s6); d += pitch;
94 _mm_store_si128((__m128i*)d, s7);
97 MotionComp_Compensate8x8_SSE2(p, pitch, r, x, y );
98 MotionComp_Compensate8x8_SSE2(p + 8, pitch, r, x + 8, y );
99 MotionComp_Compensate8x8_SSE2(p + 8 * pitch, pitch, r, x, y + 8);
100 MotionComp_Compensate8x8_SSE2(p + 8 + 8 * pitch, pitch, r, x + 8, y + 8);
104 void MotionComp_Compensate8x8_SSE2(
111 ALIGN(0x10) UINT8 b[64];
113 const UINT8* s = r->Plane + y * r->Pitch + x;
117 __m64 s0, s1, s2, s3, s4, s5, s6, s7;
119 if (x < 0 || x + 8 >= r->CX ||
120 y < 0 || y + 8 >= r->CY) {
123 Block_Extract8x8(r, x, y, b);
126 s0 = *((const __m64*)s); s += p0;
127 s1 = *((const __m64*)s); s += p0;
128 s2 = *((const __m64*)s); s += p0;
129 s3 = *((const __m64*)s); s += p0;
130 s4 = *((const __m64*)s); s += p0;
131 s5 = *((const __m64*)s); s += p0;
132 s6 = *((const __m64*)s); s += p0;
133 s7 = *((const __m64*)s);
135 *((__m64*)d) = s0; d += pitch;
136 *((__m64*)d) = s1; d += pitch;
137 *((__m64*)d) = s2; d += pitch;
138 *((__m64*)d) = s3; d += pitch;
139 *((__m64*)d) = s4; d += pitch;
140 *((__m64*)d) = s5; d += pitch;
141 *((__m64*)d) = s6; d += pitch;
147 ALIGN(0x10) static const UINT8 MASK_1[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
149 void MotionComp_Compensate16x16H_SSE2(
158 if (x0 >= 0 && x0 + 16 < r->CX &&
159 y0 >= 0 && y0 + 16 < r->CY &&
160 x1 >= 0 && x1 + 16 < r->CX &&
161 y1 >= 0 && y1 + 16 < r->CY) {
162 const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
163 const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
168 const __m128i M = *((const __m128i*)MASK_1);
170 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
171 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
172 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
173 _mm_store_si128((__m128i*)d, D); d += pitch;
175 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
176 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
177 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
178 _mm_store_si128((__m128i*)d, D); d += pitch;
180 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
181 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
182 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
183 _mm_store_si128((__m128i*)d, D); d += pitch;
185 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
186 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
187 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
188 _mm_store_si128((__m128i*)d, D); d += pitch;
190 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
191 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
192 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
193 _mm_store_si128((__m128i*)d, D); d += pitch;
195 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
196 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
197 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
198 _mm_store_si128((__m128i*)d, D); d += pitch;
200 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
201 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
202 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
203 _mm_store_si128((__m128i*)d, D); d += pitch;
205 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
206 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
207 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
208 _mm_store_si128((__m128i*)d, D); d += pitch;
210 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
211 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
212 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
213 _mm_store_si128((__m128i*)d, D); d += pitch;
215 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
216 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
217 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
218 _mm_store_si128((__m128i*)d, D); d += pitch;
220 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
221 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
222 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
223 _mm_store_si128((__m128i*)d, D); d += pitch;
225 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
226 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
227 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
228 _mm_store_si128((__m128i*)d, D); d += pitch;
230 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
231 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
232 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
233 _mm_store_si128((__m128i*)d, D); d += pitch;
235 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
236 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
237 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
238 _mm_store_si128((__m128i*)d, D); d += pitch;
240 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
241 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
242 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
243 _mm_store_si128((__m128i*)d, D); d += pitch;
245 S0 = _mm_loadu_si128((const __m128i*)s0);
246 S1 = _mm_loadu_si128((const __m128i*)s1);
247 D = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
248 _mm_store_si128((__m128i*)d, D);
251 MotionComp_Compensate8x8H_SSE2(p, pitch, r, x0, y0 , x1, y1 );
252 MotionComp_Compensate8x8H_SSE2(p + 8, pitch, r, x0 + 8, y0 , x1 + 8, y1 );
253 MotionComp_Compensate8x8H_SSE2(p + 8 * pitch, pitch, r, x0, y0 + 8, x1, y1 + 8);
254 MotionComp_Compensate8x8H_SSE2(p + 8 + 8 * pitch, pitch, r, x0 + 8, y0 + 8, x1 + 8, y1 + 8);
258 void MotionComp_Compensate8x8H_SSE2(
267 ALIGN(0x10) UINT8 b0[64], b1[64];
269 const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
272 const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
278 const __m64 M = *((const __m64*)MASK_1);
280 if (x0 < 0 || x0 + 8 >= r->CX ||
281 y0 < 0 || y0 + 8 >= r->CY ||
282 x1 < 0 || x1 + 8 >= r->CX ||
283 y1 < 0 || y1 + 8 >= r->CY) {
290 Block_Extract8x8(r, x0, y0, b0);
291 Block_Extract8x8(r, x1, y1, b1);
294 S0 = *((const __m64*)s0); s0 += p0;
295 S1 = *((const __m64*)s1); s1 += p1;
296 D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
297 *((__m64*)d) = D; d += pitch;
299 S0 = *((const __m64*)s0); s0 += p0;
300 S1 = *((const __m64*)s1); s1 += p1;
301 D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
302 *((__m64*)d) = D; d += pitch;
304 S0 = *((const __m64*)s0); s0 += p0;
305 S1 = *((const __m64*)s1); s1 += p1;
306 D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
307 *((__m64*)d) = D; d += pitch;
309 S0 = *((const __m64*)s0); s0 += p0;
310 S1 = *((const __m64*)s1); s1 += p1;
311 D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
312 *((__m64*)d) = D; d += pitch;
314 S0 = *((const __m64*)s0); s0 += p0;
315 S1 = *((const __m64*)s1); s1 += p1;
316 D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
317 *((__m64*)d) = D; d += pitch;
319 S0 = *((const __m64*)s0); s0 += p0;
320 S1 = *((const __m64*)s1); s1 += p1;
321 D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
322 *((__m64*)d) = D; d += pitch;
324 S0 = *((const __m64*)s0); s0 += p0;
325 S1 = *((const __m64*)s1); s1 += p1;
326 D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
327 *((__m64*)d) = D; d += pitch;
329 S0 = *((const __m64*)s0);
330 S1 = *((const __m64*)s1);
331 D = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
337 void MotionComp_Block16x16_SSE2(
342 const MotionVector_t* mv)
344 INT32 dx = ((mv->X & 1) != 0);
345 INT32 dy = ((mv->Y & 1) != 0);
347 INT32 vx[2] = { mv->X >> 1 };
348 INT32 vy[2] = { mv->Y >> 1 };
350 UINT8* d = p->Plane + y * p->Pitch + x;
352 if (dx == 0 && dy == 0) {
353 MotionComp_Compensate16x16_SSE2(
364 vx[mv->X >= 0] += dx;
365 vy[mv->Y >= 0] += dy;
367 MotionComp_Compensate16x16H_SSE2(
378 void MotionComp_Block8x8Y_SSE2(
383 const MotionVector_t* mv)
385 INT32 dx = ((mv->X & 1) != 0);
386 INT32 dy = ((mv->Y & 1) != 0);
388 INT32 vx[2] = { mv->X >> 1 };
389 INT32 vy[2] = { mv->Y >> 1 };
391 UINT8* d = p->Plane + y * p->Pitch + x;
393 if (dx == 0 && dy == 0) {
394 MotionComp_Compensate8x8_SSE2(
405 vx[mv->X >= 0] += dx;
406 vy[mv->Y >= 0] += dy;
408 MotionComp_Compensate8x8H_SSE2(
419 void MotionComp_Block8x8C_SSE2(
424 const MotionVector_t* mv0)
426 MotionVector_t mv = {
427 (mv0->X >> 1) | (mv0->X & 1),
428 (mv0->Y >> 1) | (mv0->Y & 1)
431 INT32 dx = ((mv.X & 1) != 0);
432 INT32 dy = ((mv.Y & 1) != 0);
434 INT32 vx[2] = { mv.X >> 1 };
435 INT32 vy[2] = { mv.Y >> 1 };
437 UINT8* d = p->Plane + y * p->Pitch + x;
439 if (dx == 0 && dy == 0) {
440 MotionComp_Compensate8x8_SSE2(
454 MotionComp_Compensate8x8H_SSE2(