OSDN Git Service

078145f17ca33fbd8e5bc5ad2321b79f46722339
[qtheora/main.git] / Lib / QTheoraEx / MotionComp_SSE2.c
1 /* MotionComp_SSE2.c */
2 /* 2009/07/02        */
3
4 #include "StdAfx.h"
5
6 #include "MotionComp_SSE2.h"
7
8 /* */
9
10 #pragma warning(disable : 4799)
11
12 /* */
13
14 static void Block_Extract8x8(
15         const Plane_t* plane,
16         INT32          x,
17         INT32          y,
18         UINT8*         block)
19 {
20         INT32 i, j;
21
22         for (i = 0; i < 8; i++) {
23                 for (j = 0; j < 8; j++) {
24                         INT32 xx = x + j;
25                         INT32 yy = y + i;
26
27                         if (xx < 0) {
28                                 xx = 0;
29                         } else if (xx >= plane->CX) {
30                                 xx = plane->CX - 1;
31                         }
32
33                         if (yy < 0) {
34                                 yy = 0;
35                         } else if (yy >= plane->CY) {
36                                 yy = plane->CY - 1;
37                         }
38
39                         block[i * 8 + j] = plane->Plane[yy * plane->Pitch + xx];
40                 }
41         }
42 }
43
44 /* */
45
46 void MotionComp_Compensate16x16_SSE2(
47         UINT8*         p,
48         INT32          pitch,
49         const Plane_t* r,
50         INT32          x,
51         INT32          y)
52 {
53         if (x >= 0 && x + 16 < r->CX &&
54                 y >= 0 && y + 16 < r->CY) {
55                 const UINT8* s = r->Plane + y * r->Pitch + x;
56                 UINT8*       d = p;
57
58                 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
59
60                 s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
61                 s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
62                 s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
63                 s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
64                 s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
65                 s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
66                 s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
67                 s7 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
68
69                 _mm_store_si128((__m128i*)d, s0); d += pitch;
70                 _mm_store_si128((__m128i*)d, s1); d += pitch;
71                 _mm_store_si128((__m128i*)d, s2); d += pitch;
72                 _mm_store_si128((__m128i*)d, s3); d += pitch;
73                 _mm_store_si128((__m128i*)d, s4); d += pitch;
74                 _mm_store_si128((__m128i*)d, s5); d += pitch;
75                 _mm_store_si128((__m128i*)d, s6); d += pitch;
76                 _mm_store_si128((__m128i*)d, s7); d += pitch;
77
78                 s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
79                 s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
80                 s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
81                 s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
82                 s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
83                 s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
84                 s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
85                 s7 = _mm_loadu_si128((const __m128i*)s);
86
87                 _mm_store_si128((__m128i*)d, s0); d += pitch;
88                 _mm_store_si128((__m128i*)d, s1); d += pitch;
89                 _mm_store_si128((__m128i*)d, s2); d += pitch;
90                 _mm_store_si128((__m128i*)d, s3); d += pitch;
91                 _mm_store_si128((__m128i*)d, s4); d += pitch;
92                 _mm_store_si128((__m128i*)d, s5); d += pitch;
93                 _mm_store_si128((__m128i*)d, s6); d += pitch;
94                 _mm_store_si128((__m128i*)d, s7);
95
96         } else {
97                 MotionComp_Compensate8x8_SSE2(p,                 pitch, r, x,     y    );
98                 MotionComp_Compensate8x8_SSE2(p + 8,             pitch, r, x + 8, y    );
99                 MotionComp_Compensate8x8_SSE2(p     + 8 * pitch, pitch, r, x,     y + 8);
100                 MotionComp_Compensate8x8_SSE2(p + 8 + 8 * pitch, pitch, r, x + 8, y + 8);
101         }
102 }
103
104 void MotionComp_Compensate8x8_SSE2(
105         UINT8*         p,
106         INT32          pitch,
107         const Plane_t* r,
108         INT32          x,
109         INT32          y)
110 {
111         ALIGN(0x10) UINT8 b[64];
112
113         const UINT8* s  = r->Plane + y * r->Pitch + x;
114         INT32        p0 = r->Pitch;
115         UINT8*       d  = p;
116
117         __m64 s0, s1, s2, s3, s4, s5, s6, s7;
118
119         if (x < 0 || x + 8 >= r->CX ||
120                 y < 0 || y + 8 >= r->CY) {
121                 s  = b;
122                 p0 = 8;
123                 Block_Extract8x8(r, x, y, b);
124         }
125
126         s0 = *((const __m64*)s); s += p0;
127         s1 = *((const __m64*)s); s += p0;
128         s2 = *((const __m64*)s); s += p0;
129         s3 = *((const __m64*)s); s += p0;
130         s4 = *((const __m64*)s); s += p0;
131         s5 = *((const __m64*)s); s += p0;
132         s6 = *((const __m64*)s); s += p0;
133         s7 = *((const __m64*)s);
134
135         *((__m64*)d) = s0; d += pitch;
136         *((__m64*)d) = s1; d += pitch;
137         *((__m64*)d) = s2; d += pitch;
138         *((__m64*)d) = s3; d += pitch;
139         *((__m64*)d) = s4; d += pitch;
140         *((__m64*)d) = s5; d += pitch;
141         *((__m64*)d) = s6; d += pitch;
142         *((__m64*)d) = s7;
143 }
144
145 /* */
146
147 ALIGN(0x10) static const UINT8 MASK_1[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
148
149 void MotionComp_Compensate16x16H_SSE2(
150         UINT8*         p,
151         INT32          pitch,
152         const Plane_t* r,
153         INT32          x0,
154         INT32          y0,
155         INT32          x1,
156         INT32          y1)
157 {
158         if (x0 >= 0 && x0 + 16 < r->CX &&
159                 y0 >= 0 && y0 + 16 < r->CY &&
160                 x1 >= 0 && x1 + 16 < r->CX &&
161                 y1 >= 0 && y1 + 16 < r->CY) {
162                 const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
163                 const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
164
165                 UINT8* d = p;
166
167                 __m128i S0, S1, D;
168                 const __m128i M = *((const __m128i*)MASK_1);
169
170                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
171                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
172                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
173                 _mm_store_si128((__m128i*)d, D); d += pitch;
174
175                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
176                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
177                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
178                 _mm_store_si128((__m128i*)d, D); d += pitch;
179
180                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
181                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
182                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
183                 _mm_store_si128((__m128i*)d, D); d += pitch;
184
185                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
186                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
187                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
188                 _mm_store_si128((__m128i*)d, D); d += pitch;
189
190                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
191                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
192                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
193                 _mm_store_si128((__m128i*)d, D); d += pitch;
194
195                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
196                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
197                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
198                 _mm_store_si128((__m128i*)d, D); d += pitch;
199
200                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
201                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
202                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
203                 _mm_store_si128((__m128i*)d, D); d += pitch;
204
205                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
206                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
207                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
208                 _mm_store_si128((__m128i*)d, D); d += pitch;
209
210                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
211                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
212                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
213                 _mm_store_si128((__m128i*)d, D); d += pitch;
214
215                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
216                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
217                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
218                 _mm_store_si128((__m128i*)d, D); d += pitch;
219
220                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
221                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
222                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
223                 _mm_store_si128((__m128i*)d, D); d += pitch;
224
225                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
226                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
227                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
228                 _mm_store_si128((__m128i*)d, D); d += pitch;
229
230                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
231                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
232                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
233                 _mm_store_si128((__m128i*)d, D); d += pitch;
234
235                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
236                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
237                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
238                 _mm_store_si128((__m128i*)d, D); d += pitch;
239
240                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
241                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
242                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
243                 _mm_store_si128((__m128i*)d, D); d += pitch;
244
245                 S0 = _mm_loadu_si128((const __m128i*)s0);
246                 S1 = _mm_loadu_si128((const __m128i*)s1);
247                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
248                 _mm_store_si128((__m128i*)d, D);
249
250         } else {
251                 MotionComp_Compensate8x8H_SSE2(p,                 pitch, r, x0,     y0    , x1,     y1    );
252                 MotionComp_Compensate8x8H_SSE2(p + 8,             pitch, r, x0 + 8, y0    , x1 + 8, y1    );
253                 MotionComp_Compensate8x8H_SSE2(p     + 8 * pitch, pitch, r, x0,     y0 + 8, x1,     y1 + 8);
254                 MotionComp_Compensate8x8H_SSE2(p + 8 + 8 * pitch, pitch, r, x0 + 8, y0 + 8, x1 + 8, y1 + 8);
255         }
256 }
257
258 void MotionComp_Compensate8x8H_SSE2(
259         UINT8*         p,
260         INT32          pitch,
261         const Plane_t* r,
262         INT32          x0,
263         INT32          y0,
264         INT32          x1,
265         INT32          y1)
266 {
267         ALIGN(0x10) UINT8 b0[64], b1[64];
268
269         const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
270         INT32        p0 = r->Pitch;
271
272         const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
273         INT32        p1 = r->Pitch;
274
275         UINT8* d = p;
276
277         __m64 S0, S1, D;
278         const __m64 M = *((const __m64*)MASK_1);
279
280         if (x0 < 0 || x0 + 8 >= r->CX ||
281                 y0 < 0 || y0 + 8 >= r->CY ||
282                 x1 < 0 || x1 + 8 >= r->CX ||
283                 y1 < 0 || y1 + 8 >= r->CY) {
284                 s0 = b0;
285                 p0 = 8;
286
287                 s1 = b1;
288                 p1 = 8;
289
290                 Block_Extract8x8(r, x0, y0, b0);
291                 Block_Extract8x8(r, x1, y1, b1);
292         }
293
294         S0 = *((const __m64*)s0); s0 += p0;
295         S1 = *((const __m64*)s1); s1 += p1;
296         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
297         *((__m64*)d) = D; d += pitch;
298
299         S0 = *((const __m64*)s0); s0 += p0;
300         S1 = *((const __m64*)s1); s1 += p1;
301         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
302         *((__m64*)d) = D; d += pitch;
303
304         S0 = *((const __m64*)s0); s0 += p0;
305         S1 = *((const __m64*)s1); s1 += p1;
306         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
307         *((__m64*)d) = D; d += pitch;
308
309         S0 = *((const __m64*)s0); s0 += p0;
310         S1 = *((const __m64*)s1); s1 += p1;
311         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
312         *((__m64*)d) = D; d += pitch;
313
314         S0 = *((const __m64*)s0); s0 += p0;
315         S1 = *((const __m64*)s1); s1 += p1;
316         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
317         *((__m64*)d) = D; d += pitch;
318
319         S0 = *((const __m64*)s0); s0 += p0;
320         S1 = *((const __m64*)s1); s1 += p1;
321         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
322         *((__m64*)d) = D; d += pitch;
323
324         S0 = *((const __m64*)s0); s0 += p0;
325         S1 = *((const __m64*)s1); s1 += p1;
326         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
327         *((__m64*)d) = D; d += pitch;
328
329         S0 = *((const __m64*)s0);
330         S1 = *((const __m64*)s1);
331         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
332         *((__m64*)d) = D;
333 }
334
335 /* */
336
337 void MotionComp_Block16x16_SSE2(
338         Plane_t*              p,
339         INT32                 x,
340         INT32                 y,
341         const Plane_t*        r,
342         const MotionVector_t* mv)
343 {
344         INT32 dx = ((mv->X & 1) != 0);
345         INT32 dy = ((mv->Y & 1) != 0);
346
347         INT32 vx[2] = { mv->X >> 1 };
348         INT32 vy[2] = { mv->Y >> 1 };
349
350         UINT8* d = p->Plane + y * p->Pitch + x;
351
352         if (dx == 0 && dy == 0) {
353                 MotionComp_Compensate16x16_SSE2(
354                         d,
355                         p->Pitch,
356                         r,
357                         x + vx[0],
358                         y + vy[0]);
359
360         } else {
361                 vx[1] = vx[0];
362                 vy[1] = vy[0];
363
364                 vx[mv->X >= 0] += dx;
365                 vy[mv->Y >= 0] += dy;
366
367                 MotionComp_Compensate16x16H_SSE2(
368                         d,
369                         p->Pitch,
370                         r,
371                         x + vx[0],
372                         y + vy[0],
373                         x + vx[1],
374                         y + vy[1]);
375         }
376 }
377
378 void MotionComp_Block8x8Y_SSE2(
379         Plane_t*              p,
380         INT32                 x,
381         INT32                 y,
382         const Plane_t*        r,
383         const MotionVector_t* mv)
384 {
385         INT32 dx = ((mv->X & 1) != 0);
386         INT32 dy = ((mv->Y & 1) != 0);
387
388         INT32 vx[2] = { mv->X >> 1 };
389         INT32 vy[2] = { mv->Y >> 1 };
390
391         UINT8* d = p->Plane + y * p->Pitch + x;
392
393         if (dx == 0 && dy == 0) {
394                 MotionComp_Compensate8x8_SSE2(
395                         d,
396                         p->Pitch,
397                         r,
398                         x + vx[0],
399                         y + vy[0]);
400
401         } else {
402                 vx[1] = vx[0];
403                 vy[1] = vy[0];
404
405                 vx[mv->X >= 0] += dx;
406                 vy[mv->Y >= 0] += dy;
407
408                 MotionComp_Compensate8x8H_SSE2(
409                         d,
410                         p->Pitch,
411                         r,
412                         x + vx[0],
413                         y + vy[0],
414                         x + vx[1],
415                         y + vy[1]);
416         }
417 }
418
419 void MotionComp_Block8x8C_SSE2(
420         Plane_t*              p,
421         INT32                 x,
422         INT32                 y,
423         const Plane_t*        r,
424         const MotionVector_t* mv0)
425 {
426         MotionVector_t mv = {
427                 (mv0->X >> 1) | (mv0->X & 1),
428                 (mv0->Y >> 1) | (mv0->Y & 1)
429         };
430
431         INT32 dx = ((mv.X & 1) != 0);
432         INT32 dy = ((mv.Y & 1) != 0);
433
434         INT32 vx[2] = { mv.X >> 1 };
435         INT32 vy[2] = { mv.Y >> 1 };
436
437         UINT8* d = p->Plane + y * p->Pitch + x;
438
439         if (dx == 0 && dy == 0) {
440                 MotionComp_Compensate8x8_SSE2(
441                         d,
442                         p->Pitch,
443                         r,
444                         x + vx[0],
445                         y + vy[0]);
446
447         } else {
448                 vx[1] = vx[0];
449                 vy[1] = vy[0];
450
451                 vx[mv.X >= 0] += dx;
452                 vy[mv.Y >= 0] += dy;
453
454                 MotionComp_Compensate8x8H_SSE2(
455                         d,
456                         p->Pitch,
457                         r,
458                         x + vx[0],
459                         y + vy[0],
460                         x + vx[1],
461                         y + vy[1]);
462         }
463 }
464
465 /* */
466