Lib/QTheoraEx/MotionComp_SSE2.c

   1 /* MotionComp_SSE2.c */
   2 /* 2009/07/02        */
   3
   4 #include "StdAfx.h"
   5
   6 #include "MotionComp_SSE2.h"
   7
   8 /* */
   9
  10 #pragma warning(disable : 4799)
  11
  12 /* */
  13
  14 static void Block_Extract8x8(
  15         const Plane_t* plane,
  16         INT32          x,
  17         INT32          y,
  18         UINT8*         block)
  19 {
  20         INT32 i, j;
  21
  22         for (i = 0; i < 8; i++) {
  23                 for (j = 0; j < 8; j++) {
  24                         INT32 xx = x + j;
  25                         INT32 yy = y + i;
  26
  27                         if (xx < 0) {
  28                                 xx = 0;
  29                         } else if (xx >= plane->CX) {
  30                                 xx = plane->CX - 1;
  31                         }
  32
  33                         if (yy < 0) {
  34                                 yy = 0;
  35                         } else if (yy >= plane->CY) {
  36                                 yy = plane->CY - 1;
  37                         }
  38
  39                         block[i * 8 + j] = plane->Plane[yy * plane->Pitch + xx];
  40                 }
  41         }
  42 }
  43
  44 /* */
  45
  46 void MotionComp_Compensate16x16_SSE2(
  47         UINT8*         p,
  48         INT32          pitch,
  49         const Plane_t* r,
  50         INT32          x,
  51         INT32          y)
  52 {
  53         if (x >= 0 && x + 16 < r->CX &&
  54                 y >= 0 && y + 16 < r->CY) {
  55                 const UINT8* s = r->Plane + y * r->Pitch + x;
  56                 UINT8*       d = p;
  57
  58                 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
  59
  60                 s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  61                 s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  62                 s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  63                 s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  64                 s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  65                 s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  66                 s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  67                 s7 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  68
  69                 _mm_store_si128((__m128i*)d, s0); d += pitch;
  70                 _mm_store_si128((__m128i*)d, s1); d += pitch;
  71                 _mm_store_si128((__m128i*)d, s2); d += pitch;
  72                 _mm_store_si128((__m128i*)d, s3); d += pitch;
  73                 _mm_store_si128((__m128i*)d, s4); d += pitch;
  74                 _mm_store_si128((__m128i*)d, s5); d += pitch;
  75                 _mm_store_si128((__m128i*)d, s6); d += pitch;
  76                 _mm_store_si128((__m128i*)d, s7); d += pitch;
  77
  78                 s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  79                 s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  80                 s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  81                 s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  82                 s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  83                 s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  84                 s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
  85                 s7 = _mm_loadu_si128((const __m128i*)s);
  86
  87                 _mm_store_si128((__m128i*)d, s0); d += pitch;
  88                 _mm_store_si128((__m128i*)d, s1); d += pitch;
  89                 _mm_store_si128((__m128i*)d, s2); d += pitch;
  90                 _mm_store_si128((__m128i*)d, s3); d += pitch;
  91                 _mm_store_si128((__m128i*)d, s4); d += pitch;
  92                 _mm_store_si128((__m128i*)d, s5); d += pitch;
  93                 _mm_store_si128((__m128i*)d, s6); d += pitch;
  94                 _mm_store_si128((__m128i*)d, s7);
  95
  96         } else {
  97                 MotionComp_Compensate8x8_SSE2(p,                 pitch, r, x,     y    );
  98                 MotionComp_Compensate8x8_SSE2(p + 8,             pitch, r, x + 8, y    );
  99                 MotionComp_Compensate8x8_SSE2(p     + 8 * pitch, pitch, r, x,     y + 8);
 100                 MotionComp_Compensate8x8_SSE2(p + 8 + 8 * pitch, pitch, r, x + 8, y + 8);
 101         }
 102 }
 103
 104 void MotionComp_Compensate8x8_SSE2(
 105         UINT8*         p,
 106         INT32          pitch,
 107         const Plane_t* r,
 108         INT32          x,
 109         INT32          y)
 110 {
 111         ALIGN(0x10) UINT8 b[64];
 112
 113         const UINT8* s  = r->Plane + y * r->Pitch + x;
 114         INT32        p0 = r->Pitch;
 115         UINT8*       d  = p;
 116
 117         __m64 s0, s1, s2, s3, s4, s5, s6, s7;
 118
 119         if (x < 0 || x + 8 >= r->CX ||
 120                 y < 0 || y + 8 >= r->CY) {
 121                 s  = b;
 122                 p0 = 8;
 123                 Block_Extract8x8(r, x, y, b);
 124         }
 125
 126         s0 = *((const __m64*)s); s += p0;
 127         s1 = *((const __m64*)s); s += p0;
 128         s2 = *((const __m64*)s); s += p0;
 129         s3 = *((const __m64*)s); s += p0;
 130         s4 = *((const __m64*)s); s += p0;
 131         s5 = *((const __m64*)s); s += p0;
 132         s6 = *((const __m64*)s); s += p0;
 133         s7 = *((const __m64*)s);
 134
 135         *((__m64*)d) = s0; d += pitch;
 136         *((__m64*)d) = s1; d += pitch;
 137         *((__m64*)d) = s2; d += pitch;
 138         *((__m64*)d) = s3; d += pitch;
 139         *((__m64*)d) = s4; d += pitch;
 140         *((__m64*)d) = s5; d += pitch;
 141         *((__m64*)d) = s6; d += pitch;
 142         *((__m64*)d) = s7;
 143 }
 144
 145 /* */
 146
 147 ALIGN(0x10) static const UINT8 MASK_1[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
 148
 149 void MotionComp_Compensate16x16H_SSE2(
 150         UINT8*         p,
 151         INT32          pitch,
 152         const Plane_t* r,
 153         INT32          x0,
 154         INT32          y0,
 155         INT32          x1,
 156         INT32          y1)
 157 {
 158         if (x0 >= 0 && x0 + 16 < r->CX &&
 159                 y0 >= 0 && y0 + 16 < r->CY &&
 160                 x1 >= 0 && x1 + 16 < r->CX &&
 161                 y1 >= 0 && y1 + 16 < r->CY) {
 162                 const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
 163                 const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
 164
 165                 UINT8* d = p;
 166
 167                 __m128i S0, S1, D;
 168                 const __m128i M = *((const __m128i*)MASK_1);
 169
 170                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 171                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 172                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 173                 _mm_store_si128((__m128i*)d, D); d += pitch;
 174
 175                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 176                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 177                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 178                 _mm_store_si128((__m128i*)d, D); d += pitch;
 179
 180                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 181                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 182                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 183                 _mm_store_si128((__m128i*)d, D); d += pitch;
 184
 185                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 186                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 187                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 188                 _mm_store_si128((__m128i*)d, D); d += pitch;
 189
 190                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 191                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 192                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 193                 _mm_store_si128((__m128i*)d, D); d += pitch;
 194
 195                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 196                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 197                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 198                 _mm_store_si128((__m128i*)d, D); d += pitch;
 199
 200                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 201                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 202                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 203                 _mm_store_si128((__m128i*)d, D); d += pitch;
 204
 205                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 206                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 207                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 208                 _mm_store_si128((__m128i*)d, D); d += pitch;
 209
 210                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 211                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 212                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 213                 _mm_store_si128((__m128i*)d, D); d += pitch;
 214
 215                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 216                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 217                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 218                 _mm_store_si128((__m128i*)d, D); d += pitch;
 219
 220                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 221                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 222                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 223                 _mm_store_si128((__m128i*)d, D); d += pitch;
 224
 225                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 226                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 227                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 228                 _mm_store_si128((__m128i*)d, D); d += pitch;
 229
 230                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 231                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 232                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 233                 _mm_store_si128((__m128i*)d, D); d += pitch;
 234
 235                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 236                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 237                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 238                 _mm_store_si128((__m128i*)d, D); d += pitch;
 239
 240                 S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
 241                 S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
 242                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 243                 _mm_store_si128((__m128i*)d, D); d += pitch;
 244
 245                 S0 = _mm_loadu_si128((const __m128i*)s0);
 246                 S1 = _mm_loadu_si128((const __m128i*)s1);
 247                 D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
 248                 _mm_store_si128((__m128i*)d, D);
 249
 250         } else {
 251                 MotionComp_Compensate8x8H_SSE2(p,                 pitch, r, x0,     y0    , x1,     y1    );
 252                 MotionComp_Compensate8x8H_SSE2(p + 8,             pitch, r, x0 + 8, y0    , x1 + 8, y1    );
 253                 MotionComp_Compensate8x8H_SSE2(p     + 8 * pitch, pitch, r, x0,     y0 + 8, x1,     y1 + 8);
 254                 MotionComp_Compensate8x8H_SSE2(p + 8 + 8 * pitch, pitch, r, x0 + 8, y0 + 8, x1 + 8, y1 + 8);
 255         }
 256 }
 257
 258 void MotionComp_Compensate8x8H_SSE2(
 259         UINT8*         p,
 260         INT32          pitch,
 261         const Plane_t* r,
 262         INT32          x0,
 263         INT32          y0,
 264         INT32          x1,
 265         INT32          y1)
 266 {
 267         ALIGN(0x10) UINT8 b0[64], b1[64];
 268
 269         const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
 270         INT32        p0 = r->Pitch;
 271
 272         const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
 273         INT32        p1 = r->Pitch;
 274
 275         UINT8* d = p;
 276
 277         __m64 S0, S1, D;
 278         const __m64 M = *((const __m64*)MASK_1);
 279
 280         if (x0 < 0 || x0 + 8 >= r->CX ||
 281                 y0 < 0 || y0 + 8 >= r->CY ||
 282                 x1 < 0 || x1 + 8 >= r->CX ||
 283                 y1 < 0 || y1 + 8 >= r->CY) {
 284                 s0 = b0;
 285                 p0 = 8;
 286
 287                 s1 = b1;
 288                 p1 = 8;
 289
 290                 Block_Extract8x8(r, x0, y0, b0);
 291                 Block_Extract8x8(r, x1, y1, b1);
 292         }
 293
 294         S0 = *((const __m64*)s0); s0 += p0;
 295         S1 = *((const __m64*)s1); s1 += p1;
 296         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
 297         *((__m64*)d) = D; d += pitch;
 298
 299         S0 = *((const __m64*)s0); s0 += p0;
 300         S1 = *((const __m64*)s1); s1 += p1;
 301         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
 302         *((__m64*)d) = D; d += pitch;
 303
 304         S0 = *((const __m64*)s0); s0 += p0;
 305         S1 = *((const __m64*)s1); s1 += p1;
 306         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
 307         *((__m64*)d) = D; d += pitch;
 308
 309         S0 = *((const __m64*)s0); s0 += p0;
 310         S1 = *((const __m64*)s1); s1 += p1;
 311         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
 312         *((__m64*)d) = D; d += pitch;
 313
 314         S0 = *((const __m64*)s0); s0 += p0;
 315         S1 = *((const __m64*)s1); s1 += p1;
 316         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
 317         *((__m64*)d) = D; d += pitch;
 318
 319         S0 = *((const __m64*)s0); s0 += p0;
 320         S1 = *((const __m64*)s1); s1 += p1;
 321         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
 322         *((__m64*)d) = D; d += pitch;
 323
 324         S0 = *((const __m64*)s0); s0 += p0;
 325         S1 = *((const __m64*)s1); s1 += p1;
 326         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
 327         *((__m64*)d) = D; d += pitch;
 328
 329         S0 = *((const __m64*)s0);
 330         S1 = *((const __m64*)s1);
 331         D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
 332         *((__m64*)d) = D;
 333 }
 334
 335 /* */
 336
 337 void MotionComp_Block16x16_SSE2(
 338         Plane_t*              p,
 339         INT32                 x,
 340         INT32                 y,
 341         const Plane_t*        r,
 342         const MotionVector_t* mv)
 343 {
 344         INT32 dx = ((mv->X & 1) != 0);
 345         INT32 dy = ((mv->Y & 1) != 0);
 346
 347         INT32 vx[2] = { mv->X >> 1 };
 348         INT32 vy[2] = { mv->Y >> 1 };
 349
 350         UINT8* d = p->Plane + y * p->Pitch + x;
 351
 352         if (dx == 0 && dy == 0) {
 353                 MotionComp_Compensate16x16_SSE2(
 354                         d,
 355                         p->Pitch,
 356                         r,
 357                         x + vx[0],
 358                         y + vy[0]);
 359
 360         } else {
 361                 vx[1] = vx[0];
 362                 vy[1] = vy[0];
 363
 364                 vx[mv->X >= 0] += dx;
 365                 vy[mv->Y >= 0] += dy;
 366
 367                 MotionComp_Compensate16x16H_SSE2(
 368                         d,
 369                         p->Pitch,
 370                         r,
 371                         x + vx[0],
 372                         y + vy[0],
 373                         x + vx[1],
 374                         y + vy[1]);
 375         }
 376 }
 377
 378 void MotionComp_Block8x8Y_SSE2(
 379         Plane_t*              p,
 380         INT32                 x,
 381         INT32                 y,
 382         const Plane_t*        r,
 383         const MotionVector_t* mv)
 384 {
 385         INT32 dx = ((mv->X & 1) != 0);
 386         INT32 dy = ((mv->Y & 1) != 0);
 387
 388         INT32 vx[2] = { mv->X >> 1 };
 389         INT32 vy[2] = { mv->Y >> 1 };
 390
 391         UINT8* d = p->Plane + y * p->Pitch + x;
 392
 393         if (dx == 0 && dy == 0) {
 394                 MotionComp_Compensate8x8_SSE2(
 395                         d,
 396                         p->Pitch,
 397                         r,
 398                         x + vx[0],
 399                         y + vy[0]);
 400
 401         } else {
 402                 vx[1] = vx[0];
 403                 vy[1] = vy[0];
 404
 405                 vx[mv->X >= 0] += dx;
 406                 vy[mv->Y >= 0] += dy;
 407
 408                 MotionComp_Compensate8x8H_SSE2(
 409                         d,
 410                         p->Pitch,
 411                         r,
 412                         x + vx[0],
 413                         y + vy[0],
 414                         x + vx[1],
 415                         y + vy[1]);
 416         }
 417 }
 418
 419 void MotionComp_Block8x8C_SSE2(
 420         Plane_t*              p,
 421         INT32                 x,
 422         INT32                 y,
 423         const Plane_t*        r,
 424         const MotionVector_t* mv0)
 425 {
 426         MotionVector_t mv = {
 427                 (mv0->X >> 1) | (mv0->X & 1),
 428                 (mv0->Y >> 1) | (mv0->Y & 1)
 429         };
 430
 431         INT32 dx = ((mv.X & 1) != 0);
 432         INT32 dy = ((mv.Y & 1) != 0);
 433
 434         INT32 vx[2] = { mv.X >> 1 };
 435         INT32 vy[2] = { mv.Y >> 1 };
 436
 437         UINT8* d = p->Plane + y * p->Pitch + x;
 438
 439         if (dx == 0 && dy == 0) {
 440                 MotionComp_Compensate8x8_SSE2(
 441                         d,
 442                         p->Pitch,
 443                         r,
 444                         x + vx[0],
 445                         y + vy[0]);
 446
 447         } else {
 448                 vx[1] = vx[0];
 449                 vy[1] = vy[0];
 450
 451                 vx[mv.X >= 0] += dx;
 452                 vy[mv.Y >= 0] += dy;
 453
 454                 MotionComp_Compensate8x8H_SSE2(
 455                         d,
 456                         p->Pitch,
 457                         r,
 458                         x + vx[0],
 459                         y + vy[0],
 460                         x + vx[1],
 461                         y + vy[1]);
 462         }
 463 }
 464
 465 /* */
 466