frameworks/base/media/libstagefright/codecs/avc/enc/src/findhalfpel.cpp

   1 /* ------------------------------------------------------------------
   2  * Copyright (C) 1998-2009 PacketVideo
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
  13  * express or implied.
  14  * See the License for the specific language governing permissions
  15  * and limitations under the License.
  16  * -------------------------------------------------------------------
  17  */
  18 #include "avcenc_lib.h"
  19 /* 3/29/01 fast half-pel search based on neighboring guess */
  20 /* value ranging from 0 to 4, high complexity (more accurate) to
  21    low complexity (less accurate) */
  22 #define HP_DISTANCE_TH      5 // 2  /* half-pel distance threshold */
  23
  24 #define PREF_16_VEC 129     /* 1MV bias versus 4MVs*/
  25
  26 const static int distance_tab[9][9] =   /* [hp_guess][k] */
  27 {
  28     {0, 1, 1, 1, 1, 1, 1, 1, 1},
  29     {1, 0, 1, 2, 3, 4, 3, 2, 1},
  30     {1, 0, 0, 0, 1, 2, 3, 2, 1},
  31     {1, 2, 1, 0, 1, 2, 3, 4, 3},
  32     {1, 2, 1, 0, 0, 0, 1, 2, 3},
  33     {1, 4, 3, 2, 1, 0, 1, 2, 3},
  34     {1, 2, 3, 2, 1, 0, 0, 0, 1},
  35     {1, 2, 3, 4, 3, 2, 1, 0, 1},
  36     {1, 0, 1, 2, 3, 2, 1, 0, 0}
  37 };
  38
  39 #define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
  40                  x = 0xFF & (~(x>>31));}
  41
  42 #define CLIP_UPPER16(x)     if((uint)x >= 0x20000000){ \
  43         x = 0xFF0000 & (~(x>>31));} \
  44         else { \
  45         x = (x>>5)&0xFF0000; \
  46         }
  47
  48 /*=====================================================================
  49     Function:   AVCFindHalfPelMB
  50     Date:       10/31/2007
  51     Purpose:    Find half pel resolution MV surrounding the full-pel MV
  52 =====================================================================*/
  53
  54 int AVCFindHalfPelMB(AVCEncObject *encvid, uint8 *cur, AVCMV *mot, uint8 *ncand,
  55                      int xpos, int ypos, int hp_guess, int cmvx, int cmvy)
  56 {
  57     AVCPictureData *currPic = encvid->common->currPic;
  58     int lx = currPic->pitch;
  59     int d, dmin, satd_min;
  60     uint8* cand;
  61     int lambda_motion = encvid->lambda_motion;
  62     uint8 *mvbits = encvid->mvbits;
  63     int mvcost;
  64     /* list of candidate to go through for half-pel search*/
  65     uint8 *subpel_pred = (uint8*) encvid->subpel_pred; // all 16 sub-pel positions
  66     uint8 **hpel_cand = (uint8**) encvid->hpel_cand; /* half-pel position */
  67
  68     int xh[9] = {0, 0, 2, 2, 2, 0, -2, -2, -2};
  69     int yh[9] = {0, -2, -2, 0, 2, 2, 2, 0, -2};
  70     int xq[8] = {0, 1, 1, 1, 0, -1, -1, -1};
  71     int yq[8] = { -1, -1, 0, 1, 1, 1, 0, -1};
  72     int h, hmin, q, qmin;
  73
  74     OSCL_UNUSED_ARG(xpos);
  75     OSCL_UNUSED_ARG(ypos);
  76     OSCL_UNUSED_ARG(hp_guess);
  77
  78     GenerateHalfPelPred(subpel_pred, ncand, lx);
  79
  80     cur = encvid->currYMB; // pre-load current original MB
  81
  82     cand = hpel_cand[0];
  83
  84     // find cost for the current full-pel position
  85     dmin = SATD_MB(cand, cur, 65535); // get Hadamaard transform SAD
  86     mvcost = MV_COST_S(lambda_motion, mot->x, mot->y, cmvx, cmvy);
  87     satd_min = dmin;
  88     dmin += mvcost;
  89     hmin = 0;
  90
  91     /* find half-pel */
  92     for (h = 1; h < 9; h++)
  93     {
  94         d = SATD_MB(hpel_cand[h], cur, dmin);
  95         mvcost = MV_COST_S(lambda_motion, mot->x + xh[h], mot->y + yh[h], cmvx, cmvy);
  96         d += mvcost;
  97
  98         if (d < dmin)
  99         {
 100             dmin = d;
 101             hmin = h;
 102             satd_min = d - mvcost;
 103         }
 104     }
 105
 106     mot->sad = dmin;
 107     mot->x += xh[hmin];
 108     mot->y += yh[hmin];
 109     encvid->best_hpel_pos = hmin;
 110
 111     /*** search for quarter-pel ****/
 112     GenerateQuartPelPred(encvid->bilin_base[hmin], &(encvid->qpel_cand[0][0]), hmin);
 113
 114     encvid->best_qpel_pos = qmin = -1;
 115
 116     for (q = 0; q < 8; q++)
 117     {
 118         d = SATD_MB(encvid->qpel_cand[q], cur, dmin);
 119         mvcost = MV_COST_S(lambda_motion, mot->x + xq[q], mot->y + yq[q], cmvx, cmvy);
 120         d += mvcost;
 121         if (d < dmin)
 122         {
 123             dmin = d;
 124             qmin = q;
 125             satd_min = d - mvcost;
 126         }
 127     }
 128
 129     if (qmin != -1)
 130     {
 131         mot->sad = dmin;
 132         mot->x += xq[qmin];
 133         mot->y += yq[qmin];
 134         encvid->best_qpel_pos = qmin;
 135     }
 136
 137     return satd_min;
 138 }
 139
 140
 141
 142 /** This function generates sub-pel prediction around the full-pel candidate.
 143 Each sub-pel position array is 20 pixel wide (for word-alignment) and 17 pixel tall. */
 144 /** The sub-pel position is labeled in spiral manner from the center. */
 145
 146 void GenerateHalfPelPred(uint8* subpel_pred, uint8 *ncand, int lx)
 147 {
 148     /* let's do straightforward way first */
 149     uint8 *ref;
 150     uint8 *dst;
 151     uint8 tmp8;
 152     int32 tmp32;
 153     int16 tmp_horz[18*22], *dst_16, *src_16;
 154     register int a = 0, b = 0, c = 0, d = 0, e = 0, f = 0; // temp register
 155     int msk;
 156     int i, j;
 157
 158     /* first copy full-pel to the first array */
 159     /* to be optimized later based on byte-offset load */
 160     ref = ncand - 3 - lx - (lx << 1); /* move back (-3,-3) */
 161     dst = subpel_pred;
 162
 163     dst -= 4; /* offset */
 164     for (j = 0; j < 22; j++) /* 24x22 */
 165     {
 166         i = 6;
 167         while (i > 0)
 168         {
 169             tmp32 = *ref++;
 170             tmp8 = *ref++;
 171             tmp32 |= (tmp8 << 8);
 172             tmp8 = *ref++;
 173             tmp32 |= (tmp8 << 16);
 174             tmp8 = *ref++;
 175             tmp32 |= (tmp8 << 24);
 176             *((uint32*)(dst += 4)) = tmp32;
 177             i--;
 178         }
 179         ref += (lx - 24);
 180     }
 181
 182     /* from the first array, we do horizontal interp */
 183     ref = subpel_pred + 2;
 184     dst_16 = tmp_horz; /* 17 x 22 */
 185
 186     for (j = 4; j > 0; j--)
 187     {
 188         for (i = 16; i > 0; i -= 4)
 189         {
 190             a = ref[-2];
 191             b = ref[-1];
 192             c = ref[0];
 193             d = ref[1];
 194             e = ref[2];
 195             f = ref[3];
 196             *dst_16++ = a + f - 5 * (b + e) + 20 * (c + d);
 197             a = ref[4];
 198             *dst_16++ = b + a - 5 * (c + f) + 20 * (d + e);
 199             b = ref[5];
 200             *dst_16++ = c + b - 5 * (d + a) + 20 * (e + f);
 201             c = ref[6];
 202             *dst_16++ = d + c - 5 * (e + b) + 20 * (f + a);
 203
 204             ref += 4;
 205         }
 206         /* do the 17th column here */
 207         d = ref[3];
 208         *dst_16 =  e + d - 5 * (f + c) + 20 * (a + b);
 209         dst_16 += 2; /* stride for tmp_horz is 18 */
 210         ref += 8;  /* stride for ref is 24 */
 211         if (j == 3)  // move 18 lines down
 212         {
 213             dst_16 += 324;//18*18;
 214             ref += 432;//18*24;
 215         }
 216     }
 217
 218     ref -= 480;//20*24;
 219     dst_16 -= 360;//20*18;
 220     dst = subpel_pred + V0Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* go to the 14th array 17x18*/
 221
 222     for (j = 18; j > 0; j--)
 223     {
 224         for (i = 16; i > 0; i -= 4)
 225         {
 226             a = ref[-2];
 227             b = ref[-1];
 228             c = ref[0];
 229             d = ref[1];
 230             e = ref[2];
 231             f = ref[3];
 232             tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
 233             *dst_16++ = tmp32;
 234             tmp32 = (tmp32 + 16) >> 5;
 235             CLIP_RESULT(tmp32)
 236             *dst++ = tmp32;
 237
 238             a = ref[4];
 239             tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
 240             *dst_16++ = tmp32;
 241             tmp32 = (tmp32 + 16) >> 5;
 242             CLIP_RESULT(tmp32)
 243             *dst++ = tmp32;
 244
 245             b = ref[5];
 246             tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
 247             *dst_16++ = tmp32;
 248             tmp32 = (tmp32 + 16) >> 5;
 249             CLIP_RESULT(tmp32)
 250             *dst++ = tmp32;
 251
 252             c = ref[6];
 253             tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
 254             *dst_16++ = tmp32;
 255             tmp32 = (tmp32 + 16) >> 5;
 256             CLIP_RESULT(tmp32)
 257             *dst++ = tmp32;
 258
 259             ref += 4;
 260         }
 261         /* do the 17th column here */
 262         d = ref[3];
 263         tmp32 =  e + d - 5 * (f + c) + 20 * (a + b);
 264         *dst_16 = tmp32;
 265         tmp32 = (tmp32 + 16) >> 5;
 266         CLIP_RESULT(tmp32)
 267         *dst = tmp32;
 268
 269         dst += 8;  /* stride for dst is 24 */
 270         dst_16 += 2; /* stride for tmp_horz is 18 */
 271         ref += 8;  /* stride for ref is 24 */
 272     }
 273
 274
 275     /* Do middle point filtering*/
 276     src_16 = tmp_horz; /* 17 x 22 */
 277     dst = subpel_pred + V2Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* 12th array 17x17*/
 278     dst -= 24; // offset
 279     for (i = 0; i < 17; i++)
 280     {
 281         for (j = 16; j > 0; j -= 4)
 282         {
 283             a = *src_16;
 284             b = *(src_16 += 18);
 285             c = *(src_16 += 18);
 286             d = *(src_16 += 18);
 287             e = *(src_16 += 18);
 288             f = *(src_16 += 18);
 289
 290             tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
 291             tmp32 = (tmp32 + 512) >> 10;
 292             CLIP_RESULT(tmp32)
 293             *(dst += 24) = tmp32;
 294
 295             a = *(src_16 += 18);
 296             tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
 297             tmp32 = (tmp32 + 512) >> 10;
 298             CLIP_RESULT(tmp32)
 299             *(dst += 24) = tmp32;
 300
 301             b = *(src_16 += 18);
 302             tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
 303             tmp32 = (tmp32 + 512) >> 10;
 304             CLIP_RESULT(tmp32)
 305             *(dst += 24) = tmp32;
 306
 307             c = *(src_16 += 18);
 308             tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
 309             tmp32 = (tmp32 + 512) >> 10;
 310             CLIP_RESULT(tmp32)
 311             *(dst += 24) = tmp32;
 312
 313             src_16 -= (18 << 2);
 314         }
 315
 316         d = src_16[90]; // 18*5
 317         tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
 318         tmp32 = (tmp32 + 512) >> 10;
 319         CLIP_RESULT(tmp32)
 320         dst[24] = tmp32;
 321
 322         src_16 -= ((18 << 4) - 1);
 323         dst -= ((24 << 4) - 1);
 324     }
 325
 326     /* do vertical interpolation */
 327     ref = subpel_pred + 2;
 328     dst = subpel_pred + V2Q_H0Q * SUBPEL_PRED_BLK_SIZE; /* 10th array 18x17 */
 329     dst -= 24; // offset
 330
 331     for (i = 2; i > 0; i--)
 332     {
 333         for (j = 16; j > 0; j -= 4)
 334         {
 335             a = *ref;
 336             b = *(ref += 24);
 337             c = *(ref += 24);
 338             d = *(ref += 24);
 339             e = *(ref += 24);
 340             f = *(ref += 24);
 341
 342             tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
 343             tmp32 = (tmp32 + 16) >> 5;
 344             CLIP_RESULT(tmp32)
 345             *(dst += 24) = tmp32;  // 10th
 346
 347             a = *(ref += 24);
 348             tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
 349             tmp32 = (tmp32 + 16) >> 5;
 350             CLIP_RESULT(tmp32)
 351             *(dst += 24) = tmp32;  // 10th
 352
 353             b = *(ref += 24);
 354             tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
 355             tmp32 = (tmp32 + 16) >> 5;
 356             CLIP_RESULT(tmp32)
 357             *(dst += 24) = tmp32;  // 10th
 358
 359             c = *(ref += 24);
 360             tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
 361             tmp32 = (tmp32 + 16) >> 5;
 362             CLIP_RESULT(tmp32)
 363             *(dst += 24) = tmp32;  // 10th
 364
 365             ref -= (24 << 2);
 366         }
 367
 368         d = ref[120]; // 24*5
 369         tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
 370         tmp32 = (tmp32 + 16) >> 5;
 371         CLIP_RESULT(tmp32)
 372         dst[24] = tmp32;  // 10th
 373
 374         dst -= ((24 << 4) - 1);
 375         ref -= ((24 << 4) - 1);
 376     }
 377
 378     // note that using SIMD here doesn't help much, the cycle almost stays the same
 379     // one can just use the above code and change the for(i=2 to for(i=18
 380     for (i = 16; i > 0; i -= 4)
 381     {
 382         msk = 0;
 383         for (j = 17; j > 0; j--)
 384         {
 385             a = *((uint32*)ref); /* load 4 bytes */
 386             b = (a >> 8) & 0xFF00FF; /* second and fourth byte */
 387             a &= 0xFF00FF;
 388
 389             c = *((uint32*)(ref + 120));
 390             d = (c >> 8) & 0xFF00FF;
 391             c &= 0xFF00FF;
 392
 393             a += c;
 394             b += d;
 395
 396             e = *((uint32*)(ref + 72)); /* e, f */
 397             f = (e >> 8) & 0xFF00FF;
 398             e &= 0xFF00FF;
 399
 400             c = *((uint32*)(ref + 48)); /* c, d */
 401             d = (c >> 8) & 0xFF00FF;
 402             c &= 0xFF00FF;
 403
 404             c += e;
 405             d += f;
 406
 407             a += 20 * c;
 408             b += 20 * d;
 409             a += 0x100010;
 410             b += 0x100010;
 411
 412             e = *((uint32*)(ref += 24)); /* e, f */
 413             f = (e >> 8) & 0xFF00FF;
 414             e &= 0xFF00FF;
 415
 416             c = *((uint32*)(ref + 72)); /* c, d */
 417             d = (c >> 8) & 0xFF00FF;
 418             c &= 0xFF00FF;
 419
 420             c += e;
 421             d += f;
 422
 423             a -= 5 * c;
 424             b -= 5 * d;
 425
 426             c = a << 16;
 427             d = b << 16;
 428             CLIP_UPPER16(a)
 429             CLIP_UPPER16(c)
 430             CLIP_UPPER16(b)
 431             CLIP_UPPER16(d)
 432
 433             a |= (c >> 16);
 434             b |= (d >> 16);
 435             //  a>>=5;
 436             //  b>>=5;
 437             /* clip */
 438             //  msk |= b;  msk|=a;
 439             //  a &= 0xFF00FF;
 440             //  b &= 0xFF00FF;
 441             a |= (b << 8);  /* pack it back */
 442
 443             *((uint16*)(dst += 24)) = a & 0xFFFF; //dst is not word-aligned.
 444             *((uint16*)(dst + 2)) = a >> 16;
 445
 446         }
 447         dst -= 404; // 24*17-4
 448         ref -= 404;
 449         /*      if(msk & 0xFF00FF00) // need clipping
 450                 {
 451                     VertInterpWClip(dst,ref); // re-do 4 column with clip
 452                 }*/
 453     }
 454
 455     return ;
 456 }
 457
 458 void VertInterpWClip(uint8 *dst, uint8 *ref)
 459 {
 460     int i, j;
 461     int a, b, c, d, e, f;
 462     int32 tmp32;
 463
 464     dst -= 4;
 465     ref -= 4;
 466
 467     for (i = 4; i > 0; i--)
 468     {
 469         for (j = 16; j > 0; j -= 4)
 470         {
 471             a = *ref;
 472             b = *(ref += 24);
 473             c = *(ref += 24);
 474             d = *(ref += 24);
 475             e = *(ref += 24);
 476             f = *(ref += 24);
 477
 478             tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
 479             tmp32 = (tmp32 + 16) >> 5;
 480             CLIP_RESULT(tmp32)
 481             *(dst += 24) = tmp32;  // 10th
 482
 483             a = *(ref += 24);
 484             tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
 485             tmp32 = (tmp32 + 16) >> 5;
 486             CLIP_RESULT(tmp32)
 487             *(dst += 24) = tmp32;  // 10th
 488
 489             b = *(ref += 24);
 490             tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
 491             tmp32 = (tmp32 + 16) >> 5;
 492             CLIP_RESULT(tmp32)
 493             *(dst += 24) = tmp32;  // 10th
 494
 495             c = *(ref += 24);
 496             tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
 497             tmp32 = (tmp32 + 16) >> 5;
 498             CLIP_RESULT(tmp32)
 499             *(dst += 24) = tmp32;  // 10th
 500
 501             ref -= (24 << 2);
 502         }
 503
 504         d = ref[120]; // 24*5
 505         tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
 506         tmp32 = (tmp32 + 16) >> 5;
 507         CLIP_RESULT(tmp32)
 508         dst[24] = tmp32;  // 10th
 509
 510         dst -= ((24 << 4) - 1);
 511         ref -= ((24 << 4) - 1);
 512     }
 513
 514     return ;
 515 }
 516
 517
 518 void GenerateQuartPelPred(uint8 **bilin_base, uint8 *qpel_cand, int hpel_pos)
 519 {
 520     // for even value of hpel_pos, start with pattern 1, otherwise, start with pattern 2
 521     int i, j;
 522
 523     uint8 *c1 = qpel_cand;
 524     uint8 *tl = bilin_base[0];
 525     uint8 *tr = bilin_base[1];
 526     uint8 *bl = bilin_base[2];
 527     uint8 *br = bilin_base[3];
 528     int a, b, c, d;
 529     int offset = 1 - (384 * 7);
 530
 531     if (!(hpel_pos&1)) // diamond pattern
 532     {
 533         j = 16;
 534         while (j--)
 535         {
 536             i = 16;
 537             while (i--)
 538             {
 539                 d = tr[24];
 540                 a = *tr++;
 541                 b = bl[1];
 542                 c = *br++;
 543
 544                 *c1 = (c + a + 1) >> 1;
 545                 *(c1 += 384) = (b + a + 1) >> 1; /* c2 */
 546                 *(c1 += 384) = (b + c + 1) >> 1; /* c3 */
 547                 *(c1 += 384) = (b + d + 1) >> 1; /* c4 */
 548
 549                 b = *bl++;
 550
 551                 *(c1 += 384) = (c + d + 1) >> 1;  /* c5 */
 552                 *(c1 += 384) = (b + d + 1) >> 1;  /* c6 */
 553                 *(c1 += 384) = (b + c + 1) >> 1;  /* c7 */
 554                 *(c1 += 384) = (b + a + 1) >> 1;  /* c8 */
 555
 556                 c1 += offset;
 557             }
 558             // advance to the next line, pitch is 24
 559             tl += 8;
 560             tr += 8;
 561             bl += 8;
 562             br += 8;
 563             c1 += 8;
 564         }
 565     }
 566     else // star pattern
 567     {
 568         j = 16;
 569         while (j--)
 570         {
 571             i = 16;
 572             while (i--)
 573             {
 574                 a = *br++;
 575                 b = *tr++;
 576                 c = tl[1];
 577                 *c1 = (a + b + 1) >> 1;
 578                 b = bl[1];
 579                 *(c1 += 384) = (a + c + 1) >> 1; /* c2 */
 580                 c = tl[25];
 581                 *(c1 += 384) = (a + b + 1) >> 1; /* c3 */
 582                 b = tr[23];
 583                 *(c1 += 384) = (a + c + 1) >> 1; /* c4 */
 584                 c = tl[24];
 585                 *(c1 += 384) = (a + b + 1) >> 1; /* c5 */
 586                 b = *bl++;
 587                 *(c1 += 384) = (a + c + 1) >> 1; /* c6 */
 588                 c = *tl++;
 589                 *(c1 += 384) = (a + b + 1) >> 1; /* c7 */
 590                 *(c1 += 384) = (a + c + 1) >> 1; /* c8 */
 591
 592                 c1 += offset;
 593             }
 594             // advance to the next line, pitch is 24
 595             tl += 8;
 596             tr += 8;
 597             bl += 8;
 598             br += 8;
 599             c1 += 8;
 600         }
 601     }
 602
 603     return ;
 604 }
 605
 606
 607 /* assuming cand always has a pitch of 24 */
 608 int SATD_MB(uint8 *cand, uint8 *cur, int dmin)
 609 {
 610     int cost;
 611
 612
 613     dmin = (dmin << 16) | 24;
 614     cost = AVCSAD_Macroblock_C(cand, cur, dmin, NULL);
 615
 616     return cost;
 617 }
 618
 619
 620
 621
 622