1 /* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
18 #include "avcenc_lib.h"
19 /* 3/29/01 fast half-pel search based on neighboring guess */
20 /* value ranging from 0 to 4, high complexity (more accurate) to
21 low complexity (less accurate) */
22 #define HP_DISTANCE_TH 5 // 2 /* half-pel distance threshold */
24 #define PREF_16_VEC 129 /* 1MV bias versus 4MVs*/
26 const static int distance_tab[9][9] = /* [hp_guess][k] */
28 {0, 1, 1, 1, 1, 1, 1, 1, 1},
29 {1, 0, 1, 2, 3, 4, 3, 2, 1},
30 {1, 0, 0, 0, 1, 2, 3, 2, 1},
31 {1, 2, 1, 0, 1, 2, 3, 4, 3},
32 {1, 2, 1, 0, 0, 0, 1, 2, 3},
33 {1, 4, 3, 2, 1, 0, 1, 2, 3},
34 {1, 2, 3, 2, 1, 0, 0, 0, 1},
35 {1, 2, 3, 4, 3, 2, 1, 0, 1},
36 {1, 0, 1, 2, 3, 2, 1, 0, 0}
39 #define CLIP_RESULT(x) if((uint)x > 0xFF){ \
40 x = 0xFF & (~(x>>31));}
42 #define CLIP_UPPER16(x) if((uint)x >= 0x20000000){ \
43 x = 0xFF0000 & (~(x>>31));} \
45 x = (x>>5)&0xFF0000; \
48 /*=====================================================================
49 Function: AVCFindHalfPelMB
51 Purpose: Find half pel resolution MV surrounding the full-pel MV
52 =====================================================================*/
54 int AVCFindHalfPelMB(AVCEncObject *encvid, uint8 *cur, AVCMV *mot, uint8 *ncand,
55 int xpos, int ypos, int hp_guess, int cmvx, int cmvy)
57 AVCPictureData *currPic = encvid->common->currPic;
58 int lx = currPic->pitch;
59 int d, dmin, satd_min;
61 int lambda_motion = encvid->lambda_motion;
62 uint8 *mvbits = encvid->mvbits;
64 /* list of candidate to go through for half-pel search*/
65 uint8 *subpel_pred = (uint8*) encvid->subpel_pred; // all 16 sub-pel positions
66 uint8 **hpel_cand = (uint8**) encvid->hpel_cand; /* half-pel position */
68 int xh[9] = {0, 0, 2, 2, 2, 0, -2, -2, -2};
69 int yh[9] = {0, -2, -2, 0, 2, 2, 2, 0, -2};
70 int xq[8] = {0, 1, 1, 1, 0, -1, -1, -1};
71 int yq[8] = { -1, -1, 0, 1, 1, 1, 0, -1};
74 OSCL_UNUSED_ARG(xpos);
75 OSCL_UNUSED_ARG(ypos);
76 OSCL_UNUSED_ARG(hp_guess);
78 GenerateHalfPelPred(subpel_pred, ncand, lx);
80 cur = encvid->currYMB; // pre-load current original MB
84 // find cost for the current full-pel position
85 dmin = SATD_MB(cand, cur, 65535); // get Hadamaard transform SAD
86 mvcost = MV_COST_S(lambda_motion, mot->x, mot->y, cmvx, cmvy);
92 for (h = 1; h < 9; h++)
94 d = SATD_MB(hpel_cand[h], cur, dmin);
95 mvcost = MV_COST_S(lambda_motion, mot->x + xh[h], mot->y + yh[h], cmvx, cmvy);
102 satd_min = d - mvcost;
109 encvid->best_hpel_pos = hmin;
111 /*** search for quarter-pel ****/
112 GenerateQuartPelPred(encvid->bilin_base[hmin], &(encvid->qpel_cand[0][0]), hmin);
114 encvid->best_qpel_pos = qmin = -1;
116 for (q = 0; q < 8; q++)
118 d = SATD_MB(encvid->qpel_cand[q], cur, dmin);
119 mvcost = MV_COST_S(lambda_motion, mot->x + xq[q], mot->y + yq[q], cmvx, cmvy);
125 satd_min = d - mvcost;
134 encvid->best_qpel_pos = qmin;
142 /** This function generates sub-pel prediction around the full-pel candidate.
143 Each sub-pel position array is 20 pixel wide (for word-alignment) and 17 pixel tall. */
144 /** The sub-pel position is labeled in spiral manner from the center. */
146 void GenerateHalfPelPred(uint8* subpel_pred, uint8 *ncand, int lx)
148 /* let's do straightforward way first */
153 int16 tmp_horz[18*22], *dst_16, *src_16;
154 register int a = 0, b = 0, c = 0, d = 0, e = 0, f = 0; // temp register
158 /* first copy full-pel to the first array */
159 /* to be optimized later based on byte-offset load */
160 ref = ncand - 3 - lx - (lx << 1); /* move back (-3,-3) */
163 dst -= 4; /* offset */
164 for (j = 0; j < 22; j++) /* 24x22 */
171 tmp32 |= (tmp8 << 8);
173 tmp32 |= (tmp8 << 16);
175 tmp32 |= (tmp8 << 24);
176 *((uint32*)(dst += 4)) = tmp32;
182 /* from the first array, we do horizontal interp */
183 ref = subpel_pred + 2;
184 dst_16 = tmp_horz; /* 17 x 22 */
186 for (j = 4; j > 0; j--)
188 for (i = 16; i > 0; i -= 4)
196 *dst_16++ = a + f - 5 * (b + e) + 20 * (c + d);
198 *dst_16++ = b + a - 5 * (c + f) + 20 * (d + e);
200 *dst_16++ = c + b - 5 * (d + a) + 20 * (e + f);
202 *dst_16++ = d + c - 5 * (e + b) + 20 * (f + a);
206 /* do the 17th column here */
208 *dst_16 = e + d - 5 * (f + c) + 20 * (a + b);
209 dst_16 += 2; /* stride for tmp_horz is 18 */
210 ref += 8; /* stride for ref is 24 */
211 if (j == 3) // move 18 lines down
213 dst_16 += 324;//18*18;
219 dst_16 -= 360;//20*18;
220 dst = subpel_pred + V0Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* go to the 14th array 17x18*/
222 for (j = 18; j > 0; j--)
224 for (i = 16; i > 0; i -= 4)
232 tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
234 tmp32 = (tmp32 + 16) >> 5;
239 tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
241 tmp32 = (tmp32 + 16) >> 5;
246 tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
248 tmp32 = (tmp32 + 16) >> 5;
253 tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
255 tmp32 = (tmp32 + 16) >> 5;
261 /* do the 17th column here */
263 tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
265 tmp32 = (tmp32 + 16) >> 5;
269 dst += 8; /* stride for dst is 24 */
270 dst_16 += 2; /* stride for tmp_horz is 18 */
271 ref += 8; /* stride for ref is 24 */
275 /* Do middle point filtering*/
276 src_16 = tmp_horz; /* 17 x 22 */
277 dst = subpel_pred + V2Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* 12th array 17x17*/
279 for (i = 0; i < 17; i++)
281 for (j = 16; j > 0; j -= 4)
290 tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
291 tmp32 = (tmp32 + 512) >> 10;
293 *(dst += 24) = tmp32;
296 tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
297 tmp32 = (tmp32 + 512) >> 10;
299 *(dst += 24) = tmp32;
302 tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
303 tmp32 = (tmp32 + 512) >> 10;
305 *(dst += 24) = tmp32;
308 tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
309 tmp32 = (tmp32 + 512) >> 10;
311 *(dst += 24) = tmp32;
316 d = src_16[90]; // 18*5
317 tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
318 tmp32 = (tmp32 + 512) >> 10;
322 src_16 -= ((18 << 4) - 1);
323 dst -= ((24 << 4) - 1);
326 /* do vertical interpolation */
327 ref = subpel_pred + 2;
328 dst = subpel_pred + V2Q_H0Q * SUBPEL_PRED_BLK_SIZE; /* 10th array 18x17 */
331 for (i = 2; i > 0; i--)
333 for (j = 16; j > 0; j -= 4)
342 tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
343 tmp32 = (tmp32 + 16) >> 5;
345 *(dst += 24) = tmp32; // 10th
348 tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
349 tmp32 = (tmp32 + 16) >> 5;
351 *(dst += 24) = tmp32; // 10th
354 tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
355 tmp32 = (tmp32 + 16) >> 5;
357 *(dst += 24) = tmp32; // 10th
360 tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
361 tmp32 = (tmp32 + 16) >> 5;
363 *(dst += 24) = tmp32; // 10th
368 d = ref[120]; // 24*5
369 tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
370 tmp32 = (tmp32 + 16) >> 5;
372 dst[24] = tmp32; // 10th
374 dst -= ((24 << 4) - 1);
375 ref -= ((24 << 4) - 1);
378 // note that using SIMD here doesn't help much, the cycle almost stays the same
379 // one can just use the above code and change the for(i=2 to for(i=18
380 for (i = 16; i > 0; i -= 4)
383 for (j = 17; j > 0; j--)
385 a = *((uint32*)ref); /* load 4 bytes */
386 b = (a >> 8) & 0xFF00FF; /* second and fourth byte */
389 c = *((uint32*)(ref + 120));
390 d = (c >> 8) & 0xFF00FF;
396 e = *((uint32*)(ref + 72)); /* e, f */
397 f = (e >> 8) & 0xFF00FF;
400 c = *((uint32*)(ref + 48)); /* c, d */
401 d = (c >> 8) & 0xFF00FF;
412 e = *((uint32*)(ref += 24)); /* e, f */
413 f = (e >> 8) & 0xFF00FF;
416 c = *((uint32*)(ref + 72)); /* c, d */
417 d = (c >> 8) & 0xFF00FF;
441 a |= (b << 8); /* pack it back */
443 *((uint16*)(dst += 24)) = a & 0xFFFF; //dst is not word-aligned.
444 *((uint16*)(dst + 2)) = a >> 16;
447 dst -= 404; // 24*17-4
449 /* if(msk & 0xFF00FF00) // need clipping
451 VertInterpWClip(dst,ref); // re-do 4 column with clip
458 void VertInterpWClip(uint8 *dst, uint8 *ref)
461 int a, b, c, d, e, f;
467 for (i = 4; i > 0; i--)
469 for (j = 16; j > 0; j -= 4)
478 tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
479 tmp32 = (tmp32 + 16) >> 5;
481 *(dst += 24) = tmp32; // 10th
484 tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
485 tmp32 = (tmp32 + 16) >> 5;
487 *(dst += 24) = tmp32; // 10th
490 tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
491 tmp32 = (tmp32 + 16) >> 5;
493 *(dst += 24) = tmp32; // 10th
496 tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
497 tmp32 = (tmp32 + 16) >> 5;
499 *(dst += 24) = tmp32; // 10th
504 d = ref[120]; // 24*5
505 tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
506 tmp32 = (tmp32 + 16) >> 5;
508 dst[24] = tmp32; // 10th
510 dst -= ((24 << 4) - 1);
511 ref -= ((24 << 4) - 1);
518 void GenerateQuartPelPred(uint8 **bilin_base, uint8 *qpel_cand, int hpel_pos)
520 // for even value of hpel_pos, start with pattern 1, otherwise, start with pattern 2
523 uint8 *c1 = qpel_cand;
524 uint8 *tl = bilin_base[0];
525 uint8 *tr = bilin_base[1];
526 uint8 *bl = bilin_base[2];
527 uint8 *br = bilin_base[3];
529 int offset = 1 - (384 * 7);
531 if (!(hpel_pos&1)) // diamond pattern
544 *c1 = (c + a + 1) >> 1;
545 *(c1 += 384) = (b + a + 1) >> 1; /* c2 */
546 *(c1 += 384) = (b + c + 1) >> 1; /* c3 */
547 *(c1 += 384) = (b + d + 1) >> 1; /* c4 */
551 *(c1 += 384) = (c + d + 1) >> 1; /* c5 */
552 *(c1 += 384) = (b + d + 1) >> 1; /* c6 */
553 *(c1 += 384) = (b + c + 1) >> 1; /* c7 */
554 *(c1 += 384) = (b + a + 1) >> 1; /* c8 */
558 // advance to the next line, pitch is 24
577 *c1 = (a + b + 1) >> 1;
579 *(c1 += 384) = (a + c + 1) >> 1; /* c2 */
581 *(c1 += 384) = (a + b + 1) >> 1; /* c3 */
583 *(c1 += 384) = (a + c + 1) >> 1; /* c4 */
585 *(c1 += 384) = (a + b + 1) >> 1; /* c5 */
587 *(c1 += 384) = (a + c + 1) >> 1; /* c6 */
589 *(c1 += 384) = (a + b + 1) >> 1; /* c7 */
590 *(c1 += 384) = (a + c + 1) >> 1; /* c8 */
594 // advance to the next line, pitch is 24
607 /* assuming cand always has a pitch of 24 */
608 int SATD_MB(uint8 *cand, uint8 *cur, int dmin)
613 dmin = (dmin << 16) | 24;
614 cost = AVCSAD_Macroblock_C(cand, cur, dmin, NULL);